From d442f641d6daeb4310a057bc6193f87517c9f6a0 Mon Sep 17 00:00:00 2001 From: w-e-w <40751091+w-e-w@users.noreply.github.com> Date: Sat, 20 Jan 2024 06:36:44 +0900 Subject: [PATCH 1/7] config class gpu_temperature_protection --- scripts/gpu_temperature_protection.py | 58 +++++++++++++++++++-------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/scripts/gpu_temperature_protection.py b/scripts/gpu_temperature_protection.py index 20bc9d3..059799b 100644 --- a/scripts/gpu_temperature_protection.py +++ b/scripts/gpu_temperature_protection.py @@ -25,7 +25,28 @@ def download_open_hardware_monitor(): f.write(z.read('OpenHardwareMonitor/OpenHardwareMonitorLib.dll')) +class TemperatureConfig: + def __init__(self, sleep_temp, wake_temp, max_sleep_time): + self.sleep_temp_key = sleep_temp + self.wake_temp_key = wake_temp + self.max_sleep_time_key = max_sleep_time + + @property + def sleep_temp(self): + return getattr(shared.opts, self.sleep_temp_key) + + @property + def wake_temp(self): + return getattr(shared.opts, self.wake_temp_key) + + @property + def max_sleep_time(self): + return getattr(shared.opts, self.max_sleep_time_key) + + class GPUTemperatureProtection(scripts.Script): + temperature_func = None + def title(self): return "GPU temperature protection" @@ -34,10 +55,8 @@ class GPUTemperatureProtection(scripts.Script): def setup(self, p, *args): if shared.opts.gpu_temps_sleep_enable: - sd_samplers_common.store_latent = GPUTemperatureProtection.gpu_temperature_protection_decorator( - sd_samplers_common.store_latent, - GPUTemperatureProtection.get_temperature_src_function(shared.opts.gpu_temps_sleep_temperature_src) - ) + GPUTemperatureProtection.temperature_func = GPUTemperatureProtection.get_temperature_src_function(shared.opts.gpu_temps_sleep_temperature_src) + sd_samplers_common.store_latent = GPUTemperatureProtection.gpu_temperature_protection_decorator(sd_samplers_common.store_latent) p.close = GPUTemperatureProtection.gpu_temperature_close_decorator(p.close) @staticmethod @@ -108,7 +127,7 @@ class GPUTemperatureProtection(scripts.Script): print(f"[Error GPU temperature protection] OpenHardwareMonitor Couldn't find temperature sensor for {shared.opts.gpu_temps_sleep_gpu_name}") except Exception as e: - print(f"[Error GPU temperature protection] Failed to initialize OpenHardwareMonitor \: {e}") + print(f"[Error GPU temperature protection] Failed to initialize OpenHardwareMonitor: {e}") @staticmethod def get_gpu_temperature_open_hardware_monitor(): @@ -140,34 +159,41 @@ class GPUTemperatureProtection(scripts.Script): return GPUTemperatureProtection.temperature_src_dict.get(source_name, GPUTemperatureProtection.get_gpu_temperature_nvidia_smi) @staticmethod - def gpu_temperature_protection(temperature_src_fun): + def gpu_temperature_protection(config: TemperatureConfig): if shared.opts.gpu_temps_sleep_enable: call_time = time.time() if call_time - GPUTemperatureProtection.last_call_time > shared.opts.gpu_temps_sleep_minimum_interval: - gpu_core_temp = temperature_src_fun() - if gpu_core_temp > shared.opts.gpu_temps_sleep_sleep_temp: - + gpu_core_temp = GPUTemperatureProtection.temperature_func() + if gpu_core_temp > config.sleep_temp: if shared.opts.gpu_temps_sleep_print: print(f'\n\nGPU Temperature: {gpu_core_temp}') - time.sleep(shared.opts.gpu_temps_sleep_sleep_time) - gpu_core_temp = temperature_src_fun() - while gpu_core_temp > shared.opts.gpu_temps_sleep_wake_temp and (not shared.opts.gpu_temps_sleep_max_sleep_time or shared.opts.gpu_temps_sleep_max_sleep_time > time.time() - call_time) and shared.opts.gpu_temps_sleep_enable: + gpu_core_temp = GPUTemperatureProtection.temperature_func() + while (gpu_core_temp > shared.opts.gpu_temps_sleep_wake_temp + and (not config.max_sleep_time or config.max_sleep_time > time.time() - call_time) + and shared.opts.gpu_temps_sleep_enable): if shared.opts.gpu_temps_sleep_print: print(f'GPU Temperature: {gpu_core_temp}') - + if shared.state.interrupted or shared.state.skipped: + break time.sleep(shared.opts.gpu_temps_sleep_sleep_time) - gpu_core_temp = temperature_src_fun() + gpu_core_temp = GPUTemperatureProtection.temperature_func() GPUTemperatureProtection.last_call_time = time.time() else: GPUTemperatureProtection.last_call_time = call_time @staticmethod - def gpu_temperature_protection_decorator(fun, temperature_src_fun): + def gpu_temperature_protection_decorator(fun): + config = TemperatureConfig( + 'gpu_temps_sleep_sleep_temp', + 'gpu_temps_sleep_wake_temp', + 'gpu_temps_sleep_max_sleep_time', + ) + def wrapper(*args, **kwargs): + GPUTemperatureProtection.gpu_temperature_protection(config) result = fun(*args, **kwargs) - GPUTemperatureProtection.gpu_temperature_protection(temperature_src_fun) return result return wrapper From 217afc92646864854c69d4747d9c710a4d1fa122 Mon Sep 17 00:00:00 2001 From: w-e-w <40751091+w-e-w@users.noreply.github.com> Date: Sat, 20 Jan 2024 06:41:48 +0900 Subject: [PATCH 2/7] # noqa --- scripts/gpu_temperature_protection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gpu_temperature_protection.py b/scripts/gpu_temperature_protection.py index 059799b..7a9927f 100644 --- a/scripts/gpu_temperature_protection.py +++ b/scripts/gpu_temperature_protection.py @@ -97,7 +97,7 @@ class GPUTemperatureProtection(scripts.Script): # install and import Python.NET module if not launch.is_installed("pythonnet"): launch.run_pip("install pythonnet==3.0.2", "Installing requirements for OpenHardwareMonitorLib") - import clr # import pythonnet module. + import clr # noqa import pythonnet module. # download OpenHardwareMonitor if not found download_open_hardware_monitor() @@ -105,7 +105,7 @@ class GPUTemperatureProtection(scripts.Script): # initialize OpenHardwareMonitor if GPUTemperatureProtection.computer is None: clr.AddReference(str(OpenHardwareMonitorLib_path)) - from OpenHardwareMonitor.Hardware import Computer + from OpenHardwareMonitor.Hardware import Computer # noqa GPUTemperatureProtection.computer = Computer() GPUTemperatureProtection.computer.CPUEnabled = False # Disable CPU GPUTemperatureProtection.computer.GPUEnabled = True # Enable GPU From 872331bc7a14281323b251305087ee597433e1dd Mon Sep 17 00:00:00 2001 From: w-e-w <40751091+w-e-w@users.noreply.github.com> Date: Sun, 19 May 2024 16:37:04 +0900 Subject: [PATCH 3/7] split into modules --- scripts/gpu_temperature_protection.py | 319 +++++++----------- temperature_sensor_modules/amd_rocm_smi.py | 19 ++ temperature_sensor_modules/nvidia_smi.py | 13 + .../open_hardware_monitor.py | 75 ++++ 4 files changed, 220 insertions(+), 206 deletions(-) create mode 100644 temperature_sensor_modules/amd_rocm_smi.py create mode 100644 temperature_sensor_modules/nvidia_smi.py create mode 100644 temperature_sensor_modules/open_hardware_monitor.py diff --git a/scripts/gpu_temperature_protection.py b/scripts/gpu_temperature_protection.py index 7a9927f..ebbe91b 100644 --- a/scripts/gpu_temperature_protection.py +++ b/scripts/gpu_temperature_protection.py @@ -1,31 +1,68 @@ +from temperature_sensor_modules import nvidia_smi, amd_rocm_smi, open_hardware_monitor from modules import scripts, shared, sd_samplers_common -from pathlib import Path -import urllib.request +from typing import Callable import gradio as gr import subprocess -import zipfile -import launch import time import re import os -OpenHardwareMonitorLibDownloadUrl = "https://openhardwaremonitor.org/files/openhardwaremonitor-v0.9.6.zip" -OpenHardwareMonitor_path = Path(scripts.current_basedir).joinpath('OpenHardwareMonitor') -OpenHardwareMonitorLib_path = OpenHardwareMonitor_path.joinpath('OpenHardwareMonitorLib') -OpenHardwareMonitorLib_dll_path = OpenHardwareMonitor_path.joinpath('OpenHardwareMonitorLib.dll') +pre_decorate_store_latent = sd_samplers_common.store_latent +temperature_func: Callable[[], float] + +temperature_src_dict = { + "NVIDIA - nvidia-smi": nvidia_smi.get_gpu_temperature_nvidia_smi, + "AMD - ROCm-smi": amd_rocm_smi.get_gpu_temperature_amd_rocm_smi, + "NVIDIA & AMD - OpenHardwareMonitor": open_hardware_monitor.get_gpu_temperature_open_hardware_monitor +} -def download_open_hardware_monitor(): - if not OpenHardwareMonitorLib_dll_path.is_file(): - OpenHardwareMonitor_path.mkdir(parents=True, exist_ok=True) - print("Downloading OpenHardwareMonitor") - zip_path, _ = urllib.request.urlretrieve(OpenHardwareMonitorLibDownloadUrl) - with zipfile.ZipFile(zip_path, "r") as z: - with open(os.path.realpath(OpenHardwareMonitorLib_dll_path), 'wb') as f: - f.write(z.read('OpenHardwareMonitor/OpenHardwareMonitorLib.dll')) +def init_temps_src(): + global temperature_func + temperature_func = temperature_src_dict.get(shared.opts.gpu_temps_sleep_temperature_src, nvidia_smi.get_gpu_temperature_nvidia_smi) + if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor': + if os.name != 'nt': + assert False, "NVIDIA & AMD - OpenHardwareMonitor it's only supported on Windows" + open_hardware_monitor.init_open_hardware_monitor() + elif shared.opts.gpu_temps_sleep_temperature_src == 'AMD - ROCm-smi' and os.name == 'nt': + assert False, "AMD - ROCm-smi is not supported on Windows" -class TemperatureConfig: +if hasattr(shared, "OptionHTML"): # < 1.6.0 support + shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), { + "gpu_temps_sleep_temperature_src_explanation": shared.OptionHTML("""NVIDIA - nvidia-smi is available on both Windows and Linux.
+AMD - ROCm-smi is Linux only and does not support specifying GPU device index.
+NVIDIA & AMD - OpenHardwareMonitor is Windows only supports NVIDIA and AMD. + """) + })) + + +shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), { + "gpu_temps_sleep_temperature_src": shared.OptionInfo("NVIDIA - nvidia-smi", "Temperature source", gr.Radio, {"choices": list(temperature_src_dict.keys())}, init_temps_src), + "gpu_temps_sleep_enable": shared.OptionInfo(True, "Enable GPU temperature protection"), + "gpu_temps_sleep_print": shared.OptionInfo(True, "Print GPU Core temperature while sleeping in terminal"), + "gpu_temps_sleep_minimum_interval": shared.OptionInfo(5.0, "GPU temperature monitor minimum interval", gr.Number).info("won't check the temperature again until this amount of seconds have passed"), + "gpu_temps_sleep_sleep_time": shared.OptionInfo(1.0, "Sleep Time", gr.Number).info("seconds to pause before checking temperature again"), + "gpu_temps_sleep_max_sleep_time": shared.OptionInfo(10.0, "Max sleep Time", gr.Number).info("max number of seconds that it's allowed to pause, 0=unlimited"), + "gpu_temps_sleep_sleep_temp": shared.OptionInfo(75.0, "GPU sleep temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause if GPU core temperature exceeds this temperature"), + "gpu_temps_sleep_wake_temp": shared.OptionInfo(75.0, "GPU wake temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause until GPU core temperature drops below this temperature"), + "gpu_temps_sleep_gpu_index": shared.OptionInfo(0, "GPU device index - nvidia-smi", gr.Number, {"precision": 0}).info("selecting the correct temperature reading for multi GPU systems, for systems with 3 gpus the value should be an integer between 0~2, default 0"), +})) + +if os.name == 'nt': + try: + all_lines = subprocess.check_output(['cmd.exe', '/c', 'wmic path win32_VideoController get name']).decode().strip("\nName").splitlines() + video_controller_filter = re.compile(r"^\s+$") + names_list = [name.strip() for name in all_lines if not video_controller_filter.match(name) and name != ''] + shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), { + "gpu_temps_sleep_gpu_name": shared.OptionInfo("None" if len(names_list) == 0 else names_list[0], "GPU Name - OpenHardwareMonitor", gr.Radio, {"choices": names_list}, init_temps_src).info("select your gpu"), + })) + except Exception as _e: + if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor': + print(f'[Error GPU temperature protection] Failed to retrieve list of video controllers: \n{_e}') + + +class TemperatureProtection: def __init__(self, sleep_temp, wake_temp, max_sleep_time): self.sleep_temp_key = sleep_temp self.wake_temp_key = wake_temp @@ -43,9 +80,60 @@ class TemperatureConfig: def max_sleep_time(self): return getattr(shared.opts, self.max_sleep_time_key) + def temperature_protection(self): + if not shared.opts.gpu_temps_sleep_enable: + return -class GPUTemperatureProtection(scripts.Script): - temperature_func = None + global last_call_time + call_time = time.time() + if call_time - last_call_time < shared.opts.gpu_temps_sleep_minimum_interval: + return + + gpu_core_temp = temperature_func() + if gpu_core_temp > self.sleep_temp: + if shared.opts.gpu_temps_sleep_print: + print(f'\n\nGPU Temperature: {gpu_core_temp}') + time.sleep(shared.opts.gpu_temps_sleep_sleep_time) + gpu_core_temp = temperature_func() + while (gpu_core_temp > self.wake_temp + and (not self.max_sleep_time or self.max_sleep_time > time.time() - call_time) + and shared.opts.gpu_temps_sleep_enable): + if shared.opts.gpu_temps_sleep_print: + print(f'GPU Temperature: {gpu_core_temp}') + if shared.state.interrupted or shared.state.skipped: + break + time.sleep(shared.opts.gpu_temps_sleep_sleep_time) + gpu_core_temp = temperature_func() + last_call_time = time.time() + else: + last_call_time = call_time + + +config_store_latent = TemperatureProtection( + 'gpu_temps_sleep_sleep_temp', + 'gpu_temps_sleep_wake_temp', + 'gpu_temps_sleep_max_sleep_time', +) + + +def gpu_temperature_protection_decorator(fun): + def wrapper(*args, **kwargs): + # gpu_temperature_protection(config_store_latent) + config_store_latent.temperature_protection() + result = fun(*args, **kwargs) + return result + return wrapper + + +def gpu_temperature_close_decorator(fun): + def wrapper(*args, **kwargs): + sd_samplers_common.store_latent = pre_decorate_store_latent + result = fun(*args, **kwargs) + return result + return wrapper + + +class GPUTemperatureProtectionScript(scripts.Script): def title(self): return "GPU temperature protection" @@ -55,192 +143,11 @@ class GPUTemperatureProtection(scripts.Script): def setup(self, p, *args): if shared.opts.gpu_temps_sleep_enable: - GPUTemperatureProtection.temperature_func = GPUTemperatureProtection.get_temperature_src_function(shared.opts.gpu_temps_sleep_temperature_src) - sd_samplers_common.store_latent = GPUTemperatureProtection.gpu_temperature_protection_decorator(sd_samplers_common.store_latent) - p.close = GPUTemperatureProtection.gpu_temperature_close_decorator(p.close) - - @staticmethod - def get_gpu_temperature_nvidia_smi(): - try: - return int(subprocess.check_output( - ['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader']).decode().strip().splitlines()[shared.opts.gpu_temps_sleep_gpu_index]) - except subprocess.CalledProcessError as e: - print(f"\n[Error GPU temperature protection] nvidia-smi: {e.output.decode('utf-8').strip()}") - except Exception as e: - print(f'\n[Error GPU temperature protection] nvidia-smi: {e}') - return 0 - - amd_rocm_smi_regex = re.compile(r'Temperature \(Sensor edge\) \(C\): (\d+\.\d+)') - - @staticmethod - def get_gpu_temperature_amd_rocm_smi(): - try: - output = subprocess.check_output(['rocm-smi', '--showtemp']).decode().strip() - match = GPUTemperatureProtection.amd_rocm_smi_regex.search(output) - if match: - return int(float(match.group(1))) - else: - print("\n[Error GPU temperature protection]: Couldn't parse temperature from rocm-smi output") - except subprocess.CalledProcessError as e: - print(f"\n[Error GPU temperature protection] rocm-smi: {e.output.decode('utf-8').strip()}") - except Exception as e: - print(f'\n[Error GPU temperature protection] rocm-smi: {e}') - return 0 - - computer = None - sensors = None - hardware = None - - @staticmethod - def init_open_hardware_monitor(): - try: - # install and import Python.NET module - if not launch.is_installed("pythonnet"): - launch.run_pip("install pythonnet==3.0.2", "Installing requirements for OpenHardwareMonitorLib") - import clr # noqa import pythonnet module. - - # download OpenHardwareMonitor if not found - download_open_hardware_monitor() - - # initialize OpenHardwareMonitor - if GPUTemperatureProtection.computer is None: - clr.AddReference(str(OpenHardwareMonitorLib_path)) - from OpenHardwareMonitor.Hardware import Computer # noqa - GPUTemperatureProtection.computer = Computer() - GPUTemperatureProtection.computer.CPUEnabled = False # Disable CPU - GPUTemperatureProtection.computer.GPUEnabled = True # Enable GPU - GPUTemperatureProtection.computer.Open() - - # find the first matching temperature sensor for the specified hardware - if GPUTemperatureProtection.sensors is None or shared.opts.gpu_temps_sleep_gpu_name not in str(GPUTemperatureProtection.hardware.Name): - for hardware in GPUTemperatureProtection.computer.Hardware: - if shared.opts.gpu_temps_sleep_gpu_name in str(hardware.Name): - for sensor in hardware.Sensors: - if '/temperature' in str(sensor.Identifier): - GPUTemperatureProtection.sensors = sensor - GPUTemperatureProtection.hardware = hardware - return # sensor is found early return - - # sensor not found - GPUTemperatureProtection.sensors = None - GPUTemperatureProtection.hardware = None - print(f"[Error GPU temperature protection] OpenHardwareMonitor Couldn't find temperature sensor for {shared.opts.gpu_temps_sleep_gpu_name}") - - except Exception as e: - print(f"[Error GPU temperature protection] Failed to initialize OpenHardwareMonitor: {e}") - - @staticmethod - def get_gpu_temperature_open_hardware_monitor(): - try: - GPUTemperatureProtection.hardware.Update() - return int(GPUTemperatureProtection.sensors.get_Value()) - except Exception as e: - print(f"\n[Error GPU temperature protection] OpenHardwareMonitor: Couldn't read temperature{e}") - return 0 - - @staticmethod - def on_change_temps_src(): - if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor': - if os.name == 'nt': - GPUTemperatureProtection.init_open_hardware_monitor() - else: - assert False, "NVIDIA & AMD - OpenHardwareMonitor it's only supported on Windows" - elif shared.opts.gpu_temps_sleep_temperature_src == 'AMD - ROCm-smi' and os.name == 'nt': - assert False, "AMD - ROCm-smi is not supported on Windows" - - temperature_src_dict = { - "NVIDIA - nvidia-smi": get_gpu_temperature_nvidia_smi, - "AMD - ROCm-smi": get_gpu_temperature_amd_rocm_smi, - "NVIDIA & AMD - OpenHardwareMonitor": get_gpu_temperature_open_hardware_monitor - } - - @staticmethod - def get_temperature_src_function(source_name): - return GPUTemperatureProtection.temperature_src_dict.get(source_name, GPUTemperatureProtection.get_gpu_temperature_nvidia_smi) - - @staticmethod - def gpu_temperature_protection(config: TemperatureConfig): - if shared.opts.gpu_temps_sleep_enable: - call_time = time.time() - if call_time - GPUTemperatureProtection.last_call_time > shared.opts.gpu_temps_sleep_minimum_interval: - gpu_core_temp = GPUTemperatureProtection.temperature_func() - if gpu_core_temp > config.sleep_temp: - if shared.opts.gpu_temps_sleep_print: - print(f'\n\nGPU Temperature: {gpu_core_temp}') - time.sleep(shared.opts.gpu_temps_sleep_sleep_time) - gpu_core_temp = GPUTemperatureProtection.temperature_func() - while (gpu_core_temp > shared.opts.gpu_temps_sleep_wake_temp - and (not config.max_sleep_time or config.max_sleep_time > time.time() - call_time) - and shared.opts.gpu_temps_sleep_enable): - if shared.opts.gpu_temps_sleep_print: - print(f'GPU Temperature: {gpu_core_temp}') - if shared.state.interrupted or shared.state.skipped: - break - time.sleep(shared.opts.gpu_temps_sleep_sleep_time) - gpu_core_temp = GPUTemperatureProtection.temperature_func() - - GPUTemperatureProtection.last_call_time = time.time() - else: - GPUTemperatureProtection.last_call_time = call_time - - @staticmethod - def gpu_temperature_protection_decorator(fun): - config = TemperatureConfig( - 'gpu_temps_sleep_sleep_temp', - 'gpu_temps_sleep_wake_temp', - 'gpu_temps_sleep_max_sleep_time', - ) - - def wrapper(*args, **kwargs): - GPUTemperatureProtection.gpu_temperature_protection(config) - result = fun(*args, **kwargs) - return result - return wrapper - - @staticmethod - def gpu_temperature_close_decorator(fun): - def wrapper(*args, **kwargs): - sd_samplers_common.store_latent = GPUTemperatureProtection.pre_decorate_store_latent - result = fun(*args, **kwargs) - return result - return wrapper - - last_call_time = time.time() - pre_decorate_store_latent = sd_samplers_common.store_latent + global pre_decorate_store_latent + pre_decorate_store_latent = sd_samplers_common.store_latent + sd_samplers_common.store_latent = gpu_temperature_protection_decorator(sd_samplers_common.store_latent) + p.close = gpu_temperature_close_decorator(p.close) -if hasattr(shared, "OptionHTML"): # < 1.6.0 support - shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), { - "gpu_temps_sleep_temperature_src_explanation": shared.OptionHTML("""NVIDIA - nvidia-smi is available on both Windows and Linux.
-AMD - ROCm-smi is Linux only and does not support specifying GPU device index.
-NVIDIA & AMD - OpenHardwareMonitor is Windows only supports NVIDIA and AMD. - """) - })) - - -shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), { - "gpu_temps_sleep_temperature_src": shared.OptionInfo("NVIDIA - nvidia-smi", "Temperature source", gr.Radio, {"choices": list(GPUTemperatureProtection.temperature_src_dict.keys())}, GPUTemperatureProtection.on_change_temps_src), - "gpu_temps_sleep_enable": shared.OptionInfo(True, "Enable GPU temperature protection"), - "gpu_temps_sleep_print": shared.OptionInfo(True, "Print GPU Core temperature while sleeping in terminal"), - "gpu_temps_sleep_minimum_interval": shared.OptionInfo(5.0, "GPU temperature monitor minimum interval", gr.Number).info("won't check the temperature again until this amount of seconds have passed"), - "gpu_temps_sleep_sleep_time": shared.OptionInfo(1.0, "Sleep Time", gr.Number).info("seconds to pause before checking temperature again"), - "gpu_temps_sleep_max_sleep_time": shared.OptionInfo(10.0, "Max sleep Time", gr.Number).info("max number of seconds that it's allowed to pause, 0=unlimited"), - "gpu_temps_sleep_sleep_temp": shared.OptionInfo(75.0, "GPU sleep temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause if GPU core temperature exceeds this temperature"), - "gpu_temps_sleep_wake_temp": shared.OptionInfo(75.0, "GPU wake temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause until GPU core temperature drops below this temperature"), - "gpu_temps_sleep_gpu_index": shared.OptionInfo(0, "GPU device index - nvidia-smi", gr.Number, {"precision": 0}).info("selecting the correct temperature reading for multi GPU systems, for systems with 3 gpus the value should be an integer between 0~2, default 0"), -})) - -if os.name == 'nt': - try: - all_lines = subprocess.check_output(['cmd.exe', '/c', 'wmic path win32_VideoController get name']).decode().strip("\nName").splitlines() - video_controller_filter = re.compile(r"^\s+$") - names_list = [name.strip() for name in all_lines if not video_controller_filter.match(name) and name != ''] - shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), { - "gpu_temps_sleep_gpu_name": shared.OptionInfo("None" if len(names_list) == 0 else names_list[0], "GPU Name - OpenHardwareMonitor", gr.Radio, {"choices": names_list}, GPUTemperatureProtection.on_change_temps_src).info("select your gpu"), - })) - except Exception as _e: - if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor': - print(f'[Error GPU temperature protection] Failed to retrieve list of video controllers: \n{_e}') - -if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor': - GPUTemperatureProtection.init_open_hardware_monitor() +init_temps_src() +last_call_time = time.time() diff --git a/temperature_sensor_modules/amd_rocm_smi.py b/temperature_sensor_modules/amd_rocm_smi.py new file mode 100644 index 0000000..cc5d32f --- /dev/null +++ b/temperature_sensor_modules/amd_rocm_smi.py @@ -0,0 +1,19 @@ +import subprocess +import re + +amd_rocm_smi_regex = re.compile(r'Temperature \(Sensor edge\) \(C\): (\d+\.\d+)') + + +def get_gpu_temperature_amd_rocm_smi(): + try: + output = subprocess.check_output(['rocm-smi', '--showtemp']).decode().strip() + match = amd_rocm_smi_regex.search(output) + if match: + return int(float(match.group(1))) + else: + print("\n[Error GPU temperature protection]: Couldn't parse temperature from rocm-smi output") + except subprocess.CalledProcessError as e: + print(f"\n[Error GPU temperature protection] rocm-smi: {e.output.decode('utf-8').strip()}") + except Exception as e: + print(f'\n[Error GPU temperature protection] rocm-smi: {e}') + return 0 diff --git a/temperature_sensor_modules/nvidia_smi.py b/temperature_sensor_modules/nvidia_smi.py new file mode 100644 index 0000000..4aea8ef --- /dev/null +++ b/temperature_sensor_modules/nvidia_smi.py @@ -0,0 +1,13 @@ +from modules import shared +import subprocess + + +def get_gpu_temperature_nvidia_smi(): + try: + return int(subprocess.check_output( + ['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader']).decode().strip().splitlines()[shared.opts.gpu_temps_sleep_gpu_index]) + except subprocess.CalledProcessError as e: + print(f"\n[Error GPU temperature protection] nvidia-smi: {e.output.decode('utf-8').strip()}") + except Exception as e: + print(f'\n[Error GPU temperature protection] nvidia-smi: {e}') + return 0 diff --git a/temperature_sensor_modules/open_hardware_monitor.py b/temperature_sensor_modules/open_hardware_monitor.py new file mode 100644 index 0000000..d78b094 --- /dev/null +++ b/temperature_sensor_modules/open_hardware_monitor.py @@ -0,0 +1,75 @@ +from modules import scripts, shared +from pathlib import Path +import urllib.request +import zipfile +import launch +import os + +ohm_hardware = None +ohm_computer = None +ohm_sensors = None + +OpenHardwareMonitorLibDownloadUrl = "https://openhardwaremonitor.org/files/openhardwaremonitor-v0.9.6.zip" +OpenHardwareMonitor_path = Path(scripts.current_basedir).joinpath('OpenHardwareMonitor') +OpenHardwareMonitorLib_path = OpenHardwareMonitor_path.joinpath('OpenHardwareMonitorLib') +OpenHardwareMonitorLib_dll_path = OpenHardwareMonitor_path.joinpath('OpenHardwareMonitorLib.dll') + + +def download_open_hardware_monitor(): + if not OpenHardwareMonitorLib_dll_path.is_file(): + OpenHardwareMonitor_path.mkdir(parents=True, exist_ok=True) + print("Downloading OpenHardwareMonitor") + zip_path, _ = urllib.request.urlretrieve(OpenHardwareMonitorLibDownloadUrl) + with zipfile.ZipFile(zip_path, "r") as z: + with open(os.path.realpath(OpenHardwareMonitorLib_dll_path), 'wb') as f: + f.write(z.read('OpenHardwareMonitor/OpenHardwareMonitorLib.dll')) + + +def init_open_hardware_monitor(): + global ohm_computer, ohm_sensors, ohm_hardware + try: + # install and import Python.NET module + if not launch.is_installed("pythonnet"): + launch.run_pip("install pythonnet==3.0.2", "Installing requirements for OpenHardwareMonitorLib") + import clr # noqa import pythonnet module. + + # download OpenHardwareMonitor if not found + download_open_hardware_monitor() + + # initialize OpenHardwareMonitor + if ohm_computer is None: + clr.AddReference(str(OpenHardwareMonitorLib_path)) + from OpenHardwareMonitor.Hardware import Computer # noqa + ohm_computer = Computer() + ohm_computer.CPUEnabled = False # Disable CPU + ohm_computer.GPUEnabled = True # Enable GPU + ohm_computer.Open() + + # find the first matching temperature sensor for the specified hardware + if ohm_sensors is None or shared.opts.gpu_temps_sleep_gpu_name not in str( + ohm_hardware.Name): + for hardware in ohm_computer.Hardware: + if shared.opts.gpu_temps_sleep_gpu_name in str(hardware.Name): + for sensor in hardware.Sensors: + if '/temperature' in str(sensor.Identifier): + ohm_sensors = sensor + ohm_hardware = hardware + return # sensor is found early return + + # sensor not found + ohm_sensors = None + ohm_hardware = None + print( + f"[Error GPU temperature protection] OpenHardwareMonitor Couldn't find temperature sensor for {shared.opts.gpu_temps_sleep_gpu_name}") + + except Exception as e: + print(f"[Error GPU temperature protection] Failed to initialize OpenHardwareMonitor: {e}") + + +def get_gpu_temperature_open_hardware_monitor(): + try: + ohm_hardware.Update() + return int(ohm_sensors.get_Value()) + except Exception as e: + print(f"\n[Error GPU temperature protection] OpenHardwareMonitor: Couldn't read temperature{e}") + return 0 From ce4aff494018c8916878547524a411dc46d76ced Mon Sep 17 00:00:00 2001 From: w-e-w <40751091+w-e-w@users.noreply.github.com> Date: Sat, 20 Jan 2024 23:14:08 +0900 Subject: [PATCH 4/7] patch store_latent on load --- scripts/gpu_temperature_protection.py | 64 +++++++++++---------------- 1 file changed, 25 insertions(+), 39 deletions(-) diff --git a/scripts/gpu_temperature_protection.py b/scripts/gpu_temperature_protection.py index ebbe91b..baec1dd 100644 --- a/scripts/gpu_temperature_protection.py +++ b/scripts/gpu_temperature_protection.py @@ -1,5 +1,5 @@ from temperature_sensor_modules import nvidia_smi, amd_rocm_smi, open_hardware_monitor -from modules import scripts, shared, sd_samplers_common +from modules import scripts, shared, sd_samplers_common, patches, script_callbacks, errors from typing import Callable import gradio as gr import subprocess @@ -7,9 +7,7 @@ import time import re import os -pre_decorate_store_latent = sd_samplers_common.store_latent temperature_func: Callable[[], float] - temperature_src_dict = { "NVIDIA - nvidia-smi": nvidia_smi.get_gpu_temperature_nvidia_smi, "AMD - ROCm-smi": amd_rocm_smi.get_gpu_temperature_amd_rocm_smi, @@ -57,9 +55,9 @@ if os.name == 'nt': shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), { "gpu_temps_sleep_gpu_name": shared.OptionInfo("None" if len(names_list) == 0 else names_list[0], "GPU Name - OpenHardwareMonitor", gr.Radio, {"choices": names_list}, init_temps_src).info("select your gpu"), })) - except Exception as _e: + except Exception as e: if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor': - print(f'[Error GPU temperature protection] Failed to retrieve list of video controllers: \n{_e}') + print(f'[Error GPU temperature protection] Failed to retrieve list of video controllers: \n{e}') class TemperatureProtection: @@ -109,6 +107,26 @@ class TemperatureProtection: last_call_time = call_time +def gpu_temperature_protection_decorator(fun, config): + def wrapper(*args, **kwargs): + config.temperature_protection() + result = fun(*args, **kwargs) + return result + return wrapper + + +def patch_temperature_protection(obj, field, config): + try: + patches.patch(__name__, obj, field, gpu_temperature_protection_decorator(sd_samplers_common.store_latent, config)) + + def undo_hijack(): + patches.undo(__name__, obj, field) + + script_callbacks.on_script_unloaded(undo_hijack) + except RuntimeError: + errors.report(f"patch_temperature_protection {field} is already applied") + + config_store_latent = TemperatureProtection( 'gpu_temps_sleep_sleep_temp', 'gpu_temps_sleep_wake_temp', @@ -116,38 +134,6 @@ config_store_latent = TemperatureProtection( ) -def gpu_temperature_protection_decorator(fun): - def wrapper(*args, **kwargs): - # gpu_temperature_protection(config_store_latent) - config_store_latent.temperature_protection() - result = fun(*args, **kwargs) - return result - return wrapper - - -def gpu_temperature_close_decorator(fun): - def wrapper(*args, **kwargs): - sd_samplers_common.store_latent = pre_decorate_store_latent - result = fun(*args, **kwargs) - return result - return wrapper - - -class GPUTemperatureProtectionScript(scripts.Script): - - def title(self): - return "GPU temperature protection" - - def show(self, is_img2img): - return scripts.AlwaysVisible - - def setup(self, p, *args): - if shared.opts.gpu_temps_sleep_enable: - global pre_decorate_store_latent - pre_decorate_store_latent = sd_samplers_common.store_latent - sd_samplers_common.store_latent = gpu_temperature_protection_decorator(sd_samplers_common.store_latent) - p.close = gpu_temperature_close_decorator(p.close) - - -init_temps_src() +patch_temperature_protection(sd_samplers_common, 'store_latent', config_store_latent) last_call_time = time.time() +init_temps_src() From 9fe63493d297b7d1ef11a17863b7e62d568f2d1b Mon Sep 17 00:00:00 2001 From: w-e-w <40751091+w-e-w@users.noreply.github.com> Date: Sat, 20 Jan 2024 23:36:50 +0900 Subject: [PATCH 5/7] rework settings --- scripts/gpu_temperature_protection.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/scripts/gpu_temperature_protection.py b/scripts/gpu_temperature_protection.py index baec1dd..63719d1 100644 --- a/scripts/gpu_temperature_protection.py +++ b/scripts/gpu_temperature_protection.py @@ -28,23 +28,13 @@ def init_temps_src(): if hasattr(shared, "OptionHTML"): # < 1.6.0 support shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), { - "gpu_temps_sleep_temperature_src_explanation": shared.OptionHTML("""NVIDIA - nvidia-smi is available on both Windows and Linux.
-AMD - ROCm-smi is Linux only and does not support specifying GPU device index.
-NVIDIA & AMD - OpenHardwareMonitor is Windows only supports NVIDIA and AMD. - """) - })) + "gpu_temps_sleep_temperature_src_explanation": shared.OptionHTML("""NVIDIA - nvidia-smi available on both Windows and Linux.
+AMD - ROCm-smi - Linux only and does not support specifying GPU device index.
+NVIDIA & AMD - OpenHardwareMonitor - Windows only supports NVIDIA and AMD.""")})) shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), { "gpu_temps_sleep_temperature_src": shared.OptionInfo("NVIDIA - nvidia-smi", "Temperature source", gr.Radio, {"choices": list(temperature_src_dict.keys())}, init_temps_src), - "gpu_temps_sleep_enable": shared.OptionInfo(True, "Enable GPU temperature protection"), - "gpu_temps_sleep_print": shared.OptionInfo(True, "Print GPU Core temperature while sleeping in terminal"), - "gpu_temps_sleep_minimum_interval": shared.OptionInfo(5.0, "GPU temperature monitor minimum interval", gr.Number).info("won't check the temperature again until this amount of seconds have passed"), - "gpu_temps_sleep_sleep_time": shared.OptionInfo(1.0, "Sleep Time", gr.Number).info("seconds to pause before checking temperature again"), - "gpu_temps_sleep_max_sleep_time": shared.OptionInfo(10.0, "Max sleep Time", gr.Number).info("max number of seconds that it's allowed to pause, 0=unlimited"), - "gpu_temps_sleep_sleep_temp": shared.OptionInfo(75.0, "GPU sleep temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause if GPU core temperature exceeds this temperature"), - "gpu_temps_sleep_wake_temp": shared.OptionInfo(75.0, "GPU wake temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause until GPU core temperature drops below this temperature"), - "gpu_temps_sleep_gpu_index": shared.OptionInfo(0, "GPU device index - nvidia-smi", gr.Number, {"precision": 0}).info("selecting the correct temperature reading for multi GPU systems, for systems with 3 gpus the value should be an integer between 0~2, default 0"), })) if os.name == 'nt': @@ -59,6 +49,17 @@ if os.name == 'nt': if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor': print(f'[Error GPU temperature protection] Failed to retrieve list of video controllers: \n{e}') +shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), { + "gpu_temps_sleep_gpu_index": shared.OptionInfo(0, "GPU device index - nvidia-smi", gr.Number, {"precision": 0}).info("selecting the correct temperature reading for multi GPU systems, for systems with 3 gpus the value should be an integer between 0~2, default 0"), + "gpu_temps_sleep_enable": shared.OptionInfo(True, "Enable GPU temperature protection"), + "gpu_temps_sleep_print": shared.OptionInfo(True, "Print GPU Core temperature while sleeping in terminal"), + "gpu_temps_sleep_minimum_interval": shared.OptionInfo(5.0, "GPU temperature monitor minimum interval", gr.Number).info("won't check the temperature again until this amount of seconds have passed"), + "gpu_temps_sleep_sleep_time": shared.OptionInfo(1.0, "Sleep Time", gr.Number).info("seconds to pause before checking temperature again"), + "gpu_temps_sleep_max_sleep_time": shared.OptionInfo(10.0, "Max sleep Time", gr.Number).info("max number of seconds that it's allowed to pause, 0=unlimited"), + "gpu_temps_sleep_sleep_temp": shared.OptionInfo(75.0, "GPU sleep temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause if GPU core temperature exceeds this temperature"), + "gpu_temps_sleep_wake_temp": shared.OptionInfo(75.0, "GPU wake temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause until GPU core temperature drops below this temperature"), +})) + class TemperatureProtection: def __init__(self, sleep_temp, wake_temp, max_sleep_time): From 61adc45ccd24aa483aed1f8f4a23218354b740a4 Mon Sep 17 00:00:00 2001 From: w-e-w <40751091+w-e-w@users.noreply.github.com> Date: Sun, 19 May 2024 17:38:17 +0900 Subject: [PATCH 6/7] switch wmic tp Get-CimInstance --- scripts/gpu_temperature_protection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/gpu_temperature_protection.py b/scripts/gpu_temperature_protection.py index 63719d1..a7f6e9a 100644 --- a/scripts/gpu_temperature_protection.py +++ b/scripts/gpu_temperature_protection.py @@ -39,11 +39,10 @@ shared.options_templates.update(shared.options_section(('GPU_temperature_protect if os.name == 'nt': try: - all_lines = subprocess.check_output(['cmd.exe', '/c', 'wmic path win32_VideoController get name']).decode().strip("\nName").splitlines() + Win32_VideoControllers = subprocess.check_output(['powershell.exe', '-Command', '(Get-CimInstance -ClassName Win32_VideoController | Select-Object -ExpandProperty Name) -join "`n"'], text=True).splitlines() video_controller_filter = re.compile(r"^\s+$") - names_list = [name.strip() for name in all_lines if not video_controller_filter.match(name) and name != ''] shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), { - "gpu_temps_sleep_gpu_name": shared.OptionInfo("None" if len(names_list) == 0 else names_list[0], "GPU Name - OpenHardwareMonitor", gr.Radio, {"choices": names_list}, init_temps_src).info("select your gpu"), + "gpu_temps_sleep_gpu_name": shared.OptionInfo(Win32_VideoControllers[0] if Win32_VideoControllers else "None", "GPU Name - OpenHardwareMonitor", gr.Radio, {"choices": Win32_VideoControllers}, init_temps_src).info("select your gpu"), })) except Exception as e: if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor': From 38de3a9615d6565d3fae70bb0d8aebdfa31cf13e Mon Sep 17 00:00:00 2001 From: w-e-w <40751091+w-e-w@users.noreply.github.com> Date: Sun, 19 May 2024 17:58:33 +0900 Subject: [PATCH 7/7] improve error message in OpenHardwareMonitor --- .../open_hardware_monitor.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/temperature_sensor_modules/open_hardware_monitor.py b/temperature_sensor_modules/open_hardware_monitor.py index d78b094..2a42d73 100644 --- a/temperature_sensor_modules/open_hardware_monitor.py +++ b/temperature_sensor_modules/open_hardware_monitor.py @@ -1,6 +1,7 @@ -from modules import scripts, shared +from modules import scripts, shared, errors from pathlib import Path import urllib.request +import gradio as gr import zipfile import launch import os @@ -46,8 +47,7 @@ def init_open_hardware_monitor(): ohm_computer.Open() # find the first matching temperature sensor for the specified hardware - if ohm_sensors is None or shared.opts.gpu_temps_sleep_gpu_name not in str( - ohm_hardware.Name): + if ohm_sensors is None or shared.opts.gpu_temps_sleep_gpu_name not in str(ohm_hardware.Name): for hardware in ohm_computer.Hardware: if shared.opts.gpu_temps_sleep_gpu_name in str(hardware.Name): for sensor in hardware.Sensors: @@ -59,11 +59,13 @@ def init_open_hardware_monitor(): # sensor not found ohm_sensors = None ohm_hardware = None - print( - f"[Error GPU temperature protection] OpenHardwareMonitor Couldn't find temperature sensor for {shared.opts.gpu_temps_sleep_gpu_name}") + error_message = f"OpenHardwareMonitor Couldn't find temperature sensor for {shared.opts.gpu_temps_sleep_gpu_name}" + gr.Warning(error_message) + print(f"[Error GPU temperature protection] {error_message}") except Exception as e: - print(f"[Error GPU temperature protection] Failed to initialize OpenHardwareMonitor: {e}") + error_message = f"Failed to initialize OpenHardwareMonitor" + errors.report(f'[Error GPU temperature protection] {error_message}') def get_gpu_temperature_open_hardware_monitor():