From d442f641d6daeb4310a057bc6193f87517c9f6a0 Mon Sep 17 00:00:00 2001
From: w-e-w <40751091+w-e-w@users.noreply.github.com>
Date: Sat, 20 Jan 2024 06:36:44 +0900
Subject: [PATCH 1/7] config class gpu_temperature_protection
---
scripts/gpu_temperature_protection.py | 58 +++++++++++++++++++--------
1 file changed, 42 insertions(+), 16 deletions(-)
diff --git a/scripts/gpu_temperature_protection.py b/scripts/gpu_temperature_protection.py
index 20bc9d3..059799b 100644
--- a/scripts/gpu_temperature_protection.py
+++ b/scripts/gpu_temperature_protection.py
@@ -25,7 +25,28 @@ def download_open_hardware_monitor():
f.write(z.read('OpenHardwareMonitor/OpenHardwareMonitorLib.dll'))
+class TemperatureConfig:
+ def __init__(self, sleep_temp, wake_temp, max_sleep_time):
+ self.sleep_temp_key = sleep_temp
+ self.wake_temp_key = wake_temp
+ self.max_sleep_time_key = max_sleep_time
+
+ @property
+ def sleep_temp(self):
+ return getattr(shared.opts, self.sleep_temp_key)
+
+ @property
+ def wake_temp(self):
+ return getattr(shared.opts, self.wake_temp_key)
+
+ @property
+ def max_sleep_time(self):
+ return getattr(shared.opts, self.max_sleep_time_key)
+
+
class GPUTemperatureProtection(scripts.Script):
+ temperature_func = None
+
def title(self):
return "GPU temperature protection"
@@ -34,10 +55,8 @@ class GPUTemperatureProtection(scripts.Script):
def setup(self, p, *args):
if shared.opts.gpu_temps_sleep_enable:
- sd_samplers_common.store_latent = GPUTemperatureProtection.gpu_temperature_protection_decorator(
- sd_samplers_common.store_latent,
- GPUTemperatureProtection.get_temperature_src_function(shared.opts.gpu_temps_sleep_temperature_src)
- )
+ GPUTemperatureProtection.temperature_func = GPUTemperatureProtection.get_temperature_src_function(shared.opts.gpu_temps_sleep_temperature_src)
+ sd_samplers_common.store_latent = GPUTemperatureProtection.gpu_temperature_protection_decorator(sd_samplers_common.store_latent)
p.close = GPUTemperatureProtection.gpu_temperature_close_decorator(p.close)
@staticmethod
@@ -108,7 +127,7 @@ class GPUTemperatureProtection(scripts.Script):
print(f"[Error GPU temperature protection] OpenHardwareMonitor Couldn't find temperature sensor for {shared.opts.gpu_temps_sleep_gpu_name}")
except Exception as e:
- print(f"[Error GPU temperature protection] Failed to initialize OpenHardwareMonitor \: {e}")
+ print(f"[Error GPU temperature protection] Failed to initialize OpenHardwareMonitor: {e}")
@staticmethod
def get_gpu_temperature_open_hardware_monitor():
@@ -140,34 +159,41 @@ class GPUTemperatureProtection(scripts.Script):
return GPUTemperatureProtection.temperature_src_dict.get(source_name, GPUTemperatureProtection.get_gpu_temperature_nvidia_smi)
@staticmethod
- def gpu_temperature_protection(temperature_src_fun):
+ def gpu_temperature_protection(config: TemperatureConfig):
if shared.opts.gpu_temps_sleep_enable:
call_time = time.time()
if call_time - GPUTemperatureProtection.last_call_time > shared.opts.gpu_temps_sleep_minimum_interval:
- gpu_core_temp = temperature_src_fun()
- if gpu_core_temp > shared.opts.gpu_temps_sleep_sleep_temp:
-
+ gpu_core_temp = GPUTemperatureProtection.temperature_func()
+ if gpu_core_temp > config.sleep_temp:
if shared.opts.gpu_temps_sleep_print:
print(f'\n\nGPU Temperature: {gpu_core_temp}')
-
time.sleep(shared.opts.gpu_temps_sleep_sleep_time)
- gpu_core_temp = temperature_src_fun()
- while gpu_core_temp > shared.opts.gpu_temps_sleep_wake_temp and (not shared.opts.gpu_temps_sleep_max_sleep_time or shared.opts.gpu_temps_sleep_max_sleep_time > time.time() - call_time) and shared.opts.gpu_temps_sleep_enable:
+ gpu_core_temp = GPUTemperatureProtection.temperature_func()
+ while (gpu_core_temp > shared.opts.gpu_temps_sleep_wake_temp
+ and (not config.max_sleep_time or config.max_sleep_time > time.time() - call_time)
+ and shared.opts.gpu_temps_sleep_enable):
if shared.opts.gpu_temps_sleep_print:
print(f'GPU Temperature: {gpu_core_temp}')
-
+ if shared.state.interrupted or shared.state.skipped:
+ break
time.sleep(shared.opts.gpu_temps_sleep_sleep_time)
- gpu_core_temp = temperature_src_fun()
+ gpu_core_temp = GPUTemperatureProtection.temperature_func()
GPUTemperatureProtection.last_call_time = time.time()
else:
GPUTemperatureProtection.last_call_time = call_time
@staticmethod
- def gpu_temperature_protection_decorator(fun, temperature_src_fun):
+ def gpu_temperature_protection_decorator(fun):
+ config = TemperatureConfig(
+ 'gpu_temps_sleep_sleep_temp',
+ 'gpu_temps_sleep_wake_temp',
+ 'gpu_temps_sleep_max_sleep_time',
+ )
+
def wrapper(*args, **kwargs):
+ GPUTemperatureProtection.gpu_temperature_protection(config)
result = fun(*args, **kwargs)
- GPUTemperatureProtection.gpu_temperature_protection(temperature_src_fun)
return result
return wrapper
From 217afc92646864854c69d4747d9c710a4d1fa122 Mon Sep 17 00:00:00 2001
From: w-e-w <40751091+w-e-w@users.noreply.github.com>
Date: Sat, 20 Jan 2024 06:41:48 +0900
Subject: [PATCH 2/7] # noqa
---
scripts/gpu_temperature_protection.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/scripts/gpu_temperature_protection.py b/scripts/gpu_temperature_protection.py
index 059799b..7a9927f 100644
--- a/scripts/gpu_temperature_protection.py
+++ b/scripts/gpu_temperature_protection.py
@@ -97,7 +97,7 @@ class GPUTemperatureProtection(scripts.Script):
# install and import Python.NET module
if not launch.is_installed("pythonnet"):
launch.run_pip("install pythonnet==3.0.2", "Installing requirements for OpenHardwareMonitorLib")
- import clr # import pythonnet module.
+ import clr # noqa import pythonnet module.
# download OpenHardwareMonitor if not found
download_open_hardware_monitor()
@@ -105,7 +105,7 @@ class GPUTemperatureProtection(scripts.Script):
# initialize OpenHardwareMonitor
if GPUTemperatureProtection.computer is None:
clr.AddReference(str(OpenHardwareMonitorLib_path))
- from OpenHardwareMonitor.Hardware import Computer
+ from OpenHardwareMonitor.Hardware import Computer # noqa
GPUTemperatureProtection.computer = Computer()
GPUTemperatureProtection.computer.CPUEnabled = False # Disable CPU
GPUTemperatureProtection.computer.GPUEnabled = True # Enable GPU
From 872331bc7a14281323b251305087ee597433e1dd Mon Sep 17 00:00:00 2001
From: w-e-w <40751091+w-e-w@users.noreply.github.com>
Date: Sun, 19 May 2024 16:37:04 +0900
Subject: [PATCH 3/7] split into modules
---
scripts/gpu_temperature_protection.py | 319 +++++++-----------
temperature_sensor_modules/amd_rocm_smi.py | 19 ++
temperature_sensor_modules/nvidia_smi.py | 13 +
.../open_hardware_monitor.py | 75 ++++
4 files changed, 220 insertions(+), 206 deletions(-)
create mode 100644 temperature_sensor_modules/amd_rocm_smi.py
create mode 100644 temperature_sensor_modules/nvidia_smi.py
create mode 100644 temperature_sensor_modules/open_hardware_monitor.py
diff --git a/scripts/gpu_temperature_protection.py b/scripts/gpu_temperature_protection.py
index 7a9927f..ebbe91b 100644
--- a/scripts/gpu_temperature_protection.py
+++ b/scripts/gpu_temperature_protection.py
@@ -1,31 +1,68 @@
+from temperature_sensor_modules import nvidia_smi, amd_rocm_smi, open_hardware_monitor
from modules import scripts, shared, sd_samplers_common
-from pathlib import Path
-import urllib.request
+from typing import Callable
import gradio as gr
import subprocess
-import zipfile
-import launch
import time
import re
import os
-OpenHardwareMonitorLibDownloadUrl = "https://openhardwaremonitor.org/files/openhardwaremonitor-v0.9.6.zip"
-OpenHardwareMonitor_path = Path(scripts.current_basedir).joinpath('OpenHardwareMonitor')
-OpenHardwareMonitorLib_path = OpenHardwareMonitor_path.joinpath('OpenHardwareMonitorLib')
-OpenHardwareMonitorLib_dll_path = OpenHardwareMonitor_path.joinpath('OpenHardwareMonitorLib.dll')
+pre_decorate_store_latent = sd_samplers_common.store_latent
+temperature_func: Callable[[], float]
+
+temperature_src_dict = {
+ "NVIDIA - nvidia-smi": nvidia_smi.get_gpu_temperature_nvidia_smi,
+ "AMD - ROCm-smi": amd_rocm_smi.get_gpu_temperature_amd_rocm_smi,
+ "NVIDIA & AMD - OpenHardwareMonitor": open_hardware_monitor.get_gpu_temperature_open_hardware_monitor
+}
-def download_open_hardware_monitor():
- if not OpenHardwareMonitorLib_dll_path.is_file():
- OpenHardwareMonitor_path.mkdir(parents=True, exist_ok=True)
- print("Downloading OpenHardwareMonitor")
- zip_path, _ = urllib.request.urlretrieve(OpenHardwareMonitorLibDownloadUrl)
- with zipfile.ZipFile(zip_path, "r") as z:
- with open(os.path.realpath(OpenHardwareMonitorLib_dll_path), 'wb') as f:
- f.write(z.read('OpenHardwareMonitor/OpenHardwareMonitorLib.dll'))
+def init_temps_src():
+ global temperature_func
+ temperature_func = temperature_src_dict.get(shared.opts.gpu_temps_sleep_temperature_src, nvidia_smi.get_gpu_temperature_nvidia_smi)
+ if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor':
+ if os.name != 'nt':
+ assert False, "NVIDIA & AMD - OpenHardwareMonitor it's only supported on Windows"
+ open_hardware_monitor.init_open_hardware_monitor()
+ elif shared.opts.gpu_temps_sleep_temperature_src == 'AMD - ROCm-smi' and os.name == 'nt':
+ assert False, "AMD - ROCm-smi is not supported on Windows"
-class TemperatureConfig:
+if hasattr(shared, "OptionHTML"): # < 1.6.0 support
+ shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), {
+ "gpu_temps_sleep_temperature_src_explanation": shared.OptionHTML("""NVIDIA - nvidia-smi is available on both Windows and Linux.
+AMD - ROCm-smi is Linux only and does not support specifying GPU device index.
+NVIDIA & AMD - OpenHardwareMonitor is Windows only supports NVIDIA and AMD.
+ """)
+ }))
+
+
+shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), {
+ "gpu_temps_sleep_temperature_src": shared.OptionInfo("NVIDIA - nvidia-smi", "Temperature source", gr.Radio, {"choices": list(temperature_src_dict.keys())}, init_temps_src),
+ "gpu_temps_sleep_enable": shared.OptionInfo(True, "Enable GPU temperature protection"),
+ "gpu_temps_sleep_print": shared.OptionInfo(True, "Print GPU Core temperature while sleeping in terminal"),
+ "gpu_temps_sleep_minimum_interval": shared.OptionInfo(5.0, "GPU temperature monitor minimum interval", gr.Number).info("won't check the temperature again until this amount of seconds have passed"),
+ "gpu_temps_sleep_sleep_time": shared.OptionInfo(1.0, "Sleep Time", gr.Number).info("seconds to pause before checking temperature again"),
+ "gpu_temps_sleep_max_sleep_time": shared.OptionInfo(10.0, "Max sleep Time", gr.Number).info("max number of seconds that it's allowed to pause, 0=unlimited"),
+ "gpu_temps_sleep_sleep_temp": shared.OptionInfo(75.0, "GPU sleep temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause if GPU core temperature exceeds this temperature"),
+ "gpu_temps_sleep_wake_temp": shared.OptionInfo(75.0, "GPU wake temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause until GPU core temperature drops below this temperature"),
+ "gpu_temps_sleep_gpu_index": shared.OptionInfo(0, "GPU device index - nvidia-smi", gr.Number, {"precision": 0}).info("selecting the correct temperature reading for multi GPU systems, for systems with 3 gpus the value should be an integer between 0~2, default 0"),
+}))
+
+if os.name == 'nt':
+ try:
+ all_lines = subprocess.check_output(['cmd.exe', '/c', 'wmic path win32_VideoController get name']).decode().strip("\nName").splitlines()
+ video_controller_filter = re.compile(r"^\s+$")
+ names_list = [name.strip() for name in all_lines if not video_controller_filter.match(name) and name != '']
+ shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), {
+ "gpu_temps_sleep_gpu_name": shared.OptionInfo("None" if len(names_list) == 0 else names_list[0], "GPU Name - OpenHardwareMonitor", gr.Radio, {"choices": names_list}, init_temps_src).info("select your gpu"),
+ }))
+ except Exception as _e:
+ if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor':
+ print(f'[Error GPU temperature protection] Failed to retrieve list of video controllers: \n{_e}')
+
+
+class TemperatureProtection:
def __init__(self, sleep_temp, wake_temp, max_sleep_time):
self.sleep_temp_key = sleep_temp
self.wake_temp_key = wake_temp
@@ -43,9 +80,60 @@ class TemperatureConfig:
def max_sleep_time(self):
return getattr(shared.opts, self.max_sleep_time_key)
+ def temperature_protection(self):
+ if not shared.opts.gpu_temps_sleep_enable:
+ return
-class GPUTemperatureProtection(scripts.Script):
- temperature_func = None
+ global last_call_time
+ call_time = time.time()
+ if call_time - last_call_time < shared.opts.gpu_temps_sleep_minimum_interval:
+ return
+
+ gpu_core_temp = temperature_func()
+ if gpu_core_temp > self.sleep_temp:
+ if shared.opts.gpu_temps_sleep_print:
+ print(f'\n\nGPU Temperature: {gpu_core_temp}')
+ time.sleep(shared.opts.gpu_temps_sleep_sleep_time)
+ gpu_core_temp = temperature_func()
+ while (gpu_core_temp > self.wake_temp
+ and (not self.max_sleep_time or self.max_sleep_time > time.time() - call_time)
+ and shared.opts.gpu_temps_sleep_enable):
+ if shared.opts.gpu_temps_sleep_print:
+ print(f'GPU Temperature: {gpu_core_temp}')
+ if shared.state.interrupted or shared.state.skipped:
+ break
+ time.sleep(shared.opts.gpu_temps_sleep_sleep_time)
+ gpu_core_temp = temperature_func()
+ last_call_time = time.time()
+ else:
+ last_call_time = call_time
+
+
+config_store_latent = TemperatureProtection(
+ 'gpu_temps_sleep_sleep_temp',
+ 'gpu_temps_sleep_wake_temp',
+ 'gpu_temps_sleep_max_sleep_time',
+)
+
+
+def gpu_temperature_protection_decorator(fun):
+ def wrapper(*args, **kwargs):
+ # gpu_temperature_protection(config_store_latent)
+ config_store_latent.temperature_protection()
+ result = fun(*args, **kwargs)
+ return result
+ return wrapper
+
+
+def gpu_temperature_close_decorator(fun):
+ def wrapper(*args, **kwargs):
+ sd_samplers_common.store_latent = pre_decorate_store_latent
+ result = fun(*args, **kwargs)
+ return result
+ return wrapper
+
+
+class GPUTemperatureProtectionScript(scripts.Script):
def title(self):
return "GPU temperature protection"
@@ -55,192 +143,11 @@ class GPUTemperatureProtection(scripts.Script):
def setup(self, p, *args):
if shared.opts.gpu_temps_sleep_enable:
- GPUTemperatureProtection.temperature_func = GPUTemperatureProtection.get_temperature_src_function(shared.opts.gpu_temps_sleep_temperature_src)
- sd_samplers_common.store_latent = GPUTemperatureProtection.gpu_temperature_protection_decorator(sd_samplers_common.store_latent)
- p.close = GPUTemperatureProtection.gpu_temperature_close_decorator(p.close)
-
- @staticmethod
- def get_gpu_temperature_nvidia_smi():
- try:
- return int(subprocess.check_output(
- ['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader']).decode().strip().splitlines()[shared.opts.gpu_temps_sleep_gpu_index])
- except subprocess.CalledProcessError as e:
- print(f"\n[Error GPU temperature protection] nvidia-smi: {e.output.decode('utf-8').strip()}")
- except Exception as e:
- print(f'\n[Error GPU temperature protection] nvidia-smi: {e}')
- return 0
-
- amd_rocm_smi_regex = re.compile(r'Temperature \(Sensor edge\) \(C\): (\d+\.\d+)')
-
- @staticmethod
- def get_gpu_temperature_amd_rocm_smi():
- try:
- output = subprocess.check_output(['rocm-smi', '--showtemp']).decode().strip()
- match = GPUTemperatureProtection.amd_rocm_smi_regex.search(output)
- if match:
- return int(float(match.group(1)))
- else:
- print("\n[Error GPU temperature protection]: Couldn't parse temperature from rocm-smi output")
- except subprocess.CalledProcessError as e:
- print(f"\n[Error GPU temperature protection] rocm-smi: {e.output.decode('utf-8').strip()}")
- except Exception as e:
- print(f'\n[Error GPU temperature protection] rocm-smi: {e}')
- return 0
-
- computer = None
- sensors = None
- hardware = None
-
- @staticmethod
- def init_open_hardware_monitor():
- try:
- # install and import Python.NET module
- if not launch.is_installed("pythonnet"):
- launch.run_pip("install pythonnet==3.0.2", "Installing requirements for OpenHardwareMonitorLib")
- import clr # noqa import pythonnet module.
-
- # download OpenHardwareMonitor if not found
- download_open_hardware_monitor()
-
- # initialize OpenHardwareMonitor
- if GPUTemperatureProtection.computer is None:
- clr.AddReference(str(OpenHardwareMonitorLib_path))
- from OpenHardwareMonitor.Hardware import Computer # noqa
- GPUTemperatureProtection.computer = Computer()
- GPUTemperatureProtection.computer.CPUEnabled = False # Disable CPU
- GPUTemperatureProtection.computer.GPUEnabled = True # Enable GPU
- GPUTemperatureProtection.computer.Open()
-
- # find the first matching temperature sensor for the specified hardware
- if GPUTemperatureProtection.sensors is None or shared.opts.gpu_temps_sleep_gpu_name not in str(GPUTemperatureProtection.hardware.Name):
- for hardware in GPUTemperatureProtection.computer.Hardware:
- if shared.opts.gpu_temps_sleep_gpu_name in str(hardware.Name):
- for sensor in hardware.Sensors:
- if '/temperature' in str(sensor.Identifier):
- GPUTemperatureProtection.sensors = sensor
- GPUTemperatureProtection.hardware = hardware
- return # sensor is found early return
-
- # sensor not found
- GPUTemperatureProtection.sensors = None
- GPUTemperatureProtection.hardware = None
- print(f"[Error GPU temperature protection] OpenHardwareMonitor Couldn't find temperature sensor for {shared.opts.gpu_temps_sleep_gpu_name}")
-
- except Exception as e:
- print(f"[Error GPU temperature protection] Failed to initialize OpenHardwareMonitor: {e}")
-
- @staticmethod
- def get_gpu_temperature_open_hardware_monitor():
- try:
- GPUTemperatureProtection.hardware.Update()
- return int(GPUTemperatureProtection.sensors.get_Value())
- except Exception as e:
- print(f"\n[Error GPU temperature protection] OpenHardwareMonitor: Couldn't read temperature{e}")
- return 0
-
- @staticmethod
- def on_change_temps_src():
- if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor':
- if os.name == 'nt':
- GPUTemperatureProtection.init_open_hardware_monitor()
- else:
- assert False, "NVIDIA & AMD - OpenHardwareMonitor it's only supported on Windows"
- elif shared.opts.gpu_temps_sleep_temperature_src == 'AMD - ROCm-smi' and os.name == 'nt':
- assert False, "AMD - ROCm-smi is not supported on Windows"
-
- temperature_src_dict = {
- "NVIDIA - nvidia-smi": get_gpu_temperature_nvidia_smi,
- "AMD - ROCm-smi": get_gpu_temperature_amd_rocm_smi,
- "NVIDIA & AMD - OpenHardwareMonitor": get_gpu_temperature_open_hardware_monitor
- }
-
- @staticmethod
- def get_temperature_src_function(source_name):
- return GPUTemperatureProtection.temperature_src_dict.get(source_name, GPUTemperatureProtection.get_gpu_temperature_nvidia_smi)
-
- @staticmethod
- def gpu_temperature_protection(config: TemperatureConfig):
- if shared.opts.gpu_temps_sleep_enable:
- call_time = time.time()
- if call_time - GPUTemperatureProtection.last_call_time > shared.opts.gpu_temps_sleep_minimum_interval:
- gpu_core_temp = GPUTemperatureProtection.temperature_func()
- if gpu_core_temp > config.sleep_temp:
- if shared.opts.gpu_temps_sleep_print:
- print(f'\n\nGPU Temperature: {gpu_core_temp}')
- time.sleep(shared.opts.gpu_temps_sleep_sleep_time)
- gpu_core_temp = GPUTemperatureProtection.temperature_func()
- while (gpu_core_temp > shared.opts.gpu_temps_sleep_wake_temp
- and (not config.max_sleep_time or config.max_sleep_time > time.time() - call_time)
- and shared.opts.gpu_temps_sleep_enable):
- if shared.opts.gpu_temps_sleep_print:
- print(f'GPU Temperature: {gpu_core_temp}')
- if shared.state.interrupted or shared.state.skipped:
- break
- time.sleep(shared.opts.gpu_temps_sleep_sleep_time)
- gpu_core_temp = GPUTemperatureProtection.temperature_func()
-
- GPUTemperatureProtection.last_call_time = time.time()
- else:
- GPUTemperatureProtection.last_call_time = call_time
-
- @staticmethod
- def gpu_temperature_protection_decorator(fun):
- config = TemperatureConfig(
- 'gpu_temps_sleep_sleep_temp',
- 'gpu_temps_sleep_wake_temp',
- 'gpu_temps_sleep_max_sleep_time',
- )
-
- def wrapper(*args, **kwargs):
- GPUTemperatureProtection.gpu_temperature_protection(config)
- result = fun(*args, **kwargs)
- return result
- return wrapper
-
- @staticmethod
- def gpu_temperature_close_decorator(fun):
- def wrapper(*args, **kwargs):
- sd_samplers_common.store_latent = GPUTemperatureProtection.pre_decorate_store_latent
- result = fun(*args, **kwargs)
- return result
- return wrapper
-
- last_call_time = time.time()
- pre_decorate_store_latent = sd_samplers_common.store_latent
+ global pre_decorate_store_latent
+ pre_decorate_store_latent = sd_samplers_common.store_latent
+ sd_samplers_common.store_latent = gpu_temperature_protection_decorator(sd_samplers_common.store_latent)
+ p.close = gpu_temperature_close_decorator(p.close)
-if hasattr(shared, "OptionHTML"): # < 1.6.0 support
- shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), {
- "gpu_temps_sleep_temperature_src_explanation": shared.OptionHTML("""NVIDIA - nvidia-smi is available on both Windows and Linux.
-AMD - ROCm-smi is Linux only and does not support specifying GPU device index.
-NVIDIA & AMD - OpenHardwareMonitor is Windows only supports NVIDIA and AMD.
- """)
- }))
-
-
-shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), {
- "gpu_temps_sleep_temperature_src": shared.OptionInfo("NVIDIA - nvidia-smi", "Temperature source", gr.Radio, {"choices": list(GPUTemperatureProtection.temperature_src_dict.keys())}, GPUTemperatureProtection.on_change_temps_src),
- "gpu_temps_sleep_enable": shared.OptionInfo(True, "Enable GPU temperature protection"),
- "gpu_temps_sleep_print": shared.OptionInfo(True, "Print GPU Core temperature while sleeping in terminal"),
- "gpu_temps_sleep_minimum_interval": shared.OptionInfo(5.0, "GPU temperature monitor minimum interval", gr.Number).info("won't check the temperature again until this amount of seconds have passed"),
- "gpu_temps_sleep_sleep_time": shared.OptionInfo(1.0, "Sleep Time", gr.Number).info("seconds to pause before checking temperature again"),
- "gpu_temps_sleep_max_sleep_time": shared.OptionInfo(10.0, "Max sleep Time", gr.Number).info("max number of seconds that it's allowed to pause, 0=unlimited"),
- "gpu_temps_sleep_sleep_temp": shared.OptionInfo(75.0, "GPU sleep temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause if GPU core temperature exceeds this temperature"),
- "gpu_temps_sleep_wake_temp": shared.OptionInfo(75.0, "GPU wake temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause until GPU core temperature drops below this temperature"),
- "gpu_temps_sleep_gpu_index": shared.OptionInfo(0, "GPU device index - nvidia-smi", gr.Number, {"precision": 0}).info("selecting the correct temperature reading for multi GPU systems, for systems with 3 gpus the value should be an integer between 0~2, default 0"),
-}))
-
-if os.name == 'nt':
- try:
- all_lines = subprocess.check_output(['cmd.exe', '/c', 'wmic path win32_VideoController get name']).decode().strip("\nName").splitlines()
- video_controller_filter = re.compile(r"^\s+$")
- names_list = [name.strip() for name in all_lines if not video_controller_filter.match(name) and name != '']
- shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), {
- "gpu_temps_sleep_gpu_name": shared.OptionInfo("None" if len(names_list) == 0 else names_list[0], "GPU Name - OpenHardwareMonitor", gr.Radio, {"choices": names_list}, GPUTemperatureProtection.on_change_temps_src).info("select your gpu"),
- }))
- except Exception as _e:
- if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor':
- print(f'[Error GPU temperature protection] Failed to retrieve list of video controllers: \n{_e}')
-
-if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor':
- GPUTemperatureProtection.init_open_hardware_monitor()
+init_temps_src()
+last_call_time = time.time()
diff --git a/temperature_sensor_modules/amd_rocm_smi.py b/temperature_sensor_modules/amd_rocm_smi.py
new file mode 100644
index 0000000..cc5d32f
--- /dev/null
+++ b/temperature_sensor_modules/amd_rocm_smi.py
@@ -0,0 +1,19 @@
+import subprocess
+import re
+
+amd_rocm_smi_regex = re.compile(r'Temperature \(Sensor edge\) \(C\): (\d+\.\d+)')
+
+
+def get_gpu_temperature_amd_rocm_smi():
+ try:
+ output = subprocess.check_output(['rocm-smi', '--showtemp']).decode().strip()
+ match = amd_rocm_smi_regex.search(output)
+ if match:
+ return int(float(match.group(1)))
+ else:
+ print("\n[Error GPU temperature protection]: Couldn't parse temperature from rocm-smi output")
+ except subprocess.CalledProcessError as e:
+ print(f"\n[Error GPU temperature protection] rocm-smi: {e.output.decode('utf-8').strip()}")
+ except Exception as e:
+ print(f'\n[Error GPU temperature protection] rocm-smi: {e}')
+ return 0
diff --git a/temperature_sensor_modules/nvidia_smi.py b/temperature_sensor_modules/nvidia_smi.py
new file mode 100644
index 0000000..4aea8ef
--- /dev/null
+++ b/temperature_sensor_modules/nvidia_smi.py
@@ -0,0 +1,13 @@
+from modules import shared
+import subprocess
+
+
+def get_gpu_temperature_nvidia_smi():
+ try:
+ return int(subprocess.check_output(
+ ['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader']).decode().strip().splitlines()[shared.opts.gpu_temps_sleep_gpu_index])
+ except subprocess.CalledProcessError as e:
+ print(f"\n[Error GPU temperature protection] nvidia-smi: {e.output.decode('utf-8').strip()}")
+ except Exception as e:
+ print(f'\n[Error GPU temperature protection] nvidia-smi: {e}')
+ return 0
diff --git a/temperature_sensor_modules/open_hardware_monitor.py b/temperature_sensor_modules/open_hardware_monitor.py
new file mode 100644
index 0000000..d78b094
--- /dev/null
+++ b/temperature_sensor_modules/open_hardware_monitor.py
@@ -0,0 +1,75 @@
+from modules import scripts, shared
+from pathlib import Path
+import urllib.request
+import zipfile
+import launch
+import os
+
+ohm_hardware = None
+ohm_computer = None
+ohm_sensors = None
+
+OpenHardwareMonitorLibDownloadUrl = "https://openhardwaremonitor.org/files/openhardwaremonitor-v0.9.6.zip"
+OpenHardwareMonitor_path = Path(scripts.current_basedir).joinpath('OpenHardwareMonitor')
+OpenHardwareMonitorLib_path = OpenHardwareMonitor_path.joinpath('OpenHardwareMonitorLib')
+OpenHardwareMonitorLib_dll_path = OpenHardwareMonitor_path.joinpath('OpenHardwareMonitorLib.dll')
+
+
+def download_open_hardware_monitor():
+ if not OpenHardwareMonitorLib_dll_path.is_file():
+ OpenHardwareMonitor_path.mkdir(parents=True, exist_ok=True)
+ print("Downloading OpenHardwareMonitor")
+ zip_path, _ = urllib.request.urlretrieve(OpenHardwareMonitorLibDownloadUrl)
+ with zipfile.ZipFile(zip_path, "r") as z:
+ with open(os.path.realpath(OpenHardwareMonitorLib_dll_path), 'wb') as f:
+ f.write(z.read('OpenHardwareMonitor/OpenHardwareMonitorLib.dll'))
+
+
+def init_open_hardware_monitor():
+ global ohm_computer, ohm_sensors, ohm_hardware
+ try:
+ # install and import Python.NET module
+ if not launch.is_installed("pythonnet"):
+ launch.run_pip("install pythonnet==3.0.2", "Installing requirements for OpenHardwareMonitorLib")
+ import clr # noqa import pythonnet module.
+
+ # download OpenHardwareMonitor if not found
+ download_open_hardware_monitor()
+
+ # initialize OpenHardwareMonitor
+ if ohm_computer is None:
+ clr.AddReference(str(OpenHardwareMonitorLib_path))
+ from OpenHardwareMonitor.Hardware import Computer # noqa
+ ohm_computer = Computer()
+ ohm_computer.CPUEnabled = False # Disable CPU
+ ohm_computer.GPUEnabled = True # Enable GPU
+ ohm_computer.Open()
+
+ # find the first matching temperature sensor for the specified hardware
+ if ohm_sensors is None or shared.opts.gpu_temps_sleep_gpu_name not in str(
+ ohm_hardware.Name):
+ for hardware in ohm_computer.Hardware:
+ if shared.opts.gpu_temps_sleep_gpu_name in str(hardware.Name):
+ for sensor in hardware.Sensors:
+ if '/temperature' in str(sensor.Identifier):
+ ohm_sensors = sensor
+ ohm_hardware = hardware
+ return # sensor is found early return
+
+ # sensor not found
+ ohm_sensors = None
+ ohm_hardware = None
+ print(
+ f"[Error GPU temperature protection] OpenHardwareMonitor Couldn't find temperature sensor for {shared.opts.gpu_temps_sleep_gpu_name}")
+
+ except Exception as e:
+ print(f"[Error GPU temperature protection] Failed to initialize OpenHardwareMonitor: {e}")
+
+
+def get_gpu_temperature_open_hardware_monitor():
+ try:
+ ohm_hardware.Update()
+ return int(ohm_sensors.get_Value())
+ except Exception as e:
+ print(f"\n[Error GPU temperature protection] OpenHardwareMonitor: Couldn't read temperature{e}")
+ return 0
From ce4aff494018c8916878547524a411dc46d76ced Mon Sep 17 00:00:00 2001
From: w-e-w <40751091+w-e-w@users.noreply.github.com>
Date: Sat, 20 Jan 2024 23:14:08 +0900
Subject: [PATCH 4/7] patch store_latent on load
---
scripts/gpu_temperature_protection.py | 64 +++++++++++----------------
1 file changed, 25 insertions(+), 39 deletions(-)
diff --git a/scripts/gpu_temperature_protection.py b/scripts/gpu_temperature_protection.py
index ebbe91b..baec1dd 100644
--- a/scripts/gpu_temperature_protection.py
+++ b/scripts/gpu_temperature_protection.py
@@ -1,5 +1,5 @@
from temperature_sensor_modules import nvidia_smi, amd_rocm_smi, open_hardware_monitor
-from modules import scripts, shared, sd_samplers_common
+from modules import scripts, shared, sd_samplers_common, patches, script_callbacks, errors
from typing import Callable
import gradio as gr
import subprocess
@@ -7,9 +7,7 @@ import time
import re
import os
-pre_decorate_store_latent = sd_samplers_common.store_latent
temperature_func: Callable[[], float]
-
temperature_src_dict = {
"NVIDIA - nvidia-smi": nvidia_smi.get_gpu_temperature_nvidia_smi,
"AMD - ROCm-smi": amd_rocm_smi.get_gpu_temperature_amd_rocm_smi,
@@ -57,9 +55,9 @@ if os.name == 'nt':
shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), {
"gpu_temps_sleep_gpu_name": shared.OptionInfo("None" if len(names_list) == 0 else names_list[0], "GPU Name - OpenHardwareMonitor", gr.Radio, {"choices": names_list}, init_temps_src).info("select your gpu"),
}))
- except Exception as _e:
+ except Exception as e:
if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor':
- print(f'[Error GPU temperature protection] Failed to retrieve list of video controllers: \n{_e}')
+ print(f'[Error GPU temperature protection] Failed to retrieve list of video controllers: \n{e}')
class TemperatureProtection:
@@ -109,6 +107,26 @@ class TemperatureProtection:
last_call_time = call_time
+def gpu_temperature_protection_decorator(fun, config):
+ def wrapper(*args, **kwargs):
+ config.temperature_protection()
+ result = fun(*args, **kwargs)
+ return result
+ return wrapper
+
+
+def patch_temperature_protection(obj, field, config):
+ try:
+ patches.patch(__name__, obj, field, gpu_temperature_protection_decorator(sd_samplers_common.store_latent, config))
+
+ def undo_hijack():
+ patches.undo(__name__, obj, field)
+
+ script_callbacks.on_script_unloaded(undo_hijack)
+ except RuntimeError:
+ errors.report(f"patch_temperature_protection {field} is already applied")
+
+
config_store_latent = TemperatureProtection(
'gpu_temps_sleep_sleep_temp',
'gpu_temps_sleep_wake_temp',
@@ -116,38 +134,6 @@ config_store_latent = TemperatureProtection(
)
-def gpu_temperature_protection_decorator(fun):
- def wrapper(*args, **kwargs):
- # gpu_temperature_protection(config_store_latent)
- config_store_latent.temperature_protection()
- result = fun(*args, **kwargs)
- return result
- return wrapper
-
-
-def gpu_temperature_close_decorator(fun):
- def wrapper(*args, **kwargs):
- sd_samplers_common.store_latent = pre_decorate_store_latent
- result = fun(*args, **kwargs)
- return result
- return wrapper
-
-
-class GPUTemperatureProtectionScript(scripts.Script):
-
- def title(self):
- return "GPU temperature protection"
-
- def show(self, is_img2img):
- return scripts.AlwaysVisible
-
- def setup(self, p, *args):
- if shared.opts.gpu_temps_sleep_enable:
- global pre_decorate_store_latent
- pre_decorate_store_latent = sd_samplers_common.store_latent
- sd_samplers_common.store_latent = gpu_temperature_protection_decorator(sd_samplers_common.store_latent)
- p.close = gpu_temperature_close_decorator(p.close)
-
-
-init_temps_src()
+patch_temperature_protection(sd_samplers_common, 'store_latent', config_store_latent)
last_call_time = time.time()
+init_temps_src()
From 9fe63493d297b7d1ef11a17863b7e62d568f2d1b Mon Sep 17 00:00:00 2001
From: w-e-w <40751091+w-e-w@users.noreply.github.com>
Date: Sat, 20 Jan 2024 23:36:50 +0900
Subject: [PATCH 5/7] rework settings
---
scripts/gpu_temperature_protection.py | 27 ++++++++++++++-------------
1 file changed, 14 insertions(+), 13 deletions(-)
diff --git a/scripts/gpu_temperature_protection.py b/scripts/gpu_temperature_protection.py
index baec1dd..63719d1 100644
--- a/scripts/gpu_temperature_protection.py
+++ b/scripts/gpu_temperature_protection.py
@@ -28,23 +28,13 @@ def init_temps_src():
if hasattr(shared, "OptionHTML"): # < 1.6.0 support
shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), {
- "gpu_temps_sleep_temperature_src_explanation": shared.OptionHTML("""NVIDIA - nvidia-smi is available on both Windows and Linux.
-AMD - ROCm-smi is Linux only and does not support specifying GPU device index.
-NVIDIA & AMD - OpenHardwareMonitor is Windows only supports NVIDIA and AMD.
- """)
- }))
+ "gpu_temps_sleep_temperature_src_explanation": shared.OptionHTML("""NVIDIA - nvidia-smi available on both Windows and Linux.
+AMD - ROCm-smi - Linux only and does not support specifying GPU device index.
+NVIDIA & AMD - OpenHardwareMonitor - Windows only supports NVIDIA and AMD.""")}))
shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), {
"gpu_temps_sleep_temperature_src": shared.OptionInfo("NVIDIA - nvidia-smi", "Temperature source", gr.Radio, {"choices": list(temperature_src_dict.keys())}, init_temps_src),
- "gpu_temps_sleep_enable": shared.OptionInfo(True, "Enable GPU temperature protection"),
- "gpu_temps_sleep_print": shared.OptionInfo(True, "Print GPU Core temperature while sleeping in terminal"),
- "gpu_temps_sleep_minimum_interval": shared.OptionInfo(5.0, "GPU temperature monitor minimum interval", gr.Number).info("won't check the temperature again until this amount of seconds have passed"),
- "gpu_temps_sleep_sleep_time": shared.OptionInfo(1.0, "Sleep Time", gr.Number).info("seconds to pause before checking temperature again"),
- "gpu_temps_sleep_max_sleep_time": shared.OptionInfo(10.0, "Max sleep Time", gr.Number).info("max number of seconds that it's allowed to pause, 0=unlimited"),
- "gpu_temps_sleep_sleep_temp": shared.OptionInfo(75.0, "GPU sleep temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause if GPU core temperature exceeds this temperature"),
- "gpu_temps_sleep_wake_temp": shared.OptionInfo(75.0, "GPU wake temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause until GPU core temperature drops below this temperature"),
- "gpu_temps_sleep_gpu_index": shared.OptionInfo(0, "GPU device index - nvidia-smi", gr.Number, {"precision": 0}).info("selecting the correct temperature reading for multi GPU systems, for systems with 3 gpus the value should be an integer between 0~2, default 0"),
}))
if os.name == 'nt':
@@ -59,6 +49,17 @@ if os.name == 'nt':
if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor':
print(f'[Error GPU temperature protection] Failed to retrieve list of video controllers: \n{e}')
+shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), {
+ "gpu_temps_sleep_gpu_index": shared.OptionInfo(0, "GPU device index - nvidia-smi", gr.Number, {"precision": 0}).info("selecting the correct temperature reading for multi GPU systems, for systems with 3 gpus the value should be an integer between 0~2, default 0"),
+ "gpu_temps_sleep_enable": shared.OptionInfo(True, "Enable GPU temperature protection"),
+ "gpu_temps_sleep_print": shared.OptionInfo(True, "Print GPU Core temperature while sleeping in terminal"),
+ "gpu_temps_sleep_minimum_interval": shared.OptionInfo(5.0, "GPU temperature monitor minimum interval", gr.Number).info("won't check the temperature again until this amount of seconds have passed"),
+ "gpu_temps_sleep_sleep_time": shared.OptionInfo(1.0, "Sleep Time", gr.Number).info("seconds to pause before checking temperature again"),
+ "gpu_temps_sleep_max_sleep_time": shared.OptionInfo(10.0, "Max sleep Time", gr.Number).info("max number of seconds that it's allowed to pause, 0=unlimited"),
+ "gpu_temps_sleep_sleep_temp": shared.OptionInfo(75.0, "GPU sleep temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause if GPU core temperature exceeds this temperature"),
+ "gpu_temps_sleep_wake_temp": shared.OptionInfo(75.0, "GPU wake temperature", gr.Slider, {"minimum": 0, "maximum": 125}).info("generation will pause until GPU core temperature drops below this temperature"),
+}))
+
class TemperatureProtection:
def __init__(self, sleep_temp, wake_temp, max_sleep_time):
From 61adc45ccd24aa483aed1f8f4a23218354b740a4 Mon Sep 17 00:00:00 2001
From: w-e-w <40751091+w-e-w@users.noreply.github.com>
Date: Sun, 19 May 2024 17:38:17 +0900
Subject: [PATCH 6/7] switch wmic tp Get-CimInstance
---
scripts/gpu_temperature_protection.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/scripts/gpu_temperature_protection.py b/scripts/gpu_temperature_protection.py
index 63719d1..a7f6e9a 100644
--- a/scripts/gpu_temperature_protection.py
+++ b/scripts/gpu_temperature_protection.py
@@ -39,11 +39,10 @@ shared.options_templates.update(shared.options_section(('GPU_temperature_protect
if os.name == 'nt':
try:
- all_lines = subprocess.check_output(['cmd.exe', '/c', 'wmic path win32_VideoController get name']).decode().strip("\nName").splitlines()
+ Win32_VideoControllers = subprocess.check_output(['powershell.exe', '-Command', '(Get-CimInstance -ClassName Win32_VideoController | Select-Object -ExpandProperty Name) -join "`n"'], text=True).splitlines()
video_controller_filter = re.compile(r"^\s+$")
- names_list = [name.strip() for name in all_lines if not video_controller_filter.match(name) and name != '']
shared.options_templates.update(shared.options_section(('GPU_temperature_protection', "GPU Temperature"), {
- "gpu_temps_sleep_gpu_name": shared.OptionInfo("None" if len(names_list) == 0 else names_list[0], "GPU Name - OpenHardwareMonitor", gr.Radio, {"choices": names_list}, init_temps_src).info("select your gpu"),
+ "gpu_temps_sleep_gpu_name": shared.OptionInfo(Win32_VideoControllers[0] if Win32_VideoControllers else "None", "GPU Name - OpenHardwareMonitor", gr.Radio, {"choices": Win32_VideoControllers}, init_temps_src).info("select your gpu"),
}))
except Exception as e:
if shared.opts.gpu_temps_sleep_temperature_src == 'NVIDIA & AMD - OpenHardwareMonitor':
From 38de3a9615d6565d3fae70bb0d8aebdfa31cf13e Mon Sep 17 00:00:00 2001
From: w-e-w <40751091+w-e-w@users.noreply.github.com>
Date: Sun, 19 May 2024 17:58:33 +0900
Subject: [PATCH 7/7] improve error message in OpenHardwareMonitor
---
.../open_hardware_monitor.py | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/temperature_sensor_modules/open_hardware_monitor.py b/temperature_sensor_modules/open_hardware_monitor.py
index d78b094..2a42d73 100644
--- a/temperature_sensor_modules/open_hardware_monitor.py
+++ b/temperature_sensor_modules/open_hardware_monitor.py
@@ -1,6 +1,7 @@
-from modules import scripts, shared
+from modules import scripts, shared, errors
from pathlib import Path
import urllib.request
+import gradio as gr
import zipfile
import launch
import os
@@ -46,8 +47,7 @@ def init_open_hardware_monitor():
ohm_computer.Open()
# find the first matching temperature sensor for the specified hardware
- if ohm_sensors is None or shared.opts.gpu_temps_sleep_gpu_name not in str(
- ohm_hardware.Name):
+ if ohm_sensors is None or shared.opts.gpu_temps_sleep_gpu_name not in str(ohm_hardware.Name):
for hardware in ohm_computer.Hardware:
if shared.opts.gpu_temps_sleep_gpu_name in str(hardware.Name):
for sensor in hardware.Sensors:
@@ -59,11 +59,13 @@ def init_open_hardware_monitor():
# sensor not found
ohm_sensors = None
ohm_hardware = None
- print(
- f"[Error GPU temperature protection] OpenHardwareMonitor Couldn't find temperature sensor for {shared.opts.gpu_temps_sleep_gpu_name}")
+ error_message = f"OpenHardwareMonitor Couldn't find temperature sensor for {shared.opts.gpu_temps_sleep_gpu_name}"
+ gr.Warning(error_message)
+ print(f"[Error GPU temperature protection] {error_message}")
except Exception as e:
- print(f"[Error GPU temperature protection] Failed to initialize OpenHardwareMonitor: {e}")
+ error_message = f"Failed to initialize OpenHardwareMonitor"
+ errors.report(f'[Error GPU temperature protection] {error_message}')
def get_gpu_temperature_open_hardware_monitor():