offload-never and offload-always per-module and new highvram profile

Signed-off-by: Vladimir Mandic <mandic00@live.com>
pull/4089/head
Vladimir Mandic 2025-07-31 11:40:24 -04:00
parent 1b3f5405a3
commit fa44521ea3
28 changed files with 144 additions and 91 deletions

View File

@ -8,7 +8,15 @@
- prompt parser allow explict `BOS` and `EOS` tokens in prompt
- **UI**
- new embedded docs/wiki search!
**Docs** search: fully-local and works in real-time on all document pages
**Wiki** search: uses github api to search online wiki pages
- modernui checkbox/radio styling
- **Offloading**
- changed default values for offloading based on detected gpu memory
see [offloading docs](https://vladmandic.github.io/sdnext-docs/Offload/) for details
- new feature to specify which modules to offload always or never
in *settings -> models & loading -> offload always/never*
- new `highvram` profile provides significant performance boost on gpus with more than 24gb
- **Fixes**
- fix Wan 2.2-5B I2V workflow
- fix inpaint image metadata
@ -16,6 +24,7 @@
- fix progress bar with refine/detailer
- fix api progress reporting endpoint
- fix openvino backend failing to compile
- avoid forced gc and rely on thresholds
- add missing interrogate in output panel
## Update for 2025-07-29

View File

@ -280,7 +280,7 @@ def set_cuda_memory_limit():
return
try:
from modules.shared import cmd_opts
torch_gc(force=True)
torch_gc(force=True, reason='cuda')
mem = torch.cuda.get_device_properties(device).total_memory
torch.cuda.set_per_process_memory_fraction(float(opts.cuda_mem_fraction), cmd_opts.device_id if cmd_opts.device_id is not None else 0)
log.info(f'Torch memory limit: fraction={opts.cuda_mem_fraction:.2f} limit={round(opts.cuda_mem_fraction * mem / 1024 / 1024)} total={round(mem / 1024 / 1024)}')

View File

@ -176,7 +176,7 @@ def run_modelmerger(id_task, **kwargs): # pylint: disable=unused-argument
created_model = next((ckpt for ckpt in sd_models.checkpoints_list.values() if ckpt.name == filename), None)
if created_model:
created_model.calculate_shorthash()
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='merge')
shared.state.end()
return [*[gr.Dropdown.update(choices=sd_models.checkpoint_titles()) for _ in range(4)], f"Model saved to {output_modelname}"]
@ -248,7 +248,7 @@ def run_model_modules(model_type:str, model_name:str, custom_name:str,
yield from modules_sdxl.merge()
status = modules_sdxl.status
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='merge')
yield msg("modules merge complete")
if modules_sdxl.pipeline is not None:
checkpoint_info = sd_models.CheckpointInfo(filename='None')

View File

@ -183,7 +183,7 @@ def load_model(variant:str=None, pipeline:str=None, text_encoder:str=None, text_
diffusers.loaders.peft._SET_ADAPTER_SCALE_FN_MAPPING['HunyuanVideoTransformer3DModelPacked'] = lambda model_cls, weights: weights # pylint: disable=protected-access
shared.log.info(f'FramePack load: model={shared.sd_model.__class__.__name__} variant="{variant}" type={shared.sd_model_type} time={t1-t0:.2f}')
sd_models.apply_balanced_offload(shared.sd_model)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
except Exception as e:
shared.log.error(f'FramePack load: {e}')

View File

@ -516,7 +516,7 @@ def openvino_fx(subgraph, example_inputs, options=None):
else:
# Delete unused subgraphs
subgraph = subgraph.apply(sd_models.convert_to_faketensors)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='openvino')
# Model is fully supported and already cached. Run the cached OV model directly.
compiled_model = openvino_compile_cached_model(maybe_fs_cached_name, *example_inputs)

View File

@ -632,7 +632,7 @@ def interrogate(question:str='', system_prompt:str=None, prompt:str=None, image:
if shared.opts.interrogate_offload and model is not None:
sd_models.move_model(model, devices.cpu, force=True)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='vqa')
answer = clean(answer, question)
t1 = time.time()
if not quiet:

View File

@ -436,7 +436,7 @@ def sdnq_quantize_model(model, op=None, sd_model=None, do_gc: bool = True, weigh
else:
getattr(sd_model, quant_last_model_name).to(quant_last_model_device)
if do_gc:
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='sdnq')
if shared.cmd_opts.medvram or shared.cmd_opts.lowvram or shared.opts.diffusers_offload_mode != "none":
quant_last_model_name = op
quant_last_model_device = model.device
@ -447,7 +447,7 @@ def sdnq_quantize_model(model, op=None, sd_model=None, do_gc: bool = True, weigh
elif shared.opts.diffusers_offload_mode != "none":
model = model.to(devices.cpu)
if do_gc:
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='sdnq')
return model
@ -465,7 +465,7 @@ def sdnq_quantize_weights(sd_model):
getattr(getattr(sd_model, last_model_names[0]), last_model_names[1]).to(quant_last_model_device)
else:
getattr(sd_model, quant_last_model_name).to(quant_last_model_device)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='sdnq')
quant_last_model_name = None
quant_last_model_device = None
@ -510,7 +510,7 @@ def optimum_quanto_model(model, op=None, sd_model=None, weights=None, activation
getattr(getattr(sd_model, last_model_names[0]), last_model_names[1]).to(quant_last_model_device)
else:
getattr(sd_model, quant_last_model_name).to(quant_last_model_device)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='quanto')
if shared.cmd_opts.medvram or shared.cmd_opts.lowvram or shared.opts.diffusers_offload_mode != "none":
quant_last_model_name = op
quant_last_model_device = model.device
@ -518,7 +518,7 @@ def optimum_quanto_model(model, op=None, sd_model=None, weights=None, activation
quant_last_model_name = None
quant_last_model_device = None
model.to(devices.device)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='quanto')
return model
@ -540,7 +540,7 @@ def optimum_quanto_weights(sd_model):
getattr(getattr(sd_model, last_model_names[0]), last_model_names[1]).to(quant_last_model_device)
else:
getattr(sd_model, quant_last_model_name).to(quant_last_model_device)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='quanto')
quant_last_model_name = None
quant_last_model_device = None
@ -572,7 +572,7 @@ def optimum_quanto_weights(sd_model):
sd_models.move_model(sd_model, devices.cpu)
if hasattr(sd_model, "encode_prompt"):
sd_model.encode_prompt = original_encode_prompt
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='quanto')
t1 = time.time()
log.info(f"Quantization: type=Optimum.quanto time={t1-t0:.2f}")

View File

@ -407,6 +407,8 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
if shared.opts.return_mask_composite:
output_images.append(image_mask_composite)
if shared.cmd_opts.lowvram:
devices.torch_gc(force=True, reason='lowvram')
timer.process.record('post')
if not p.xyz:
@ -461,5 +463,6 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
shared.log.debug(f'Processed: timers={timer.process.dct()}')
shared.log.debug(f'Processed: memory={memstats.memory_stats()}')
# devices.torch_gc(force=True, reason='final')
if shared.cmd_opts.lowvram or shared.cmd_opts.medvram:
devices.torch_gc(force=True, reason='final')
return processed

View File

@ -678,7 +678,7 @@ def load_diffuser(checkpoint_info=None, timer=None, op='model', revision=None):
errors.display(e, "Model")
if shared.opts.diffusers_offload_mode != 'balanced':
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
if sd_model is not None:
script_callbacks.model_loaded_callback(sd_model)
@ -1107,14 +1107,14 @@ def unload_model_weights(op='model'):
disable_offload(model_data.sd_model)
move_model(model_data.sd_model, 'meta')
model_data.sd_model = None
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='unload')
shared.log.debug(f'Unload {op}: {memory_stats()} after')
elif (op == 'refiner') and model_data.sd_refiner:
shared.log.debug(f'Current {op}: {memory_stats()}')
disable_offload(model_data.sd_refiner)
move_model(model_data.sd_refiner, 'meta')
model_data.sd_refiner = None
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='unload')
shared.log.debug(f'Unload {op}: {memory_stats()}')

View File

@ -1,4 +1,5 @@
import os
import re
import sys
import time
import inspect
@ -172,12 +173,14 @@ class OffloadHook(accelerate.hooks.ModelHook):
self.min_watermark = shared.opts.diffusers_offload_min_gpu_memory
self.max_watermark = shared.opts.diffusers_offload_max_gpu_memory
self.cpu_watermark = shared.opts.diffusers_offload_max_cpu_memory
self.offload_always = [m.strip() for m in re.split(';|,| ', shared.opts.diffusers_offload_always) if len(m.strip()) > 2]
self.offload_never = [m.strip() for m in re.split(';|,| ', shared.opts.diffusers_offload_never) if len(m.strip()) > 2]
self.gpu = int(shared.gpu_memory * shared.opts.diffusers_offload_max_gpu_memory * 1024*1024*1024)
self.cpu = int(shared.cpu_memory * shared.opts.diffusers_offload_max_cpu_memory * 1024*1024*1024)
self.offload_map = {}
self.param_map = {}
gpu = f'{(shared.gpu_memory * shared.opts.diffusers_offload_min_gpu_memory):.2f}-{(shared.gpu_memory * shared.opts.diffusers_offload_max_gpu_memory):.2f}:{shared.gpu_memory:.2f}'
shared.log.info(f'Offload: type=balanced op=init watermark={self.min_watermark}-{self.max_watermark} gpu={gpu} cpu={shared.cpu_memory:.3f} limit={shared.opts.cuda_mem_fraction:.2f}')
shared.log.info(f'Offload: type=balanced op=init watermark={self.min_watermark}-{self.max_watermark} gpu={gpu} cpu={shared.cpu_memory:.3f} limit={shared.opts.cuda_mem_fraction:.2f} always={self.offload_always} never={self.offload_never}')
self.validate()
super().__init__()
@ -210,10 +213,7 @@ class OffloadHook(accelerate.hooks.ModelHook):
max_memory = { device_index: self.gpu, "cpu": self.cpu }
device_map = getattr(module, "balanced_offload_device_map", None)
if device_map is None or max_memory != getattr(module, "balanced_offload_max_memory", None):
# try:
device_map = accelerate.infer_auto_device_map(module, max_memory=max_memory)
# except Exception as e:
# shared.log.error(f'Offload: type=balanced module={module.__class__.__name__} {e}')
offload_dir = getattr(module, "offload_dir", os.path.join(shared.opts.accelerate_offload_path, module.__class__.__name__))
if devices.backend == "directml":
keys = device_map.keys()
@ -233,15 +233,22 @@ class OffloadHook(accelerate.hooks.ModelHook):
perc_gpu = used_gpu / shared.gpu_memory
try:
module_size = self.model_size()
module_cls = module.__class__.__name__
prev_gpu = used_gpu
offload_now = perc_gpu > shared.opts.diffusers_offload_min_gpu_memory
if offload_now:
op = 'post:skip'
if module_cls in self.offload_never:
op = 'post:never'
elif module_cls in self.offload_always:
op = 'post:always'
module = module.to(devices.cpu)
used_gpu -= module_size
elif perc_gpu > shared.opts.diffusers_offload_min_gpu_memory:
op = 'post:mem'
module = module.to(devices.cpu)
used_gpu -= module_size
if debug:
cls = module.__class__.__name__
quant = getattr(module, "quantization_method", None)
debug_move(f'Offload: type=balanced op={"post" if offload_now else "skip"} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={cls} size={module_size:.3f}')
debug_move(f'Offload: type=balanced op={op} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={module_cls} size={module_size:.3f}')
except Exception as e:
if 'out of memory' in str(e):
devices.torch_gc(fast=True, force=True, reason='oom')
@ -311,6 +318,8 @@ def apply_balanced_offload(sd_model=None, exclude=[]):
else:
keys = get_signature(pipe).keys()
keys = [k for k in keys if k not in exclude and not k.startswith('_')]
offload_always = [m.strip() for m in re.split(';|,| ', shared.opts.diffusers_offload_always) if len(m.strip()) > 2]
offload_never = [m.strip() for m in re.split(';|,| ', shared.opts.diffusers_offload_never) if len(m.strip()) > 2]
for module_name, module_size in get_pipe_modules(pipe): # pylint: disable=protected-access
# shared.log.trace(f'Offload: type=balanced op=apply pipe={pipe.__class__.__name__} module={module_name} size={module_size:.3f}')
module = getattr(pipe, module_name, None)
@ -326,16 +335,26 @@ def apply_balanced_offload(sd_model=None, exclude=[]):
perc_gpu = used_gpu / shared.gpu_memory
try:
prev_gpu = used_gpu
offload_now = (perc_gpu > shared.opts.diffusers_offload_min_gpu_memory) and (module.device != devices.cpu)
if offload_now:
module_cls = module.__class__.__name__
op = 'apply:skip'
if module_cls in offload_never:
op = 'apply:never'
elif module_cls in offload_always:
op = 'apply:always'
module = module.to(devices.cpu)
used_gpu -= module_size
cls = module.__class__.__name__
elif perc_gpu > shared.opts.diffusers_offload_min_gpu_memory:
op = 'apply:mem'
module = module.to(devices.cpu)
used_gpu -= module_size
if debug:
quant = getattr(module, "quantization_method", None)
debug_move(f'Offload: type=balanced op={op} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={module_cls} size={module_size:.3f}')
quant = getattr(module, "quantization_method", None)
if not cached:
shared.log.debug(f'Model module={module_name} type={cls} dtype={module.dtype} quant={quant} params={offload_hook_instance.param_map[module_name]:.3f} size={offload_hook_instance.offload_map[module_name]:.3f}')
shared.log.debug(f'Model module={module_name} type={module_cls} dtype={module.dtype} quant={quant} params={offload_hook_instance.param_map[module_name]:.3f} size={offload_hook_instance.offload_map[module_name]:.3f}')
if debug:
debug_move(f'Offload: type=balanced op={"move" if offload_now else "skip"} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={cls} size={module_size:.3f}')
debug_move(f'Offload: type=balanced op={op} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={module_cls} size={module_size:.3f}')
except Exception as e:
if 'out of memory' in str(e):
devices.torch_gc(fast=True, force=True, reason='oom')

View File

@ -346,7 +346,7 @@ class SDNQQuantizer(DiffusersQuantizer):
def _process_model_after_weight_loading(self, model, **kwargs): # pylint: disable=unused-argument
if shared.opts.diffusers_offload_mode != "none":
model = model.to(devices.cpu)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='sdnq')
return model
def get_cuda_warm_up_factor(self):

View File

@ -8,6 +8,7 @@ import gradio as gr
import diffusers
from modules.json_helpers import readfile, writefile # pylint: disable=W0611
from modules.shared_helpers import listdir, walk_files, html_path, html, req, total_tqdm # pylint: disable=W0611
from modules.shared_defaults import get_default_modes
from modules import errors, devices, shared_items, shared_state, cmd_args, theme, history, files_cache
from modules.paths import models_path, script_path, data_path, sd_configs_path, sd_default_config, sd_model_file, default_sd_model_file, extensions_dir, extensions_builtin_dir # pylint: disable=W0611
from modules.dml import memory_providers, default_memory_provider, directml_do_hijack
@ -129,45 +130,7 @@ def list_samplers():
return modules.sd_samplers.all_samplers
def get_default_modes():
default_offload_mode = "none"
default_diffusers_offload_min_gpu_memory = 0.2
if not (cmd_opts.lowvram or cmd_opts.medvram):
if "gpu" in mem_stat:
if gpu_memory <= 4:
cmd_opts.lowvram = True
default_offload_mode = "sequential"
default_diffusers_offload_min_gpu_memory = 0
log.info(f"Device detect: memory={gpu_memory:.1f} default=sequential optimization=lowvram")
elif gpu_memory <= 12:
cmd_opts.medvram = True # VAE Tiling and other stuff
default_offload_mode = "balanced"
default_diffusers_offload_min_gpu_memory = 0
log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced optimization=medvram")
else:
default_offload_mode = "balanced"
default_diffusers_offload_min_gpu_memory = 0.2
log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced")
elif cmd_opts.medvram:
default_offload_mode = "balanced"
default_diffusers_offload_min_gpu_memory = 0
elif cmd_opts.lowvram:
default_offload_mode = "sequential"
default_diffusers_offload_min_gpu_memory = 0
default_cross_attention = "Scaled-Dot-Product"
if devices.backend == "zluda":
default_sdp_options = ['Flash attention', 'Math attention', 'Dynamic attention']
elif devices.backend in {"rocm", "directml", "cpu", "mps"}:
default_sdp_options = ['Flash attention', 'Memory attention', 'Math attention', 'Dynamic attention']
else:
default_sdp_options = ['Flash attention', 'Memory attention', 'Math attention']
return default_offload_mode, default_diffusers_offload_min_gpu_memory, default_cross_attention, default_sdp_options
startup_offload_mode, startup_diffusers_offload_min_gpu_memory, startup_cross_attention, startup_sdp_options = get_default_modes()
startup_offload_mode, startup_offload_min_gpu, startup_offload_max_gpu, startup_cross_attention, startup_sdp_options, startup_offload_always, startup_offload_never = get_default_modes(cmd_opts=cmd_opts, mem_stat=mem_stat)
options_templates.update(options_section(('sd', "Models & Loading"), {
"sd_backend": OptionInfo('diffusers', "Execution backend", gr.Radio, {"choices": ['diffusers', 'original'], "visible": False }),
@ -179,9 +142,11 @@ options_templates.update(options_section(('sd', "Models & Loading"), {
"offload_sep": OptionInfo("<h2>Model Offloading</h2>", "", gr.HTML),
"diffusers_offload_mode": OptionInfo(startup_offload_mode, "Model offload mode", gr.Radio, {"choices": ['none', 'balanced', 'group', 'model', 'sequential']}),
"diffusers_offload_min_gpu_memory": OptionInfo(startup_diffusers_offload_min_gpu_memory, "Balanced offload GPU low watermark", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01 }),
"diffusers_offload_max_gpu_memory": OptionInfo(0.70, "Balanced offload GPU high watermark", gr.Slider, {"minimum": 0.1, "maximum": 1, "step": 0.01 }),
"diffusers_offload_min_gpu_memory": OptionInfo(startup_offload_min_gpu, "Balanced offload GPU low watermark", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01 }),
"diffusers_offload_max_gpu_memory": OptionInfo(startup_offload_max_gpu, "Balanced offload GPU high watermark", gr.Slider, {"minimum": 0.1, "maximum": 1, "step": 0.01 }),
"diffusers_offload_max_cpu_memory": OptionInfo(0.90, "Balanced offload CPU high watermark", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01, "visible": False }),
"diffusers_offload_always": OptionInfo(startup_offload_always, "Modules to always offload"),
"diffusers_offload_never": OptionInfo(startup_offload_never, "Modules to never offload"),
"advanced_sep": OptionInfo("<h2>Advanced Options</h2>", "", gr.HTML),
"sd_checkpoint_autoload": OptionInfo(True, "Model auto-load on start"),
@ -299,7 +264,7 @@ options_templates.update(options_section(("quantization", "Quantization Settings
"sdnq_quantize_conv_layers": OptionInfo(False, "Quantize convolutional layers", gr.Checkbox),
"sdnq_dequantize_compile": OptionInfo(devices.has_triton(), "Dequantize using torch.compile", gr.Checkbox),
"sdnq_use_quantized_matmul": OptionInfo(False, "Use quantized MatMul", gr.Checkbox),
"sdnq_use_quantized_matmul_conv": OptionInfo(False, "Use quantized MatMul with convolutional layers", gr.Checkbox),
"sdnq_use_quantized_matmul_conv": OptionInfo(False, "Use quantized MatMul with conv", gr.Checkbox),
"sdnq_quantize_with_gpu": OptionInfo(True, "Quantize using GPU", gr.Checkbox),
"sdnq_dequantize_fp32": OptionInfo(False, "Dequantize using full precision", gr.Checkbox),
"sdnq_quantize_shuffle_weights": OptionInfo(False, "Shuffle weights in post mode", gr.Checkbox),

View File

@ -0,0 +1,57 @@
from installer import log
from modules import devices
def get_default_modes(cmd_opts, mem_stat):
default_offload_mode = "none"
default_diffusers_offload_min_gpu_memory = 0.2
default_diffusers_offload_max_gpu_memory = 0.6
default_diffusers_offload_always = ''
default_diffusers_offload_never = ''
cpu_memory = round(mem_stat['ram']['total'] if "ram" in mem_stat else 0)
gpu_memory = round(mem_stat['gpu']['total'] if "gpu" in mem_stat else 0)
if not (cmd_opts.lowvram or cmd_opts.medvram):
if "gpu" in mem_stat:
if gpu_memory <= 4:
cmd_opts.lowvram = True
default_offload_mode = "sequential"
default_diffusers_offload_min_gpu_memory = 0
log.info(f"Device detect: memory={gpu_memory:.1f} default=sequential optimization=lowvram")
elif gpu_memory <= 12:
cmd_opts.medvram = True # VAE Tiling and other stuff
default_offload_mode = "balanced"
default_diffusers_offload_min_gpu_memory = 0
log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced optimization=medvram")
elif gpu_memory >= 24:
default_offload_mode = "balanced"
default_diffusers_offload_max_gpu_memory = 0.8
default_diffusers_offload_never = ', '.join(['CLIPTextModel', 'CLIPTextModelWithProjection', 'AutoencoderKL'])
log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced optimization=highvram")
else:
default_offload_mode = "balanced"
log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced")
elif cmd_opts.medvram:
default_offload_mode = "balanced"
default_diffusers_offload_min_gpu_memory = 0
elif cmd_opts.lowvram:
default_offload_mode = "sequential"
default_diffusers_offload_min_gpu_memory = 0
default_cross_attention = "Scaled-Dot-Product"
if devices.backend == "zluda":
default_sdp_options = ['Flash attention', 'Math attention', 'Dynamic attention']
elif devices.backend in {"rocm", "directml", "cpu", "mps"}:
default_sdp_options = ['Flash attention', 'Memory attention', 'Math attention', 'Dynamic attention']
else:
default_sdp_options = ['Flash attention', 'Memory attention', 'Math attention']
return (
default_offload_mode,
default_diffusers_offload_min_gpu_memory,
default_diffusers_offload_max_gpu_memory,
default_cross_attention,
default_sdp_options,
default_diffusers_offload_always,
default_diffusers_offload_never
)

View File

@ -71,7 +71,7 @@ def generate(*args, **kwargs):
# cleanup memory
shared.sd_model = sd_models.apply_balanced_offload(shared.sd_model)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='video')
# set args
processing.fix_seed(p)

View File

@ -17,5 +17,5 @@ def load_auraflow(checkpoint_info, diffusers_load_config={}):
cache_dir = shared.opts.diffusers_dir,
**diffusers_load_config,
)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return pipe

View File

@ -187,7 +187,7 @@ def load_chroma(checkpoint_info, diffusers_load_config): # triggered by opts.sd_
# unload current model
sd_models.unload_model_weights()
shared.sd_model = None
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
if shared.opts.teacache_enabled:
from modules import teacache
@ -277,5 +277,5 @@ def load_chroma(checkpoint_info, diffusers_load_config): # triggered by opts.sd_
for k in kwargs.keys():
kwargs[k] = None
sd_hijack_te.init_hijack(pipe)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return pipe, allow_post_quant

View File

@ -220,7 +220,7 @@ def load_flux(checkpoint_info, diffusers_load_config): # triggered by opts.sd_ch
# unload current model
sd_models.unload_model_weights()
shared.sd_model = None
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
if shared.opts.teacache_enabled:
from modules import teacache
@ -356,5 +356,5 @@ def load_flux(checkpoint_info, diffusers_load_config): # triggered by opts.sd_ch
for k in kwargs.keys():
kwargs[k] = None
sd_hijack_te.init_hijack(pipe)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return pipe, allow_post_quant

View File

@ -196,5 +196,5 @@ def load_flux_nf4(checkpoint_info, prequantized: bool = True):
errors.display(e, 'FLUX:')
del original_state_dict
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return transformer, text_encoder_2

View File

@ -23,5 +23,5 @@ def load_kolors(_checkpoint_info, diffusers_load_config={}):
**diffusers_load_config,
)
pipe.vae.config.force_upcast = True
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return pipe

View File

@ -14,7 +14,7 @@ def load_lumina(_checkpoint_info, diffusers_load_config={}):
cache_dir = shared.opts.diffusers_dir,
**load_config,
)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return pipe
@ -91,5 +91,5 @@ def load_lumina2(checkpoint_info, diffusers_load_config={}):
)
sd_hijack_te.init_hijack(pipe)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return pipe

View File

@ -52,5 +52,5 @@ def load_meissonic(checkpoint_info, diffusers_load_config={}):
diffusers.pipelines.auto_pipeline.AUTO_TEXT2IMAGE_PIPELINES_MAPPING["meissonic"] = PipelineMeissonic
diffusers.pipelines.auto_pipeline.AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["meissonic"] = PipelineMeissonicImg2Img
diffusers.pipelines.auto_pipeline.AUTO_INPAINT_PIPELINES_MAPPING["meissonic"] = PipelineMeissonicInpaint
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return pipe

View File

@ -28,5 +28,5 @@ def load_omnigen(checkpoint_info, diffusers_load_config={}): # pylint: disable=u
**load_config,
)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return pipe

View File

@ -45,5 +45,5 @@ def load_omnigen2(checkpoint_info, diffusers_load_config={}): # pylint: disable=
)
pipe.transformer = transformer # for omnigen2 transformer must be loaded after pipeline
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return pipe

View File

@ -40,5 +40,5 @@ def load_pixart(checkpoint_info, diffusers_load_config={}):
text_encoder=text_encoder,
**load_args,
)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return pipe

View File

@ -88,5 +88,5 @@ def load_sana(checkpoint_info, kwargs={}):
sd_hijack_te.init_hijack(pipe)
t1 = time.time()
shared.log.debug(f'Load model: type=Sana target={devices.dtype} te={pipe.text_encoder.dtype} transformer={pipe.transformer.dtype} vae={pipe.vae.dtype} time={t1-t0:.2f}')
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return pipe

View File

@ -124,5 +124,5 @@ def load_sd3(checkpoint_info, cache_dir=None, config=None):
config=config,
**kwargs,
)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
return pipe

View File

@ -155,7 +155,7 @@ def load_cascade_combined(checkpoint_info, diffusers_load_config):
latent_dim_scale=sd_model.decoder_pipe.config.latent_dim_scale,
)
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='load')
shared.log.debug(f'StableCascade combined: {sd_model.__class__.__name__}')
return sd_model

View File

@ -228,7 +228,7 @@ class Script(scripts_manager.Script):
shared.sd_model.clip_vision_model = None
shared.sd_model.handler_ante = None
shared.sd_model = shared.sd_model.pipe
devices.torch_gc(force=True)
devices.torch_gc(force=True, reason='pulid')
shared.log.debug(f'PuLID complete: class={shared.sd_model.__class__.__name__} preprocess={self.preprocess:.2f} pipe={"restore" if restore else "cache"}')
return processed