mirror of https://github.com/vladmandic/automatic
offload-never and offload-always per-module and new highvram profile
Signed-off-by: Vladimir Mandic <mandic00@live.com>pull/4089/head
parent
1b3f5405a3
commit
fa44521ea3
|
|
@ -8,7 +8,15 @@
|
|||
- prompt parser allow explict `BOS` and `EOS` tokens in prompt
|
||||
- **UI**
|
||||
- new embedded docs/wiki search!
|
||||
**Docs** search: fully-local and works in real-time on all document pages
|
||||
**Wiki** search: uses github api to search online wiki pages
|
||||
- modernui checkbox/radio styling
|
||||
- **Offloading**
|
||||
- changed default values for offloading based on detected gpu memory
|
||||
see [offloading docs](https://vladmandic.github.io/sdnext-docs/Offload/) for details
|
||||
- new feature to specify which modules to offload always or never
|
||||
in *settings -> models & loading -> offload always/never*
|
||||
- new `highvram` profile provides significant performance boost on gpus with more than 24gb
|
||||
- **Fixes**
|
||||
- fix Wan 2.2-5B I2V workflow
|
||||
- fix inpaint image metadata
|
||||
|
|
@ -16,6 +24,7 @@
|
|||
- fix progress bar with refine/detailer
|
||||
- fix api progress reporting endpoint
|
||||
- fix openvino backend failing to compile
|
||||
- avoid forced gc and rely on thresholds
|
||||
- add missing interrogate in output panel
|
||||
|
||||
## Update for 2025-07-29
|
||||
|
|
|
|||
|
|
@ -280,7 +280,7 @@ def set_cuda_memory_limit():
|
|||
return
|
||||
try:
|
||||
from modules.shared import cmd_opts
|
||||
torch_gc(force=True)
|
||||
torch_gc(force=True, reason='cuda')
|
||||
mem = torch.cuda.get_device_properties(device).total_memory
|
||||
torch.cuda.set_per_process_memory_fraction(float(opts.cuda_mem_fraction), cmd_opts.device_id if cmd_opts.device_id is not None else 0)
|
||||
log.info(f'Torch memory limit: fraction={opts.cuda_mem_fraction:.2f} limit={round(opts.cuda_mem_fraction * mem / 1024 / 1024)} total={round(mem / 1024 / 1024)}')
|
||||
|
|
|
|||
|
|
@ -176,7 +176,7 @@ def run_modelmerger(id_task, **kwargs): # pylint: disable=unused-argument
|
|||
created_model = next((ckpt for ckpt in sd_models.checkpoints_list.values() if ckpt.name == filename), None)
|
||||
if created_model:
|
||||
created_model.calculate_shorthash()
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='merge')
|
||||
shared.state.end()
|
||||
return [*[gr.Dropdown.update(choices=sd_models.checkpoint_titles()) for _ in range(4)], f"Model saved to {output_modelname}"]
|
||||
|
||||
|
|
@ -248,7 +248,7 @@ def run_model_modules(model_type:str, model_name:str, custom_name:str,
|
|||
yield from modules_sdxl.merge()
|
||||
status = modules_sdxl.status
|
||||
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='merge')
|
||||
yield msg("modules merge complete")
|
||||
if modules_sdxl.pipeline is not None:
|
||||
checkpoint_info = sd_models.CheckpointInfo(filename='None')
|
||||
|
|
|
|||
|
|
@ -183,7 +183,7 @@ def load_model(variant:str=None, pipeline:str=None, text_encoder:str=None, text_
|
|||
diffusers.loaders.peft._SET_ADAPTER_SCALE_FN_MAPPING['HunyuanVideoTransformer3DModelPacked'] = lambda model_cls, weights: weights # pylint: disable=protected-access
|
||||
shared.log.info(f'FramePack load: model={shared.sd_model.__class__.__name__} variant="{variant}" type={shared.sd_model_type} time={t1-t0:.2f}')
|
||||
sd_models.apply_balanced_offload(shared.sd_model)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
|
||||
except Exception as e:
|
||||
shared.log.error(f'FramePack load: {e}')
|
||||
|
|
|
|||
|
|
@ -516,7 +516,7 @@ def openvino_fx(subgraph, example_inputs, options=None):
|
|||
else:
|
||||
# Delete unused subgraphs
|
||||
subgraph = subgraph.apply(sd_models.convert_to_faketensors)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='openvino')
|
||||
|
||||
# Model is fully supported and already cached. Run the cached OV model directly.
|
||||
compiled_model = openvino_compile_cached_model(maybe_fs_cached_name, *example_inputs)
|
||||
|
|
|
|||
|
|
@ -632,7 +632,7 @@ def interrogate(question:str='', system_prompt:str=None, prompt:str=None, image:
|
|||
|
||||
if shared.opts.interrogate_offload and model is not None:
|
||||
sd_models.move_model(model, devices.cpu, force=True)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='vqa')
|
||||
answer = clean(answer, question)
|
||||
t1 = time.time()
|
||||
if not quiet:
|
||||
|
|
|
|||
|
|
@ -436,7 +436,7 @@ def sdnq_quantize_model(model, op=None, sd_model=None, do_gc: bool = True, weigh
|
|||
else:
|
||||
getattr(sd_model, quant_last_model_name).to(quant_last_model_device)
|
||||
if do_gc:
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='sdnq')
|
||||
if shared.cmd_opts.medvram or shared.cmd_opts.lowvram or shared.opts.diffusers_offload_mode != "none":
|
||||
quant_last_model_name = op
|
||||
quant_last_model_device = model.device
|
||||
|
|
@ -447,7 +447,7 @@ def sdnq_quantize_model(model, op=None, sd_model=None, do_gc: bool = True, weigh
|
|||
elif shared.opts.diffusers_offload_mode != "none":
|
||||
model = model.to(devices.cpu)
|
||||
if do_gc:
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='sdnq')
|
||||
return model
|
||||
|
||||
|
||||
|
|
@ -465,7 +465,7 @@ def sdnq_quantize_weights(sd_model):
|
|||
getattr(getattr(sd_model, last_model_names[0]), last_model_names[1]).to(quant_last_model_device)
|
||||
else:
|
||||
getattr(sd_model, quant_last_model_name).to(quant_last_model_device)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='sdnq')
|
||||
quant_last_model_name = None
|
||||
quant_last_model_device = None
|
||||
|
||||
|
|
@ -510,7 +510,7 @@ def optimum_quanto_model(model, op=None, sd_model=None, weights=None, activation
|
|||
getattr(getattr(sd_model, last_model_names[0]), last_model_names[1]).to(quant_last_model_device)
|
||||
else:
|
||||
getattr(sd_model, quant_last_model_name).to(quant_last_model_device)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='quanto')
|
||||
if shared.cmd_opts.medvram or shared.cmd_opts.lowvram or shared.opts.diffusers_offload_mode != "none":
|
||||
quant_last_model_name = op
|
||||
quant_last_model_device = model.device
|
||||
|
|
@ -518,7 +518,7 @@ def optimum_quanto_model(model, op=None, sd_model=None, weights=None, activation
|
|||
quant_last_model_name = None
|
||||
quant_last_model_device = None
|
||||
model.to(devices.device)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='quanto')
|
||||
return model
|
||||
|
||||
|
||||
|
|
@ -540,7 +540,7 @@ def optimum_quanto_weights(sd_model):
|
|||
getattr(getattr(sd_model, last_model_names[0]), last_model_names[1]).to(quant_last_model_device)
|
||||
else:
|
||||
getattr(sd_model, quant_last_model_name).to(quant_last_model_device)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='quanto')
|
||||
quant_last_model_name = None
|
||||
quant_last_model_device = None
|
||||
|
||||
|
|
@ -572,7 +572,7 @@ def optimum_quanto_weights(sd_model):
|
|||
sd_models.move_model(sd_model, devices.cpu)
|
||||
if hasattr(sd_model, "encode_prompt"):
|
||||
sd_model.encode_prompt = original_encode_prompt
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='quanto')
|
||||
|
||||
t1 = time.time()
|
||||
log.info(f"Quantization: type=Optimum.quanto time={t1-t0:.2f}")
|
||||
|
|
|
|||
|
|
@ -407,6 +407,8 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
|
|||
if shared.opts.return_mask_composite:
|
||||
output_images.append(image_mask_composite)
|
||||
|
||||
if shared.cmd_opts.lowvram:
|
||||
devices.torch_gc(force=True, reason='lowvram')
|
||||
timer.process.record('post')
|
||||
|
||||
if not p.xyz:
|
||||
|
|
@ -461,5 +463,6 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
|
|||
shared.log.debug(f'Processed: timers={timer.process.dct()}')
|
||||
shared.log.debug(f'Processed: memory={memstats.memory_stats()}')
|
||||
|
||||
# devices.torch_gc(force=True, reason='final')
|
||||
if shared.cmd_opts.lowvram or shared.cmd_opts.medvram:
|
||||
devices.torch_gc(force=True, reason='final')
|
||||
return processed
|
||||
|
|
|
|||
|
|
@ -678,7 +678,7 @@ def load_diffuser(checkpoint_info=None, timer=None, op='model', revision=None):
|
|||
errors.display(e, "Model")
|
||||
|
||||
if shared.opts.diffusers_offload_mode != 'balanced':
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
if sd_model is not None:
|
||||
script_callbacks.model_loaded_callback(sd_model)
|
||||
|
||||
|
|
@ -1107,14 +1107,14 @@ def unload_model_weights(op='model'):
|
|||
disable_offload(model_data.sd_model)
|
||||
move_model(model_data.sd_model, 'meta')
|
||||
model_data.sd_model = None
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='unload')
|
||||
shared.log.debug(f'Unload {op}: {memory_stats()} after')
|
||||
elif (op == 'refiner') and model_data.sd_refiner:
|
||||
shared.log.debug(f'Current {op}: {memory_stats()}')
|
||||
disable_offload(model_data.sd_refiner)
|
||||
move_model(model_data.sd_refiner, 'meta')
|
||||
model_data.sd_refiner = None
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='unload')
|
||||
shared.log.debug(f'Unload {op}: {memory_stats()}')
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import inspect
|
||||
|
|
@ -172,12 +173,14 @@ class OffloadHook(accelerate.hooks.ModelHook):
|
|||
self.min_watermark = shared.opts.diffusers_offload_min_gpu_memory
|
||||
self.max_watermark = shared.opts.diffusers_offload_max_gpu_memory
|
||||
self.cpu_watermark = shared.opts.diffusers_offload_max_cpu_memory
|
||||
self.offload_always = [m.strip() for m in re.split(';|,| ', shared.opts.diffusers_offload_always) if len(m.strip()) > 2]
|
||||
self.offload_never = [m.strip() for m in re.split(';|,| ', shared.opts.diffusers_offload_never) if len(m.strip()) > 2]
|
||||
self.gpu = int(shared.gpu_memory * shared.opts.diffusers_offload_max_gpu_memory * 1024*1024*1024)
|
||||
self.cpu = int(shared.cpu_memory * shared.opts.diffusers_offload_max_cpu_memory * 1024*1024*1024)
|
||||
self.offload_map = {}
|
||||
self.param_map = {}
|
||||
gpu = f'{(shared.gpu_memory * shared.opts.diffusers_offload_min_gpu_memory):.2f}-{(shared.gpu_memory * shared.opts.diffusers_offload_max_gpu_memory):.2f}:{shared.gpu_memory:.2f}'
|
||||
shared.log.info(f'Offload: type=balanced op=init watermark={self.min_watermark}-{self.max_watermark} gpu={gpu} cpu={shared.cpu_memory:.3f} limit={shared.opts.cuda_mem_fraction:.2f}')
|
||||
shared.log.info(f'Offload: type=balanced op=init watermark={self.min_watermark}-{self.max_watermark} gpu={gpu} cpu={shared.cpu_memory:.3f} limit={shared.opts.cuda_mem_fraction:.2f} always={self.offload_always} never={self.offload_never}')
|
||||
self.validate()
|
||||
super().__init__()
|
||||
|
||||
|
|
@ -210,10 +213,7 @@ class OffloadHook(accelerate.hooks.ModelHook):
|
|||
max_memory = { device_index: self.gpu, "cpu": self.cpu }
|
||||
device_map = getattr(module, "balanced_offload_device_map", None)
|
||||
if device_map is None or max_memory != getattr(module, "balanced_offload_max_memory", None):
|
||||
# try:
|
||||
device_map = accelerate.infer_auto_device_map(module, max_memory=max_memory)
|
||||
# except Exception as e:
|
||||
# shared.log.error(f'Offload: type=balanced module={module.__class__.__name__} {e}')
|
||||
offload_dir = getattr(module, "offload_dir", os.path.join(shared.opts.accelerate_offload_path, module.__class__.__name__))
|
||||
if devices.backend == "directml":
|
||||
keys = device_map.keys()
|
||||
|
|
@ -233,15 +233,22 @@ class OffloadHook(accelerate.hooks.ModelHook):
|
|||
perc_gpu = used_gpu / shared.gpu_memory
|
||||
try:
|
||||
module_size = self.model_size()
|
||||
module_cls = module.__class__.__name__
|
||||
prev_gpu = used_gpu
|
||||
offload_now = perc_gpu > shared.opts.diffusers_offload_min_gpu_memory
|
||||
if offload_now:
|
||||
op = 'post:skip'
|
||||
if module_cls in self.offload_never:
|
||||
op = 'post:never'
|
||||
elif module_cls in self.offload_always:
|
||||
op = 'post:always'
|
||||
module = module.to(devices.cpu)
|
||||
used_gpu -= module_size
|
||||
elif perc_gpu > shared.opts.diffusers_offload_min_gpu_memory:
|
||||
op = 'post:mem'
|
||||
module = module.to(devices.cpu)
|
||||
used_gpu -= module_size
|
||||
if debug:
|
||||
cls = module.__class__.__name__
|
||||
quant = getattr(module, "quantization_method", None)
|
||||
debug_move(f'Offload: type=balanced op={"post" if offload_now else "skip"} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={cls} size={module_size:.3f}')
|
||||
debug_move(f'Offload: type=balanced op={op} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={module_cls} size={module_size:.3f}')
|
||||
except Exception as e:
|
||||
if 'out of memory' in str(e):
|
||||
devices.torch_gc(fast=True, force=True, reason='oom')
|
||||
|
|
@ -311,6 +318,8 @@ def apply_balanced_offload(sd_model=None, exclude=[]):
|
|||
else:
|
||||
keys = get_signature(pipe).keys()
|
||||
keys = [k for k in keys if k not in exclude and not k.startswith('_')]
|
||||
offload_always = [m.strip() for m in re.split(';|,| ', shared.opts.diffusers_offload_always) if len(m.strip()) > 2]
|
||||
offload_never = [m.strip() for m in re.split(';|,| ', shared.opts.diffusers_offload_never) if len(m.strip()) > 2]
|
||||
for module_name, module_size in get_pipe_modules(pipe): # pylint: disable=protected-access
|
||||
# shared.log.trace(f'Offload: type=balanced op=apply pipe={pipe.__class__.__name__} module={module_name} size={module_size:.3f}')
|
||||
module = getattr(pipe, module_name, None)
|
||||
|
|
@ -326,16 +335,26 @@ def apply_balanced_offload(sd_model=None, exclude=[]):
|
|||
perc_gpu = used_gpu / shared.gpu_memory
|
||||
try:
|
||||
prev_gpu = used_gpu
|
||||
offload_now = (perc_gpu > shared.opts.diffusers_offload_min_gpu_memory) and (module.device != devices.cpu)
|
||||
if offload_now:
|
||||
module_cls = module.__class__.__name__
|
||||
op = 'apply:skip'
|
||||
if module_cls in offload_never:
|
||||
op = 'apply:never'
|
||||
elif module_cls in offload_always:
|
||||
op = 'apply:always'
|
||||
module = module.to(devices.cpu)
|
||||
used_gpu -= module_size
|
||||
cls = module.__class__.__name__
|
||||
elif perc_gpu > shared.opts.diffusers_offload_min_gpu_memory:
|
||||
op = 'apply:mem'
|
||||
module = module.to(devices.cpu)
|
||||
used_gpu -= module_size
|
||||
if debug:
|
||||
quant = getattr(module, "quantization_method", None)
|
||||
debug_move(f'Offload: type=balanced op={op} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={module_cls} size={module_size:.3f}')
|
||||
quant = getattr(module, "quantization_method", None)
|
||||
if not cached:
|
||||
shared.log.debug(f'Model module={module_name} type={cls} dtype={module.dtype} quant={quant} params={offload_hook_instance.param_map[module_name]:.3f} size={offload_hook_instance.offload_map[module_name]:.3f}')
|
||||
shared.log.debug(f'Model module={module_name} type={module_cls} dtype={module.dtype} quant={quant} params={offload_hook_instance.param_map[module_name]:.3f} size={offload_hook_instance.offload_map[module_name]:.3f}')
|
||||
if debug:
|
||||
debug_move(f'Offload: type=balanced op={"move" if offload_now else "skip"} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={cls} size={module_size:.3f}')
|
||||
debug_move(f'Offload: type=balanced op={op} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={module_cls} size={module_size:.3f}')
|
||||
except Exception as e:
|
||||
if 'out of memory' in str(e):
|
||||
devices.torch_gc(fast=True, force=True, reason='oom')
|
||||
|
|
|
|||
|
|
@ -346,7 +346,7 @@ class SDNQQuantizer(DiffusersQuantizer):
|
|||
def _process_model_after_weight_loading(self, model, **kwargs): # pylint: disable=unused-argument
|
||||
if shared.opts.diffusers_offload_mode != "none":
|
||||
model = model.to(devices.cpu)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='sdnq')
|
||||
return model
|
||||
|
||||
def get_cuda_warm_up_factor(self):
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import gradio as gr
|
|||
import diffusers
|
||||
from modules.json_helpers import readfile, writefile # pylint: disable=W0611
|
||||
from modules.shared_helpers import listdir, walk_files, html_path, html, req, total_tqdm # pylint: disable=W0611
|
||||
from modules.shared_defaults import get_default_modes
|
||||
from modules import errors, devices, shared_items, shared_state, cmd_args, theme, history, files_cache
|
||||
from modules.paths import models_path, script_path, data_path, sd_configs_path, sd_default_config, sd_model_file, default_sd_model_file, extensions_dir, extensions_builtin_dir # pylint: disable=W0611
|
||||
from modules.dml import memory_providers, default_memory_provider, directml_do_hijack
|
||||
|
|
@ -129,45 +130,7 @@ def list_samplers():
|
|||
return modules.sd_samplers.all_samplers
|
||||
|
||||
|
||||
def get_default_modes():
|
||||
default_offload_mode = "none"
|
||||
default_diffusers_offload_min_gpu_memory = 0.2
|
||||
if not (cmd_opts.lowvram or cmd_opts.medvram):
|
||||
if "gpu" in mem_stat:
|
||||
if gpu_memory <= 4:
|
||||
cmd_opts.lowvram = True
|
||||
default_offload_mode = "sequential"
|
||||
default_diffusers_offload_min_gpu_memory = 0
|
||||
log.info(f"Device detect: memory={gpu_memory:.1f} default=sequential optimization=lowvram")
|
||||
elif gpu_memory <= 12:
|
||||
cmd_opts.medvram = True # VAE Tiling and other stuff
|
||||
default_offload_mode = "balanced"
|
||||
default_diffusers_offload_min_gpu_memory = 0
|
||||
log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced optimization=medvram")
|
||||
else:
|
||||
default_offload_mode = "balanced"
|
||||
default_diffusers_offload_min_gpu_memory = 0.2
|
||||
log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced")
|
||||
elif cmd_opts.medvram:
|
||||
default_offload_mode = "balanced"
|
||||
default_diffusers_offload_min_gpu_memory = 0
|
||||
elif cmd_opts.lowvram:
|
||||
default_offload_mode = "sequential"
|
||||
default_diffusers_offload_min_gpu_memory = 0
|
||||
|
||||
default_cross_attention = "Scaled-Dot-Product"
|
||||
|
||||
if devices.backend == "zluda":
|
||||
default_sdp_options = ['Flash attention', 'Math attention', 'Dynamic attention']
|
||||
elif devices.backend in {"rocm", "directml", "cpu", "mps"}:
|
||||
default_sdp_options = ['Flash attention', 'Memory attention', 'Math attention', 'Dynamic attention']
|
||||
else:
|
||||
default_sdp_options = ['Flash attention', 'Memory attention', 'Math attention']
|
||||
|
||||
return default_offload_mode, default_diffusers_offload_min_gpu_memory, default_cross_attention, default_sdp_options
|
||||
|
||||
|
||||
startup_offload_mode, startup_diffusers_offload_min_gpu_memory, startup_cross_attention, startup_sdp_options = get_default_modes()
|
||||
startup_offload_mode, startup_offload_min_gpu, startup_offload_max_gpu, startup_cross_attention, startup_sdp_options, startup_offload_always, startup_offload_never = get_default_modes(cmd_opts=cmd_opts, mem_stat=mem_stat)
|
||||
|
||||
options_templates.update(options_section(('sd', "Models & Loading"), {
|
||||
"sd_backend": OptionInfo('diffusers', "Execution backend", gr.Radio, {"choices": ['diffusers', 'original'], "visible": False }),
|
||||
|
|
@ -179,9 +142,11 @@ options_templates.update(options_section(('sd', "Models & Loading"), {
|
|||
|
||||
"offload_sep": OptionInfo("<h2>Model Offloading</h2>", "", gr.HTML),
|
||||
"diffusers_offload_mode": OptionInfo(startup_offload_mode, "Model offload mode", gr.Radio, {"choices": ['none', 'balanced', 'group', 'model', 'sequential']}),
|
||||
"diffusers_offload_min_gpu_memory": OptionInfo(startup_diffusers_offload_min_gpu_memory, "Balanced offload GPU low watermark", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01 }),
|
||||
"diffusers_offload_max_gpu_memory": OptionInfo(0.70, "Balanced offload GPU high watermark", gr.Slider, {"minimum": 0.1, "maximum": 1, "step": 0.01 }),
|
||||
"diffusers_offload_min_gpu_memory": OptionInfo(startup_offload_min_gpu, "Balanced offload GPU low watermark", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01 }),
|
||||
"diffusers_offload_max_gpu_memory": OptionInfo(startup_offload_max_gpu, "Balanced offload GPU high watermark", gr.Slider, {"minimum": 0.1, "maximum": 1, "step": 0.01 }),
|
||||
"diffusers_offload_max_cpu_memory": OptionInfo(0.90, "Balanced offload CPU high watermark", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01, "visible": False }),
|
||||
"diffusers_offload_always": OptionInfo(startup_offload_always, "Modules to always offload"),
|
||||
"diffusers_offload_never": OptionInfo(startup_offload_never, "Modules to never offload"),
|
||||
|
||||
"advanced_sep": OptionInfo("<h2>Advanced Options</h2>", "", gr.HTML),
|
||||
"sd_checkpoint_autoload": OptionInfo(True, "Model auto-load on start"),
|
||||
|
|
@ -299,7 +264,7 @@ options_templates.update(options_section(("quantization", "Quantization Settings
|
|||
"sdnq_quantize_conv_layers": OptionInfo(False, "Quantize convolutional layers", gr.Checkbox),
|
||||
"sdnq_dequantize_compile": OptionInfo(devices.has_triton(), "Dequantize using torch.compile", gr.Checkbox),
|
||||
"sdnq_use_quantized_matmul": OptionInfo(False, "Use quantized MatMul", gr.Checkbox),
|
||||
"sdnq_use_quantized_matmul_conv": OptionInfo(False, "Use quantized MatMul with convolutional layers", gr.Checkbox),
|
||||
"sdnq_use_quantized_matmul_conv": OptionInfo(False, "Use quantized MatMul with conv", gr.Checkbox),
|
||||
"sdnq_quantize_with_gpu": OptionInfo(True, "Quantize using GPU", gr.Checkbox),
|
||||
"sdnq_dequantize_fp32": OptionInfo(False, "Dequantize using full precision", gr.Checkbox),
|
||||
"sdnq_quantize_shuffle_weights": OptionInfo(False, "Shuffle weights in post mode", gr.Checkbox),
|
||||
|
|
|
|||
|
|
@ -0,0 +1,57 @@
|
|||
from installer import log
|
||||
from modules import devices
|
||||
|
||||
|
||||
def get_default_modes(cmd_opts, mem_stat):
|
||||
default_offload_mode = "none"
|
||||
default_diffusers_offload_min_gpu_memory = 0.2
|
||||
default_diffusers_offload_max_gpu_memory = 0.6
|
||||
default_diffusers_offload_always = ''
|
||||
default_diffusers_offload_never = ''
|
||||
cpu_memory = round(mem_stat['ram']['total'] if "ram" in mem_stat else 0)
|
||||
gpu_memory = round(mem_stat['gpu']['total'] if "gpu" in mem_stat else 0)
|
||||
if not (cmd_opts.lowvram or cmd_opts.medvram):
|
||||
if "gpu" in mem_stat:
|
||||
if gpu_memory <= 4:
|
||||
cmd_opts.lowvram = True
|
||||
default_offload_mode = "sequential"
|
||||
default_diffusers_offload_min_gpu_memory = 0
|
||||
log.info(f"Device detect: memory={gpu_memory:.1f} default=sequential optimization=lowvram")
|
||||
elif gpu_memory <= 12:
|
||||
cmd_opts.medvram = True # VAE Tiling and other stuff
|
||||
default_offload_mode = "balanced"
|
||||
default_diffusers_offload_min_gpu_memory = 0
|
||||
log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced optimization=medvram")
|
||||
elif gpu_memory >= 24:
|
||||
default_offload_mode = "balanced"
|
||||
default_diffusers_offload_max_gpu_memory = 0.8
|
||||
default_diffusers_offload_never = ', '.join(['CLIPTextModel', 'CLIPTextModelWithProjection', 'AutoencoderKL'])
|
||||
log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced optimization=highvram")
|
||||
else:
|
||||
default_offload_mode = "balanced"
|
||||
log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced")
|
||||
elif cmd_opts.medvram:
|
||||
default_offload_mode = "balanced"
|
||||
default_diffusers_offload_min_gpu_memory = 0
|
||||
elif cmd_opts.lowvram:
|
||||
default_offload_mode = "sequential"
|
||||
default_diffusers_offload_min_gpu_memory = 0
|
||||
|
||||
default_cross_attention = "Scaled-Dot-Product"
|
||||
|
||||
if devices.backend == "zluda":
|
||||
default_sdp_options = ['Flash attention', 'Math attention', 'Dynamic attention']
|
||||
elif devices.backend in {"rocm", "directml", "cpu", "mps"}:
|
||||
default_sdp_options = ['Flash attention', 'Memory attention', 'Math attention', 'Dynamic attention']
|
||||
else:
|
||||
default_sdp_options = ['Flash attention', 'Memory attention', 'Math attention']
|
||||
|
||||
return (
|
||||
default_offload_mode,
|
||||
default_diffusers_offload_min_gpu_memory,
|
||||
default_diffusers_offload_max_gpu_memory,
|
||||
default_cross_attention,
|
||||
default_sdp_options,
|
||||
default_diffusers_offload_always,
|
||||
default_diffusers_offload_never
|
||||
)
|
||||
|
|
@ -71,7 +71,7 @@ def generate(*args, **kwargs):
|
|||
|
||||
# cleanup memory
|
||||
shared.sd_model = sd_models.apply_balanced_offload(shared.sd_model)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='video')
|
||||
|
||||
# set args
|
||||
processing.fix_seed(p)
|
||||
|
|
|
|||
|
|
@ -17,5 +17,5 @@ def load_auraflow(checkpoint_info, diffusers_load_config={}):
|
|||
cache_dir = shared.opts.diffusers_dir,
|
||||
**diffusers_load_config,
|
||||
)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
|
|
|||
|
|
@ -187,7 +187,7 @@ def load_chroma(checkpoint_info, diffusers_load_config): # triggered by opts.sd_
|
|||
# unload current model
|
||||
sd_models.unload_model_weights()
|
||||
shared.sd_model = None
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
|
||||
if shared.opts.teacache_enabled:
|
||||
from modules import teacache
|
||||
|
|
@ -277,5 +277,5 @@ def load_chroma(checkpoint_info, diffusers_load_config): # triggered by opts.sd_
|
|||
for k in kwargs.keys():
|
||||
kwargs[k] = None
|
||||
sd_hijack_te.init_hijack(pipe)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe, allow_post_quant
|
||||
|
|
|
|||
|
|
@ -220,7 +220,7 @@ def load_flux(checkpoint_info, diffusers_load_config): # triggered by opts.sd_ch
|
|||
# unload current model
|
||||
sd_models.unload_model_weights()
|
||||
shared.sd_model = None
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
|
||||
if shared.opts.teacache_enabled:
|
||||
from modules import teacache
|
||||
|
|
@ -356,5 +356,5 @@ def load_flux(checkpoint_info, diffusers_load_config): # triggered by opts.sd_ch
|
|||
for k in kwargs.keys():
|
||||
kwargs[k] = None
|
||||
sd_hijack_te.init_hijack(pipe)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe, allow_post_quant
|
||||
|
|
|
|||
|
|
@ -196,5 +196,5 @@ def load_flux_nf4(checkpoint_info, prequantized: bool = True):
|
|||
errors.display(e, 'FLUX:')
|
||||
|
||||
del original_state_dict
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return transformer, text_encoder_2
|
||||
|
|
|
|||
|
|
@ -23,5 +23,5 @@ def load_kolors(_checkpoint_info, diffusers_load_config={}):
|
|||
**diffusers_load_config,
|
||||
)
|
||||
pipe.vae.config.force_upcast = True
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ def load_lumina(_checkpoint_info, diffusers_load_config={}):
|
|||
cache_dir = shared.opts.diffusers_dir,
|
||||
**load_config,
|
||||
)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
||||
|
||||
|
|
@ -91,5 +91,5 @@ def load_lumina2(checkpoint_info, diffusers_load_config={}):
|
|||
)
|
||||
|
||||
sd_hijack_te.init_hijack(pipe)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
|
|
|||
|
|
@ -52,5 +52,5 @@ def load_meissonic(checkpoint_info, diffusers_load_config={}):
|
|||
diffusers.pipelines.auto_pipeline.AUTO_TEXT2IMAGE_PIPELINES_MAPPING["meissonic"] = PipelineMeissonic
|
||||
diffusers.pipelines.auto_pipeline.AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["meissonic"] = PipelineMeissonicImg2Img
|
||||
diffusers.pipelines.auto_pipeline.AUTO_INPAINT_PIPELINES_MAPPING["meissonic"] = PipelineMeissonicInpaint
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
|
|
|||
|
|
@ -28,5 +28,5 @@ def load_omnigen(checkpoint_info, diffusers_load_config={}): # pylint: disable=u
|
|||
**load_config,
|
||||
)
|
||||
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
|
|
|||
|
|
@ -45,5 +45,5 @@ def load_omnigen2(checkpoint_info, diffusers_load_config={}): # pylint: disable=
|
|||
)
|
||||
pipe.transformer = transformer # for omnigen2 transformer must be loaded after pipeline
|
||||
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
|
|
|||
|
|
@ -40,5 +40,5 @@ def load_pixart(checkpoint_info, diffusers_load_config={}):
|
|||
text_encoder=text_encoder,
|
||||
**load_args,
|
||||
)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
|
|
|||
|
|
@ -88,5 +88,5 @@ def load_sana(checkpoint_info, kwargs={}):
|
|||
sd_hijack_te.init_hijack(pipe)
|
||||
t1 = time.time()
|
||||
shared.log.debug(f'Load model: type=Sana target={devices.dtype} te={pipe.text_encoder.dtype} transformer={pipe.transformer.dtype} vae={pipe.vae.dtype} time={t1-t0:.2f}')
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
|
|
|||
|
|
@ -124,5 +124,5 @@ def load_sd3(checkpoint_info, cache_dir=None, config=None):
|
|||
config=config,
|
||||
**kwargs,
|
||||
)
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
|
|
|||
|
|
@ -155,7 +155,7 @@ def load_cascade_combined(checkpoint_info, diffusers_load_config):
|
|||
latent_dim_scale=sd_model.decoder_pipe.config.latent_dim_scale,
|
||||
)
|
||||
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
shared.log.debug(f'StableCascade combined: {sd_model.__class__.__name__}')
|
||||
return sd_model
|
||||
|
||||
|
|
|
|||
|
|
@ -228,7 +228,7 @@ class Script(scripts_manager.Script):
|
|||
shared.sd_model.clip_vision_model = None
|
||||
shared.sd_model.handler_ante = None
|
||||
shared.sd_model = shared.sd_model.pipe
|
||||
devices.torch_gc(force=True)
|
||||
devices.torch_gc(force=True, reason='pulid')
|
||||
shared.log.debug(f'PuLID complete: class={shared.sd_model.__class__.__name__} preprocess={self.preprocess:.2f} pipe={"restore" if restore else "cache"}')
|
||||
return processed
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue