offload-never and offload-always per-module and new highvram profile

Signed-off-by: Vladimir Mandic <mandic00@live.com>
2025-07-31 11:40:24 -04:00 · 2025-07-31 11:40:24 -04:00 · fa44521ea3
parent 1b3f5405a3
commit fa44521ea3
28 changed files with 144 additions and 91 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -8,7 +8,15 @@
  - prompt parser allow explict `BOS` and `EOS` tokens in prompt  
 - **UI**  
  - new embedded docs/wiki search!  
+    **Docs** search: fully-local and works in real-time on all document pages  
+    **Wiki** search: uses github api to search online wiki pages  
  - modernui checkbox/radio styling  
+- **Offloading**
+  - changed default values for offloading based on detected gpu memory  
+    see [offloading docs](https://vladmandic.github.io/sdnext-docs/Offload/) for details  
+  - new feature to specify which modules to offload always or never  
+    in *settings -> models & loading -> offload always/never*  
+  - new `highvram` profile provides significant performance boost on gpus with more than 24gb  
 - **Fixes**  
  - fix Wan 2.2-5B I2V workflow  
  - fix inpaint image metadata  
@ -16,6 +24,7 @@
  - fix progress bar with refine/detailer  
  - fix api progress reporting endpoint  
  - fix openvino backend failing to compile  
+  - avoid forced gc and rely on thresholds  
  - add missing interrogate in output panel  

 ## Update for 2025-07-29
--- a/modules/devices.py
+++ b/modules/devices.py
@ -280,7 +280,7 @@ def set_cuda_memory_limit():
        return
    try:
        from modules.shared import cmd_opts
-        torch_gc(force=True)
+        torch_gc(force=True, reason='cuda')
        mem = torch.cuda.get_device_properties(device).total_memory
        torch.cuda.set_per_process_memory_fraction(float(opts.cuda_mem_fraction), cmd_opts.device_id if cmd_opts.device_id is not None else 0)
        log.info(f'Torch memory limit: fraction={opts.cuda_mem_fraction:.2f} limit={round(opts.cuda_mem_fraction * mem / 1024 / 1024)} total={round(mem / 1024 / 1024)}')
--- a/modules/extras.py
+++ b/modules/extras.py
@ -176,7 +176,7 @@ def run_modelmerger(id_task, **kwargs):  # pylint: disable=unused-argument
    created_model = next((ckpt for ckpt in sd_models.checkpoints_list.values() if ckpt.name == filename), None)
    if created_model:
        created_model.calculate_shorthash()
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='merge')
    shared.state.end()
    return [*[gr.Dropdown.update(choices=sd_models.checkpoint_titles()) for _ in range(4)], f"Model saved to {output_modelname}"]

@ -248,7 +248,7 @@ def run_model_modules(model_type:str, model_name:str, custom_name:str,
    yield from modules_sdxl.merge()
    status = modules_sdxl.status

-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='merge')
    yield msg("modules merge complete")
    if modules_sdxl.pipeline is not None:
        checkpoint_info = sd_models.CheckpointInfo(filename='None')
--- a/modules/framepack/framepack_load.py
+++ b/modules/framepack/framepack_load.py
@ -183,7 +183,7 @@ def load_model(variant:str=None, pipeline:str=None, text_encoder:str=None, text_
        diffusers.loaders.peft._SET_ADAPTER_SCALE_FN_MAPPING['HunyuanVideoTransformer3DModelPacked'] = lambda model_cls, weights: weights # pylint: disable=protected-access
        shared.log.info(f'FramePack load: model={shared.sd_model.__class__.__name__} variant="{variant}" type={shared.sd_model_type} time={t1-t0:.2f}')
        sd_models.apply_balanced_offload(shared.sd_model)
-        devices.torch_gc(force=True)
+        devices.torch_gc(force=True, reason='load')

    except Exception as e:
        shared.log.error(f'FramePack load: {e}')
--- a/modules/intel/openvino/init.py
+++ b/modules/intel/openvino/init.py
@ -516,7 +516,7 @@ def openvino_fx(subgraph, example_inputs, options=None):
            else:
                # Delete unused subgraphs
                subgraph = subgraph.apply(sd_models.convert_to_faketensors)
-                devices.torch_gc(force=True)
+                devices.torch_gc(force=True, reason='openvino')

            # Model is fully supported and already cached. Run the cached OV model directly.
            compiled_model = openvino_compile_cached_model(maybe_fs_cached_name, *example_inputs)
--- a/modules/interrogate/vqa.py
+++ b/modules/interrogate/vqa.py
@ -632,7 +632,7 @@ def interrogate(question:str='', system_prompt:str=None, prompt:str=None, image:

    if shared.opts.interrogate_offload and model is not None:
        sd_models.move_model(model, devices.cpu, force=True)
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='vqa')
    answer = clean(answer, question)
    t1 = time.time()
    if not quiet:
--- a/modules/model_quant.py
+++ b/modules/model_quant.py
@ -436,7 +436,7 @@ def sdnq_quantize_model(model, op=None, sd_model=None, do_gc: bool = True, weigh
            else:
                getattr(sd_model, quant_last_model_name).to(quant_last_model_device)
            if do_gc:
-                devices.torch_gc(force=True)
+                devices.torch_gc(force=True, reason='sdnq')
        if shared.cmd_opts.medvram or shared.cmd_opts.lowvram or shared.opts.diffusers_offload_mode != "none":
            quant_last_model_name = op
            quant_last_model_device = model.device
@ -447,7 +447,7 @@ def sdnq_quantize_model(model, op=None, sd_model=None, do_gc: bool = True, weigh
    elif shared.opts.diffusers_offload_mode != "none":
        model = model.to(devices.cpu)
    if do_gc:
-        devices.torch_gc(force=True)
+        devices.torch_gc(force=True, reason='sdnq')
    return model


@ -465,7 +465,7 @@ def sdnq_quantize_weights(sd_model):
                getattr(getattr(sd_model, last_model_names[0]), last_model_names[1]).to(quant_last_model_device)
            else:
                getattr(sd_model, quant_last_model_name).to(quant_last_model_device)
-            devices.torch_gc(force=True)
+            devices.torch_gc(force=True, reason='sdnq')
        quant_last_model_name = None
        quant_last_model_device = None

@ -510,7 +510,7 @@ def optimum_quanto_model(model, op=None, sd_model=None, weights=None, activation
                getattr(getattr(sd_model, last_model_names[0]), last_model_names[1]).to(quant_last_model_device)
            else:
                getattr(sd_model, quant_last_model_name).to(quant_last_model_device)
-            devices.torch_gc(force=True)
+            devices.torch_gc(force=True, reason='quanto')
        if shared.cmd_opts.medvram or shared.cmd_opts.lowvram or shared.opts.diffusers_offload_mode != "none":
            quant_last_model_name = op
            quant_last_model_device = model.device
@ -518,7 +518,7 @@ def optimum_quanto_model(model, op=None, sd_model=None, weights=None, activation
            quant_last_model_name = None
            quant_last_model_device = None
        model.to(devices.device)
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='quanto')
    return model


@ -540,7 +540,7 @@ def optimum_quanto_weights(sd_model):
                getattr(getattr(sd_model, last_model_names[0]), last_model_names[1]).to(quant_last_model_device)
            else:
                getattr(sd_model, quant_last_model_name).to(quant_last_model_device)
-            devices.torch_gc(force=True)
+            devices.torch_gc(force=True, reason='quanto')
        quant_last_model_name = None
        quant_last_model_device = None

@ -572,7 +572,7 @@ def optimum_quanto_weights(sd_model):
                sd_models.move_model(sd_model, devices.cpu)
                if hasattr(sd_model, "encode_prompt"):
                    sd_model.encode_prompt = original_encode_prompt
-            devices.torch_gc(force=True)
+            devices.torch_gc(force=True, reason='quanto')

        t1 = time.time()
        log.info(f"Quantization: type=Optimum.quanto time={t1-t0:.2f}")
--- a/modules/processing.py
+++ b/modules/processing.py
@ -407,6 +407,8 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
                if shared.opts.return_mask_composite:
                    output_images.append(image_mask_composite)

+            if shared.cmd_opts.lowvram:
+                devices.torch_gc(force=True, reason='lowvram')
            timer.process.record('post')

        if not p.xyz:
@ -461,5 +463,6 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
        shared.log.debug(f'Processed: timers={timer.process.dct()}')
        shared.log.debug(f'Processed: memory={memstats.memory_stats()}')

-    # devices.torch_gc(force=True, reason='final')
+    if shared.cmd_opts.lowvram or shared.cmd_opts.medvram:
+        devices.torch_gc(force=True, reason='final')
    return processed
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@ -678,7 +678,7 @@ def load_diffuser(checkpoint_info=None, timer=None, op='model', revision=None):
        errors.display(e, "Model")

    if shared.opts.diffusers_offload_mode != 'balanced':
-        devices.torch_gc(force=True)
+        devices.torch_gc(force=True, reason='load')
    if sd_model is not None:
        script_callbacks.model_loaded_callback(sd_model)

@ -1107,14 +1107,14 @@ def unload_model_weights(op='model'):
            disable_offload(model_data.sd_model)
            move_model(model_data.sd_model, 'meta')
        model_data.sd_model = None
-        devices.torch_gc(force=True)
+        devices.torch_gc(force=True, reason='unload')
        shared.log.debug(f'Unload {op}: {memory_stats()} after')
    elif (op == 'refiner') and model_data.sd_refiner:
        shared.log.debug(f'Current {op}: {memory_stats()}')
        disable_offload(model_data.sd_refiner)
        move_model(model_data.sd_refiner, 'meta')
        model_data.sd_refiner = None
-        devices.torch_gc(force=True)
+        devices.torch_gc(force=True, reason='unload')
        shared.log.debug(f'Unload {op}: {memory_stats()}')


--- a/modules/sd_offload.py
+++ b/modules/sd_offload.py
@ -1,4 +1,5 @@
 import os
+import re
 import sys
 import time
 import inspect
@ -172,12 +173,14 @@ class OffloadHook(accelerate.hooks.ModelHook):
        self.min_watermark = shared.opts.diffusers_offload_min_gpu_memory
        self.max_watermark = shared.opts.diffusers_offload_max_gpu_memory
        self.cpu_watermark = shared.opts.diffusers_offload_max_cpu_memory
+        self.offload_always = [m.strip() for m in re.split(';|,| ', shared.opts.diffusers_offload_always) if len(m.strip()) > 2]
+        self.offload_never = [m.strip() for m in re.split(';|,| ', shared.opts.diffusers_offload_never) if len(m.strip()) > 2]
        self.gpu = int(shared.gpu_memory * shared.opts.diffusers_offload_max_gpu_memory * 1024*1024*1024)
        self.cpu = int(shared.cpu_memory * shared.opts.diffusers_offload_max_cpu_memory * 1024*1024*1024)
        self.offload_map = {}
        self.param_map = {}
        gpu = f'{(shared.gpu_memory * shared.opts.diffusers_offload_min_gpu_memory):.2f}-{(shared.gpu_memory * shared.opts.diffusers_offload_max_gpu_memory):.2f}:{shared.gpu_memory:.2f}'
-        shared.log.info(f'Offload: type=balanced op=init watermark={self.min_watermark}-{self.max_watermark} gpu={gpu} cpu={shared.cpu_memory:.3f} limit={shared.opts.cuda_mem_fraction:.2f}')
+        shared.log.info(f'Offload: type=balanced op=init watermark={self.min_watermark}-{self.max_watermark} gpu={gpu} cpu={shared.cpu_memory:.3f} limit={shared.opts.cuda_mem_fraction:.2f} always={self.offload_always} never={self.offload_never}')
        self.validate()
        super().__init__()

@ -210,10 +213,7 @@ class OffloadHook(accelerate.hooks.ModelHook):
            max_memory = { device_index: self.gpu, "cpu": self.cpu }
            device_map = getattr(module, "balanced_offload_device_map", None)
            if device_map is None or max_memory != getattr(module, "balanced_offload_max_memory", None):
-                # try:
                device_map = accelerate.infer_auto_device_map(module, max_memory=max_memory)
-                # except Exception as e:
-                #     shared.log.error(f'Offload: type=balanced module={module.__class__.__name__} {e}')
            offload_dir = getattr(module, "offload_dir", os.path.join(shared.opts.accelerate_offload_path, module.__class__.__name__))
            if devices.backend == "directml":
                keys = device_map.keys()
@ -233,15 +233,22 @@ class OffloadHook(accelerate.hooks.ModelHook):
            perc_gpu = used_gpu / shared.gpu_memory
            try:
                module_size = self.model_size()
+                module_cls = module.__class__.__name__
                prev_gpu = used_gpu
-                offload_now = perc_gpu > shared.opts.diffusers_offload_min_gpu_memory
-                if offload_now:
+                op = 'post:skip'
+                if module_cls in self.offload_never:
+                    op = 'post:never'
+                elif module_cls in self.offload_always:
+                    op = 'post:always'
+                    module = module.to(devices.cpu)
+                    used_gpu -= module_size
+                elif perc_gpu > shared.opts.diffusers_offload_min_gpu_memory:
+                    op = 'post:mem'
                    module = module.to(devices.cpu)
                    used_gpu -= module_size
                if debug:
-                    cls = module.__class__.__name__
                    quant = getattr(module, "quantization_method", None)
-                    debug_move(f'Offload: type=balanced op={"post" if offload_now else "skip"} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={cls} size={module_size:.3f}')
+                    debug_move(f'Offload: type=balanced op={op} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={module_cls} size={module_size:.3f}')
            except Exception as e:
                if 'out of memory' in str(e):
                    devices.torch_gc(fast=True, force=True, reason='oom')
@ -311,6 +318,8 @@ def apply_balanced_offload(sd_model=None, exclude=[]):
        else:
            keys = get_signature(pipe).keys()
        keys = [k for k in keys if k not in exclude and not k.startswith('_')]
+        offload_always = [m.strip() for m in re.split(';|,| ', shared.opts.diffusers_offload_always) if len(m.strip()) > 2]
+        offload_never = [m.strip() for m in re.split(';|,| ', shared.opts.diffusers_offload_never) if len(m.strip()) > 2]
        for module_name, module_size in get_pipe_modules(pipe): # pylint: disable=protected-access
            # shared.log.trace(f'Offload: type=balanced op=apply pipe={pipe.__class__.__name__} module={module_name} size={module_size:.3f}')
            module = getattr(pipe, module_name, None)
@ -326,16 +335,26 @@ def apply_balanced_offload(sd_model=None, exclude=[]):
            perc_gpu = used_gpu / shared.gpu_memory
            try:
                prev_gpu = used_gpu
-                offload_now = (perc_gpu > shared.opts.diffusers_offload_min_gpu_memory) and (module.device != devices.cpu)
-                if offload_now:
+                module_cls = module.__class__.__name__
+                op = 'apply:skip'
+                if module_cls in offload_never:
+                    op = 'apply:never'
+                elif module_cls in offload_always:
+                    op = 'apply:always'
                    module = module.to(devices.cpu)
                    used_gpu -= module_size
-                cls = module.__class__.__name__
+                elif perc_gpu > shared.opts.diffusers_offload_min_gpu_memory:
+                    op = 'apply:mem'
+                    module = module.to(devices.cpu)
+                    used_gpu -= module_size
+                if debug:
+                    quant = getattr(module, "quantization_method", None)
+                    debug_move(f'Offload: type=balanced op={op} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={module_cls} size={module_size:.3f}')
                quant = getattr(module, "quantization_method", None)
                if not cached:
-                    shared.log.debug(f'Model module={module_name} type={cls} dtype={module.dtype} quant={quant} params={offload_hook_instance.param_map[module_name]:.3f} size={offload_hook_instance.offload_map[module_name]:.3f}')
+                    shared.log.debug(f'Model module={module_name} type={module_cls} dtype={module.dtype} quant={quant} params={offload_hook_instance.param_map[module_name]:.3f} size={offload_hook_instance.offload_map[module_name]:.3f}')
                if debug:
-                    debug_move(f'Offload: type=balanced op={"move" if offload_now else "skip"} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={cls} size={module_size:.3f}')
+                    debug_move(f'Offload: type=balanced op={op} gpu={prev_gpu:.3f}:{used_gpu:.3f} perc={perc_gpu:.2f} ram={used_ram:.3f} current={module.device} dtype={module.dtype} quant={quant} module={module_cls} size={module_size:.3f}')
            except Exception as e:
                if 'out of memory' in str(e):
                    devices.torch_gc(fast=True, force=True, reason='oom')
--- a/modules/sdnq/init.py
+++ b/modules/sdnq/init.py
@ -346,7 +346,7 @@ class SDNQQuantizer(DiffusersQuantizer):
    def _process_model_after_weight_loading(self, model, **kwargs): # pylint: disable=unused-argument
        if shared.opts.diffusers_offload_mode != "none":
            model = model.to(devices.cpu)
-        devices.torch_gc(force=True)
+        devices.torch_gc(force=True, reason='sdnq')
        return model

    def get_cuda_warm_up_factor(self):
--- a/modules/shared.py
+++ b/modules/shared.py
@ -8,6 +8,7 @@ import gradio as gr
 import diffusers
 from modules.json_helpers import readfile, writefile # pylint: disable=W0611
 from modules.shared_helpers import listdir, walk_files, html_path, html, req, total_tqdm # pylint: disable=W0611
+from modules.shared_defaults import get_default_modes
 from modules import errors, devices, shared_items, shared_state, cmd_args, theme, history, files_cache
 from modules.paths import models_path, script_path, data_path, sd_configs_path, sd_default_config, sd_model_file, default_sd_model_file, extensions_dir, extensions_builtin_dir # pylint: disable=W0611
 from modules.dml import memory_providers, default_memory_provider, directml_do_hijack
@ -129,45 +130,7 @@ def list_samplers():
    return modules.sd_samplers.all_samplers


-def get_default_modes():
-    default_offload_mode = "none"
-    default_diffusers_offload_min_gpu_memory = 0.2
-    if not (cmd_opts.lowvram or cmd_opts.medvram):
-        if "gpu" in mem_stat:
-            if gpu_memory <= 4:
-                cmd_opts.lowvram = True
-                default_offload_mode = "sequential"
-                default_diffusers_offload_min_gpu_memory = 0
-                log.info(f"Device detect: memory={gpu_memory:.1f} default=sequential optimization=lowvram")
-            elif gpu_memory <= 12:
-                cmd_opts.medvram = True # VAE Tiling and other stuff
-                default_offload_mode = "balanced"
-                default_diffusers_offload_min_gpu_memory = 0
-                log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced optimization=medvram")
-            else:
-                default_offload_mode = "balanced"
-                default_diffusers_offload_min_gpu_memory = 0.2
-                log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced")
-    elif cmd_opts.medvram:
-        default_offload_mode = "balanced"
-        default_diffusers_offload_min_gpu_memory = 0
-    elif cmd_opts.lowvram:
-        default_offload_mode = "sequential"
-        default_diffusers_offload_min_gpu_memory = 0
-
-    default_cross_attention = "Scaled-Dot-Product"
-
-    if devices.backend == "zluda":
-        default_sdp_options = ['Flash attention', 'Math attention', 'Dynamic attention']
-    elif devices.backend in {"rocm", "directml", "cpu", "mps"}:
-        default_sdp_options = ['Flash attention', 'Memory attention', 'Math attention', 'Dynamic attention']
-    else:
-        default_sdp_options = ['Flash attention', 'Memory attention', 'Math attention']
-
-    return default_offload_mode, default_diffusers_offload_min_gpu_memory, default_cross_attention, default_sdp_options
-
-
-startup_offload_mode, startup_diffusers_offload_min_gpu_memory, startup_cross_attention, startup_sdp_options = get_default_modes()
+startup_offload_mode, startup_offload_min_gpu, startup_offload_max_gpu, startup_cross_attention, startup_sdp_options, startup_offload_always, startup_offload_never = get_default_modes(cmd_opts=cmd_opts, mem_stat=mem_stat)

 options_templates.update(options_section(('sd', "Models & Loading"), {
    "sd_backend": OptionInfo('diffusers', "Execution backend", gr.Radio, {"choices": ['diffusers', 'original'], "visible": False }),
@ -179,9 +142,11 @@ options_templates.update(options_section(('sd', "Models & Loading"), {

    "offload_sep": OptionInfo("<h2>Model Offloading</h2>", "", gr.HTML),
    "diffusers_offload_mode": OptionInfo(startup_offload_mode, "Model offload mode", gr.Radio, {"choices": ['none', 'balanced', 'group', 'model', 'sequential']}),
-    "diffusers_offload_min_gpu_memory": OptionInfo(startup_diffusers_offload_min_gpu_memory, "Balanced offload GPU low watermark", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01 }),
-    "diffusers_offload_max_gpu_memory": OptionInfo(0.70, "Balanced offload GPU high watermark", gr.Slider, {"minimum": 0.1, "maximum": 1, "step": 0.01 }),
+    "diffusers_offload_min_gpu_memory": OptionInfo(startup_offload_min_gpu, "Balanced offload GPU low watermark", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01 }),
+    "diffusers_offload_max_gpu_memory": OptionInfo(startup_offload_max_gpu, "Balanced offload GPU high watermark", gr.Slider, {"minimum": 0.1, "maximum": 1, "step": 0.01 }),
    "diffusers_offload_max_cpu_memory": OptionInfo(0.90, "Balanced offload CPU high watermark", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01, "visible": False }),
+    "diffusers_offload_always": OptionInfo(startup_offload_always, "Modules to always offload"),
+    "diffusers_offload_never": OptionInfo(startup_offload_never, "Modules to never offload"),

    "advanced_sep": OptionInfo("<h2>Advanced Options</h2>", "", gr.HTML),
    "sd_checkpoint_autoload": OptionInfo(True, "Model auto-load on start"),
@ -299,7 +264,7 @@ options_templates.update(options_section(("quantization", "Quantization Settings
    "sdnq_quantize_conv_layers": OptionInfo(False, "Quantize convolutional layers", gr.Checkbox),
    "sdnq_dequantize_compile": OptionInfo(devices.has_triton(), "Dequantize using torch.compile", gr.Checkbox),
    "sdnq_use_quantized_matmul": OptionInfo(False, "Use quantized MatMul", gr.Checkbox),
-    "sdnq_use_quantized_matmul_conv": OptionInfo(False, "Use quantized MatMul with convolutional layers", gr.Checkbox),
+    "sdnq_use_quantized_matmul_conv": OptionInfo(False, "Use quantized MatMul with conv", gr.Checkbox),
    "sdnq_quantize_with_gpu": OptionInfo(True, "Quantize using GPU", gr.Checkbox),
    "sdnq_dequantize_fp32": OptionInfo(False, "Dequantize using full precision", gr.Checkbox),
    "sdnq_quantize_shuffle_weights": OptionInfo(False, "Shuffle weights in post mode", gr.Checkbox),
--- a/modules/shared_defaults.py
+++ b/modules/shared_defaults.py
@ -0,0 +1,57 @@
+from installer import log
+from modules import devices
+
+
+def get_default_modes(cmd_opts, mem_stat):
+    default_offload_mode = "none"
+    default_diffusers_offload_min_gpu_memory = 0.2
+    default_diffusers_offload_max_gpu_memory = 0.6
+    default_diffusers_offload_always = ''
+    default_diffusers_offload_never = ''
+    cpu_memory = round(mem_stat['ram']['total'] if "ram" in mem_stat else 0)
+    gpu_memory = round(mem_stat['gpu']['total'] if "gpu" in mem_stat else 0)
+    if not (cmd_opts.lowvram or cmd_opts.medvram):
+        if "gpu" in mem_stat:
+            if gpu_memory <= 4:
+                cmd_opts.lowvram = True
+                default_offload_mode = "sequential"
+                default_diffusers_offload_min_gpu_memory = 0
+                log.info(f"Device detect: memory={gpu_memory:.1f} default=sequential optimization=lowvram")
+            elif gpu_memory <= 12:
+                cmd_opts.medvram = True # VAE Tiling and other stuff
+                default_offload_mode = "balanced"
+                default_diffusers_offload_min_gpu_memory = 0
+                log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced optimization=medvram")
+            elif gpu_memory >= 24:
+                default_offload_mode = "balanced"
+                default_diffusers_offload_max_gpu_memory = 0.8
+                default_diffusers_offload_never = ', '.join(['CLIPTextModel', 'CLIPTextModelWithProjection', 'AutoencoderKL'])
+                log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced optimization=highvram")
+            else:
+                default_offload_mode = "balanced"
+                log.info(f"Device detect: memory={gpu_memory:.1f} default=balanced")
+    elif cmd_opts.medvram:
+        default_offload_mode = "balanced"
+        default_diffusers_offload_min_gpu_memory = 0
+    elif cmd_opts.lowvram:
+        default_offload_mode = "sequential"
+        default_diffusers_offload_min_gpu_memory = 0
+
+    default_cross_attention = "Scaled-Dot-Product"
+
+    if devices.backend == "zluda":
+        default_sdp_options = ['Flash attention', 'Math attention', 'Dynamic attention']
+    elif devices.backend in {"rocm", "directml", "cpu", "mps"}:
+        default_sdp_options = ['Flash attention', 'Memory attention', 'Math attention', 'Dynamic attention']
+    else:
+        default_sdp_options = ['Flash attention', 'Memory attention', 'Math attention']
+
+    return (
+        default_offload_mode,
+        default_diffusers_offload_min_gpu_memory,
+        default_diffusers_offload_max_gpu_memory,
+        default_cross_attention,
+        default_sdp_options,
+        default_diffusers_offload_always,
+        default_diffusers_offload_never
+    )
--- a/modules/video_models/video_run.py
+++ b/modules/video_models/video_run.py
@ -71,7 +71,7 @@ def generate(*args, **kwargs):

    # cleanup memory
    shared.sd_model = sd_models.apply_balanced_offload(shared.sd_model)
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='video')

    # set args
    processing.fix_seed(p)
--- a/pipelines/model_auraflow.py
+++ b/pipelines/model_auraflow.py
@ -17,5 +17,5 @@ def load_auraflow(checkpoint_info, diffusers_load_config={}):
        cache_dir = shared.opts.diffusers_dir,
        **diffusers_load_config,
    )
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return pipe
--- a/pipelines/model_chroma.py
+++ b/pipelines/model_chroma.py
@ -187,7 +187,7 @@ def load_chroma(checkpoint_info, diffusers_load_config): # triggered by opts.sd_
    # unload current model
    sd_models.unload_model_weights()
    shared.sd_model = None
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')

    if shared.opts.teacache_enabled:
        from modules import teacache
@ -277,5 +277,5 @@ def load_chroma(checkpoint_info, diffusers_load_config): # triggered by opts.sd_
    for k in kwargs.keys():
        kwargs[k] = None
    sd_hijack_te.init_hijack(pipe)
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return pipe, allow_post_quant
--- a/pipelines/model_flux.py
+++ b/pipelines/model_flux.py
@ -220,7 +220,7 @@ def load_flux(checkpoint_info, diffusers_load_config): # triggered by opts.sd_ch
    # unload current model
    sd_models.unload_model_weights()
    shared.sd_model = None
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')

    if shared.opts.teacache_enabled:
        from modules import teacache
@ -356,5 +356,5 @@ def load_flux(checkpoint_info, diffusers_load_config): # triggered by opts.sd_ch
    for k in kwargs.keys():
        kwargs[k] = None
    sd_hijack_te.init_hijack(pipe)
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return pipe, allow_post_quant
--- a/pipelines/model_flux_nf4.py
+++ b/pipelines/model_flux_nf4.py
@ -196,5 +196,5 @@ def load_flux_nf4(checkpoint_info, prequantized: bool = True):
            errors.display(e, 'FLUX:')

    del original_state_dict
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return transformer, text_encoder_2
--- a/pipelines/model_kolors.py
+++ b/pipelines/model_kolors.py
@ -23,5 +23,5 @@ def load_kolors(_checkpoint_info, diffusers_load_config={}):
        **diffusers_load_config,
    )
    pipe.vae.config.force_upcast = True
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return pipe
--- a/pipelines/model_lumina.py
+++ b/pipelines/model_lumina.py
@ -14,7 +14,7 @@ def load_lumina(_checkpoint_info, diffusers_load_config={}):
        cache_dir = shared.opts.diffusers_dir,
        **load_config,
    )
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return pipe


@ -91,5 +91,5 @@ def load_lumina2(checkpoint_info, diffusers_load_config={}):
    )

    sd_hijack_te.init_hijack(pipe)
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return pipe
--- a/pipelines/model_meissonic.py
+++ b/pipelines/model_meissonic.py
@ -52,5 +52,5 @@ def load_meissonic(checkpoint_info, diffusers_load_config={}):
    diffusers.pipelines.auto_pipeline.AUTO_TEXT2IMAGE_PIPELINES_MAPPING["meissonic"] = PipelineMeissonic
    diffusers.pipelines.auto_pipeline.AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["meissonic"] = PipelineMeissonicImg2Img
    diffusers.pipelines.auto_pipeline.AUTO_INPAINT_PIPELINES_MAPPING["meissonic"] = PipelineMeissonicInpaint
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return pipe
--- a/pipelines/model_omnigen.py
+++ b/pipelines/model_omnigen.py
@ -28,5 +28,5 @@ def load_omnigen(checkpoint_info, diffusers_load_config={}): # pylint: disable=u
        **load_config,
    )

-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return pipe
--- a/pipelines/model_omnigen2.py
+++ b/pipelines/model_omnigen2.py
@ -45,5 +45,5 @@ def load_omnigen2(checkpoint_info, diffusers_load_config={}): # pylint: disable=
    )
    pipe.transformer = transformer # for omnigen2 transformer must be loaded after pipeline

-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return pipe
--- a/pipelines/model_pixart.py
+++ b/pipelines/model_pixart.py
@ -40,5 +40,5 @@ def load_pixart(checkpoint_info, diffusers_load_config={}):
        text_encoder=text_encoder,
        **load_args,
    )
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return pipe
--- a/pipelines/model_sana.py
+++ b/pipelines/model_sana.py
@ -88,5 +88,5 @@ def load_sana(checkpoint_info, kwargs={}):
    sd_hijack_te.init_hijack(pipe)
    t1 = time.time()
    shared.log.debug(f'Load model: type=Sana target={devices.dtype} te={pipe.text_encoder.dtype} transformer={pipe.transformer.dtype} vae={pipe.vae.dtype} time={t1-t0:.2f}')
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return pipe
--- a/pipelines/model_sd3.py
+++ b/pipelines/model_sd3.py
@ -124,5 +124,5 @@ def load_sd3(checkpoint_info, cache_dir=None, config=None):
        config=config,
        **kwargs,
    )
-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    return pipe
--- a/pipelines/model_stablecascade.py
+++ b/pipelines/model_stablecascade.py
@ -155,7 +155,7 @@ def load_cascade_combined(checkpoint_info, diffusers_load_config):
        latent_dim_scale=sd_model.decoder_pipe.config.latent_dim_scale,
    )

-    devices.torch_gc(force=True)
+    devices.torch_gc(force=True, reason='load')
    shared.log.debug(f'StableCascade combined: {sd_model.__class__.__name__}')
    return sd_model

--- a/scripts/pulid_ext.py
+++ b/scripts/pulid_ext.py
@ -228,7 +228,7 @@ class Script(scripts_manager.Script):
                    shared.sd_model.clip_vision_model = None
                    shared.sd_model.handler_ante = None
                shared.sd_model = shared.sd_model.pipe
-                devices.torch_gc(force=True)
+                devices.torch_gc(force=True, reason='pulid')
            shared.log.debug(f'PuLID complete: class={shared.sd_model.__class__.__name__} preprocess={self.preprocess:.2f} pipe={"restore" if restore else "cache"}')
        return processed