improve offloading

Signed-off-by: Vladimir Mandic <mandic00@live.com>
2025-12-25 10:24:02 +00:00 · 2025-12-25 10:24:02 +00:00 · 0b1e6d2d3c
parent 76aae17e6f
commit 0b1e6d2d3c
8 changed files with 47 additions and 15 deletions
--- a/.ruff.toml
+++ b/.ruff.toml
@ -82,6 +82,7 @@ ignore = [
  "E731",   # Do not assign a `lambda` expression, use a `def`
  "E741",   # Ambiguous variable name
  "F401",   # Imported by unused
+  "EXE001", # file with shebang is not marked executable
  "NPY002", # replace legacy random
  "RUF005", # Consider iterable unpacking
  "RUF008", # Do not use mutable default values for dataclass
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,7 +7,7 @@
 - kandinsky-image-5 hardcoded cuda: <https://github.com/huggingface/diffusers/pull/12814>
 - peft lora with torch-rocm-windows: <https://github.com/huggingface/peft/pull/2963>

-## Update for 2025-12-24
+## Update for 2025-12-25

 - **Models**
  - [LongCat Image](https://github.com/meituan-longcat/LongCat-Image) in *Image* and *Image Edit* variants  
@ -44,6 +44,7 @@
  - kanvas css with standardui  
  - control input media with non-english locales  
  - handle embeds when on meta device  
+  - improve offloading when model has manual modules  

 ## Update for 2025-12-11

--- a/cli/api-xyz.py
+++ b/cli/api-xyz.py
--- a/javascript/login.js
+++ b/javascript/login.js
@ -46,8 +46,8 @@ function forceLogin() {
    })
      .then(async (res) => {
        const json = await res.json();
-        let txt = ''
-        if (res.status === 200) txt = 'login verified'
+        let txt = '';
+        if (res.status === 200) txt = 'login verified';
        else txt = `${res.status}: ${res.statusText} - ${json.detail}`;
        status.textContent = txt;
        console.log('login', txt);
--- a/modules/control/units/controlnet.py
+++ b/modules/control/units/controlnet.py
@ -536,7 +536,7 @@ class ControlNetPipeline():
                scheduler=pipeline.scheduler,
                controlnet=controlnets[0] if isinstance(controlnets, list) else controlnets, # can be a list
            )
-            self.pipeline.task_args['guidance_scale'] = 0
+            self.pipeline.task_args = { 'guidance_scale': 1 }
        elif len(loras) > 0:
            self.pipeline = pipeline
            for lora in loras:
--- a/modules/devices.py
+++ b/modules/devices.py
@ -238,8 +238,8 @@ def torch_gc(force:bool=False, fast:bool=False, reason:str=None):
                    torch.cuda.synchronize()
                    torch.cuda.empty_cache() # cuda gc
                    torch.cuda.ipc_collect()
-            except Exception:
-                pass
+            except Exception as e:
+                log.error(f'GC: {e}')
    else:
        return gpu, ram
    t1 = time.time()
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@ -1071,11 +1071,13 @@ def copy_diffuser_options(new_pipe, orig_pipe):
    new_pipe.feature_extractor = getattr(orig_pipe, 'feature_extractor', None)
    new_pipe.mask_processor = getattr(orig_pipe, 'mask_processor', None)
    new_pipe.restore_pipeline = getattr(orig_pipe, 'restore_pipeline', None)
-    new_pipe.task_args = getattr(orig_pipe, 'task_args', None)
    new_pipe.is_sdxl = getattr(orig_pipe, 'is_sdxl', False) # a1111 compatibility item
    new_pipe.is_sd2 = getattr(orig_pipe, 'is_sd2', False)
    new_pipe.is_sd1 = getattr(orig_pipe, 'is_sd1', True)
    add_noise_pred_to_diffusers_callback(new_pipe)
+    if getattr(new_pipe, 'task_args', None) is None:
+        new_pipe.task_args = {}
+        new_pipe.task_args.update(getattr(orig_pipe, 'task_args', {}))
    if new_pipe.has_accelerate:
        set_accelerate(new_pipe)

--- a/modules/sd_offload.py
+++ b/modules/sd_offload.py
@ -18,7 +18,7 @@ offload_warn = ['sc', 'sd3', 'f1', 'f2', 'h1', 'hunyuandit', 'auraflow', 'omnige
 offload_post = ['h1']
 offload_hook_instance = None
 balanced_offload_exclude = ['CogView4Pipeline', 'MeissonicPipeline']
-no_split_module_classes = ["Linear", "Conv1d", "Conv2d", "Conv3d", "ConvTranspose1d", "ConvTranspose2d", "ConvTranspose3d", "WanTransformerBlock"]
+no_split_module_classes = ["Linear", "Conv1d", "Conv2d", "Conv3d", "ConvTranspose1d", "ConvTranspose2d", "ConvTranspose3d", "WanTransformerBlock", "ModuleDict", "ModuleList"]
 accelerate_dtype_byte_size = None
 move_stream = None

@ -240,14 +240,26 @@ class OffloadHook(accelerate.hooks.ModelHook):
            max_memory = { device_index: self.gpu, "cpu": self.cpu }
            device_map = getattr(module, "balanced_offload_device_map", None)
            if (device_map is None) or (max_memory != getattr(module, "balanced_offload_max_memory", None)):
-                device_map = accelerate.infer_auto_device_map(module, max_memory=max_memory, no_split_module_classes=no_split_module_classes, verbose=verbose)
+                device_map = accelerate.infer_auto_device_map(module,
+                                                              max_memory=max_memory,
+                                                              no_split_module_classes=no_split_module_classes,
+                                                              verbose=verbose,
+                                                              clean_result=False,
+                                                             )
            offload_dir = getattr(module, "offload_dir", os.path.join(shared.opts.accelerate_offload_path, module.__class__.__name__))
            if devices.backend == "directml":
                for k, v in device_map.items():
                    if isinstance(v, int):
                        device_map[k] = f"{devices.device.type}:{v}" # int implies CUDA or XPU device, but it will break DirectML backend so we add type
+            if debug:
+                shared.log.trace(f'Offload: type=balanced op=dispatch map={device_map}')
            if device_map is not None:
-                module = accelerate.dispatch_model(module, device_map=device_map, offload_dir=offload_dir)
+                module = accelerate.dispatch_model(module,
+                                                   main_device=torch.device(devices.device),
+                                                   device_map=device_map,
+                                                   offload_dir=offload_dir,
+                                                   force_hooks=True,
+                                                  )
            module._hf_hook.execution_device = torch.device(devices.device) # pylint: disable=protected-access
            module.balanced_offload_device_map = device_map
            module.balanced_offload_max_memory = max_memory
@ -291,6 +303,15 @@ def get_pipe_variants(pipe=None):


 def get_module_names(pipe=None, exclude=None):
+    def is_valid(module):
+        if isinstance(getattr(pipe, module, None), torch.nn.ModuleDict):
+            return True
+        if isinstance(getattr(pipe, module, None), torch.nn.ModuleList):
+            return True
+        if isinstance(getattr(pipe, module, None), torch.nn.Module):
+            return True
+        return False
+
    if exclude is None:
        exclude = []
    if pipe is None:
@ -298,12 +319,19 @@ def get_module_names(pipe=None, exclude=None):
            pipe = shared.sd_model
        else:
            return []
-    if hasattr(pipe, "_internal_dict"):
-        modules_names = pipe._internal_dict.keys() # pylint: disable=protected-access
-    else:
-        modules_names = get_signature(pipe).keys()
+    modules_names = []
+    try:
+        dict_keys = pipe._internal_dict.keys() # pylint: disable=protected-access
+        modules_names.extend(dict_keys)
+    except Exception:
+        pass
+    try:
+        dict_keys = get_signature(pipe).keys()
+        modules_names.extend(dict_keys)
+    except Exception:
+        pass
    modules_names = [m for m in modules_names if m not in exclude and not m.startswith('_')]
-    modules_names = [m for m in modules_names if isinstance(getattr(pipe, m, None), torch.nn.Module)]
+    modules_names = [m for m in modules_names if is_valid(m)]
    modules_names = sorted(set(modules_names))
    return modules_names