From 0b1e6d2d3c832f1cf2f8cfdfc9fcd9be94d7957f Mon Sep 17 00:00:00 2001
From: Vladimir Mandic <mandic00@live.com>
Date: Thu, 25 Dec 2025 10:24:02 +0000
Subject: [PATCH] improve offloading

Signed-off-by: Vladimir Mandic <mandic00@live.com>
---
 .ruff.toml                          |  1 +
 CHANGELOG.md                        |  3 +-
 cli/api-xyz.py                      |  0
 javascript/login.js                 |  4 +--
 modules/control/units/controlnet.py |  2 +-
 modules/devices.py                  |  4 +--
 modules/sd_models.py                |  4 ++-
 modules/sd_offload.py               | 44 +++++++++++++++++++++++------
 8 files changed, 47 insertions(+), 15 deletions(-)
 mode change 100644 => 100755 cli/api-xyz.py

diff --git a/.ruff.toml b/.ruff.toml
index e043d6906..7bd68e119 100644
--- a/.ruff.toml
+++ b/.ruff.toml
@@ -82,6 +82,7 @@ ignore = [
   "E731",   # Do not assign a `lambda` expression, use a `def`
   "E741",   # Ambiguous variable name
   "F401",   # Imported by unused
+  "EXE001", # file with shebang is not marked executable
   "NPY002", # replace legacy random
   "RUF005", # Consider iterable unpacking
   "RUF008", # Do not use mutable default values for dataclass
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3f8a67958..1f824f175 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@
 - kandinsky-image-5 hardcoded cuda: <https://github.com/huggingface/diffusers/pull/12814>
 - peft lora with torch-rocm-windows: <https://github.com/huggingface/peft/pull/2963>
 
-## Update for 2025-12-24
+## Update for 2025-12-25
 
 - **Models**
   - [LongCat Image](https://github.com/meituan-longcat/LongCat-Image) in *Image* and *Image Edit* variants  
@@ -44,6 +44,7 @@
   - kanvas css with standardui  
   - control input media with non-english locales  
   - handle embeds when on meta device  
+  - improve offloading when model has manual modules  
 
 ## Update for 2025-12-11
 
diff --git a/cli/api-xyz.py b/cli/api-xyz.py
old mode 100644
new mode 100755
diff --git a/javascript/login.js b/javascript/login.js
index ab8522b81..0dc438325 100644
--- a/javascript/login.js
+++ b/javascript/login.js
@@ -46,8 +46,8 @@ function forceLogin() {
     })
       .then(async (res) => {
         const json = await res.json();
-        let txt = ''
-        if (res.status === 200) txt = 'login verified'
+        let txt = '';
+        if (res.status === 200) txt = 'login verified';
         else txt = `${res.status}: ${res.statusText} - ${json.detail}`;
         status.textContent = txt;
         console.log('login', txt);
diff --git a/modules/control/units/controlnet.py b/modules/control/units/controlnet.py
index 7727dc476..9e026923f 100644
--- a/modules/control/units/controlnet.py
+++ b/modules/control/units/controlnet.py
@@ -536,7 +536,7 @@ class ControlNetPipeline():
                 scheduler=pipeline.scheduler,
                 controlnet=controlnets[0] if isinstance(controlnets, list) else controlnets, # can be a list
             )
-            self.pipeline.task_args['guidance_scale'] = 0
+            self.pipeline.task_args = { 'guidance_scale': 1 }
         elif len(loras) > 0:
             self.pipeline = pipeline
             for lora in loras:
diff --git a/modules/devices.py b/modules/devices.py
index 70bfdfefc..15b65368c 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -238,8 +238,8 @@ def torch_gc(force:bool=False, fast:bool=False, reason:str=None):
                     torch.cuda.synchronize()
                     torch.cuda.empty_cache() # cuda gc
                     torch.cuda.ipc_collect()
-            except Exception:
-                pass
+            except Exception as e:
+                log.error(f'GC: {e}')
     else:
         return gpu, ram
     t1 = time.time()
diff --git a/modules/sd_models.py b/modules/sd_models.py
index 682706223..724aae574 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -1071,11 +1071,13 @@ def copy_diffuser_options(new_pipe, orig_pipe):
     new_pipe.feature_extractor = getattr(orig_pipe, 'feature_extractor', None)
     new_pipe.mask_processor = getattr(orig_pipe, 'mask_processor', None)
     new_pipe.restore_pipeline = getattr(orig_pipe, 'restore_pipeline', None)
-    new_pipe.task_args = getattr(orig_pipe, 'task_args', None)
     new_pipe.is_sdxl = getattr(orig_pipe, 'is_sdxl', False) # a1111 compatibility item
     new_pipe.is_sd2 = getattr(orig_pipe, 'is_sd2', False)
     new_pipe.is_sd1 = getattr(orig_pipe, 'is_sd1', True)
     add_noise_pred_to_diffusers_callback(new_pipe)
+    if getattr(new_pipe, 'task_args', None) is None:
+        new_pipe.task_args = {}
+        new_pipe.task_args.update(getattr(orig_pipe, 'task_args', {}))
     if new_pipe.has_accelerate:
         set_accelerate(new_pipe)
 
diff --git a/modules/sd_offload.py b/modules/sd_offload.py
index 721716bde..caafcc46a 100644
--- a/modules/sd_offload.py
+++ b/modules/sd_offload.py
@@ -18,7 +18,7 @@ offload_warn = ['sc', 'sd3', 'f1', 'f2', 'h1', 'hunyuandit', 'auraflow', 'omnige
 offload_post = ['h1']
 offload_hook_instance = None
 balanced_offload_exclude = ['CogView4Pipeline', 'MeissonicPipeline']
-no_split_module_classes = ["Linear", "Conv1d", "Conv2d", "Conv3d", "ConvTranspose1d", "ConvTranspose2d", "ConvTranspose3d", "WanTransformerBlock"]
+no_split_module_classes = ["Linear", "Conv1d", "Conv2d", "Conv3d", "ConvTranspose1d", "ConvTranspose2d", "ConvTranspose3d", "WanTransformerBlock", "ModuleDict", "ModuleList"]
 accelerate_dtype_byte_size = None
 move_stream = None
 
@@ -240,14 +240,26 @@ class OffloadHook(accelerate.hooks.ModelHook):
             max_memory = { device_index: self.gpu, "cpu": self.cpu }
             device_map = getattr(module, "balanced_offload_device_map", None)
             if (device_map is None) or (max_memory != getattr(module, "balanced_offload_max_memory", None)):
-                device_map = accelerate.infer_auto_device_map(module, max_memory=max_memory, no_split_module_classes=no_split_module_classes, verbose=verbose)
+                device_map = accelerate.infer_auto_device_map(module,
+                                                              max_memory=max_memory,
+                                                              no_split_module_classes=no_split_module_classes,
+                                                              verbose=verbose,
+                                                              clean_result=False,
+                                                             )
             offload_dir = getattr(module, "offload_dir", os.path.join(shared.opts.accelerate_offload_path, module.__class__.__name__))
             if devices.backend == "directml":
                 for k, v in device_map.items():
                     if isinstance(v, int):
                         device_map[k] = f"{devices.device.type}:{v}" # int implies CUDA or XPU device, but it will break DirectML backend so we add type
+            if debug:
+                shared.log.trace(f'Offload: type=balanced op=dispatch map={device_map}')
             if device_map is not None:
-                module = accelerate.dispatch_model(module, device_map=device_map, offload_dir=offload_dir)
+                module = accelerate.dispatch_model(module,
+                                                   main_device=torch.device(devices.device),
+                                                   device_map=device_map,
+                                                   offload_dir=offload_dir,
+                                                   force_hooks=True,
+                                                  )
             module._hf_hook.execution_device = torch.device(devices.device) # pylint: disable=protected-access
             module.balanced_offload_device_map = device_map
             module.balanced_offload_max_memory = max_memory
@@ -291,6 +303,15 @@ def get_pipe_variants(pipe=None):
 
 
 def get_module_names(pipe=None, exclude=None):
+    def is_valid(module):
+        if isinstance(getattr(pipe, module, None), torch.nn.ModuleDict):
+            return True
+        if isinstance(getattr(pipe, module, None), torch.nn.ModuleList):
+            return True
+        if isinstance(getattr(pipe, module, None), torch.nn.Module):
+            return True
+        return False
+
     if exclude is None:
         exclude = []
     if pipe is None:
@@ -298,12 +319,19 @@ def get_module_names(pipe=None, exclude=None):
             pipe = shared.sd_model
         else:
             return []
-    if hasattr(pipe, "_internal_dict"):
-        modules_names = pipe._internal_dict.keys() # pylint: disable=protected-access
-    else:
-        modules_names = get_signature(pipe).keys()
+    modules_names = []
+    try:
+        dict_keys = pipe._internal_dict.keys() # pylint: disable=protected-access
+        modules_names.extend(dict_keys)
+    except Exception:
+        pass
+    try:
+        dict_keys = get_signature(pipe).keys()
+        modules_names.extend(dict_keys)
+    except Exception:
+        pass
     modules_names = [m for m in modules_names if m not in exclude and not m.startswith('_')]
-    modules_names = [m for m in modules_names if isinstance(getattr(pipe, m, None), torch.nn.Module)]
+    modules_names = [m for m in modules_names if is_valid(m)]
     modules_names = sorted(set(modules_names))
     return modules_names