From 0b1e6d2d3c832f1cf2f8cfdfc9fcd9be94d7957f Mon Sep 17 00:00:00 2001 From: Vladimir Mandic Date: Thu, 25 Dec 2025 10:24:02 +0000 Subject: [PATCH] improve offloading Signed-off-by: Vladimir Mandic --- .ruff.toml | 1 + CHANGELOG.md | 3 +- cli/api-xyz.py | 0 javascript/login.js | 4 +-- modules/control/units/controlnet.py | 2 +- modules/devices.py | 4 +-- modules/sd_models.py | 4 ++- modules/sd_offload.py | 44 +++++++++++++++++++++++------ 8 files changed, 47 insertions(+), 15 deletions(-) mode change 100644 => 100755 cli/api-xyz.py diff --git a/.ruff.toml b/.ruff.toml index e043d6906..7bd68e119 100644 --- a/.ruff.toml +++ b/.ruff.toml @@ -82,6 +82,7 @@ ignore = [ "E731", # Do not assign a `lambda` expression, use a `def` "E741", # Ambiguous variable name "F401", # Imported by unused + "EXE001", # file with shebang is not marked executable "NPY002", # replace legacy random "RUF005", # Consider iterable unpacking "RUF008", # Do not use mutable default values for dataclass diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f8a67958..1f824f175 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ - kandinsky-image-5 hardcoded cuda: - peft lora with torch-rocm-windows: -## Update for 2025-12-24 +## Update for 2025-12-25 - **Models** - [LongCat Image](https://github.com/meituan-longcat/LongCat-Image) in *Image* and *Image Edit* variants @@ -44,6 +44,7 @@ - kanvas css with standardui - control input media with non-english locales - handle embeds when on meta device + - improve offloading when model has manual modules ## Update for 2025-12-11 diff --git a/cli/api-xyz.py b/cli/api-xyz.py old mode 100644 new mode 100755 diff --git a/javascript/login.js b/javascript/login.js index ab8522b81..0dc438325 100644 --- a/javascript/login.js +++ b/javascript/login.js @@ -46,8 +46,8 @@ function forceLogin() { }) .then(async (res) => { const json = await res.json(); - let txt = '' - if (res.status === 200) txt = 'login verified' + let txt = ''; + if (res.status === 200) txt = 'login verified'; else txt = `${res.status}: ${res.statusText} - ${json.detail}`; status.textContent = txt; console.log('login', txt); diff --git a/modules/control/units/controlnet.py b/modules/control/units/controlnet.py index 7727dc476..9e026923f 100644 --- a/modules/control/units/controlnet.py +++ b/modules/control/units/controlnet.py @@ -536,7 +536,7 @@ class ControlNetPipeline(): scheduler=pipeline.scheduler, controlnet=controlnets[0] if isinstance(controlnets, list) else controlnets, # can be a list ) - self.pipeline.task_args['guidance_scale'] = 0 + self.pipeline.task_args = { 'guidance_scale': 1 } elif len(loras) > 0: self.pipeline = pipeline for lora in loras: diff --git a/modules/devices.py b/modules/devices.py index 70bfdfefc..15b65368c 100644 --- a/modules/devices.py +++ b/modules/devices.py @@ -238,8 +238,8 @@ def torch_gc(force:bool=False, fast:bool=False, reason:str=None): torch.cuda.synchronize() torch.cuda.empty_cache() # cuda gc torch.cuda.ipc_collect() - except Exception: - pass + except Exception as e: + log.error(f'GC: {e}') else: return gpu, ram t1 = time.time() diff --git a/modules/sd_models.py b/modules/sd_models.py index 682706223..724aae574 100644 --- a/modules/sd_models.py +++ b/modules/sd_models.py @@ -1071,11 +1071,13 @@ def copy_diffuser_options(new_pipe, orig_pipe): new_pipe.feature_extractor = getattr(orig_pipe, 'feature_extractor', None) new_pipe.mask_processor = getattr(orig_pipe, 'mask_processor', None) new_pipe.restore_pipeline = getattr(orig_pipe, 'restore_pipeline', None) - new_pipe.task_args = getattr(orig_pipe, 'task_args', None) new_pipe.is_sdxl = getattr(orig_pipe, 'is_sdxl', False) # a1111 compatibility item new_pipe.is_sd2 = getattr(orig_pipe, 'is_sd2', False) new_pipe.is_sd1 = getattr(orig_pipe, 'is_sd1', True) add_noise_pred_to_diffusers_callback(new_pipe) + if getattr(new_pipe, 'task_args', None) is None: + new_pipe.task_args = {} + new_pipe.task_args.update(getattr(orig_pipe, 'task_args', {})) if new_pipe.has_accelerate: set_accelerate(new_pipe) diff --git a/modules/sd_offload.py b/modules/sd_offload.py index 721716bde..caafcc46a 100644 --- a/modules/sd_offload.py +++ b/modules/sd_offload.py @@ -18,7 +18,7 @@ offload_warn = ['sc', 'sd3', 'f1', 'f2', 'h1', 'hunyuandit', 'auraflow', 'omnige offload_post = ['h1'] offload_hook_instance = None balanced_offload_exclude = ['CogView4Pipeline', 'MeissonicPipeline'] -no_split_module_classes = ["Linear", "Conv1d", "Conv2d", "Conv3d", "ConvTranspose1d", "ConvTranspose2d", "ConvTranspose3d", "WanTransformerBlock"] +no_split_module_classes = ["Linear", "Conv1d", "Conv2d", "Conv3d", "ConvTranspose1d", "ConvTranspose2d", "ConvTranspose3d", "WanTransformerBlock", "ModuleDict", "ModuleList"] accelerate_dtype_byte_size = None move_stream = None @@ -240,14 +240,26 @@ class OffloadHook(accelerate.hooks.ModelHook): max_memory = { device_index: self.gpu, "cpu": self.cpu } device_map = getattr(module, "balanced_offload_device_map", None) if (device_map is None) or (max_memory != getattr(module, "balanced_offload_max_memory", None)): - device_map = accelerate.infer_auto_device_map(module, max_memory=max_memory, no_split_module_classes=no_split_module_classes, verbose=verbose) + device_map = accelerate.infer_auto_device_map(module, + max_memory=max_memory, + no_split_module_classes=no_split_module_classes, + verbose=verbose, + clean_result=False, + ) offload_dir = getattr(module, "offload_dir", os.path.join(shared.opts.accelerate_offload_path, module.__class__.__name__)) if devices.backend == "directml": for k, v in device_map.items(): if isinstance(v, int): device_map[k] = f"{devices.device.type}:{v}" # int implies CUDA or XPU device, but it will break DirectML backend so we add type + if debug: + shared.log.trace(f'Offload: type=balanced op=dispatch map={device_map}') if device_map is not None: - module = accelerate.dispatch_model(module, device_map=device_map, offload_dir=offload_dir) + module = accelerate.dispatch_model(module, + main_device=torch.device(devices.device), + device_map=device_map, + offload_dir=offload_dir, + force_hooks=True, + ) module._hf_hook.execution_device = torch.device(devices.device) # pylint: disable=protected-access module.balanced_offload_device_map = device_map module.balanced_offload_max_memory = max_memory @@ -291,6 +303,15 @@ def get_pipe_variants(pipe=None): def get_module_names(pipe=None, exclude=None): + def is_valid(module): + if isinstance(getattr(pipe, module, None), torch.nn.ModuleDict): + return True + if isinstance(getattr(pipe, module, None), torch.nn.ModuleList): + return True + if isinstance(getattr(pipe, module, None), torch.nn.Module): + return True + return False + if exclude is None: exclude = [] if pipe is None: @@ -298,12 +319,19 @@ def get_module_names(pipe=None, exclude=None): pipe = shared.sd_model else: return [] - if hasattr(pipe, "_internal_dict"): - modules_names = pipe._internal_dict.keys() # pylint: disable=protected-access - else: - modules_names = get_signature(pipe).keys() + modules_names = [] + try: + dict_keys = pipe._internal_dict.keys() # pylint: disable=protected-access + modules_names.extend(dict_keys) + except Exception: + pass + try: + dict_keys = get_signature(pipe).keys() + modules_names.extend(dict_keys) + except Exception: + pass modules_names = [m for m in modules_names if m not in exclude and not m.startswith('_')] - modules_names = [m for m in modules_names if isinstance(getattr(pipe, m, None), torch.nn.Module)] + modules_names = [m for m in modules_names if is_valid(m)] modules_names = sorted(set(modules_names)) return modules_names