improve offloading

Signed-off-by: Vladimir Mandic <mandic00@live.com>
pull/4497/head
Vladimir Mandic 2025-12-25 10:24:02 +00:00
parent 76aae17e6f
commit 0b1e6d2d3c
8 changed files with 47 additions and 15 deletions

View File

@ -82,6 +82,7 @@ ignore = [
"E731", # Do not assign a `lambda` expression, use a `def`
"E741", # Ambiguous variable name
"F401", # Imported by unused
"EXE001", # file with shebang is not marked executable
"NPY002", # replace legacy random
"RUF005", # Consider iterable unpacking
"RUF008", # Do not use mutable default values for dataclass

View File

@ -7,7 +7,7 @@
- kandinsky-image-5 hardcoded cuda: <https://github.com/huggingface/diffusers/pull/12814>
- peft lora with torch-rocm-windows: <https://github.com/huggingface/peft/pull/2963>
## Update for 2025-12-24
## Update for 2025-12-25
- **Models**
- [LongCat Image](https://github.com/meituan-longcat/LongCat-Image) in *Image* and *Image Edit* variants
@ -44,6 +44,7 @@
- kanvas css with standardui
- control input media with non-english locales
- handle embeds when on meta device
- improve offloading when model has manual modules
## Update for 2025-12-11

0
cli/api-xyz.py Normal file → Executable file
View File

View File

@ -46,8 +46,8 @@ function forceLogin() {
})
.then(async (res) => {
const json = await res.json();
let txt = ''
if (res.status === 200) txt = 'login verified'
let txt = '';
if (res.status === 200) txt = 'login verified';
else txt = `${res.status}: ${res.statusText} - ${json.detail}`;
status.textContent = txt;
console.log('login', txt);

View File

@ -536,7 +536,7 @@ class ControlNetPipeline():
scheduler=pipeline.scheduler,
controlnet=controlnets[0] if isinstance(controlnets, list) else controlnets, # can be a list
)
self.pipeline.task_args['guidance_scale'] = 0
self.pipeline.task_args = { 'guidance_scale': 1 }
elif len(loras) > 0:
self.pipeline = pipeline
for lora in loras:

View File

@ -238,8 +238,8 @@ def torch_gc(force:bool=False, fast:bool=False, reason:str=None):
torch.cuda.synchronize()
torch.cuda.empty_cache() # cuda gc
torch.cuda.ipc_collect()
except Exception:
pass
except Exception as e:
log.error(f'GC: {e}')
else:
return gpu, ram
t1 = time.time()

View File

@ -1071,11 +1071,13 @@ def copy_diffuser_options(new_pipe, orig_pipe):
new_pipe.feature_extractor = getattr(orig_pipe, 'feature_extractor', None)
new_pipe.mask_processor = getattr(orig_pipe, 'mask_processor', None)
new_pipe.restore_pipeline = getattr(orig_pipe, 'restore_pipeline', None)
new_pipe.task_args = getattr(orig_pipe, 'task_args', None)
new_pipe.is_sdxl = getattr(orig_pipe, 'is_sdxl', False) # a1111 compatibility item
new_pipe.is_sd2 = getattr(orig_pipe, 'is_sd2', False)
new_pipe.is_sd1 = getattr(orig_pipe, 'is_sd1', True)
add_noise_pred_to_diffusers_callback(new_pipe)
if getattr(new_pipe, 'task_args', None) is None:
new_pipe.task_args = {}
new_pipe.task_args.update(getattr(orig_pipe, 'task_args', {}))
if new_pipe.has_accelerate:
set_accelerate(new_pipe)

View File

@ -18,7 +18,7 @@ offload_warn = ['sc', 'sd3', 'f1', 'f2', 'h1', 'hunyuandit', 'auraflow', 'omnige
offload_post = ['h1']
offload_hook_instance = None
balanced_offload_exclude = ['CogView4Pipeline', 'MeissonicPipeline']
no_split_module_classes = ["Linear", "Conv1d", "Conv2d", "Conv3d", "ConvTranspose1d", "ConvTranspose2d", "ConvTranspose3d", "WanTransformerBlock"]
no_split_module_classes = ["Linear", "Conv1d", "Conv2d", "Conv3d", "ConvTranspose1d", "ConvTranspose2d", "ConvTranspose3d", "WanTransformerBlock", "ModuleDict", "ModuleList"]
accelerate_dtype_byte_size = None
move_stream = None
@ -240,14 +240,26 @@ class OffloadHook(accelerate.hooks.ModelHook):
max_memory = { device_index: self.gpu, "cpu": self.cpu }
device_map = getattr(module, "balanced_offload_device_map", None)
if (device_map is None) or (max_memory != getattr(module, "balanced_offload_max_memory", None)):
device_map = accelerate.infer_auto_device_map(module, max_memory=max_memory, no_split_module_classes=no_split_module_classes, verbose=verbose)
device_map = accelerate.infer_auto_device_map(module,
max_memory=max_memory,
no_split_module_classes=no_split_module_classes,
verbose=verbose,
clean_result=False,
)
offload_dir = getattr(module, "offload_dir", os.path.join(shared.opts.accelerate_offload_path, module.__class__.__name__))
if devices.backend == "directml":
for k, v in device_map.items():
if isinstance(v, int):
device_map[k] = f"{devices.device.type}:{v}" # int implies CUDA or XPU device, but it will break DirectML backend so we add type
if debug:
shared.log.trace(f'Offload: type=balanced op=dispatch map={device_map}')
if device_map is not None:
module = accelerate.dispatch_model(module, device_map=device_map, offload_dir=offload_dir)
module = accelerate.dispatch_model(module,
main_device=torch.device(devices.device),
device_map=device_map,
offload_dir=offload_dir,
force_hooks=True,
)
module._hf_hook.execution_device = torch.device(devices.device) # pylint: disable=protected-access
module.balanced_offload_device_map = device_map
module.balanced_offload_max_memory = max_memory
@ -291,6 +303,15 @@ def get_pipe_variants(pipe=None):
def get_module_names(pipe=None, exclude=None):
def is_valid(module):
if isinstance(getattr(pipe, module, None), torch.nn.ModuleDict):
return True
if isinstance(getattr(pipe, module, None), torch.nn.ModuleList):
return True
if isinstance(getattr(pipe, module, None), torch.nn.Module):
return True
return False
if exclude is None:
exclude = []
if pipe is None:
@ -298,12 +319,19 @@ def get_module_names(pipe=None, exclude=None):
pipe = shared.sd_model
else:
return []
if hasattr(pipe, "_internal_dict"):
modules_names = pipe._internal_dict.keys() # pylint: disable=protected-access
else:
modules_names = get_signature(pipe).keys()
modules_names = []
try:
dict_keys = pipe._internal_dict.keys() # pylint: disable=protected-access
modules_names.extend(dict_keys)
except Exception:
pass
try:
dict_keys = get_signature(pipe).keys()
modules_names.extend(dict_keys)
except Exception:
pass
modules_names = [m for m in modules_names if m not in exclude and not m.startswith('_')]
modules_names = [m for m in modules_names if isinstance(getattr(pipe, m, None), torch.nn.Module)]
modules_names = [m for m in modules_names if is_valid(m)]
modules_names = sorted(set(modules_names))
return modules_names