diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cf2a1722..005fe1091 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,15 +2,21 @@ ## Update for 2025-10-18 -Post-release fixes: -- ROCm-on-Windows additional checks -- SDNQ-SVD fallback on incompatible layers -- Huggingface model download -- Remove unused UI settings -- Video implement dynamic and manual sampler shift -- Fix interrupt batch processing -- Delay import of control processors until used -- Fix tiny VAE with batched results +- **Models** + [Kandinsky 5 Lite](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers') in *SFT, CFG-distilled and Steps-distilled* variants + first model in Kandinsky5 series is T2V model optimized for 5sec videos and uses Qwen2.5 text encoder +- **Fixes** + - ROCm-on-Windows additional checks + - SDNQ-SVD fallback on incompatible layers + - Huggingface model download + - Video implement dynamic and manual sampler shift + - Fix interrupt batch processing + - Delay import of control processors until used + - Fix tiny VAE with batched results +- **Other** + - Video enable VAE slicing and framewise decoding when possible + - Detect and log `flash-attn` and `sageattention` if installed + - Remove unused UI settings ## Update for 2025-10-17 diff --git a/installer.py b/installer.py index aaf2713e9..71ec4cbe0 100644 --- a/installer.py +++ b/installer.py @@ -608,7 +608,7 @@ def check_diffusers(): if args.skip_git: install('diffusers') return - sha = 'af769881d37fe916afef2c47279f66c79f5f2714' # diffusers commit hash + sha = '23ebbb4bc81a17ebea17cb7cb94f301199e49a7f' # diffusers commit hash # if args.use_rocm or args.use_zluda or args.use_directml: # sha = '043ab2520f6a19fce78e6e060a68dbc947edb9f9' # lock diffusers versions for now pkg = pkg_resources.working_set.by_key.get('diffusers', None) diff --git a/modules/devices.py b/modules/devices.py index a98b53d02..723566656 100644 --- a/modules/devices.py +++ b/modules/devices.py @@ -434,7 +434,7 @@ def set_sdpa_params(): torch.backends.cuda.enable_math_sdp('Math attention' in opts.sdp_options) if hasattr(torch.backends.cuda, "allow_fp16_bf16_reduction_math_sdp"): # only valid for torch >= 2.5 torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True) - log.debug(f'Torch attention: type="sdpa" flash={"Flash attention" in opts.sdp_options} memory={"Memory attention" in opts.sdp_options} math={"Math attention" in opts.sdp_options}') + log.debug(f'Torch attention: type="sdpa" opts={opts.sdp_options}') except Exception as err: log.warning(f'Torch attention: type="sdpa" {err}') @@ -447,7 +447,6 @@ def set_sdpa_params(): sdpa_pre_dyanmic_atten = torch.nn.functional.scaled_dot_product_attention from modules.sd_hijack_dynamic_atten import dynamic_scaled_dot_product_attention torch.nn.functional.scaled_dot_product_attention = dynamic_scaled_dot_product_attention - log.debug('Torch attention: type="dynamic attention"') except Exception as err: log.error(f'Torch attention: type="dynamic attention" {err}') @@ -542,6 +541,17 @@ def set_sdpa_params(): log.debug('Torch attention: type="sage attention"') except Exception as err: log.error(f'Torch attention: type="sage attention" {err}') + + from importlib.metadata import version + try: + flash = version('flash-attn') + except: + flash = False + try: + sage = version('sageattention') + except: + sage = False + log.info(f'Torch attention: flashattn={flash} sageattention={sage}') except Exception as e: log.warning(f'Torch SDPA: {e}') diff --git a/modules/video_models/models_def.py b/modules/video_models/models_def.py index 14f0eb8e0..f3b64b40c 100644 --- a/modules/video_models/models_def.py +++ b/modules/video_models/models_def.py @@ -349,4 +349,24 @@ models = { te_cls=transformers.T5EncoderModel, dit_cls=diffusers.CosmosTransformer3DModel), ], + 'Kandinsky': [ + Model(name='Kandinsky 5.0 Lite SFT T2V', + url='https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers', + repo='ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers', + repo_cls=diffusers.Kandinsky5T2VPipeline, + te_cls=transformers.Qwen2_5_VLForConditionalGeneration, + dit_cls=diffusers.Kandinsky5Transformer3DModel), + Model(name='Kandinsky 5.0 Lite CFG-distilled T2V', + url='https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers', + repo='ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers', + repo_cls=diffusers.Kandinsky5T2VPipeline, + te_cls=transformers.Qwen2_5_VLForConditionalGeneration, + dit_cls=diffusers.Kandinsky5Transformer3DModel), + Model(name='Kandinsky 5.0 Lite Steps-distilled T2V', + url='https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers', + repo='ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers', + repo_cls=diffusers.Kandinsky5T2VPipeline, + te_cls=transformers.Qwen2_5_VLForConditionalGeneration, + dit_cls=diffusers.Kandinsky5Transformer3DModel), + ], } diff --git a/modules/video_models/video_load.py b/modules/video_models/video_load.py index d41e3d463..43da69265 100644 --- a/modules/video_models/video_load.py +++ b/modules/video_models/video_load.py @@ -39,6 +39,10 @@ def load_model(selected: models_def.Model): selected.te = 'hunyuanvideo-community/HunyuanVideo' selected.te_folder = 'text_encoder' selected.te_revision = None + if selected.te_cls.__name__ == 'Qwen2_5_VLForConditionalGeneration' and shared.opts.te_shared_t5: + selected.te = 'ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers' + selected.te_folder = 'text_encoder' + selected.te_revision = None shared.log.debug(f'Video load: module=te repo="{selected.te or selected.repo}" folder="{selected.te_folder}" cls={selected.te_cls.__name__} quant={model_quant.get_quant_type(quant_args)}') kwargs["text_encoder"] = selected.te_cls.from_pretrained( @@ -104,7 +108,7 @@ def load_model(selected: models_def.Model): shared.sd_model.sd_model_hash = None sd_models.set_diffuser_options(shared.sd_model, offload=False) - decode, text, image, slicing, tiling = False, False, False, False, False + decode, text, image, slicing, tiling, framewise = False, False, False, False, False, False if selected.vae_hijack and hasattr(shared.sd_model.vae, 'decode'): sd_hijack_vae.init_hijack(shared.sd_model) decode = True @@ -115,6 +119,9 @@ def load_model(selected: models_def.Model): shared.sd_model.orig_encode_image = shared.sd_model.encode_image shared.sd_model.encode_image = video_utils.hijack_encode_image image = True + if hasattr(shared.sd_model, 'vae') and hasattr(shared.sd_model.vae, 'use_framewise_decoding'): + shared.sd_model.vae.use_framewise_decoding = True + framewise = True if hasattr(shared.sd_model, 'vae') and hasattr(shared.sd_model.vae, 'enable_slicing'): shared.sd_model.vae.enable_slicing() slicing = True @@ -130,6 +137,6 @@ def load_model(selected: models_def.Model): loaded_model = selected.name msg = f'Video load: cls={shared.sd_model.__class__.__name__} model="{selected.name}" time={t1-t0:.2f}' shared.log.info(msg) - shared.log.debug(f'Video hijacks: decode={decode} text={text} image={image} slicing={slicing} tiling={tiling}') + shared.log.debug(f'Video hijacks: decode={decode} text={text} image={image} slicing={slicing} tiling={tiling} framewise={framewise}') shared.state.end(jobid) return msg diff --git a/modules/video_models/video_ui.py b/modules/video_models/video_ui.py index cd78f45a2..ec82cd4c7 100644 --- a/modules/video_models/video_ui.py +++ b/modules/video_models/video_ui.py @@ -72,6 +72,8 @@ def run_video(*args): return video_run.generate(*args) elif selected and 'anisora' in selected.name.lower(): return video_run.generate(*args) + elif selected and 'Kandinsky' in selected.name: + return video_run.generate(*args) return video_utils.queue_err(f'model not found: engine="{engine}" model="{model}"') diff --git a/modules/video_models/video_vae.py b/modules/video_models/video_vae.py index 3b7a52b70..aff67c3f6 100644 --- a/modules/video_models/video_vae.py +++ b/modules/video_models/video_vae.py @@ -9,7 +9,9 @@ vae_type = None def set_vae_params(p): global vae_type # pylint: disable=global-statement vae_type = p.vae_type - if p.vae_tile_frames > p.frames: + if hasattr(shared.sd_model.vae, 'enable_slicing'): + shared.sd_model.vae.enable_slicing() + if p.frames > p.vae_tile_frames: if hasattr(shared.sd_model.vae, 'tile_sample_min_num_frames'): shared.sd_model.vae.tile_sample_min_num_frames = p.vae_tile_frames if hasattr(shared.sd_model.vae, 'use_framewise_decoding'): @@ -30,6 +32,8 @@ def vae_decode_tiny(latents): variant = 'TAE MochiVideo' elif 'WAN' in shared.sd_model.__class__.__name__: variant = 'TAE WanVideo' + elif 'Kandinsky' in shared.sd_model.__class__.__name__: + variant = 'TAE HunyuanVideo' else: shared.log.warning(f'Decode: type=Tiny cls={shared.sd_model.__class__.__name__} not supported') return None diff --git a/wiki b/wiki index e5179375d..847eae26c 160000 --- a/wiki +++ b/wiki @@ -1 +1 @@ -Subproject commit e5179375d69a6363692e734911e0c76f31d67029 +Subproject commit 847eae26c796ae32b8cd74b0cc79b705bfdc8f54