add kandinsky5-lite t2v

Signed-off-by: Vladimir Mandic <mandic00@live.com>
2025-10-18 12:15:36 -04:00 · 2025-10-18 12:15:36 -04:00 · 1ebd96fdc6
parent 95f8fd2213
commit 1ebd96fdc6
8 changed files with 65 additions and 16 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,15 +2,21 @@

 ## Update for 2025-10-18

-Post-release fixes:
- ROCm-on-Windows additional checks  
- SDNQ-SVD fallback on incompatible layers  
- Huggingface model download  
- Remove unused UI settings  
- Video implement dynamic and manual sampler shift  
- Fix interrupt batch processing  
- Delay import of control processors until used  
- Fix tiny VAE with batched results    
+- **Models**
+  [Kandinsky 5 Lite](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers') in *SFT, CFG-distilled and Steps-distilled* variants  
+  first model in Kandinsky5 series is T2V model optimized for 5sec videos and uses Qwen2.5 text encoder  
+- **Fixes**
+  - ROCm-on-Windows additional checks  
+  - SDNQ-SVD fallback on incompatible layers  
+  - Huggingface model download  
+  - Video implement dynamic and manual sampler shift  
+  - Fix interrupt batch processing  
+  - Delay import of control processors until used  
+  - Fix tiny VAE with batched results  
+- **Other**
+  - Video enable VAE slicing and framewise decoding when possible  
+  - Detect and log `flash-attn` and `sageattention` if installed  
+  - Remove unused UI settings  

 ## Update for 2025-10-17

--- a/installer.py
+++ b/installer.py
@ -608,7 +608,7 @@ def check_diffusers():
    if args.skip_git:
        install('diffusers')
        return
-    sha = 'af769881d37fe916afef2c47279f66c79f5f2714' # diffusers commit hash
+    sha = '23ebbb4bc81a17ebea17cb7cb94f301199e49a7f' # diffusers commit hash
    # if args.use_rocm or args.use_zluda or args.use_directml:
    #     sha = '043ab2520f6a19fce78e6e060a68dbc947edb9f9' # lock diffusers versions for now
    pkg = pkg_resources.working_set.by_key.get('diffusers', None)
--- a/modules/devices.py
+++ b/modules/devices.py
@ -434,7 +434,7 @@ def set_sdpa_params():
            torch.backends.cuda.enable_math_sdp('Math attention' in opts.sdp_options)
            if hasattr(torch.backends.cuda, "allow_fp16_bf16_reduction_math_sdp"): # only valid for torch >= 2.5
                torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
-            log.debug(f'Torch attention: type="sdpa" flash={"Flash attention" in opts.sdp_options} memory={"Memory attention" in opts.sdp_options} math={"Math attention" in opts.sdp_options}')
+            log.debug(f'Torch attention: type="sdpa" opts={opts.sdp_options}')
        except Exception as err:
            log.warning(f'Torch attention: type="sdpa" {err}')

@ -447,7 +447,6 @@ def set_sdpa_params():
                sdpa_pre_dyanmic_atten = torch.nn.functional.scaled_dot_product_attention
                from modules.sd_hijack_dynamic_atten import dynamic_scaled_dot_product_attention
                torch.nn.functional.scaled_dot_product_attention = dynamic_scaled_dot_product_attention
-                log.debug('Torch attention: type="dynamic attention"')
            except Exception as err:
                log.error(f'Torch attention: type="dynamic attention" {err}')

@ -542,6 +541,17 @@ def set_sdpa_params():
                log.debug('Torch attention: type="sage attention"')
            except Exception as err:
                log.error(f'Torch attention: type="sage attention" {err}')
+
+        from importlib.metadata import version
+        try:
+            flash = version('flash-attn')
+        except:
+            flash = False
+        try:
+            sage = version('sageattention')
+        except:
+            sage = False
+        log.info(f'Torch attention: flashattn={flash} sageattention={sage}')
    except Exception as e:
        log.warning(f'Torch SDPA: {e}')

--- a/modules/video_models/models_def.py
+++ b/modules/video_models/models_def.py
@ -349,4 +349,24 @@ models = {
              te_cls=transformers.T5EncoderModel,
              dit_cls=diffusers.CosmosTransformer3DModel),
    ],
+    'Kandinsky': [
+        Model(name='Kandinsky 5.0 Lite SFT T2V',
+              url='https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers',
+              repo='ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers',
+              repo_cls=diffusers.Kandinsky5T2VPipeline,
+              te_cls=transformers.Qwen2_5_VLForConditionalGeneration,
+              dit_cls=diffusers.Kandinsky5Transformer3DModel),
+        Model(name='Kandinsky 5.0 Lite CFG-distilled T2V',
+              url='https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers',
+              repo='ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers',
+              repo_cls=diffusers.Kandinsky5T2VPipeline,
+              te_cls=transformers.Qwen2_5_VLForConditionalGeneration,
+              dit_cls=diffusers.Kandinsky5Transformer3DModel),
+        Model(name='Kandinsky 5.0 Lite Steps-distilled T2V',
+              url='https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers',
+              repo='ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers',
+              repo_cls=diffusers.Kandinsky5T2VPipeline,
+              te_cls=transformers.Qwen2_5_VLForConditionalGeneration,
+              dit_cls=diffusers.Kandinsky5Transformer3DModel),
+    ],
 }
--- a/modules/video_models/video_load.py
+++ b/modules/video_models/video_load.py
@ -39,6 +39,10 @@ def load_model(selected: models_def.Model):
            selected.te = 'hunyuanvideo-community/HunyuanVideo'
            selected.te_folder = 'text_encoder'
            selected.te_revision = None
+        if selected.te_cls.__name__ == 'Qwen2_5_VLForConditionalGeneration' and shared.opts.te_shared_t5:
+            selected.te = 'ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers'
+            selected.te_folder = 'text_encoder'
+            selected.te_revision = None

        shared.log.debug(f'Video load: module=te repo="{selected.te or selected.repo}" folder="{selected.te_folder}" cls={selected.te_cls.__name__} quant={model_quant.get_quant_type(quant_args)}')
        kwargs["text_encoder"] = selected.te_cls.from_pretrained(
@ -104,7 +108,7 @@ def load_model(selected: models_def.Model):
    shared.sd_model.sd_model_hash = None
    sd_models.set_diffuser_options(shared.sd_model, offload=False)

-    decode, text, image, slicing, tiling = False, False, False, False, False
+    decode, text, image, slicing, tiling, framewise = False, False, False, False, False, False
    if selected.vae_hijack and hasattr(shared.sd_model.vae, 'decode'):
        sd_hijack_vae.init_hijack(shared.sd_model)
        decode = True
@ -115,6 +119,9 @@ def load_model(selected: models_def.Model):
        shared.sd_model.orig_encode_image = shared.sd_model.encode_image
        shared.sd_model.encode_image = video_utils.hijack_encode_image
        image = True
+    if hasattr(shared.sd_model, 'vae') and hasattr(shared.sd_model.vae, 'use_framewise_decoding'):
+        shared.sd_model.vae.use_framewise_decoding = True
+        framewise = True
    if hasattr(shared.sd_model, 'vae') and hasattr(shared.sd_model.vae, 'enable_slicing'):
        shared.sd_model.vae.enable_slicing()
        slicing = True
@ -130,6 +137,6 @@ def load_model(selected: models_def.Model):
    loaded_model = selected.name
    msg = f'Video load: cls={shared.sd_model.__class__.__name__} model="{selected.name}" time={t1-t0:.2f}'
    shared.log.info(msg)
-    shared.log.debug(f'Video hijacks: decode={decode} text={text} image={image} slicing={slicing} tiling={tiling}')
+    shared.log.debug(f'Video hijacks: decode={decode} text={text} image={image} slicing={slicing} tiling={tiling} framewise={framewise}')
    shared.state.end(jobid)
    return msg
--- a/modules/video_models/video_ui.py
+++ b/modules/video_models/video_ui.py
@ -72,6 +72,8 @@ def run_video(*args):
        return video_run.generate(*args)
    elif selected and 'anisora' in selected.name.lower():
        return video_run.generate(*args)
+    elif selected and 'Kandinsky' in selected.name:
+        return video_run.generate(*args)
    return video_utils.queue_err(f'model not found: engine="{engine}" model="{model}"')


--- a/modules/video_models/video_vae.py
+++ b/modules/video_models/video_vae.py
@ -9,7 +9,9 @@ vae_type = None
 def set_vae_params(p):
    global vae_type # pylint: disable=global-statement
    vae_type = p.vae_type
-    if p.vae_tile_frames > p.frames:
+    if hasattr(shared.sd_model.vae, 'enable_slicing'):
+        shared.sd_model.vae.enable_slicing()
+    if p.frames > p.vae_tile_frames:
        if hasattr(shared.sd_model.vae, 'tile_sample_min_num_frames'):
            shared.sd_model.vae.tile_sample_min_num_frames = p.vae_tile_frames
        if hasattr(shared.sd_model.vae, 'use_framewise_decoding'):
@ -30,6 +32,8 @@ def vae_decode_tiny(latents):
        variant = 'TAE MochiVideo'
    elif 'WAN' in shared.sd_model.__class__.__name__:
        variant = 'TAE WanVideo'
+    elif 'Kandinsky' in shared.sd_model.__class__.__name__:
+        variant = 'TAE HunyuanVideo'
    else:
        shared.log.warning(f'Decode: type=Tiny cls={shared.sd_model.__class__.__name__} not supported')
        return None
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit e5179375d69a6363692e734911e0c76f31d67029
+Subproject commit 847eae26c796ae32b8cd74b0cc79b705bfdc8f54