mirror of https://github.com/vladmandic/automatic
add kandinsky5-lite t2v
Signed-off-by: Vladimir Mandic <mandic00@live.com>pull/4275/head
parent
95f8fd2213
commit
1ebd96fdc6
24
CHANGELOG.md
24
CHANGELOG.md
|
|
@ -2,15 +2,21 @@
|
|||
|
||||
## Update for 2025-10-18
|
||||
|
||||
Post-release fixes:
|
||||
- ROCm-on-Windows additional checks
|
||||
- SDNQ-SVD fallback on incompatible layers
|
||||
- Huggingface model download
|
||||
- Remove unused UI settings
|
||||
- Video implement dynamic and manual sampler shift
|
||||
- Fix interrupt batch processing
|
||||
- Delay import of control processors until used
|
||||
- Fix tiny VAE with batched results
|
||||
- **Models**
|
||||
[Kandinsky 5 Lite](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers') in *SFT, CFG-distilled and Steps-distilled* variants
|
||||
first model in Kandinsky5 series is T2V model optimized for 5sec videos and uses Qwen2.5 text encoder
|
||||
- **Fixes**
|
||||
- ROCm-on-Windows additional checks
|
||||
- SDNQ-SVD fallback on incompatible layers
|
||||
- Huggingface model download
|
||||
- Video implement dynamic and manual sampler shift
|
||||
- Fix interrupt batch processing
|
||||
- Delay import of control processors until used
|
||||
- Fix tiny VAE with batched results
|
||||
- **Other**
|
||||
- Video enable VAE slicing and framewise decoding when possible
|
||||
- Detect and log `flash-attn` and `sageattention` if installed
|
||||
- Remove unused UI settings
|
||||
|
||||
## Update for 2025-10-17
|
||||
|
||||
|
|
|
|||
|
|
@ -608,7 +608,7 @@ def check_diffusers():
|
|||
if args.skip_git:
|
||||
install('diffusers')
|
||||
return
|
||||
sha = 'af769881d37fe916afef2c47279f66c79f5f2714' # diffusers commit hash
|
||||
sha = '23ebbb4bc81a17ebea17cb7cb94f301199e49a7f' # diffusers commit hash
|
||||
# if args.use_rocm or args.use_zluda or args.use_directml:
|
||||
# sha = '043ab2520f6a19fce78e6e060a68dbc947edb9f9' # lock diffusers versions for now
|
||||
pkg = pkg_resources.working_set.by_key.get('diffusers', None)
|
||||
|
|
|
|||
|
|
@ -434,7 +434,7 @@ def set_sdpa_params():
|
|||
torch.backends.cuda.enable_math_sdp('Math attention' in opts.sdp_options)
|
||||
if hasattr(torch.backends.cuda, "allow_fp16_bf16_reduction_math_sdp"): # only valid for torch >= 2.5
|
||||
torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
|
||||
log.debug(f'Torch attention: type="sdpa" flash={"Flash attention" in opts.sdp_options} memory={"Memory attention" in opts.sdp_options} math={"Math attention" in opts.sdp_options}')
|
||||
log.debug(f'Torch attention: type="sdpa" opts={opts.sdp_options}')
|
||||
except Exception as err:
|
||||
log.warning(f'Torch attention: type="sdpa" {err}')
|
||||
|
||||
|
|
@ -447,7 +447,6 @@ def set_sdpa_params():
|
|||
sdpa_pre_dyanmic_atten = torch.nn.functional.scaled_dot_product_attention
|
||||
from modules.sd_hijack_dynamic_atten import dynamic_scaled_dot_product_attention
|
||||
torch.nn.functional.scaled_dot_product_attention = dynamic_scaled_dot_product_attention
|
||||
log.debug('Torch attention: type="dynamic attention"')
|
||||
except Exception as err:
|
||||
log.error(f'Torch attention: type="dynamic attention" {err}')
|
||||
|
||||
|
|
@ -542,6 +541,17 @@ def set_sdpa_params():
|
|||
log.debug('Torch attention: type="sage attention"')
|
||||
except Exception as err:
|
||||
log.error(f'Torch attention: type="sage attention" {err}')
|
||||
|
||||
from importlib.metadata import version
|
||||
try:
|
||||
flash = version('flash-attn')
|
||||
except:
|
||||
flash = False
|
||||
try:
|
||||
sage = version('sageattention')
|
||||
except:
|
||||
sage = False
|
||||
log.info(f'Torch attention: flashattn={flash} sageattention={sage}')
|
||||
except Exception as e:
|
||||
log.warning(f'Torch SDPA: {e}')
|
||||
|
||||
|
|
|
|||
|
|
@ -349,4 +349,24 @@ models = {
|
|||
te_cls=transformers.T5EncoderModel,
|
||||
dit_cls=diffusers.CosmosTransformer3DModel),
|
||||
],
|
||||
'Kandinsky': [
|
||||
Model(name='Kandinsky 5.0 Lite SFT T2V',
|
||||
url='https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers',
|
||||
repo='ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers',
|
||||
repo_cls=diffusers.Kandinsky5T2VPipeline,
|
||||
te_cls=transformers.Qwen2_5_VLForConditionalGeneration,
|
||||
dit_cls=diffusers.Kandinsky5Transformer3DModel),
|
||||
Model(name='Kandinsky 5.0 Lite CFG-distilled T2V',
|
||||
url='https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers',
|
||||
repo='ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers',
|
||||
repo_cls=diffusers.Kandinsky5T2VPipeline,
|
||||
te_cls=transformers.Qwen2_5_VLForConditionalGeneration,
|
||||
dit_cls=diffusers.Kandinsky5Transformer3DModel),
|
||||
Model(name='Kandinsky 5.0 Lite Steps-distilled T2V',
|
||||
url='https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers',
|
||||
repo='ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers',
|
||||
repo_cls=diffusers.Kandinsky5T2VPipeline,
|
||||
te_cls=transformers.Qwen2_5_VLForConditionalGeneration,
|
||||
dit_cls=diffusers.Kandinsky5Transformer3DModel),
|
||||
],
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,6 +39,10 @@ def load_model(selected: models_def.Model):
|
|||
selected.te = 'hunyuanvideo-community/HunyuanVideo'
|
||||
selected.te_folder = 'text_encoder'
|
||||
selected.te_revision = None
|
||||
if selected.te_cls.__name__ == 'Qwen2_5_VLForConditionalGeneration' and shared.opts.te_shared_t5:
|
||||
selected.te = 'ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers'
|
||||
selected.te_folder = 'text_encoder'
|
||||
selected.te_revision = None
|
||||
|
||||
shared.log.debug(f'Video load: module=te repo="{selected.te or selected.repo}" folder="{selected.te_folder}" cls={selected.te_cls.__name__} quant={model_quant.get_quant_type(quant_args)}')
|
||||
kwargs["text_encoder"] = selected.te_cls.from_pretrained(
|
||||
|
|
@ -104,7 +108,7 @@ def load_model(selected: models_def.Model):
|
|||
shared.sd_model.sd_model_hash = None
|
||||
sd_models.set_diffuser_options(shared.sd_model, offload=False)
|
||||
|
||||
decode, text, image, slicing, tiling = False, False, False, False, False
|
||||
decode, text, image, slicing, tiling, framewise = False, False, False, False, False, False
|
||||
if selected.vae_hijack and hasattr(shared.sd_model.vae, 'decode'):
|
||||
sd_hijack_vae.init_hijack(shared.sd_model)
|
||||
decode = True
|
||||
|
|
@ -115,6 +119,9 @@ def load_model(selected: models_def.Model):
|
|||
shared.sd_model.orig_encode_image = shared.sd_model.encode_image
|
||||
shared.sd_model.encode_image = video_utils.hijack_encode_image
|
||||
image = True
|
||||
if hasattr(shared.sd_model, 'vae') and hasattr(shared.sd_model.vae, 'use_framewise_decoding'):
|
||||
shared.sd_model.vae.use_framewise_decoding = True
|
||||
framewise = True
|
||||
if hasattr(shared.sd_model, 'vae') and hasattr(shared.sd_model.vae, 'enable_slicing'):
|
||||
shared.sd_model.vae.enable_slicing()
|
||||
slicing = True
|
||||
|
|
@ -130,6 +137,6 @@ def load_model(selected: models_def.Model):
|
|||
loaded_model = selected.name
|
||||
msg = f'Video load: cls={shared.sd_model.__class__.__name__} model="{selected.name}" time={t1-t0:.2f}'
|
||||
shared.log.info(msg)
|
||||
shared.log.debug(f'Video hijacks: decode={decode} text={text} image={image} slicing={slicing} tiling={tiling}')
|
||||
shared.log.debug(f'Video hijacks: decode={decode} text={text} image={image} slicing={slicing} tiling={tiling} framewise={framewise}')
|
||||
shared.state.end(jobid)
|
||||
return msg
|
||||
|
|
|
|||
|
|
@ -72,6 +72,8 @@ def run_video(*args):
|
|||
return video_run.generate(*args)
|
||||
elif selected and 'anisora' in selected.name.lower():
|
||||
return video_run.generate(*args)
|
||||
elif selected and 'Kandinsky' in selected.name:
|
||||
return video_run.generate(*args)
|
||||
return video_utils.queue_err(f'model not found: engine="{engine}" model="{model}"')
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,9 @@ vae_type = None
|
|||
def set_vae_params(p):
|
||||
global vae_type # pylint: disable=global-statement
|
||||
vae_type = p.vae_type
|
||||
if p.vae_tile_frames > p.frames:
|
||||
if hasattr(shared.sd_model.vae, 'enable_slicing'):
|
||||
shared.sd_model.vae.enable_slicing()
|
||||
if p.frames > p.vae_tile_frames:
|
||||
if hasattr(shared.sd_model.vae, 'tile_sample_min_num_frames'):
|
||||
shared.sd_model.vae.tile_sample_min_num_frames = p.vae_tile_frames
|
||||
if hasattr(shared.sd_model.vae, 'use_framewise_decoding'):
|
||||
|
|
@ -30,6 +32,8 @@ def vae_decode_tiny(latents):
|
|||
variant = 'TAE MochiVideo'
|
||||
elif 'WAN' in shared.sd_model.__class__.__name__:
|
||||
variant = 'TAE WanVideo'
|
||||
elif 'Kandinsky' in shared.sd_model.__class__.__name__:
|
||||
variant = 'TAE HunyuanVideo'
|
||||
else:
|
||||
shared.log.warning(f'Decode: type=Tiny cls={shared.sd_model.__class__.__name__} not supported')
|
||||
return None
|
||||
|
|
|
|||
2
wiki
2
wiki
|
|
@ -1 +1 @@
|
|||
Subproject commit e5179375d69a6363692e734911e0c76f31d67029
|
||||
Subproject commit 847eae26c796ae32b8cd74b0cc79b705bfdc8f54
|
||||
Loading…
Reference in New Issue