From 3a65d561a70f60d2c67f607d2b00a944c7c427ed Mon Sep 17 00:00:00 2001 From: vladmandic Date: Tue, 9 Dec 2025 19:14:08 +0100 Subject: [PATCH] add google-veo-3.1 Signed-off-by: vladmandic --- CHANGELOG.md | 8 +- html/reference.json | 4 +- modules/modeldata.py | 4 +- modules/processing.py | 20 +++- modules/processing_diffusers.py | 7 +- modules/video_models/google_veo.py | 145 ++++++++++++++++++++++++ modules/video_models/models_def.py | 20 ++++ modules/video_models/video_cache.py | 2 +- modules/video_models/video_load.py | 169 ++++++++++++++++------------ modules/video_models/video_run.py | 11 +- modules/video_models/video_save.py | 20 +++- modules/video_models/video_vae.py | 2 + pipelines/model_google.py | 14 ++- wiki | 2 +- 14 files changed, 330 insertions(+), 98 deletions(-) create mode 100644 modules/video_models/google_veo.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c07a5c7d..f92934cc4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,10 @@ Merge commit: `f903a36d9` ### Highlights for 2025-12-09 New native [kanvas](https://vladmandic.github.io/sdnext-docs/Kanvas/) module for image manipulation that fully replaces *img2img*, *inpaint* and *outpaint* controls, massive update to **Captioning/VQA** models and features -New generation of **Flux.2** large image model, new **Z-Image** model that is creating a lot of buzz, new **Kandinsky 5 Lite** image model and a first cloud model with **Google's Nano Banana** *2.5 Flash and 3.0 Pro*, new **Photoroom PRX** model -Also new are **HunyuanVideo 1.5** and **Kandinsky 5 Pro** video models, plus a lot of internal improvements and fixes +New generation of **Flux.2** large image model, new **Z-Image** model that is creating a lot of buzz, new **Kandinsky 5 Lite** image model and new **Photoroom PRX** model +And first cloud models with **Google Nano Banana** *2.5 Flash and 3.0 Pro* and **Google Veo** *3.1* video model +Also new are **HunyuanVideo 1.5** and **Kandinsky 5 Pro** video models +Plus a lot of internal improvements and fixes ![Screenshot](https://github.com/user-attachments/assets/54b25586-b611-4d70-a28f-ee3360944034) @@ -42,6 +44,8 @@ Also new are **HunyuanVideo 1.5** and **Kandinsky 5 Pro** video models, plus a l distilled variants provide faster generation with slightly reduced quality - [Kandinsky 5.0 Pro Video](https://huggingface.co/kandinskylab/Kandinsky-5.0-T2V-Pro-sft-5s-Diffusers) in T2V and I2V variants larger 19B (and more powerful version) of previously released Lite 2B models + - [Google Veo 3.1](https://gemini.google/us/overview/video-generation/) for T2V and I2V workflows + *note*: need to set `GOOGLE_API_KEY` environment variable with your key to use this model - **Kanvas**: new module for native canvas-based image manipulation kanvas is a full replacement for *img2img, inpaint and outpaint* controls see [docs](https://vladmandic.github.io/sdnext-docs/Kanvas/) for details diff --git a/html/reference.json b/html/reference.json index db5689aae..f9c08e89c 100644 --- a/html/reference.json +++ b/html/reference.json @@ -782,7 +782,7 @@ "Kandinsky 5.0 T2I Lite": { "path": "kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers", "desc": "Kandinsky 5.0 Image Lite is a 6B image generation models 1K resulution, high visual quality and strong text-writing", - "preview": "kandinskylab--Kandinsky-5.0-T2I-Lite-sft-Diffusers.jpg", + "preview": "kandinsky-community--kandinsky-3.jpg", "skip": true, "size": 33.20, "date": "2025 November" @@ -790,7 +790,7 @@ "Kandinsky 5.0 I2I Lite": { "path": "kandinskylab/Kandinsky-5.0-I2I-Lite-sft-Diffusers", "desc": "Kandinsky 5.0 Image Lite is a 6B image editing models 1K resulution, high visual quality and strong text-writing", - "preview": "kandinskylab--Kandinsky-5.0-I2I-Lite-sft-Diffusers.jpg", + "preview": "kandinsky-community--kandinsky-3.jpg", "skip": true, "size": 33.20, "date": "2025 November" diff --git a/modules/modeldata.py b/modules/modeldata.py index c7358d162..6a7f7451c 100644 --- a/modules/modeldata.py +++ b/modules/modeldata.py @@ -79,7 +79,7 @@ def get_model_type(pipe): # video models elif "CogVideo" in name: model_type = 'cogvideo' - elif 'HunyuanVideo15': + elif 'HunyuanVideo15' in name: model_type = 'hunyuanvideo15' elif 'HunyuanVideoPipeline' in name or 'HunyuanSkyreels' in name: model_type = 'hunyuanvideo' @@ -101,6 +101,8 @@ def get_model_type(pipe): elif 'HunyuanImage' in name: model_type = 'hunyuanimage' # cloud models + elif 'GoogleVeo' in name: + model_type = 'veo3' elif 'NanoBanana' in name: model_type = 'nanobanana' else: diff --git a/modules/processing.py b/modules/processing.py index c363a58fa..79e62d669 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -31,7 +31,7 @@ processed = None # last known processed results class Processed: - def __init__(self, p: StableDiffusionProcessing, images_list, seed=-1, info=None, subseed=None, all_prompts=None, all_negative_prompts=None, all_seeds=None, all_subseeds=None, index_of_first_image=0, infotexts=None, comments=""): + def __init__(self, p: StableDiffusionProcessing, images_list, seed=-1, info=None, subseed=None, all_prompts=None, all_negative_prompts=None, all_seeds=None, all_subseeds=None, index_of_first_image=0, infotexts=None, comments="", binary=None): self.sd_model_hash = getattr(shared.sd_model, 'sd_model_hash', '') if model_data.sd_model is not None else '' self.prompt = p.prompt or '' @@ -40,6 +40,7 @@ class Processed: self.negative_prompt = self.negative_prompt if type(self.negative_prompt) != list else self.negative_prompt[0] self.styles = p.styles + self.bytes = binary self.images = images_list self.width = p.width if hasattr(p, 'width') else (self.images[0].width if len(self.images) > 0 else 0) self.height = p.height if hasattr(p, 'height') else (self.images[0].height if len(self.images) > 0 else 0) @@ -275,6 +276,8 @@ def process_init(p: StableDiffusionProcessing): def process_samples(p: StableDiffusionProcessing, samples): out_images = [] out_infotexts = [] + if not isinstance(samples, list): + return samples, [] for i, sample in enumerate(samples): debug(f'Processing result: index={i+1}/{len(samples)}') p.batch_index = i @@ -394,6 +397,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed: comments = {} infotexts = [] output_images = [] + output_binary = None process_init(p) if p.scripts is not None and isinstance(p.scripts, scripts_manager.ScriptRunner): @@ -471,11 +475,14 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed: p.scripts.postprocess_batch_list(p, batch_params, batch_number=n) samples = batch_params.images - batch_images, batch_infotexts = process_samples(p, samples) - for batch_image, batch_infotext in zip(batch_images, batch_infotexts): - if batch_image is not None and batch_image not in output_images: - output_images.append(batch_image) - infotexts.append(batch_infotext) + if hasattr(samples, 'bytes') and samples.bytes is not None: + output_binary = samples.bytes + else: + batch_images, batch_infotexts = process_samples(p, samples) + for batch_image, batch_infotext in zip(batch_images, batch_infotexts): + if batch_image is not None and batch_image not in output_images: + output_images.append(batch_image) + infotexts.append(batch_infotext) if shared.cmd_opts.lowvram: devices.torch_gc(force=True, reason='lowvram') @@ -508,6 +515,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed: results = get_processed( p, images_list=output_images, + binary=output_binary, seed=p.all_seeds[0], info=infotexts[0] if len(infotexts) > 0 else '', comments="\n".join(comments), diff --git a/modules/processing_diffusers.py b/modules/processing_diffusers.py index c5e0ebb06..97b222b48 100644 --- a/modules/processing_diffusers.py +++ b/modules/processing_diffusers.py @@ -435,6 +435,9 @@ def process_refine(p: processing.StableDiffusionProcessing, output): def process_decode(p: processing.StableDiffusionProcessing, output): shared.sd_model = sd_models.apply_balanced_offload(shared.sd_model, exclude=['vae']) if output is not None: + if hasattr(output, 'bytes') and output.bytes is not None: + shared.log.debug(f'Generated: bytes={len(output.bytes)}') + return output if not hasattr(output, 'images') and hasattr(output, 'frames'): shared.log.debug(f'Generated: frames={len(output.frames[0])}') output.images = output.frames[0] @@ -508,6 +511,8 @@ def validate_pipeline(p: processing.StableDiffusionProcessing): for m in video_models[family]: if m.repo_cls is not None: models_cls.append(m.repo_cls.__name__) + if m.custom is not None: + models_cls.append(m.custom) is_video_model = shared.sd_model.__class__.__name__ in models_cls override_video_pipelines = ['WanPipeline', 'WanImageToVideoPipeline', 'WanVACEPipeline'] is_video_pipeline = ('video' in p.__class__.__name__.lower()) or (shared.sd_model.__class__.__name__ in override_video_pipelines) @@ -569,7 +574,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing): images, _index=shared.history.selected output = SimpleNamespace(images=images) - if (output is None or len(output.images) == 0) and has_images: + if (output is None or (hasattr(output, 'images') and len(output.images) == 0)) and has_images: if output is not None: shared.log.debug('Processing: using input as base output') output.images = p.init_images diff --git a/modules/video_models/google_veo.py b/modules/video_models/google_veo.py new file mode 100644 index 000000000..92ee43a2e --- /dev/null +++ b/modules/video_models/google_veo.py @@ -0,0 +1,145 @@ +import io +import os +import time + +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) + +from PIL import Image +from installer import install, reload, log + + +image_size_buckets = { + '720p': 1280*720, + '1080p': 1920*1080, +} +aspect_ratios_buckets = { + '1:1': 1/1, + '2:3': 2/3, + '3:2': 3/2, + '4:3': 4/3, + '3:4': 3/4, + '4:5': 4/5, + '5:4': 5/4, + '16:9': 16/9, + '9:16': 9/16, + '21:9': 21/9, + '9:21': 9/21, +} + + +def google_requirements(): + install('google-genai==1.52.0') + install('pydantic==2.11.7', ignore=True, quiet=True) + reload('pydantic', '2.11.7') + + +def get_size_buckets(width: int, height: int) -> str: + aspect_ratio = width / height + closest_aspect_ratio = min(aspect_ratios_buckets.items(), key=lambda x: abs(x[1] - aspect_ratio))[0] + pixel_count = width * height + closest_size = min(image_size_buckets.items(), key=lambda x: abs(x[1] - pixel_count))[0] + closest_aspect_ratio = min(aspect_ratios_buckets.items(), key=lambda x: abs(x[1] - aspect_ratio))[0] + return closest_size, closest_aspect_ratio + + +class GoogleVeoVideoPipeline(): + def __init__(self, model_name: str): + self.model = model_name + self.client = None + self.config = None + google_requirements() + log.debug(f'Load model: type=GoogleVeo model="{model_name}"') + + def txt2vid(self, prompt): + return self.client.models.generate_videos( + model=self.model, + prompt=prompt, + config=self.config, + ) + + def img2vid(self, prompt, image): + from google import genai + image_bytes = io.BytesIO() + image.save(image_bytes, format='JPEG') + return self.client.models.generate_videos( + model=self.model, + prompt=prompt, + config=self.config, + image=genai.types.Image(image_bytes=image_bytes.getvalue(), mime_type='image/jpeg'), + ) + + def __call__(self, prompt: list[str], width: int, height: int, image: Image.Image = None, num_frames: int = 4*24): + from google import genai + + if isinstance(prompt, list) and len(prompt) > 0: + prompt = prompt[0] + if self.client is None: + api_key = os.getenv("GOOGLE_API_KEY", None) + if api_key is None: + log.error(f'Cloud: model="{self.model}" GOOGLE_API_KEY environment variable not set') + return None + self.client = genai.Client(api_key=api_key, vertexai=False) + + resolution, aspect_ratio = get_size_buckets(width, height) + duration = num_frames // 24 + if duration < 4: + duration = 4 + if duration > 8: + duration = 8 + self.config=genai.types.GenerateVideosConfig( + # seed=42, + # fps=24, + duration_seconds=duration, + aspect_ratio=aspect_ratio, + resolution=resolution, + # person_generation='ALLOW_ALL', + # safety_filter_level='BLOCK_NONE', + # negative_prompt=None, + # enhance_prompt=True, + # generate_audio=True, + ) + log.debug(f'Cloud: prompt="{prompt}" size={resolution} ar={aspect_ratio} image={image} model="{self.model}" frames={num_frames} duration={duration}') + + operation = None + try: + if image is not None: + operation = self.img2vid(prompt, image) + else: + operation = self.txt2vid(prompt) + while not operation.done: + log.debug(f"Cloud processing: {operation}") + time.sleep(10) + operation = self.client.operations.get(operation) + except Exception as e: + log.error(f'Cloud video: model="{self.model}" {operation} {e}') + return None + + if operation is None or operation.response is None or operation.response.generated_videos is None or len(operation.response.generated_videos) == 0: + log.error(f'Cloud video: model="{self.model}" no response {operation}') + return None + try: + response: genai.types.GeneratedVideo = operation.response.generated_videos[0] + self.client.files.download(file=response.video) + video_bytes = response.video.video_bytes + return { 'bytes': video_bytes, 'images': [] } + except Exception as e: + log.error(f'Cloud download: model="{self.model}" {e}') + return None + + +def load_veo(model_name): # pylint: disable=unused-argument + pipe = GoogleVeoVideoPipeline(model_name = model_name) + return pipe + + +if __name__ == "__main__": + from installer import setup_logging + setup_logging() + log.info('test') + model = GoogleVeoVideoPipeline('veo-3.1-generate-preview') + img = Image.open('C:\\Users\\mandi\\OneDrive\\Generative\\Samples\\cartoon.png') + vid = model(['A beautiful young woman walking through the fantasy city'], 1280, 720, image=img) + if vid is not None: + with open("veo.mp4", "wb") as f: + f.write(vid['video']) diff --git a/modules/video_models/models_def.py b/modules/video_models/models_def.py index f2e2f21c9..c89995da8 100644 --- a/modules/video_models/models_def.py +++ b/modules/video_models/models_def.py @@ -10,6 +10,7 @@ class Model(): name: str url: str = '' repo: str = None + custom: str = None repo_cls: classmethod = None repo_revision: str = None dit: str = None @@ -480,6 +481,22 @@ try: te_cls=getattr(transformers, 'Qwen2_5_VLForConditionalGeneration', None), dit_cls=getattr(diffusers, 'Kandinsky5Transformer3DModel', None)), ], + 'Google Veo': [ + Model(name='Google Veo 3.1 T2V', + url='https://gemini.google/overview/video-generation/', + repo='veo-3.1-generate-preview', + custom='GoogleVeoVideoPipeline', + repo_cls=None, + te_cls=None, + dit_cls=None), + Model(name='Google Veo 3.1 I2V', + url='https://gemini.google/overview/video-generation/', + repo='veo-3.1-generate-preview', + custom='GoogleVeoVideoPipeline', + repo_cls=None, + te_cls=None, + dit_cls=None), + ], } t1 = time.time() errors = 0 @@ -488,11 +505,14 @@ try: for m in model: if m.name == 'None': continue + """ if (m.repo_cls is None) or (m.dit_cls is None) or (m.te_cls is None): log.error(f'Video: pipeline="{m.name}" not available') errors += 1 else: total += 1 + """ + total += 1 log.info(f'Networks: type="video" engines={len(models)} models={total} errors={errors} time={t1 - t0:.2f}') except Exception as e: models = {} diff --git a/modules/video_models/video_cache.py b/modules/video_models/video_cache.py index 01f541be3..b4b0d1472 100644 --- a/modules/video_models/video_cache.py +++ b/modules/video_models/video_cache.py @@ -3,7 +3,7 @@ from modules import shared def apply_teacache_patch(cls): - if shared.opts.teacache_enabled: + if shared.opts.teacache_enabled and cls is not None: from modules import teacache shared.log.debug(f'Transformers cache: type=teacache patch=forward cls={cls.__name__}') if cls.__name__ == 'LTXVideoTransformer3DModel': diff --git a/modules/video_models/video_load.py b/modules/video_models/video_load.py index 375861840..aae853a8a 100644 --- a/modules/video_models/video_load.py +++ b/modules/video_models/video_load.py @@ -20,15 +20,26 @@ def _loader(component): loaded_model = None +def load_custom(model_name: str): + shared.log.debug(f'Video load: module=pipe repo="{model_name}" cls=Custom') + if 'veo-3.1' in model_name: + from modules.video_models.google_veo import load_veo + pipe = load_veo(model_name) + return pipe + return None + + def load_model(selected: models_def.Model): - if selected is None or selected.te_cls is None or selected.dit_cls is None: + if selected is None or selected.repo is None: return '' global loaded_model # pylint: disable=global-statement if not shared.sd_loaded: loaded_model = None if loaded_model == selected.name: return '' - sd_models.unload_model_weights() + if shared.sd_loaded: + sd_models.unload_model_weights() + t0 = time.time() jobid = shared.state.begin('Load model') @@ -46,89 +57,99 @@ def load_model(selected: models_def.Model): kwargs = video_overrides.load_override(selected, **offline_args) # text encoder - try: - load_args, quant_args = model_quant.get_dit_args({}, module='TE', device_map=True) + if selected.te_cls is not None: + try: + load_args, quant_args = model_quant.get_dit_args({}, module='TE', device_map=True) - # loader deduplication of text-encoder models - if selected.te_cls.__name__ == 'T5EncoderModel' and shared.opts.te_shared_t5: - selected.te = 'Disty0/t5-xxl' - selected.te_folder = '' - selected.te_revision = None - if selected.te_cls.__name__ == 'UMT5EncoderModel' and shared.opts.te_shared_t5: - if 'SDNQ' in selected.name: - selected.te = 'Disty0/Wan2.2-T2V-A14B-SDNQ-uint4-svd-r32' - else: - selected.te = 'Wan-AI/Wan2.2-TI2V-5B-Diffusers' - selected.te_folder = 'text_encoder' - selected.te_revision = None - if selected.te_cls.__name__ == 'LlamaModel' and shared.opts.te_shared_t5: - selected.te = 'hunyuanvideo-community/HunyuanVideo' - selected.te_folder = 'text_encoder' - selected.te_revision = None - if selected.te_cls.__name__ == 'Qwen2_5_VLForConditionalGeneration' and shared.opts.te_shared_t5: - selected.te = 'ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers' - selected.te_folder = 'text_encoder' - selected.te_revision = None + # loader deduplication of text-encoder models + if selected.te_cls.__name__ == 'T5EncoderModel' and shared.opts.te_shared_t5: + selected.te = 'Disty0/t5-xxl' + selected.te_folder = '' + selected.te_revision = None + if selected.te_cls.__name__ == 'UMT5EncoderModel' and shared.opts.te_shared_t5: + if 'SDNQ' in selected.name: + selected.te = 'Disty0/Wan2.2-T2V-A14B-SDNQ-uint4-svd-r32' + else: + selected.te = 'Wan-AI/Wan2.2-TI2V-5B-Diffusers' + selected.te_folder = 'text_encoder' + selected.te_revision = None + if selected.te_cls.__name__ == 'LlamaModel' and shared.opts.te_shared_t5: + selected.te = 'hunyuanvideo-community/HunyuanVideo' + selected.te_folder = 'text_encoder' + selected.te_revision = None + if selected.te_cls.__name__ == 'Qwen2_5_VLForConditionalGeneration' and shared.opts.te_shared_t5: + selected.te = 'ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers' + selected.te_folder = 'text_encoder' + selected.te_revision = None - shared.log.debug(f'Video load: module=te repo="{selected.te or selected.repo}" folder="{selected.te_folder}" cls={selected.te_cls.__name__} quant={model_quant.get_quant_type(quant_args)} loader={_loader("transformers")}') - kwargs["text_encoder"] = selected.te_cls.from_pretrained( - pretrained_model_name_or_path=selected.te or selected.repo, - subfolder=selected.te_folder, - revision=selected.te_revision or selected.repo_revision, - cache_dir=shared.opts.hfcache_dir, - **load_args, - **quant_args, - **offline_args, - ) - except Exception as e: - shared.log.error(f'video load: module=te cls={selected.te_cls.__name__} {e}') - errors.display(e, 'video') + shared.log.debug(f'Video load: module=te repo="{selected.te or selected.repo}" folder="{selected.te_folder}" cls={selected.te_cls.__name__} quant={model_quant.get_quant_type(quant_args)} loader={_loader("transformers")}') + kwargs["text_encoder"] = selected.te_cls.from_pretrained( + pretrained_model_name_or_path=selected.te or selected.repo, + subfolder=selected.te_folder, + revision=selected.te_revision or selected.repo_revision, + cache_dir=shared.opts.hfcache_dir, + **load_args, + **quant_args, + **offline_args, + ) + except Exception as e: + shared.log.error(f'video load: module=te cls={selected.te_cls.__name__} {e}') + errors.display(e, 'video') # transformer - try: - def load_dit_folder(dit_folder): - if dit_folder is not None and dit_folder not in kwargs: - # get a new quant arg on every loop to prevent the quant config classes getting entangled - load_args, quant_args = model_quant.get_dit_args({}, module='Model', device_map=True) - shared.log.debug(f'Video load: module=transformer repo="{selected.dit or selected.repo}" module="{dit_folder}" folder="{dit_folder}" cls={selected.dit_cls.__name__} quant={model_quant.get_quant_type(quant_args)} loader={_loader("diffusers")}') - kwargs[dit_folder] = selected.dit_cls.from_pretrained( - pretrained_model_name_or_path=selected.dit or selected.repo, - subfolder=dit_folder, - revision=selected.dit_revision or selected.repo_revision, - cache_dir=shared.opts.hfcache_dir, - **load_args, - **quant_args, - **offline_args, - ) - else: - shared.log.debug(f'Video load: module=transformer repo="{selected.dit or selected.repo}" module="{dit_folder}" folder="{dit_folder}" cls={selected.dit_cls.__name__} loader={_loader("diffusers")} skip') + if selected.dit_cls is not None: + try: + def load_dit_folder(dit_folder): + if dit_folder is not None and dit_folder not in kwargs: + # get a new quant arg on every loop to prevent the quant config classes getting entangled + load_args, quant_args = model_quant.get_dit_args({}, module='Model', device_map=True) + shared.log.debug(f'Video load: module=transformer repo="{selected.dit or selected.repo}" module="{dit_folder}" folder="{dit_folder}" cls={selected.dit_cls.__name__} quant={model_quant.get_quant_type(quant_args)} loader={_loader("diffusers")}') + kwargs[dit_folder] = selected.dit_cls.from_pretrained( + pretrained_model_name_or_path=selected.dit or selected.repo, + subfolder=dit_folder, + revision=selected.dit_revision or selected.repo_revision, + cache_dir=shared.opts.hfcache_dir, + **load_args, + **quant_args, + **offline_args, + ) + else: + shared.log.debug(f'Video load: module=transformer repo="{selected.dit or selected.repo}" module="{dit_folder}" folder="{dit_folder}" cls={selected.dit_cls.__name__} loader={_loader("diffusers")} skip') - if selected.dit_folder is None: - selected.dit_folder = ['transformer'] - if isinstance(selected.dit_folder, list) or isinstance(selected.dit_folder, tuple): - for dit_folder in selected.dit_folder: # wan a14b has transformer and transformer_2 - load_dit_folder(dit_folder) - else: - load_dit_folder(selected.dit_folder) - except Exception as e: - shared.log.error(f'video load: module=transformer cls={selected.dit_cls.__name__} {e}') - errors.display(e, 'video') + if selected.dit_folder is None: + selected.dit_folder = ['transformer'] + if isinstance(selected.dit_folder, list) or isinstance(selected.dit_folder, tuple): + for dit_folder in selected.dit_folder: # wan a14b has transformer and transformer_2 + load_dit_folder(dit_folder) + else: + load_dit_folder(selected.dit_folder) + except Exception as e: + shared.log.error(f'video load: module=transformer cls={selected.dit_cls.__name__} {e}') + errors.display(e, 'video') # model try: - shared.log.debug(f'Video load: module=pipe repo="{selected.repo}" cls={selected.repo_cls.__name__}') - shared.sd_model = selected.repo_cls.from_pretrained( - pretrained_model_name_or_path=selected.repo, - revision=selected.repo_revision, - cache_dir=shared.opts.hfcache_dir, - torch_dtype=devices.dtype, - **kwargs, - **offline_args, - ) + if selected.repo_cls is None: + shared.sd_model = load_custom(selected.repo) + else: + shared.log.debug(f'Video load: module=pipe repo="{selected.repo}" cls={selected.repo_cls.__name__}') + shared.sd_model = selected.repo_cls.from_pretrained( + pretrained_model_name_or_path=selected.repo, + revision=selected.repo_revision, + cache_dir=shared.opts.hfcache_dir, + torch_dtype=devices.dtype, + **kwargs, + **offline_args, + ) except Exception as e: shared.log.error(f'video load: module=pipe repo="{selected.repo}" cls={selected.repo_cls.__name__} {e}') errors.display(e, 'video') + if shared.sd_model is None: + msg = f'Video load: model="{selected.name}" failed' + shared.log.error(msg) + return msg + t1 = time.time() if shared.sd_model.__class__.__name__.startswith("LTX"): shared.sd_model.scheduler.config.use_dynamic_shifting = False @@ -138,7 +159,7 @@ def load_model(selected: models_def.Model): sd_models.set_diffuser_options(shared.sd_model, offload=False) decode, text, image, slicing, tiling, framewise = False, False, False, False, False, False - if selected.vae_hijack and hasattr(shared.sd_model.vae, 'decode'): + if selected.vae_hijack and hasattr(shared.sd_model, 'vae') and hasattr(shared.sd_model.vae, 'decode'): sd_hijack_vae.init_hijack(shared.sd_model) decode = True if selected.te_hijack and hasattr(shared.sd_model, 'encode_prompt'): diff --git a/modules/video_models/video_run.py b/modules/video_models/video_run.py index cdb1771d1..596497b6f 100644 --- a/modules/video_models/video_run.py +++ b/modules/video_models/video_run.py @@ -109,7 +109,7 @@ def generate(*args, **kwargs): orig_sampler_shift = shared.opts.schedulers_shift shared.opts.data['schedulers_dynamic_shift'] = dynamic_shift shared.opts.data['schedulers_shift'] = sampler_shift - if hasattr(shared.sd_model.scheduler, 'config') and hasattr(shared.sd_model.scheduler, 'register_to_config'): + if hasattr(shared.sd_model, 'scheduler') and hasattr(shared.sd_model.scheduler, 'config') and hasattr(shared.sd_model.scheduler, 'register_to_config'): if hasattr(shared.sd_model.scheduler.config, 'use_dynamic_shifting'): shared.sd_model.scheduler.config.use_dynamic_shifting = dynamic_shift shared.sd_model.scheduler.register_to_config(use_dynamic_shifting = dynamic_shift) @@ -146,15 +146,18 @@ def generate(*args, **kwargs): # done if err: return video_utils.queue_err(err) - if processed is None or len(processed.images) == 0: + if processed is None or (len(processed.images) == 0 and processed.bytes is None): return video_utils.queue_err('processing failed') shared.log.info(f'Video: name="{selected.name}" cls={shared.sd_model.__class__.__name__} frames={len(processed.images)} time={t1-t0:.2f}') - # video_file = images.save_video(p, filename=None, images=processed.images, video_type=video_type, duration=video_duration, loop=video_loop, pad=video_pad, interpolate=video_interpolate) # legacy video save from list of images - pixels = video_save.images_to_tensor(processed.images) + if hasattr(processed, 'images') and processed.images is not None: + pixels = video_save.images_to_tensor(processed.images) + else: + pixels = None _num_frames, video_file = video_save.save_video( p=p, pixels=pixels, + binary=processed.bytes, mp4_fps=mp4_fps, mp4_codec=mp4_codec, mp4_opt=mp4_opt, diff --git a/modules/video_models/video_save.py b/modules/video_models/video_save.py index 29a960a4c..38bc961ef 100644 --- a/modules/video_models/video_save.py +++ b/modules/video_models/video_save.py @@ -107,7 +107,8 @@ def atomic_save_video(filename, tensor:torch.Tensor, fps:float=24, codec:str='li def save_video( p:processing.StableDiffusionProcessingVideo, - pixels:torch.Tensor, + pixels:torch.Tensor=None, + binary:bytes=None, mp4_fps:int=24, mp4_codec:str='libx264', mp4_opt:str='', @@ -121,6 +122,23 @@ def save_video( pbar=None, # progress bar for video ): output_video = None + + if binary is not None: + output_filename = get_video_filename(p) + output_video = f'{output_filename}.{mp4_ext}' + try: + try: + with open(output_video, 'wb') as f: + f.write(binary) + shared.log.info(f'Video output: file="{output_video}" size={len(binary)}') + shared.state.outputs(output_video) + except Exception as e: + shared.log.error(f'Video output: file="{output_video}" {e}') + except Exception as e: + shared.log.error(f'Video output: file="{output_video}" write error {e}') + errors.display(e, 'video') + return 0, output_video + if pixels is None: return 0, output_video if not torch.is_tensor(pixels): diff --git a/modules/video_models/video_vae.py b/modules/video_models/video_vae.py index aff67c3f6..fefb116d9 100644 --- a/modules/video_models/video_vae.py +++ b/modules/video_models/video_vae.py @@ -9,6 +9,8 @@ vae_type = None def set_vae_params(p): global vae_type # pylint: disable=global-statement vae_type = p.vae_type + if not hasattr(shared.sd_model, 'vae'): + return if hasattr(shared.sd_model.vae, 'enable_slicing'): shared.sd_model.vae.enable_slicing() if p.frames > p.vae_tile_frames: diff --git a/pipelines/model_google.py b/pipelines/model_google.py index eff75e2a6..89089f211 100644 --- a/pipelines/model_google.py +++ b/pipelines/model_google.py @@ -24,6 +24,12 @@ aspect_ratios_buckets = { } +def google_requirements(): + install('google-genai==1.52.0') + install('pydantic==2.11.7', ignore=True, quiet=True) + reload('pydantic', '2.11.7') + + def get_size_buckets(width: int, height: int) -> str: aspect_ratio = width / height closest_aspect_ratio = min(aspect_ratios_buckets.items(), key=lambda x: abs(x[1] - aspect_ratio))[0] @@ -38,9 +44,7 @@ class GoogleNanoBananaPipeline(): self.model = model_name self.client = None self.config = None - install('google-genai==1.52.0') - install('pydantic==2.11.7', ignore=True, quiet=True) - reload('pydantic', '2.11.7') + google_requirements() log.debug(f'Load model: type=NanoBanana model="{model_name}"') def txt2img(self, prompt): @@ -81,7 +85,7 @@ class GoogleNanoBananaPipeline(): response_modalities=["IMAGE"], image_config=image_config ) - log.debug(f'Cloud: prompt={prompt} size={image_size} ar={aspect_ratio} image={image} model="{self.model}"') + log.debug(f'Cloud: prompt="{prompt}" size={image_size} ar={aspect_ratio} image={image} model="{self.model}"') # log.debug(f'Cloud: config={self.config}') try: @@ -114,6 +118,6 @@ if __name__ == "__main__": import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) log.info('test') - model =GoogleNanoBananaPipeline('gemini-3-pro-image-preview') + model = GoogleNanoBananaPipeline('gemini-3-pro-image-preview') img = model(['A beautiful landscape with mountains and a river'], 1024, 1024) img.save('test.png') diff --git a/wiki b/wiki index f7289d6c0..2fb950abd 160000 --- a/wiki +++ b/wiki @@ -1 +1 @@ -Subproject commit f7289d6c03899f519de8692efe8ea2731779607c +Subproject commit 2fb950abdeaad2d2a7976857f04646fc8c6963e1