fix meissonic

Signed-off-by: Vladimir Mandic <mandic00@live.com>
2025-08-11 11:02:39 -04:00 · 2025-08-11 11:02:39 -04:00 · dc8a72947d
parent e42a27a0e4
commit dc8a72947d
12 changed files with 134 additions and 42 deletions
--- a/cli/test-all-models.py
+++ b/cli/test-all-models.py
@ -1,14 +1,16 @@
 #!/usr/bin/env python
 """
-Warning:
+Warnings:
 - fal/AuraFlow-v0.3: layer_class_name=Linear layer_weight_shape=torch.Size([3072, 2, 1024]) weights_dtype=int8 unsupported
 - Kwai-Kolors/Kolors-diffusers: `set_input_embeddings` not auto‑handled for ChatGLMModel
-Error:
+- kandinsky-community/kandinsky-2-1: `get_input_embeddings` not auto‑handled for MultilingualCLIP
+Errors:
+- kandinsky-community/kandinsky-3: corrupt output
 - nvidia/Cosmos-Predict2-2B-Text2Image: mat1 and mat2 shapes cannot be multiplied (512x4096 and 1024x2048)
 - nvidia/Cosmos-Predict2-14B-Text2Image: mat1 and mat2 shapes cannot be multiplied (512x4096 and 1024x5120)
 - Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers: CUDA error: device-side assert triggered
 Other:
- HiDream-ai/HiDream-I1-Full: 30+s/it
+- HiDream-ai/HiDream-I1-Full: very slow at 30+s/it
 """

 import io
@ -54,10 +56,9 @@ models = {
    "OmniGen2/OmniGen2": {},
    # "HiDream-ai/HiDream-I1-Full": {},
    "Kwai-Kolors/Kolors-diffusers": {},
-    "lodestones/Chroma1-HD": {},
-    "vladmandic/chroma-unlocked-v50-annealed": {},
-    "vladmandic/chroma-unlocked-v48": {},
-    "vladmandic/chroma-unlocked-v48-detail-calibrated": {},
+    # "kandinsky-community/kandinsky-3": {},
+    "kandinsky-community/kandinsky-2-2-decoder": {},
+    "kandinsky-community/kandinsky-2-1": {},
    "Alpha-VLLM/Lumina-Next-SFT-diffusers": {},
    "Alpha-VLLM/Lumina-Image-2.0": {},
    "MeissonFlow/Meissonic": {},
@ -68,14 +69,15 @@ models = {
    "Wan-AI/Wan2.1-T2V-1.3B-Diffusers": {},
    "Wan-AI/Wan2.1-T2V-14B-Diffusers": {},
    "stabilityai/stable-cascade": {},
+    "lodestones/Chroma1-HD": {},
+    "vladmandic/chroma-unlocked-v50-annealed": {},
+    "vladmandic/chroma-unlocked-v48": {},
+    "vladmandic/chroma-unlocked-v48-detail-calibrated": {},
 }
 models_tbd = [
    "black-forest-labs/FLUX.1-dev",
    "black-forest-labs/FLUX.1-Kontext-dev",
    "black-forest-labs/FLUX.1-Krea-dev",
-    "kandinsky-community/kandinsky-3", # TODO
-    "kandinsky-community/kandinsky-2-2-decoder",
-    "kandinsky-community/kandinsky-2-1",
 ]
 styles = [
    'Fixed Astronaut',
--- a/html/reference.json
+++ b/html/reference.json
@ -490,7 +490,7 @@
  },
  "Kandinsky 2.2": {
    "path": "kandinsky-community/kandinsky-2-2-decoder",
-    "desc": "Kandinsky 2.2 is a text-conditional diffusion model (+0.1!) based on unCLIP and latent diffusion, composed of a transformer-based image prior model, a unet diffusion model, and a decoder. Kandinsky 2.1 inherits best practices from Dall-E 2 and Latent diffusion while introducing some new ideas. It uses the CLIP model as a text and image encoder, and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.",
+    "desc": "Kandinsky 2.2 is a text-conditional diffusion model (+0.1!) based on unCLIP and latent diffusion, composed of a transformer-based image prior model, a unet diffusion model, and a decoder. Kandinsky 2.2 inherits best practices from Dall-E 2 and Latent diffusion while introducing some new ideas. It uses the CLIP model as a text and image encoder, and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.",
    "preview": "kandinsky-community--kandinsky-2-2-decoder.jpg",
    "extras": "width: 768, height: 768, sampler: Default"
  },
--- a/modules/sd_detect.py
+++ b/modules/sd_detect.py
@ -103,6 +103,12 @@ def guess_by_name(fn, current_guess):
        return 'Bria'
    elif 'qwen' in fn.lower():
        return 'Qwen'
+    elif 'kandinsky-2-1' in fn.lower():
+        return 'Kandinsky 2.1'
+    elif 'kandinsky-2-2' in fn.lower():
+        return 'Kandinsky 2.2'
+    elif 'kandinsky-3' in fn.lower():
+        return 'Kandinsky 3.0'
    return current_guess


--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@ -381,6 +381,18 @@ def load_diffuser_force(model_type, checkpoint_info, diffusers_load_config, op='
            from pipelines.model_hunyuandit import load_hunyuandit
            sd_model = load_hunyuandit(checkpoint_info, diffusers_load_config)
            allow_post_quant = False
+        elif model_type in ['Kandinsky 2.1']:
+            from pipelines.model_kandinsky import load_kandinsky21
+            sd_model = load_kandinsky21(checkpoint_info, diffusers_load_config)
+            allow_post_quant = True
+        elif model_type in ['Kandinsky 2.2']:
+            from pipelines.model_kandinsky import load_kandinsky22
+            sd_model = load_kandinsky22(checkpoint_info, diffusers_load_config)
+            allow_post_quant = False
+        elif model_type in ['Kandinsky 3.0']:
+            from pipelines.model_kandinsky import load_kandinsky3
+            sd_model = load_kandinsky3(checkpoint_info, diffusers_load_config)
+            allow_post_quant = False
    except Exception as e:
        shared.log.error(f'Load {op}: path="{checkpoint_info.path}" {e}')
        if debug_load:
--- a/modules/sd_offload.py
+++ b/modules/sd_offload.py
@ -16,7 +16,7 @@ debug_move = log.trace if debug else lambda *args, **kwargs: None
 offload_warn = ['sc', 'sd3', 'f1', 'h1', 'hunyuandit', 'auraflow', 'omnigen', 'omnigen2', 'cogview4', 'cosmos', 'chroma']
 offload_post = ['h1']
 offload_hook_instance = None
-balanced_offload_exclude = ['CogView4Pipeline']
+balanced_offload_exclude = ['CogView4Pipeline', 'MeissonicPipeline']
 accelerate_dtype_byte_size = None


--- a/modules/sd_vae.py
+++ b/modules/sd_vae.py
@ -34,7 +34,7 @@ def get_vae_scale_factor(model=None):
    elif hasattr(model, 'config') and hasattr(model.config, 'vae_scale_factor'):
        vae_scale_factor = model.config.vae_scale_factor
    else:
-        shared.log.warning(f'VAE: cls={model.__class__.__name__ if model else "None"} scale=unknown')
+        # shared.log.warning(f'VAE: cls={model.__class__.__name__ if model else "None"} scale=unknown')
        vae_scale_factor = 8
    if hasattr(model, 'patch_size'):
        patch_size = model.patch_size
--- a/modules/sd_vae_taesd.py
+++ b/modules/sd_vae_taesd.py
@ -45,7 +45,7 @@ def warn_once(msg, variant=None):
    global prev_warnings # pylint: disable=global-statement
    if not prev_warnings:
        prev_warnings = True
-        shared.log.error(f'Decode: type="taesd" variant="{variant}": {msg}')
+        shared.log.warning(f'Decode: type="taesd" variant="{variant}": {msg}')
    return Image.new('RGB', (8, 8), color = (0, 0, 0))


--- a/pipelines/generic.py
+++ b/pipelines/generic.py
@ -8,9 +8,10 @@ from modules import shared, devices, sd_models, model_quant
 debug = shared.log.trace if os.environ.get('SD_LOAD_DEBUG', None) is not None else lambda *args, **kwargs: None


-def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer", allow_quant=True, variant=None):
+def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer", allow_quant=True, variant=None, dtype=None):
    load_args, quant_args = model_quant.get_dit_args(load_config, module='Model', device_map=True, allow_quant=allow_quant)
    quant_type = model_quant.get_quant_type(quant_args)
+    dtype = dtype or devices.dtype

    local_file = None
    if shared.opts.sd_unet is not None and shared.opts.sd_unet != 'Default':
@ -27,7 +28,7 @@ def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer",
        loader = cls_name.from_single_file if hasattr(cls_name, 'from_single_file') else cls_name.from_pretrained
        transformer = loader(
            local_file,
-            quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=devices.dtype),
+            quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=dtype),
            cache_dir=shared.opts.hfcache_dir,
            **load_args,
        )
@ -43,6 +44,8 @@ def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer",
        transformer = model_quant.do_post_load_quant(transformer, allow=quant_type is not None)
    else:
        shared.log.debug(f'Load model: transformer="{repo_id}" cls={cls_name.__name__} quant="{quant_type}" args={load_args}')
+        if dtype is not None:
+            load_args['torch_dtype'] = dtype
        if subfolder is not None:
            load_args['subfolder'] = subfolder
        if variant is not None:
@ -58,10 +61,11 @@ def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer",
    return transformer


-def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder", allow_quant=True, allow_shared=True, variant=None):
+def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder", allow_quant=True, allow_shared=True, variant=None, dtype=None):
    load_args, quant_args = model_quant.get_dit_args(load_config, module='TE', device_map=True, allow_quant=allow_quant)
    quant_type = model_quant.get_quant_type(quant_args)
    text_encoder = None
+    dtype = dtype or devices.dtype

    # load from local file if specified
    local_file = None
@ -79,7 +83,7 @@ def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder
        ggml.install_gguf()
        text_encoder = cls_name.from_pretrained(
            gguf_file=local_file,
-            quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=devices.dtype),
+            quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=dtype),
            cache_dir=shared.opts.hfcache_dir,
            **load_args,
        )
@ -104,12 +108,14 @@ def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder
            shared.log.debug(f'Load model: text_encoder="{repo_id}" cls={cls_name.__name__} quant="SVDQuant"')
            text_encoder = nunchaku.NunchakuT5EncoderModel.from_pretrained(
                repo_id,
-                torch_dtype=devices.dtype,
+                torch_dtype=dtype,
            )
            text_encoder.quantization_method = 'SVDQuant'
        elif shared.opts.te_shared_t5:
            repo_id = 'Disty0/t5-xxl'
            shared.log.debug(f'Load model: text_encoder="{repo_id}" cls={cls_name.__name__} quant="{quant_type}" shared={shared.opts.te_shared_t5}')
+            if dtype is not None:
+                load_args['torch_dtype'] = dtype
            text_encoder = cls_name.from_pretrained(
                repo_id,
                cache_dir=shared.opts.hfcache_dir,
@ -120,6 +126,8 @@ def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder
    # load from repo
    if text_encoder is None:
        shared.log.debug(f'Load model: text_encoder="{repo_id}" cls={cls_name.__name__} quant="{quant_type}" shared={shared.opts.te_shared_t5}')
+        if dtype is not None:
+            load_args['torch_dtype'] = dtype
        if subfolder is not None:
            load_args['subfolder'] = subfolder
        if variant is not None:
--- a/pipelines/meissonic/pipeline.py
+++ b/pipelines/meissonic/pipeline.py
@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import torch
@ -49,7 +48,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
    return latent_image_ids.to(device=device, dtype=dtype)


-class Pipeline(DiffusionPipeline):
+class MeissonicPipeline(DiffusionPipeline):
    image_processor: VaeImageProcessor
    vqvae: VQModel
    tokenizer: CLIPTokenizer
@ -212,27 +211,27 @@ class Pipeline(DiffusionPipeline):
            width = self.transformer.config.sample_size * self.vae_scale_factor

        if prompt_embeds is None:
-                input_ids = self.tokenizer(
-                    prompt,
-                    return_tensors="pt",
-                    padding="max_length",
-                    truncation=True,
-                    max_length=77, #self.tokenizer.model_max_length,
-                ).input_ids.to(self._execution_device)
-                # input_ids_t5 = self.tokenizer_t5(
-                #     prompt,
-                #     return_tensors="pt",
-                #     padding="max_length",
-                #     truncation=True,
-                #     max_length=512,
-                # ).input_ids.to(self._execution_device)
+            input_ids = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=77, #self.tokenizer.model_max_length,
+            ).input_ids.to(self._execution_device)
+            # input_ids_t5 = self.tokenizer_t5(
+            #     prompt,
+            #     return_tensors="pt",
+            #     padding="max_length",
+            #     truncation=True,
+            #     max_length=512,
+            # ).input_ids.to(self._execution_device)


-                outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
-                # outputs_t5 = self.text_encoder_t5(input_ids_t5, decoder_input_ids = input_ids_t5 ,return_dict=True, output_hidden_states=True)
-                prompt_embeds = outputs.text_embeds
-                encoder_hidden_states = outputs.hidden_states[-2]
-                # encoder_hidden_states = outputs_t5.encoder_hidden_states[-2]
+            outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+            # outputs_t5 = self.text_encoder_t5(input_ids_t5, decoder_input_ids = input_ids_t5 ,return_dict=True, output_hidden_states=True)
+            prompt_embeds = outputs.text_embeds
+            encoder_hidden_states = outputs.hidden_states[-2]
+            # encoder_hidden_states = outputs_t5.encoder_hidden_states[-2]

        prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1)
        encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
--- a/pipelines/meissonic/pipeline_img2img.py
+++ b/pipelines/meissonic/pipeline_img2img.py
@ -46,7 +46,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
    return latent_image_ids.to(device=device, dtype=dtype)


-class Img2ImgPipeline(DiffusionPipeline):
+class MeissonicImg2ImgPipeline(DiffusionPipeline):
    image_processor: VaeImageProcessor
    vqvae: VQModel
    tokenizer: CLIPTokenizer
--- a/pipelines/meissonic/pipeline_inpaint.py
+++ b/pipelines/meissonic/pipeline_inpaint.py
@ -43,7 +43,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
    return latent_image_ids.to(device=device, dtype=dtype)


-class InpaintPipeline(DiffusionPipeline):
+class MeissonicInpaintPipeline(DiffusionPipeline):
    image_processor: VaeImageProcessor
    vqvae: VQModel
    tokenizer: CLIPTokenizer
--- a/pipelines/model_kandinsky.py
+++ b/pipelines/model_kandinsky.py
@ -0,0 +1,65 @@
+import transformers
+import diffusers
+from modules import shared, sd_models, devices, model_quant, sd_hijack_te
+from pipelines import generic
+
+
+def load_kandinsky21(checkpoint_info, diffusers_load_config={}):
+    repo_id = sd_models.path_to_repo(checkpoint_info)
+    sd_models.hf_auth_check(checkpoint_info)
+
+    load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config)
+    shared.log.debug(f'Load model: type=Kandinsky21 repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}')
+    pipe = diffusers.KandinskyCombinedPipeline.from_pretrained(
+        repo_id,
+        cache_dir=shared.opts.diffusers_dir,
+        **load_args,
+    )
+    sd_hijack_te.init_hijack(pipe)
+    devices.torch_gc(force=True, reason='load')
+    return pipe
+
+
+def load_kandinsky22(checkpoint_info, diffusers_load_config={}):
+    repo_id = sd_models.path_to_repo(checkpoint_info)
+    sd_models.hf_auth_check(checkpoint_info)
+
+    load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config)
+    shared.log.debug(f'Load model: type=Kandinsky22 repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}')
+    pipe = diffusers.KandinskyV22CombinedPipeline.from_pretrained(
+        repo_id,
+        cache_dir=shared.opts.diffusers_dir,
+        **load_args,
+    )
+    sd_hijack_te.init_hijack(pipe)
+    devices.torch_gc(force=True, reason='load')
+    return pipe
+
+
+def load_kandinsky3(checkpoint_info, diffusers_load_config={}):
+    repo_id = sd_models.path_to_repo(checkpoint_info)
+    sd_models.hf_auth_check(checkpoint_info)
+
+    load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config)
+    shared.log.debug(f'Load model: type=Kandinsky30 repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}')
+
+    unet = generic.load_transformer(repo_id, cls_name=diffusers.Kandinsky3UNet, load_config=diffusers_load_config, subfolder="unet", variant="fp16")
+    text_encoder = generic.load_text_encoder(repo_id, cls_name=transformers.T5EncoderModel, load_config=diffusers_load_config, subfolder="text_encoder", variant="fp16")
+
+    pipe = diffusers.Kandinsky3Pipeline.from_pretrained(
+        repo_id,
+        unet=unet,
+        text_encoder=text_encoder,
+        variant="fp16",
+        cache_dir=shared.opts.diffusers_dir,
+        **load_args,
+    )
+    pipe.task_args = {
+        'output_type': 'np',
+    }
+
+    del text_encoder
+    del unet
+    sd_hijack_te.init_hijack(pipe)
+    devices.torch_gc(force=True, reason='load')
+    return pipe