fix meissonic

Signed-off-by: Vladimir Mandic <mandic00@live.com>
pull/4115/head
Vladimir Mandic 2025-08-11 11:02:39 -04:00
parent e42a27a0e4
commit dc8a72947d
12 changed files with 134 additions and 42 deletions

View File

@ -1,14 +1,16 @@
#!/usr/bin/env python
"""
Warning:
Warnings:
- fal/AuraFlow-v0.3: layer_class_name=Linear layer_weight_shape=torch.Size([3072, 2, 1024]) weights_dtype=int8 unsupported
- Kwai-Kolors/Kolors-diffusers: `set_input_embeddings` not autohandled for ChatGLMModel
Error:
- kandinsky-community/kandinsky-2-1: `get_input_embeddings` not autohandled for MultilingualCLIP
Errors:
- kandinsky-community/kandinsky-3: corrupt output
- nvidia/Cosmos-Predict2-2B-Text2Image: mat1 and mat2 shapes cannot be multiplied (512x4096 and 1024x2048)
- nvidia/Cosmos-Predict2-14B-Text2Image: mat1 and mat2 shapes cannot be multiplied (512x4096 and 1024x5120)
- Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers: CUDA error: device-side assert triggered
Other:
- HiDream-ai/HiDream-I1-Full: 30+s/it
- HiDream-ai/HiDream-I1-Full: very slow at 30+s/it
"""
import io
@ -54,10 +56,9 @@ models = {
"OmniGen2/OmniGen2": {},
# "HiDream-ai/HiDream-I1-Full": {},
"Kwai-Kolors/Kolors-diffusers": {},
"lodestones/Chroma1-HD": {},
"vladmandic/chroma-unlocked-v50-annealed": {},
"vladmandic/chroma-unlocked-v48": {},
"vladmandic/chroma-unlocked-v48-detail-calibrated": {},
# "kandinsky-community/kandinsky-3": {},
"kandinsky-community/kandinsky-2-2-decoder": {},
"kandinsky-community/kandinsky-2-1": {},
"Alpha-VLLM/Lumina-Next-SFT-diffusers": {},
"Alpha-VLLM/Lumina-Image-2.0": {},
"MeissonFlow/Meissonic": {},
@ -68,14 +69,15 @@ models = {
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers": {},
"Wan-AI/Wan2.1-T2V-14B-Diffusers": {},
"stabilityai/stable-cascade": {},
"lodestones/Chroma1-HD": {},
"vladmandic/chroma-unlocked-v50-annealed": {},
"vladmandic/chroma-unlocked-v48": {},
"vladmandic/chroma-unlocked-v48-detail-calibrated": {},
}
models_tbd = [
"black-forest-labs/FLUX.1-dev",
"black-forest-labs/FLUX.1-Kontext-dev",
"black-forest-labs/FLUX.1-Krea-dev",
"kandinsky-community/kandinsky-3", # TODO
"kandinsky-community/kandinsky-2-2-decoder",
"kandinsky-community/kandinsky-2-1",
]
styles = [
'Fixed Astronaut',

View File

@ -490,7 +490,7 @@
},
"Kandinsky 2.2": {
"path": "kandinsky-community/kandinsky-2-2-decoder",
"desc": "Kandinsky 2.2 is a text-conditional diffusion model (+0.1!) based on unCLIP and latent diffusion, composed of a transformer-based image prior model, a unet diffusion model, and a decoder. Kandinsky 2.1 inherits best practices from Dall-E 2 and Latent diffusion while introducing some new ideas. It uses the CLIP model as a text and image encoder, and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.",
"desc": "Kandinsky 2.2 is a text-conditional diffusion model (+0.1!) based on unCLIP and latent diffusion, composed of a transformer-based image prior model, a unet diffusion model, and a decoder. Kandinsky 2.2 inherits best practices from Dall-E 2 and Latent diffusion while introducing some new ideas. It uses the CLIP model as a text and image encoder, and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.",
"preview": "kandinsky-community--kandinsky-2-2-decoder.jpg",
"extras": "width: 768, height: 768, sampler: Default"
},

View File

@ -103,6 +103,12 @@ def guess_by_name(fn, current_guess):
return 'Bria'
elif 'qwen' in fn.lower():
return 'Qwen'
elif 'kandinsky-2-1' in fn.lower():
return 'Kandinsky 2.1'
elif 'kandinsky-2-2' in fn.lower():
return 'Kandinsky 2.2'
elif 'kandinsky-3' in fn.lower():
return 'Kandinsky 3.0'
return current_guess

View File

@ -381,6 +381,18 @@ def load_diffuser_force(model_type, checkpoint_info, diffusers_load_config, op='
from pipelines.model_hunyuandit import load_hunyuandit
sd_model = load_hunyuandit(checkpoint_info, diffusers_load_config)
allow_post_quant = False
elif model_type in ['Kandinsky 2.1']:
from pipelines.model_kandinsky import load_kandinsky21
sd_model = load_kandinsky21(checkpoint_info, diffusers_load_config)
allow_post_quant = True
elif model_type in ['Kandinsky 2.2']:
from pipelines.model_kandinsky import load_kandinsky22
sd_model = load_kandinsky22(checkpoint_info, diffusers_load_config)
allow_post_quant = False
elif model_type in ['Kandinsky 3.0']:
from pipelines.model_kandinsky import load_kandinsky3
sd_model = load_kandinsky3(checkpoint_info, diffusers_load_config)
allow_post_quant = False
except Exception as e:
shared.log.error(f'Load {op}: path="{checkpoint_info.path}" {e}')
if debug_load:

View File

@ -16,7 +16,7 @@ debug_move = log.trace if debug else lambda *args, **kwargs: None
offload_warn = ['sc', 'sd3', 'f1', 'h1', 'hunyuandit', 'auraflow', 'omnigen', 'omnigen2', 'cogview4', 'cosmos', 'chroma']
offload_post = ['h1']
offload_hook_instance = None
balanced_offload_exclude = ['CogView4Pipeline']
balanced_offload_exclude = ['CogView4Pipeline', 'MeissonicPipeline']
accelerate_dtype_byte_size = None

View File

@ -34,7 +34,7 @@ def get_vae_scale_factor(model=None):
elif hasattr(model, 'config') and hasattr(model.config, 'vae_scale_factor'):
vae_scale_factor = model.config.vae_scale_factor
else:
shared.log.warning(f'VAE: cls={model.__class__.__name__ if model else "None"} scale=unknown')
# shared.log.warning(f'VAE: cls={model.__class__.__name__ if model else "None"} scale=unknown')
vae_scale_factor = 8
if hasattr(model, 'patch_size'):
patch_size = model.patch_size

View File

@ -45,7 +45,7 @@ def warn_once(msg, variant=None):
global prev_warnings # pylint: disable=global-statement
if not prev_warnings:
prev_warnings = True
shared.log.error(f'Decode: type="taesd" variant="{variant}": {msg}')
shared.log.warning(f'Decode: type="taesd" variant="{variant}": {msg}')
return Image.new('RGB', (8, 8), color = (0, 0, 0))

View File

@ -8,9 +8,10 @@ from modules import shared, devices, sd_models, model_quant
debug = shared.log.trace if os.environ.get('SD_LOAD_DEBUG', None) is not None else lambda *args, **kwargs: None
def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer", allow_quant=True, variant=None):
def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer", allow_quant=True, variant=None, dtype=None):
load_args, quant_args = model_quant.get_dit_args(load_config, module='Model', device_map=True, allow_quant=allow_quant)
quant_type = model_quant.get_quant_type(quant_args)
dtype = dtype or devices.dtype
local_file = None
if shared.opts.sd_unet is not None and shared.opts.sd_unet != 'Default':
@ -27,7 +28,7 @@ def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer",
loader = cls_name.from_single_file if hasattr(cls_name, 'from_single_file') else cls_name.from_pretrained
transformer = loader(
local_file,
quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=devices.dtype),
quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=dtype),
cache_dir=shared.opts.hfcache_dir,
**load_args,
)
@ -43,6 +44,8 @@ def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer",
transformer = model_quant.do_post_load_quant(transformer, allow=quant_type is not None)
else:
shared.log.debug(f'Load model: transformer="{repo_id}" cls={cls_name.__name__} quant="{quant_type}" args={load_args}')
if dtype is not None:
load_args['torch_dtype'] = dtype
if subfolder is not None:
load_args['subfolder'] = subfolder
if variant is not None:
@ -58,10 +61,11 @@ def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer",
return transformer
def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder", allow_quant=True, allow_shared=True, variant=None):
def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder", allow_quant=True, allow_shared=True, variant=None, dtype=None):
load_args, quant_args = model_quant.get_dit_args(load_config, module='TE', device_map=True, allow_quant=allow_quant)
quant_type = model_quant.get_quant_type(quant_args)
text_encoder = None
dtype = dtype or devices.dtype
# load from local file if specified
local_file = None
@ -79,7 +83,7 @@ def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder
ggml.install_gguf()
text_encoder = cls_name.from_pretrained(
gguf_file=local_file,
quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=devices.dtype),
quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=dtype),
cache_dir=shared.opts.hfcache_dir,
**load_args,
)
@ -104,12 +108,14 @@ def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder
shared.log.debug(f'Load model: text_encoder="{repo_id}" cls={cls_name.__name__} quant="SVDQuant"')
text_encoder = nunchaku.NunchakuT5EncoderModel.from_pretrained(
repo_id,
torch_dtype=devices.dtype,
torch_dtype=dtype,
)
text_encoder.quantization_method = 'SVDQuant'
elif shared.opts.te_shared_t5:
repo_id = 'Disty0/t5-xxl'
shared.log.debug(f'Load model: text_encoder="{repo_id}" cls={cls_name.__name__} quant="{quant_type}" shared={shared.opts.te_shared_t5}')
if dtype is not None:
load_args['torch_dtype'] = dtype
text_encoder = cls_name.from_pretrained(
repo_id,
cache_dir=shared.opts.hfcache_dir,
@ -120,6 +126,8 @@ def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder
# load from repo
if text_encoder is None:
shared.log.debug(f'Load model: text_encoder="{repo_id}" cls={cls_name.__name__} quant="{quant_type}" shared={shared.opts.te_shared_t5}')
if dtype is not None:
load_args['torch_dtype'] = dtype
if subfolder is not None:
load_args['subfolder'] = subfolder
if variant is not None:

View File

@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
@ -49,7 +48,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
return latent_image_ids.to(device=device, dtype=dtype)
class Pipeline(DiffusionPipeline):
class MeissonicPipeline(DiffusionPipeline):
image_processor: VaeImageProcessor
vqvae: VQModel
tokenizer: CLIPTokenizer
@ -212,27 +211,27 @@ class Pipeline(DiffusionPipeline):
width = self.transformer.config.sample_size * self.vae_scale_factor
if prompt_embeds is None:
input_ids = self.tokenizer(
prompt,
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=77, #self.tokenizer.model_max_length,
).input_ids.to(self._execution_device)
# input_ids_t5 = self.tokenizer_t5(
# prompt,
# return_tensors="pt",
# padding="max_length",
# truncation=True,
# max_length=512,
# ).input_ids.to(self._execution_device)
input_ids = self.tokenizer(
prompt,
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=77, #self.tokenizer.model_max_length,
).input_ids.to(self._execution_device)
# input_ids_t5 = self.tokenizer_t5(
# prompt,
# return_tensors="pt",
# padding="max_length",
# truncation=True,
# max_length=512,
# ).input_ids.to(self._execution_device)
outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
# outputs_t5 = self.text_encoder_t5(input_ids_t5, decoder_input_ids = input_ids_t5 ,return_dict=True, output_hidden_states=True)
prompt_embeds = outputs.text_embeds
encoder_hidden_states = outputs.hidden_states[-2]
# encoder_hidden_states = outputs_t5.encoder_hidden_states[-2]
outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
# outputs_t5 = self.text_encoder_t5(input_ids_t5, decoder_input_ids = input_ids_t5 ,return_dict=True, output_hidden_states=True)
prompt_embeds = outputs.text_embeds
encoder_hidden_states = outputs.hidden_states[-2]
# encoder_hidden_states = outputs_t5.encoder_hidden_states[-2]
prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1)
encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)

View File

@ -46,7 +46,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
return latent_image_ids.to(device=device, dtype=dtype)
class Img2ImgPipeline(DiffusionPipeline):
class MeissonicImg2ImgPipeline(DiffusionPipeline):
image_processor: VaeImageProcessor
vqvae: VQModel
tokenizer: CLIPTokenizer

View File

@ -43,7 +43,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
return latent_image_ids.to(device=device, dtype=dtype)
class InpaintPipeline(DiffusionPipeline):
class MeissonicInpaintPipeline(DiffusionPipeline):
image_processor: VaeImageProcessor
vqvae: VQModel
tokenizer: CLIPTokenizer

View File

@ -0,0 +1,65 @@
import transformers
import diffusers
from modules import shared, sd_models, devices, model_quant, sd_hijack_te
from pipelines import generic
def load_kandinsky21(checkpoint_info, diffusers_load_config={}):
repo_id = sd_models.path_to_repo(checkpoint_info)
sd_models.hf_auth_check(checkpoint_info)
load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config)
shared.log.debug(f'Load model: type=Kandinsky21 repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}')
pipe = diffusers.KandinskyCombinedPipeline.from_pretrained(
repo_id,
cache_dir=shared.opts.diffusers_dir,
**load_args,
)
sd_hijack_te.init_hijack(pipe)
devices.torch_gc(force=True, reason='load')
return pipe
def load_kandinsky22(checkpoint_info, diffusers_load_config={}):
repo_id = sd_models.path_to_repo(checkpoint_info)
sd_models.hf_auth_check(checkpoint_info)
load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config)
shared.log.debug(f'Load model: type=Kandinsky22 repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}')
pipe = diffusers.KandinskyV22CombinedPipeline.from_pretrained(
repo_id,
cache_dir=shared.opts.diffusers_dir,
**load_args,
)
sd_hijack_te.init_hijack(pipe)
devices.torch_gc(force=True, reason='load')
return pipe
def load_kandinsky3(checkpoint_info, diffusers_load_config={}):
repo_id = sd_models.path_to_repo(checkpoint_info)
sd_models.hf_auth_check(checkpoint_info)
load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config)
shared.log.debug(f'Load model: type=Kandinsky30 repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}')
unet = generic.load_transformer(repo_id, cls_name=diffusers.Kandinsky3UNet, load_config=diffusers_load_config, subfolder="unet", variant="fp16")
text_encoder = generic.load_text_encoder(repo_id, cls_name=transformers.T5EncoderModel, load_config=diffusers_load_config, subfolder="text_encoder", variant="fp16")
pipe = diffusers.Kandinsky3Pipeline.from_pretrained(
repo_id,
unet=unet,
text_encoder=text_encoder,
variant="fp16",
cache_dir=shared.opts.diffusers_dir,
**load_args,
)
pipe.task_args = {
'output_type': 'np',
}
del text_encoder
del unet
sd_hijack_te.init_hijack(pipe)
devices.torch_gc(force=True, reason='load')
return pipe