diff --git a/cli/test-all-models.py b/cli/test-all-models.py index 163c193d6..a9853ebd1 100755 --- a/cli/test-all-models.py +++ b/cli/test-all-models.py @@ -1,14 +1,16 @@ #!/usr/bin/env python """ -Warning: +Warnings: - fal/AuraFlow-v0.3: layer_class_name=Linear layer_weight_shape=torch.Size([3072, 2, 1024]) weights_dtype=int8 unsupported - Kwai-Kolors/Kolors-diffusers: `set_input_embeddings` not auto‑handled for ChatGLMModel -Error: +- kandinsky-community/kandinsky-2-1: `get_input_embeddings` not auto‑handled for MultilingualCLIP +Errors: +- kandinsky-community/kandinsky-3: corrupt output - nvidia/Cosmos-Predict2-2B-Text2Image: mat1 and mat2 shapes cannot be multiplied (512x4096 and 1024x2048) - nvidia/Cosmos-Predict2-14B-Text2Image: mat1 and mat2 shapes cannot be multiplied (512x4096 and 1024x5120) - Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers: CUDA error: device-side assert triggered Other: -- HiDream-ai/HiDream-I1-Full: 30+s/it +- HiDream-ai/HiDream-I1-Full: very slow at 30+s/it """ import io @@ -54,10 +56,9 @@ models = { "OmniGen2/OmniGen2": {}, # "HiDream-ai/HiDream-I1-Full": {}, "Kwai-Kolors/Kolors-diffusers": {}, - "lodestones/Chroma1-HD": {}, - "vladmandic/chroma-unlocked-v50-annealed": {}, - "vladmandic/chroma-unlocked-v48": {}, - "vladmandic/chroma-unlocked-v48-detail-calibrated": {}, + # "kandinsky-community/kandinsky-3": {}, + "kandinsky-community/kandinsky-2-2-decoder": {}, + "kandinsky-community/kandinsky-2-1": {}, "Alpha-VLLM/Lumina-Next-SFT-diffusers": {}, "Alpha-VLLM/Lumina-Image-2.0": {}, "MeissonFlow/Meissonic": {}, @@ -68,14 +69,15 @@ models = { "Wan-AI/Wan2.1-T2V-1.3B-Diffusers": {}, "Wan-AI/Wan2.1-T2V-14B-Diffusers": {}, "stabilityai/stable-cascade": {}, + "lodestones/Chroma1-HD": {}, + "vladmandic/chroma-unlocked-v50-annealed": {}, + "vladmandic/chroma-unlocked-v48": {}, + "vladmandic/chroma-unlocked-v48-detail-calibrated": {}, } models_tbd = [ "black-forest-labs/FLUX.1-dev", "black-forest-labs/FLUX.1-Kontext-dev", "black-forest-labs/FLUX.1-Krea-dev", - "kandinsky-community/kandinsky-3", # TODO - "kandinsky-community/kandinsky-2-2-decoder", - "kandinsky-community/kandinsky-2-1", ] styles = [ 'Fixed Astronaut', diff --git a/html/reference.json b/html/reference.json index 873dcaefb..a8c3447bb 100644 --- a/html/reference.json +++ b/html/reference.json @@ -490,7 +490,7 @@ }, "Kandinsky 2.2": { "path": "kandinsky-community/kandinsky-2-2-decoder", - "desc": "Kandinsky 2.2 is a text-conditional diffusion model (+0.1!) based on unCLIP and latent diffusion, composed of a transformer-based image prior model, a unet diffusion model, and a decoder. Kandinsky 2.1 inherits best practices from Dall-E 2 and Latent diffusion while introducing some new ideas. It uses the CLIP model as a text and image encoder, and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.", + "desc": "Kandinsky 2.2 is a text-conditional diffusion model (+0.1!) based on unCLIP and latent diffusion, composed of a transformer-based image prior model, a unet diffusion model, and a decoder. Kandinsky 2.2 inherits best practices from Dall-E 2 and Latent diffusion while introducing some new ideas. It uses the CLIP model as a text and image encoder, and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.", "preview": "kandinsky-community--kandinsky-2-2-decoder.jpg", "extras": "width: 768, height: 768, sampler: Default" }, diff --git a/modules/sd_detect.py b/modules/sd_detect.py index b5767956e..a6169e5c3 100644 --- a/modules/sd_detect.py +++ b/modules/sd_detect.py @@ -103,6 +103,12 @@ def guess_by_name(fn, current_guess): return 'Bria' elif 'qwen' in fn.lower(): return 'Qwen' + elif 'kandinsky-2-1' in fn.lower(): + return 'Kandinsky 2.1' + elif 'kandinsky-2-2' in fn.lower(): + return 'Kandinsky 2.2' + elif 'kandinsky-3' in fn.lower(): + return 'Kandinsky 3.0' return current_guess diff --git a/modules/sd_models.py b/modules/sd_models.py index da863a924..ec11f2c1e 100644 --- a/modules/sd_models.py +++ b/modules/sd_models.py @@ -381,6 +381,18 @@ def load_diffuser_force(model_type, checkpoint_info, diffusers_load_config, op=' from pipelines.model_hunyuandit import load_hunyuandit sd_model = load_hunyuandit(checkpoint_info, diffusers_load_config) allow_post_quant = False + elif model_type in ['Kandinsky 2.1']: + from pipelines.model_kandinsky import load_kandinsky21 + sd_model = load_kandinsky21(checkpoint_info, diffusers_load_config) + allow_post_quant = True + elif model_type in ['Kandinsky 2.2']: + from pipelines.model_kandinsky import load_kandinsky22 + sd_model = load_kandinsky22(checkpoint_info, diffusers_load_config) + allow_post_quant = False + elif model_type in ['Kandinsky 3.0']: + from pipelines.model_kandinsky import load_kandinsky3 + sd_model = load_kandinsky3(checkpoint_info, diffusers_load_config) + allow_post_quant = False except Exception as e: shared.log.error(f'Load {op}: path="{checkpoint_info.path}" {e}') if debug_load: diff --git a/modules/sd_offload.py b/modules/sd_offload.py index c6156a3e8..4a253a428 100644 --- a/modules/sd_offload.py +++ b/modules/sd_offload.py @@ -16,7 +16,7 @@ debug_move = log.trace if debug else lambda *args, **kwargs: None offload_warn = ['sc', 'sd3', 'f1', 'h1', 'hunyuandit', 'auraflow', 'omnigen', 'omnigen2', 'cogview4', 'cosmos', 'chroma'] offload_post = ['h1'] offload_hook_instance = None -balanced_offload_exclude = ['CogView4Pipeline'] +balanced_offload_exclude = ['CogView4Pipeline', 'MeissonicPipeline'] accelerate_dtype_byte_size = None diff --git a/modules/sd_vae.py b/modules/sd_vae.py index 2578ff196..7986b2568 100644 --- a/modules/sd_vae.py +++ b/modules/sd_vae.py @@ -34,7 +34,7 @@ def get_vae_scale_factor(model=None): elif hasattr(model, 'config') and hasattr(model.config, 'vae_scale_factor'): vae_scale_factor = model.config.vae_scale_factor else: - shared.log.warning(f'VAE: cls={model.__class__.__name__ if model else "None"} scale=unknown') + # shared.log.warning(f'VAE: cls={model.__class__.__name__ if model else "None"} scale=unknown') vae_scale_factor = 8 if hasattr(model, 'patch_size'): patch_size = model.patch_size diff --git a/modules/sd_vae_taesd.py b/modules/sd_vae_taesd.py index 7fc9c35f7..a89dff777 100644 --- a/modules/sd_vae_taesd.py +++ b/modules/sd_vae_taesd.py @@ -45,7 +45,7 @@ def warn_once(msg, variant=None): global prev_warnings # pylint: disable=global-statement if not prev_warnings: prev_warnings = True - shared.log.error(f'Decode: type="taesd" variant="{variant}": {msg}') + shared.log.warning(f'Decode: type="taesd" variant="{variant}": {msg}') return Image.new('RGB', (8, 8), color = (0, 0, 0)) diff --git a/pipelines/generic.py b/pipelines/generic.py index 3801963d5..102b5e6b8 100644 --- a/pipelines/generic.py +++ b/pipelines/generic.py @@ -8,9 +8,10 @@ from modules import shared, devices, sd_models, model_quant debug = shared.log.trace if os.environ.get('SD_LOAD_DEBUG', None) is not None else lambda *args, **kwargs: None -def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer", allow_quant=True, variant=None): +def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer", allow_quant=True, variant=None, dtype=None): load_args, quant_args = model_quant.get_dit_args(load_config, module='Model', device_map=True, allow_quant=allow_quant) quant_type = model_quant.get_quant_type(quant_args) + dtype = dtype or devices.dtype local_file = None if shared.opts.sd_unet is not None and shared.opts.sd_unet != 'Default': @@ -27,7 +28,7 @@ def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer", loader = cls_name.from_single_file if hasattr(cls_name, 'from_single_file') else cls_name.from_pretrained transformer = loader( local_file, - quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=devices.dtype), + quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=dtype), cache_dir=shared.opts.hfcache_dir, **load_args, ) @@ -43,6 +44,8 @@ def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer", transformer = model_quant.do_post_load_quant(transformer, allow=quant_type is not None) else: shared.log.debug(f'Load model: transformer="{repo_id}" cls={cls_name.__name__} quant="{quant_type}" args={load_args}') + if dtype is not None: + load_args['torch_dtype'] = dtype if subfolder is not None: load_args['subfolder'] = subfolder if variant is not None: @@ -58,10 +61,11 @@ def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer", return transformer -def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder", allow_quant=True, allow_shared=True, variant=None): +def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder", allow_quant=True, allow_shared=True, variant=None, dtype=None): load_args, quant_args = model_quant.get_dit_args(load_config, module='TE', device_map=True, allow_quant=allow_quant) quant_type = model_quant.get_quant_type(quant_args) text_encoder = None + dtype = dtype or devices.dtype # load from local file if specified local_file = None @@ -79,7 +83,7 @@ def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder ggml.install_gguf() text_encoder = cls_name.from_pretrained( gguf_file=local_file, - quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=devices.dtype), + quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=dtype), cache_dir=shared.opts.hfcache_dir, **load_args, ) @@ -104,12 +108,14 @@ def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder shared.log.debug(f'Load model: text_encoder="{repo_id}" cls={cls_name.__name__} quant="SVDQuant"') text_encoder = nunchaku.NunchakuT5EncoderModel.from_pretrained( repo_id, - torch_dtype=devices.dtype, + torch_dtype=dtype, ) text_encoder.quantization_method = 'SVDQuant' elif shared.opts.te_shared_t5: repo_id = 'Disty0/t5-xxl' shared.log.debug(f'Load model: text_encoder="{repo_id}" cls={cls_name.__name__} quant="{quant_type}" shared={shared.opts.te_shared_t5}') + if dtype is not None: + load_args['torch_dtype'] = dtype text_encoder = cls_name.from_pretrained( repo_id, cache_dir=shared.opts.hfcache_dir, @@ -120,6 +126,8 @@ def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder # load from repo if text_encoder is None: shared.log.debug(f'Load model: text_encoder="{repo_id}" cls={cls_name.__name__} quant="{quant_type}" shared={shared.opts.te_shared_t5}') + if dtype is not None: + load_args['torch_dtype'] = dtype if subfolder is not None: load_args['subfolder'] = subfolder if variant is not None: diff --git a/pipelines/meissonic/pipeline.py b/pipelines/meissonic/pipeline.py index 4f1bb05a2..34b894081 100644 --- a/pipelines/meissonic/pipeline.py +++ b/pipelines/meissonic/pipeline.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import sys from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch @@ -49,7 +48,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype): return latent_image_ids.to(device=device, dtype=dtype) -class Pipeline(DiffusionPipeline): +class MeissonicPipeline(DiffusionPipeline): image_processor: VaeImageProcessor vqvae: VQModel tokenizer: CLIPTokenizer @@ -212,27 +211,27 @@ class Pipeline(DiffusionPipeline): width = self.transformer.config.sample_size * self.vae_scale_factor if prompt_embeds is None: - input_ids = self.tokenizer( - prompt, - return_tensors="pt", - padding="max_length", - truncation=True, - max_length=77, #self.tokenizer.model_max_length, - ).input_ids.to(self._execution_device) - # input_ids_t5 = self.tokenizer_t5( - # prompt, - # return_tensors="pt", - # padding="max_length", - # truncation=True, - # max_length=512, - # ).input_ids.to(self._execution_device) + input_ids = self.tokenizer( + prompt, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=77, #self.tokenizer.model_max_length, + ).input_ids.to(self._execution_device) + # input_ids_t5 = self.tokenizer_t5( + # prompt, + # return_tensors="pt", + # padding="max_length", + # truncation=True, + # max_length=512, + # ).input_ids.to(self._execution_device) - outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True) - # outputs_t5 = self.text_encoder_t5(input_ids_t5, decoder_input_ids = input_ids_t5 ,return_dict=True, output_hidden_states=True) - prompt_embeds = outputs.text_embeds - encoder_hidden_states = outputs.hidden_states[-2] - # encoder_hidden_states = outputs_t5.encoder_hidden_states[-2] + outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True) + # outputs_t5 = self.text_encoder_t5(input_ids_t5, decoder_input_ids = input_ids_t5 ,return_dict=True, output_hidden_states=True) + prompt_embeds = outputs.text_embeds + encoder_hidden_states = outputs.hidden_states[-2] + # encoder_hidden_states = outputs_t5.encoder_hidden_states[-2] prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1) encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1) diff --git a/pipelines/meissonic/pipeline_img2img.py b/pipelines/meissonic/pipeline_img2img.py index 13e5c3717..2aaf9d987 100644 --- a/pipelines/meissonic/pipeline_img2img.py +++ b/pipelines/meissonic/pipeline_img2img.py @@ -46,7 +46,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype): return latent_image_ids.to(device=device, dtype=dtype) -class Img2ImgPipeline(DiffusionPipeline): +class MeissonicImg2ImgPipeline(DiffusionPipeline): image_processor: VaeImageProcessor vqvae: VQModel tokenizer: CLIPTokenizer diff --git a/pipelines/meissonic/pipeline_inpaint.py b/pipelines/meissonic/pipeline_inpaint.py index d405afa53..aa352d9b4 100644 --- a/pipelines/meissonic/pipeline_inpaint.py +++ b/pipelines/meissonic/pipeline_inpaint.py @@ -43,7 +43,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype): return latent_image_ids.to(device=device, dtype=dtype) -class InpaintPipeline(DiffusionPipeline): +class MeissonicInpaintPipeline(DiffusionPipeline): image_processor: VaeImageProcessor vqvae: VQModel tokenizer: CLIPTokenizer diff --git a/pipelines/model_kandinsky.py b/pipelines/model_kandinsky.py new file mode 100644 index 000000000..0d5ad0013 --- /dev/null +++ b/pipelines/model_kandinsky.py @@ -0,0 +1,65 @@ +import transformers +import diffusers +from modules import shared, sd_models, devices, model_quant, sd_hijack_te +from pipelines import generic + + +def load_kandinsky21(checkpoint_info, diffusers_load_config={}): + repo_id = sd_models.path_to_repo(checkpoint_info) + sd_models.hf_auth_check(checkpoint_info) + + load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config) + shared.log.debug(f'Load model: type=Kandinsky21 repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}') + pipe = diffusers.KandinskyCombinedPipeline.from_pretrained( + repo_id, + cache_dir=shared.opts.diffusers_dir, + **load_args, + ) + sd_hijack_te.init_hijack(pipe) + devices.torch_gc(force=True, reason='load') + return pipe + + +def load_kandinsky22(checkpoint_info, diffusers_load_config={}): + repo_id = sd_models.path_to_repo(checkpoint_info) + sd_models.hf_auth_check(checkpoint_info) + + load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config) + shared.log.debug(f'Load model: type=Kandinsky22 repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}') + pipe = diffusers.KandinskyV22CombinedPipeline.from_pretrained( + repo_id, + cache_dir=shared.opts.diffusers_dir, + **load_args, + ) + sd_hijack_te.init_hijack(pipe) + devices.torch_gc(force=True, reason='load') + return pipe + + +def load_kandinsky3(checkpoint_info, diffusers_load_config={}): + repo_id = sd_models.path_to_repo(checkpoint_info) + sd_models.hf_auth_check(checkpoint_info) + + load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config) + shared.log.debug(f'Load model: type=Kandinsky30 repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}') + + unet = generic.load_transformer(repo_id, cls_name=diffusers.Kandinsky3UNet, load_config=diffusers_load_config, subfolder="unet", variant="fp16") + text_encoder = generic.load_text_encoder(repo_id, cls_name=transformers.T5EncoderModel, load_config=diffusers_load_config, subfolder="text_encoder", variant="fp16") + + pipe = diffusers.Kandinsky3Pipeline.from_pretrained( + repo_id, + unet=unet, + text_encoder=text_encoder, + variant="fp16", + cache_dir=shared.opts.diffusers_dir, + **load_args, + ) + pipe.task_args = { + 'output_type': 'np', + } + + del text_encoder + del unet + sd_hijack_te.init_hijack(pipe) + devices.torch_gc(force=True, reason='load') + return pipe