mirror of https://github.com/vladmandic/automatic
parent
e42a27a0e4
commit
dc8a72947d
|
|
@ -1,14 +1,16 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Warning:
|
||||
Warnings:
|
||||
- fal/AuraFlow-v0.3: layer_class_name=Linear layer_weight_shape=torch.Size([3072, 2, 1024]) weights_dtype=int8 unsupported
|
||||
- Kwai-Kolors/Kolors-diffusers: `set_input_embeddings` not auto‑handled for ChatGLMModel
|
||||
Error:
|
||||
- kandinsky-community/kandinsky-2-1: `get_input_embeddings` not auto‑handled for MultilingualCLIP
|
||||
Errors:
|
||||
- kandinsky-community/kandinsky-3: corrupt output
|
||||
- nvidia/Cosmos-Predict2-2B-Text2Image: mat1 and mat2 shapes cannot be multiplied (512x4096 and 1024x2048)
|
||||
- nvidia/Cosmos-Predict2-14B-Text2Image: mat1 and mat2 shapes cannot be multiplied (512x4096 and 1024x5120)
|
||||
- Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers: CUDA error: device-side assert triggered
|
||||
Other:
|
||||
- HiDream-ai/HiDream-I1-Full: 30+s/it
|
||||
- HiDream-ai/HiDream-I1-Full: very slow at 30+s/it
|
||||
"""
|
||||
|
||||
import io
|
||||
|
|
@ -54,10 +56,9 @@ models = {
|
|||
"OmniGen2/OmniGen2": {},
|
||||
# "HiDream-ai/HiDream-I1-Full": {},
|
||||
"Kwai-Kolors/Kolors-diffusers": {},
|
||||
"lodestones/Chroma1-HD": {},
|
||||
"vladmandic/chroma-unlocked-v50-annealed": {},
|
||||
"vladmandic/chroma-unlocked-v48": {},
|
||||
"vladmandic/chroma-unlocked-v48-detail-calibrated": {},
|
||||
# "kandinsky-community/kandinsky-3": {},
|
||||
"kandinsky-community/kandinsky-2-2-decoder": {},
|
||||
"kandinsky-community/kandinsky-2-1": {},
|
||||
"Alpha-VLLM/Lumina-Next-SFT-diffusers": {},
|
||||
"Alpha-VLLM/Lumina-Image-2.0": {},
|
||||
"MeissonFlow/Meissonic": {},
|
||||
|
|
@ -68,14 +69,15 @@ models = {
|
|||
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers": {},
|
||||
"Wan-AI/Wan2.1-T2V-14B-Diffusers": {},
|
||||
"stabilityai/stable-cascade": {},
|
||||
"lodestones/Chroma1-HD": {},
|
||||
"vladmandic/chroma-unlocked-v50-annealed": {},
|
||||
"vladmandic/chroma-unlocked-v48": {},
|
||||
"vladmandic/chroma-unlocked-v48-detail-calibrated": {},
|
||||
}
|
||||
models_tbd = [
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
"black-forest-labs/FLUX.1-Kontext-dev",
|
||||
"black-forest-labs/FLUX.1-Krea-dev",
|
||||
"kandinsky-community/kandinsky-3", # TODO
|
||||
"kandinsky-community/kandinsky-2-2-decoder",
|
||||
"kandinsky-community/kandinsky-2-1",
|
||||
]
|
||||
styles = [
|
||||
'Fixed Astronaut',
|
||||
|
|
|
|||
|
|
@ -490,7 +490,7 @@
|
|||
},
|
||||
"Kandinsky 2.2": {
|
||||
"path": "kandinsky-community/kandinsky-2-2-decoder",
|
||||
"desc": "Kandinsky 2.2 is a text-conditional diffusion model (+0.1!) based on unCLIP and latent diffusion, composed of a transformer-based image prior model, a unet diffusion model, and a decoder. Kandinsky 2.1 inherits best practices from Dall-E 2 and Latent diffusion while introducing some new ideas. It uses the CLIP model as a text and image encoder, and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.",
|
||||
"desc": "Kandinsky 2.2 is a text-conditional diffusion model (+0.1!) based on unCLIP and latent diffusion, composed of a transformer-based image prior model, a unet diffusion model, and a decoder. Kandinsky 2.2 inherits best practices from Dall-E 2 and Latent diffusion while introducing some new ideas. It uses the CLIP model as a text and image encoder, and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.",
|
||||
"preview": "kandinsky-community--kandinsky-2-2-decoder.jpg",
|
||||
"extras": "width: 768, height: 768, sampler: Default"
|
||||
},
|
||||
|
|
|
|||
|
|
@ -103,6 +103,12 @@ def guess_by_name(fn, current_guess):
|
|||
return 'Bria'
|
||||
elif 'qwen' in fn.lower():
|
||||
return 'Qwen'
|
||||
elif 'kandinsky-2-1' in fn.lower():
|
||||
return 'Kandinsky 2.1'
|
||||
elif 'kandinsky-2-2' in fn.lower():
|
||||
return 'Kandinsky 2.2'
|
||||
elif 'kandinsky-3' in fn.lower():
|
||||
return 'Kandinsky 3.0'
|
||||
return current_guess
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -381,6 +381,18 @@ def load_diffuser_force(model_type, checkpoint_info, diffusers_load_config, op='
|
|||
from pipelines.model_hunyuandit import load_hunyuandit
|
||||
sd_model = load_hunyuandit(checkpoint_info, diffusers_load_config)
|
||||
allow_post_quant = False
|
||||
elif model_type in ['Kandinsky 2.1']:
|
||||
from pipelines.model_kandinsky import load_kandinsky21
|
||||
sd_model = load_kandinsky21(checkpoint_info, diffusers_load_config)
|
||||
allow_post_quant = True
|
||||
elif model_type in ['Kandinsky 2.2']:
|
||||
from pipelines.model_kandinsky import load_kandinsky22
|
||||
sd_model = load_kandinsky22(checkpoint_info, diffusers_load_config)
|
||||
allow_post_quant = False
|
||||
elif model_type in ['Kandinsky 3.0']:
|
||||
from pipelines.model_kandinsky import load_kandinsky3
|
||||
sd_model = load_kandinsky3(checkpoint_info, diffusers_load_config)
|
||||
allow_post_quant = False
|
||||
except Exception as e:
|
||||
shared.log.error(f'Load {op}: path="{checkpoint_info.path}" {e}')
|
||||
if debug_load:
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ debug_move = log.trace if debug else lambda *args, **kwargs: None
|
|||
offload_warn = ['sc', 'sd3', 'f1', 'h1', 'hunyuandit', 'auraflow', 'omnigen', 'omnigen2', 'cogview4', 'cosmos', 'chroma']
|
||||
offload_post = ['h1']
|
||||
offload_hook_instance = None
|
||||
balanced_offload_exclude = ['CogView4Pipeline']
|
||||
balanced_offload_exclude = ['CogView4Pipeline', 'MeissonicPipeline']
|
||||
accelerate_dtype_byte_size = None
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ def get_vae_scale_factor(model=None):
|
|||
elif hasattr(model, 'config') and hasattr(model.config, 'vae_scale_factor'):
|
||||
vae_scale_factor = model.config.vae_scale_factor
|
||||
else:
|
||||
shared.log.warning(f'VAE: cls={model.__class__.__name__ if model else "None"} scale=unknown')
|
||||
# shared.log.warning(f'VAE: cls={model.__class__.__name__ if model else "None"} scale=unknown')
|
||||
vae_scale_factor = 8
|
||||
if hasattr(model, 'patch_size'):
|
||||
patch_size = model.patch_size
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ def warn_once(msg, variant=None):
|
|||
global prev_warnings # pylint: disable=global-statement
|
||||
if not prev_warnings:
|
||||
prev_warnings = True
|
||||
shared.log.error(f'Decode: type="taesd" variant="{variant}": {msg}')
|
||||
shared.log.warning(f'Decode: type="taesd" variant="{variant}": {msg}')
|
||||
return Image.new('RGB', (8, 8), color = (0, 0, 0))
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -8,9 +8,10 @@ from modules import shared, devices, sd_models, model_quant
|
|||
debug = shared.log.trace if os.environ.get('SD_LOAD_DEBUG', None) is not None else lambda *args, **kwargs: None
|
||||
|
||||
|
||||
def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer", allow_quant=True, variant=None):
|
||||
def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer", allow_quant=True, variant=None, dtype=None):
|
||||
load_args, quant_args = model_quant.get_dit_args(load_config, module='Model', device_map=True, allow_quant=allow_quant)
|
||||
quant_type = model_quant.get_quant_type(quant_args)
|
||||
dtype = dtype or devices.dtype
|
||||
|
||||
local_file = None
|
||||
if shared.opts.sd_unet is not None and shared.opts.sd_unet != 'Default':
|
||||
|
|
@ -27,7 +28,7 @@ def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer",
|
|||
loader = cls_name.from_single_file if hasattr(cls_name, 'from_single_file') else cls_name.from_pretrained
|
||||
transformer = loader(
|
||||
local_file,
|
||||
quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=devices.dtype),
|
||||
quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=dtype),
|
||||
cache_dir=shared.opts.hfcache_dir,
|
||||
**load_args,
|
||||
)
|
||||
|
|
@ -43,6 +44,8 @@ def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer",
|
|||
transformer = model_quant.do_post_load_quant(transformer, allow=quant_type is not None)
|
||||
else:
|
||||
shared.log.debug(f'Load model: transformer="{repo_id}" cls={cls_name.__name__} quant="{quant_type}" args={load_args}')
|
||||
if dtype is not None:
|
||||
load_args['torch_dtype'] = dtype
|
||||
if subfolder is not None:
|
||||
load_args['subfolder'] = subfolder
|
||||
if variant is not None:
|
||||
|
|
@ -58,10 +61,11 @@ def load_transformer(repo_id, cls_name, load_config={}, subfolder="transformer",
|
|||
return transformer
|
||||
|
||||
|
||||
def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder", allow_quant=True, allow_shared=True, variant=None):
|
||||
def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder", allow_quant=True, allow_shared=True, variant=None, dtype=None):
|
||||
load_args, quant_args = model_quant.get_dit_args(load_config, module='TE', device_map=True, allow_quant=allow_quant)
|
||||
quant_type = model_quant.get_quant_type(quant_args)
|
||||
text_encoder = None
|
||||
dtype = dtype or devices.dtype
|
||||
|
||||
# load from local file if specified
|
||||
local_file = None
|
||||
|
|
@ -79,7 +83,7 @@ def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder
|
|||
ggml.install_gguf()
|
||||
text_encoder = cls_name.from_pretrained(
|
||||
gguf_file=local_file,
|
||||
quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=devices.dtype),
|
||||
quantization_config=diffusers.GGUFQuantizationConfig(compute_dtype=dtype),
|
||||
cache_dir=shared.opts.hfcache_dir,
|
||||
**load_args,
|
||||
)
|
||||
|
|
@ -104,12 +108,14 @@ def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder
|
|||
shared.log.debug(f'Load model: text_encoder="{repo_id}" cls={cls_name.__name__} quant="SVDQuant"')
|
||||
text_encoder = nunchaku.NunchakuT5EncoderModel.from_pretrained(
|
||||
repo_id,
|
||||
torch_dtype=devices.dtype,
|
||||
torch_dtype=dtype,
|
||||
)
|
||||
text_encoder.quantization_method = 'SVDQuant'
|
||||
elif shared.opts.te_shared_t5:
|
||||
repo_id = 'Disty0/t5-xxl'
|
||||
shared.log.debug(f'Load model: text_encoder="{repo_id}" cls={cls_name.__name__} quant="{quant_type}" shared={shared.opts.te_shared_t5}')
|
||||
if dtype is not None:
|
||||
load_args['torch_dtype'] = dtype
|
||||
text_encoder = cls_name.from_pretrained(
|
||||
repo_id,
|
||||
cache_dir=shared.opts.hfcache_dir,
|
||||
|
|
@ -120,6 +126,8 @@ def load_text_encoder(repo_id, cls_name, load_config={}, subfolder="text_encoder
|
|||
# load from repo
|
||||
if text_encoder is None:
|
||||
shared.log.debug(f'Load model: text_encoder="{repo_id}" cls={cls_name.__name__} quant="{quant_type}" shared={shared.opts.te_shared_t5}')
|
||||
if dtype is not None:
|
||||
load_args['torch_dtype'] = dtype
|
||||
if subfolder is not None:
|
||||
load_args['subfolder'] = subfolder
|
||||
if variant is not None:
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import sys
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
|
@ -49,7 +48,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
|
|||
return latent_image_ids.to(device=device, dtype=dtype)
|
||||
|
||||
|
||||
class Pipeline(DiffusionPipeline):
|
||||
class MeissonicPipeline(DiffusionPipeline):
|
||||
image_processor: VaeImageProcessor
|
||||
vqvae: VQModel
|
||||
tokenizer: CLIPTokenizer
|
||||
|
|
@ -212,27 +211,27 @@ class Pipeline(DiffusionPipeline):
|
|||
width = self.transformer.config.sample_size * self.vae_scale_factor
|
||||
|
||||
if prompt_embeds is None:
|
||||
input_ids = self.tokenizer(
|
||||
prompt,
|
||||
return_tensors="pt",
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
max_length=77, #self.tokenizer.model_max_length,
|
||||
).input_ids.to(self._execution_device)
|
||||
# input_ids_t5 = self.tokenizer_t5(
|
||||
# prompt,
|
||||
# return_tensors="pt",
|
||||
# padding="max_length",
|
||||
# truncation=True,
|
||||
# max_length=512,
|
||||
# ).input_ids.to(self._execution_device)
|
||||
input_ids = self.tokenizer(
|
||||
prompt,
|
||||
return_tensors="pt",
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
max_length=77, #self.tokenizer.model_max_length,
|
||||
).input_ids.to(self._execution_device)
|
||||
# input_ids_t5 = self.tokenizer_t5(
|
||||
# prompt,
|
||||
# return_tensors="pt",
|
||||
# padding="max_length",
|
||||
# truncation=True,
|
||||
# max_length=512,
|
||||
# ).input_ids.to(self._execution_device)
|
||||
|
||||
|
||||
outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
|
||||
# outputs_t5 = self.text_encoder_t5(input_ids_t5, decoder_input_ids = input_ids_t5 ,return_dict=True, output_hidden_states=True)
|
||||
prompt_embeds = outputs.text_embeds
|
||||
encoder_hidden_states = outputs.hidden_states[-2]
|
||||
# encoder_hidden_states = outputs_t5.encoder_hidden_states[-2]
|
||||
outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
|
||||
# outputs_t5 = self.text_encoder_t5(input_ids_t5, decoder_input_ids = input_ids_t5 ,return_dict=True, output_hidden_states=True)
|
||||
prompt_embeds = outputs.text_embeds
|
||||
encoder_hidden_states = outputs.hidden_states[-2]
|
||||
# encoder_hidden_states = outputs_t5.encoder_hidden_states[-2]
|
||||
|
||||
prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1)
|
||||
encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
|
|||
return latent_image_ids.to(device=device, dtype=dtype)
|
||||
|
||||
|
||||
class Img2ImgPipeline(DiffusionPipeline):
|
||||
class MeissonicImg2ImgPipeline(DiffusionPipeline):
|
||||
image_processor: VaeImageProcessor
|
||||
vqvae: VQModel
|
||||
tokenizer: CLIPTokenizer
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
|
|||
return latent_image_ids.to(device=device, dtype=dtype)
|
||||
|
||||
|
||||
class InpaintPipeline(DiffusionPipeline):
|
||||
class MeissonicInpaintPipeline(DiffusionPipeline):
|
||||
image_processor: VaeImageProcessor
|
||||
vqvae: VQModel
|
||||
tokenizer: CLIPTokenizer
|
||||
|
|
|
|||
|
|
@ -0,0 +1,65 @@
|
|||
import transformers
|
||||
import diffusers
|
||||
from modules import shared, sd_models, devices, model_quant, sd_hijack_te
|
||||
from pipelines import generic
|
||||
|
||||
|
||||
def load_kandinsky21(checkpoint_info, diffusers_load_config={}):
|
||||
repo_id = sd_models.path_to_repo(checkpoint_info)
|
||||
sd_models.hf_auth_check(checkpoint_info)
|
||||
|
||||
load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config)
|
||||
shared.log.debug(f'Load model: type=Kandinsky21 repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}')
|
||||
pipe = diffusers.KandinskyCombinedPipeline.from_pretrained(
|
||||
repo_id,
|
||||
cache_dir=shared.opts.diffusers_dir,
|
||||
**load_args,
|
||||
)
|
||||
sd_hijack_te.init_hijack(pipe)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
||||
|
||||
def load_kandinsky22(checkpoint_info, diffusers_load_config={}):
|
||||
repo_id = sd_models.path_to_repo(checkpoint_info)
|
||||
sd_models.hf_auth_check(checkpoint_info)
|
||||
|
||||
load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config)
|
||||
shared.log.debug(f'Load model: type=Kandinsky22 repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}')
|
||||
pipe = diffusers.KandinskyV22CombinedPipeline.from_pretrained(
|
||||
repo_id,
|
||||
cache_dir=shared.opts.diffusers_dir,
|
||||
**load_args,
|
||||
)
|
||||
sd_hijack_te.init_hijack(pipe)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
||||
|
||||
def load_kandinsky3(checkpoint_info, diffusers_load_config={}):
|
||||
repo_id = sd_models.path_to_repo(checkpoint_info)
|
||||
sd_models.hf_auth_check(checkpoint_info)
|
||||
|
||||
load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config)
|
||||
shared.log.debug(f'Load model: type=Kandinsky30 repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}')
|
||||
|
||||
unet = generic.load_transformer(repo_id, cls_name=diffusers.Kandinsky3UNet, load_config=diffusers_load_config, subfolder="unet", variant="fp16")
|
||||
text_encoder = generic.load_text_encoder(repo_id, cls_name=transformers.T5EncoderModel, load_config=diffusers_load_config, subfolder="text_encoder", variant="fp16")
|
||||
|
||||
pipe = diffusers.Kandinsky3Pipeline.from_pretrained(
|
||||
repo_id,
|
||||
unet=unet,
|
||||
text_encoder=text_encoder,
|
||||
variant="fp16",
|
||||
cache_dir=shared.opts.diffusers_dir,
|
||||
**load_args,
|
||||
)
|
||||
pipe.task_args = {
|
||||
'output_type': 'np',
|
||||
}
|
||||
|
||||
del text_encoder
|
||||
del unet
|
||||
sd_hijack_te.init_hijack(pipe)
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
Loading…
Reference in New Issue