diffuser auto-pipeline and fix vae

pull/1985/head
Vladimir Mandic 2023-08-07 17:19:30 +00:00
parent 23f6b66bd7
commit 0a3e821067
10 changed files with 291 additions and 59 deletions

View File

@ -3,15 +3,20 @@
## Update for 2023-08-07
- diffusers:
- **pipeline autodetect**
if pipeline is set to autodetect (default for new installs), app will try to autodetect pipeline based on selected model
this should reduce user errors such as loading sd-xl model when sd pipeline is selected
- **prompt attention** for sd and sd-xl
native `compel` implementation and standrd -> compel translation
thanks @ai-casanova
- advanced **lora load/apply** methods
in addition to standard lora loading that was recently added to sd-xl using diffusers, now we have
- **sequential apply** (load & apply multiple loras in sequential manner) and
- **sequential apply** (load & apply multiple loras in sequential manner) and
- **merge and apply** (load multiple loras and merge before applying to model)
see *settings -> diffusers -> lora methods*
thanks @hameerabbasi and @ai-casanova
see *settings -> diffusers -> lora methods*
thanks @hameerabbasi and @ai-casanova
- **sd-xl vae** from safetensors now applies correct config
result is that 3rd party vaes can be used without washed out colors
- general:
- updated requirements
this time its a bigger change so upgrade may take longer to install new requirements

View File

@ -26,6 +26,7 @@ Stuff to be added, in no particular order...
- Port `p.all_hr_prompts`
- Import core repos to reduce dependencies
- Update `gradio`
- Parse StabilityAI `modelspec` metadata
- Non-technical:
- Create additional themes
- Update Wiki

98
configs/sd_xl_base.yaml Normal file
View File

@ -0,0 +1,98 @@
model:
target: sgm.models.diffusion.DiffusionEngine
params:
scale_factor: 0.13025
disable_first_stage_autocast: True
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
params:
num_idx: 1000
weighting_config:
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
network_config:
target: sgm.modules.diffusionmodules.openaimodel.UNetModel
params:
adm_in_channels: 2816
num_classes: sequential
use_checkpoint: True
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions: [4, 2]
num_res_blocks: 2
channel_mult: [1, 2, 4]
num_head_channels: 64
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: [1, 2, 10] # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
context_dim: 2048
spatial_transformer_attn_type: softmax-xformers
legacy: False
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
# crossattn cond
- is_trainable: False
input_key: txt
target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
params:
layer: hidden
layer_idx: 11
# crossattn and vector cond
- is_trainable: False
input_key: txt
target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
params:
arch: ViT-bigG-14
version: laion2b_s39b_b160k
freeze: True
layer: penultimate
always_return_pooled: True
legacy: False
# vector cond
- is_trainable: False
input_key: original_size_as_tuple
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256 # multiplied by two
# vector cond
- is_trainable: False
input_key: crop_coords_top_left
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256 # multiplied by two
# vector cond
- is_trainable: False
input_key: target_size_as_tuple
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256 # multiplied by two
first_stage_config:
target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
attn_type: vanilla-xformers
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity

View File

@ -0,0 +1,91 @@
model:
target: sgm.models.diffusion.DiffusionEngine
params:
scale_factor: 0.13025
disable_first_stage_autocast: True
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
params:
num_idx: 1000
weighting_config:
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
network_config:
target: sgm.modules.diffusionmodules.openaimodel.UNetModel
params:
adm_in_channels: 2560
num_classes: sequential
use_checkpoint: True
in_channels: 4
out_channels: 4
model_channels: 384
attention_resolutions: [4, 2]
num_res_blocks: 2
channel_mult: [1, 2, 4, 4]
num_head_channels: 64
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: 4
context_dim: [1280, 1280, 1280, 1280] # 1280
spatial_transformer_attn_type: softmax-xformers
legacy: False
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
# crossattn and vector cond
- is_trainable: False
input_key: txt
target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
params:
arch: ViT-bigG-14
version: laion2b_s39b_b160k
legacy: False
freeze: True
layer: penultimate
always_return_pooled: True
# vector cond
- is_trainable: False
input_key: original_size_as_tuple
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256 # multiplied by two
# vector cond
- is_trainable: False
input_key: crop_coords_top_left
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256 # multiplied by two
# vector cond
- is_trainable: False
input_key: aesthetic_score
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256 # multiplied by one
first_stage_config:
target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
attn_type: vanilla-xformers
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity

View File

@ -430,8 +430,14 @@ def install_packages():
install('pi-heif', 'pi_heif', ignore=True)
tensorflow_package = os.environ.get('TENSORFLOW_PACKAGE', 'tensorflow==2.13.0')
install(tensorflow_package, 'tensorflow', ignore=True)
# bitsandbytes_package = os.environ.get('BITSANDBYTES_PACKAGE', 'bitsandbytes==0.39.1')
# install(bitsandbytes_package, 'bitsandbytes', ignore=True)
bitsandbytes_package = os.environ.get('BITSANDBYTES_PACKAGE', None)
if bitsandbytes_package is not None:
install(bitsandbytes_package, 'bitsandbytes', ignore=True)
else:
bitsandbytes_package = pkg_resources.working_set.by_key.get('bitsandbytes', None)
if bitsandbytes_package is not None:
log.warning(f'Not used, uninstalling: {bitsandbytes_package}')
pip('uninstall bitsandbytes --yes --quiet', ignore=True, quiet=True)
if args.profile:
print_profile(pr, 'Packages')

View File

@ -169,6 +169,7 @@ if __name__ == "__main__":
if installer.check_timestamp():
installer.log.info('No changes detected: Quick launch active')
installer.install_requirements()
installer.install_packages()
installer.check_extensions()
else:
installer.install_requirements()

View File

@ -1,14 +1,16 @@
import inspect
import typing
import torch
import modules.devices as devices
import modules.shared as shared
import modules.sd_samplers as sd_samplers
import modules.sd_models as sd_models
import modules.sd_vae as sd_vae
import modules.images as images
from modules.lora_diffusers import lora_state, unload_diffusers_lora
from modules.processing import StableDiffusionProcessing
import modules.prompt_parser_diffusers as prompt_parser_diffusers
import typing
try:
import diffusers
@ -16,16 +18,6 @@ except Exception as ex:
shared.log.error(f'Failed to import diffusers: {ex}')
def encode_prompt(encoder, prompt):
cfg = encoder.config
# TODO implement similar hijack for diffusers text encoder but following diffusers pipeline.encode_prompt concepts
# from modules import sd_hijack_clip
# model.text_encoder = sd_hijack_clip.FrozenCLIPEmbedderWithCustomWords(model.text_encoder, None)
shared.log.debug(f'Diffuser encoder: {encoder.__class__.__name__} dict={getattr(cfg, "vocab_size", None)} layers={getattr(cfg, "num_hidden_layers", None)} tokens={getattr(cfg, "max_position_embeddings", None)}')
embeds = prompt
return embeds
def process_diffusers(p: StableDiffusionProcessing, seeds, prompts, negative_prompts):
results = []
@ -36,7 +28,7 @@ def process_diffusers(p: StableDiffusionProcessing, seeds, prompts, negative_pro
def vae_decode(latents, model, output_type='np'):
if hasattr(model, 'vae') and torch.is_tensor(latents):
shared.log.debug(f'Diffusers VAE decode: name={model.vae.config.get("_name_or_path", "default")} dtype={model.vae.dtype} upcast={model.vae.config.get("force_upcast", None)}')
shared.log.debug(f'Diffusers VAE decode: name={sd_vae.loaded_vae_file} dtype={model.vae.dtype} upcast={model.vae.config.get("force_upcast", None)}')
if shared.opts.diffusers_move_unet and not model.has_accelerate:
shared.log.debug('Diffusers: Moving UNet to CPU')
unet_device = model.unet.device
@ -113,6 +105,14 @@ def process_diffusers(p: StableDiffusionProcessing, seeds, prompts, negative_pro
clean['prompt'] = len(clean['prompt'])
if 'negative_prompt' in clean:
clean['negative_prompt'] = len(clean['negative_prompt'])
if 'prompt_embeds' in clean:
clean['prompt_embeds'] = clean['prompt_embeds'].shape
if 'pooled_prompt_embeds' in clean:
clean['pooled_prompt_embeds'] = clean['pooled_prompt_embeds'].shape
if 'negative_prompt_embeds' in clean:
clean['negative_prompt_embeds'] = clean['negative_prompt_embeds'].shape
if 'negative_pooled_prompt_embeds' in clean:
clean['negative_pooled_prompt_embeds'] = clean['negative_pooled_prompt_embeds'].shape
clean['generator'] = generator_device
shared.log.debug(f'Diffuser pipeline: {pipeline.__class__.__name__} task={sd_models.get_diffusers_task(model)} set={clean}')
return args
@ -138,10 +138,6 @@ def process_diffusers(p: StableDiffusionProcessing, seeds, prompts, negative_pro
p.ops.append('inpaint')
task_specific_kwargs = {"image": p.init_images, "mask_image": p.mask, "strength": p.denoising_strength, "height": p.height, "width": p.width}
# TODO diffusers use transformers for prompt parsing
# from modules.prompt_parser import parse_prompt_attention
# parsed_prompt = [parse_prompt_attention(prompt) for prompt in prompts]
if shared.state.interrupted or shared.state.skipped:
unload_diffusers_lora()
return results

View File

@ -533,6 +533,62 @@ def change_backend():
refresh_vae_list()
def detect_pipeline(f: str, op: str = 'model'):
guess = shared.opts.diffusers_pipeline
if guess == 'Autodetect':
try:
size = round(os.path.getsize(f) / 1024 / 1024 / 1024, 2)
if size < 1:
shared.log.warning(f'Model size smaller than expected: {f} size={size} GB')
elif size < 5:
guess = 'Stable Diffusion'
elif size < 6:
if op == 'model':
shared.log.warning(f'Model detected as SD-XL refiner model, but attempting to load a base model: {f} size={size} GB')
else:
guess = 'Stable Diffusion XL'
elif size < 7:
if op == 'refiner':
shared.log.warning(f'Model size matches SD-XL base model, but attempting to load a refiner model: {f} size={size} GB')
else:
guess = 'Stable Diffusion XL'
else:
shared.log.error(f'Diffusers autodetect failed, set diffuser pipeline manually: {f}')
return None, None
shared.log.debug(f'Diffusers autodetect {op}: {f} pipeline={guess} size={size} GB')
except Exception as e:
shared.log.error(f'Error detecting diffusers pipeline: model={f} {e}')
return None, None
if guess == shared.pipelines[1]:
pipeline = diffusers.StableDiffusionPipeline
elif guess == shared.pipelines[2]:
pipeline = diffusers.StableDiffusionXLPipeline
elif guess == shared.pipelines[3]:
pipeline = diffusers.KandinskyPipeline
elif guess == shared.pipelines[4]:
pipeline = diffusers.KandinskyV22Pipeline
elif guess == shared.pipelines[5]:
pipeline = diffusers.IFPipeline
elif guess == shared.pipelines[6]:
pipeline = diffusers.ShapEPipeline
elif guess == shared.pipelines[7]:
pipeline = diffusers.StableDiffusionImg2ImgPipeline
elif guess == shared.pipelines[8]:
pipeline = diffusers.StableDiffusionXLImg2ImgPipeline
elif guess == shared.pipelines[9]:
pipeline = diffusers.KandinskyImg2ImgPipeline
elif guess == shared.pipelines[10]:
pipeline = diffusers.KandinskyV22Img2ImgPipeline
elif guess == shared.pipelines[11]:
pipeline = diffusers.IFImg2ImgPipeline
elif guess == shared.pipelines[12]:
pipeline = diffusers.ShapEImg2ImgPipeline
else:
shared.log.error(f'Diffusers unknown pipeline: {guess}')
pipeline = None
return pipeline, guess
def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=None, op='model'): # pylint: disable=unused-argument
import torch # pylint: disable=reimported,redefined-outer-name
if timer is None:
@ -593,9 +649,10 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
devices.set_cuda_params()
vae = None
sd_vae.loaded_vae_file = None
if op == 'model' or op == 'refiner':
vae_file, vae_source = sd_vae.resolve_vae(checkpoint_info.filename)
vae = sd_vae.load_vae_diffusers(None, vae_file, vae_source)
vae = sd_vae.load_vae_diffusers(checkpoint_info.path, vae_file, vae_source)
if vae is not None:
diffusers_load_config["vae"] = vae
@ -609,35 +666,9 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
else:
diffusers_load_config["local_files_only "] = True
diffusers_load_config["extract_ema"] = shared.opts.diffusers_extract_ema
try:
if shared.opts.diffusers_pipeline == shared.pipelines[0]:
pipeline = diffusers.StableDiffusionPipeline
elif shared.opts.diffusers_pipeline == shared.pipelines[1]:
pipeline = diffusers.StableDiffusionXLPipeline
elif shared.opts.diffusers_pipeline == shared.pipelines[2]:
pipeline = diffusers.KandinskyPipeline
elif shared.opts.diffusers_pipeline == shared.pipelines[3]:
pipeline = diffusers.KandinskyV22Pipeline
elif shared.opts.diffusers_pipeline == shared.pipelines[4]:
pipeline = diffusers.IFPipeline
elif shared.opts.diffusers_pipeline == shared.pipelines[5]:
pipeline = diffusers.ShapEPipeline
elif shared.opts.diffusers_pipeline == shared.pipelines[6]:
pipeline = diffusers.StableDiffusionImg2ImgPipeline
elif shared.opts.diffusers_pipeline == shared.pipelines[7]:
pipeline = diffusers.StableDiffusionXLImg2ImgPipeline
elif shared.opts.diffusers_pipeline == shared.pipelines[8]:
pipeline = diffusers.KandinskyImg2ImgPipeline
elif shared.opts.diffusers_pipeline == shared.pipelines[9]:
pipeline = diffusers.KandinskyV22Img2ImgPipeline
elif shared.opts.diffusers_pipeline == shared.pipelines[10]:
pipeline = diffusers.IFImg2ImgPipeline
elif shared.opts.diffusers_pipeline == shared.pipelines[11]:
pipeline = diffusers.ShapEImg2ImgPipeline
else:
shared.log.error(f'Diffusers {op} unknown pipeline: {shared.opts.diffusers_pipeline}')
except Exception as e:
shared.log.error(f'Diffusers {op} failed initializing pipeline: {shared.opts.diffusers_pipeline} {e}')
pipeline, _model_type = detect_pipeline(checkpoint_info.path, op)
if pipeline is None:
shared.log.error(f'Diffusers {op} pipeline not initialized: {shared.opts.diffusers_pipeline}')
return
try:
if hasattr(pipeline, 'from_single_file'):
@ -697,6 +728,8 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
else:
sd_model.disable_attention_slicing()
if hasattr(sd_model, "vae"):
if vae is not None:
sd_model.vae = vae
if shared.opts.diffusers_vae_upcast != 'default':
if shared.opts.diffusers_vae_upcast == 'true':
sd_model.vae.config["force_upcast"] = True
@ -704,7 +737,7 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
else:
sd_model.vae.config["force_upcast"] = False
sd_model.vae.config.force_upcast = False
shared.log.debug(f'Diffusers {op} VAE: name={sd_model.vae.config.get("_name_or_path", "default")} upcast={sd_model.vae.config.get("force_upcast", None)}')
shared.log.debug(f'Diffusers {op} VAE: name={sd_vae.loaded_vae_file} upcast={sd_model.vae.config.get("force_upcast", None)}')
if shared.opts.cross_attention_optimization == "xFormers" and hasattr(sd_model, 'enable_xformers_memory_efficient_attention'):
sd_model.enable_xformers_memory_efficient_attention()
if shared.opts.opt_channelslast:

View File

@ -3,7 +3,7 @@ import collections
import glob
from copy import deepcopy
import torch
from modules import shared, paths, devices, script_callbacks, sd_models
from modules import shared, paths, paths_internal, devices, script_callbacks, sd_models
vae_ignore_keys = {"model_ema.decay", "model_ema.num_updates"}
@ -169,7 +169,7 @@ def load_vae(model, vae_file=None, vae_source="from unknown source"):
loaded_vae_file = vae_file
def load_vae_diffusers(_model, vae_file=None, vae_source="from unknown source"):
def load_vae_diffusers(model_file, vae_file=None, vae_source="from unknown source"):
if vae_file is None:
return None
if not os.path.exists(vae_file):
@ -196,14 +196,14 @@ def load_vae_diffusers(_model, vae_file=None, vae_source="from unknown source"):
try:
import diffusers
if os.path.isfile(vae_file):
if shared.opts.diffusers_pipeline == "Stable Diffusion XL":
# load_config passed to from_single_file doesn't apply
# from_single_file by default downloads VAE1.5 config
shared.log.warning("Using SDXL VAE loaded from singular file will result in low contrast images.")
vae = diffusers.AutoencoderKL.from_single_file(vae_file)
_pipeline, model_type = sd_models.detect_pipeline(model_file, 'vae')
diffusers_load_config = { "config_file": paths_internal.sd_default_config if model_type != 'Stable Diffusion XL' else os.path.join(paths_internal.sd_configs_path, 'sd_xl_base.yaml')}
vae = diffusers.AutoencoderKL.from_single_file(vae_file, **diffusers_load_config)
vae = vae.to(devices.dtype_vae)
else:
vae = diffusers.AutoencoderKL.from_pretrained(vae_file, **diffusers_load_config)
global loaded_vae_file # pylint: disable=global-statement
loaded_vae_file = os.path.basename(vae_file)
# shared.log.debug(f'Diffusers VAE config: {vae.config}')
return vae
except Exception as e:

View File

@ -41,6 +41,7 @@ loaded_hypernetworks = []
gradio_theme = gr.themes.Base()
settings_components = None
pipelines = [
'Autodetect',
'Stable Diffusion', 'Stable Diffusion XL', 'Kandinsky V1', 'Kandinsky V2', 'DeepFloyd IF', 'Shap-E',
'Stable Diffusion Img2Img', 'Stable Diffusion XL Img2Img', 'Kandinsky V1 Img2Img', 'Kandinsky V2 Img2Img', 'DeepFloyd IF Img2Img', 'Shap-E Img2Img'
]