From 0a3e82106731c75d7a368e0b2fd667fc274a5193 Mon Sep 17 00:00:00 2001 From: Vladimir Mandic Date: Mon, 7 Aug 2023 17:19:30 +0000 Subject: [PATCH] diffuser auto-pipeline and fix vae --- CHANGELOG.md | 11 +++- TODO.md | 1 + configs/sd_xl_base.yaml | 98 +++++++++++++++++++++++++++++++++ configs/sd_xl_refiner.yaml | 91 ++++++++++++++++++++++++++++++ installer.py | 10 +++- launch.py | 1 + modules/processing_diffusers.py | 28 ++++------ modules/sd_models.py | 95 +++++++++++++++++++++----------- modules/sd_vae.py | 14 ++--- modules/shared.py | 1 + 10 files changed, 291 insertions(+), 59 deletions(-) create mode 100644 configs/sd_xl_base.yaml create mode 100644 configs/sd_xl_refiner.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 09f1ed913..6eee36183 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,15 +3,20 @@ ## Update for 2023-08-07 - diffusers: + - **pipeline autodetect** + if pipeline is set to autodetect (default for new installs), app will try to autodetect pipeline based on selected model + this should reduce user errors such as loading sd-xl model when sd pipeline is selected - **prompt attention** for sd and sd-xl native `compel` implementation and standrd -> compel translation thanks @ai-casanova - advanced **lora load/apply** methods in addition to standard lora loading that was recently added to sd-xl using diffusers, now we have - - **sequential apply** (load & apply multiple loras in sequential manner) and + - **sequential apply** (load & apply multiple loras in sequential manner) and - **merge and apply** (load multiple loras and merge before applying to model) - see *settings -> diffusers -> lora methods* - thanks @hameerabbasi and @ai-casanova + see *settings -> diffusers -> lora methods* + thanks @hameerabbasi and @ai-casanova + - **sd-xl vae** from safetensors now applies correct config + result is that 3rd party vaes can be used without washed out colors - general: - updated requirements this time its a bigger change so upgrade may take longer to install new requirements diff --git a/TODO.md b/TODO.md index 6428c9e68..c178e4c20 100644 --- a/TODO.md +++ b/TODO.md @@ -26,6 +26,7 @@ Stuff to be added, in no particular order... - Port `p.all_hr_prompts` - Import core repos to reduce dependencies - Update `gradio` + - Parse StabilityAI `modelspec` metadata - Non-technical: - Create additional themes - Update Wiki diff --git a/configs/sd_xl_base.yaml b/configs/sd_xl_base.yaml new file mode 100644 index 000000000..8aaf5b6ec --- /dev/null +++ b/configs/sd_xl_base.yaml @@ -0,0 +1,98 @@ +model: + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.13025 + disable_first_stage_autocast: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + weighting_config: + target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + adm_in_channels: 2816 + num_classes: sequential + use_checkpoint: True + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [4, 2] + num_res_blocks: 2 + channel_mult: [1, 2, 4] + num_head_channels: 64 + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: [1, 2, 10] # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16 + context_dim: 2048 + spatial_transformer_attn_type: softmax-xformers + legacy: False + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + # crossattn cond + - is_trainable: False + input_key: txt + target: sgm.modules.encoders.modules.FrozenCLIPEmbedder + params: + layer: hidden + layer_idx: 11 + # crossattn and vector cond + - is_trainable: False + input_key: txt + target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2 + params: + arch: ViT-bigG-14 + version: laion2b_s39b_b160k + freeze: True + layer: penultimate + always_return_pooled: True + legacy: False + # vector cond + - is_trainable: False + input_key: original_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by two + # vector cond + - is_trainable: False + input_key: crop_coords_top_left + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by two + # vector cond + - is_trainable: False + input_key: target_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by two + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity diff --git a/configs/sd_xl_refiner.yaml b/configs/sd_xl_refiner.yaml new file mode 100644 index 000000000..cab5fe283 --- /dev/null +++ b/configs/sd_xl_refiner.yaml @@ -0,0 +1,91 @@ +model: + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.13025 + disable_first_stage_autocast: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + weighting_config: + target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + adm_in_channels: 2560 + num_classes: sequential + use_checkpoint: True + in_channels: 4 + out_channels: 4 + model_channels: 384 + attention_resolutions: [4, 2] + num_res_blocks: 2 + channel_mult: [1, 2, 4, 4] + num_head_channels: 64 + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 4 + context_dim: [1280, 1280, 1280, 1280] # 1280 + spatial_transformer_attn_type: softmax-xformers + legacy: False + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + # crossattn and vector cond + - is_trainable: False + input_key: txt + target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2 + params: + arch: ViT-bigG-14 + version: laion2b_s39b_b160k + legacy: False + freeze: True + layer: penultimate + always_return_pooled: True + # vector cond + - is_trainable: False + input_key: original_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by two + # vector cond + - is_trainable: False + input_key: crop_coords_top_left + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by two + # vector cond + - is_trainable: False + input_key: aesthetic_score + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by one + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity diff --git a/installer.py b/installer.py index 60ba1dfb2..01283d817 100644 --- a/installer.py +++ b/installer.py @@ -430,8 +430,14 @@ def install_packages(): install('pi-heif', 'pi_heif', ignore=True) tensorflow_package = os.environ.get('TENSORFLOW_PACKAGE', 'tensorflow==2.13.0') install(tensorflow_package, 'tensorflow', ignore=True) - # bitsandbytes_package = os.environ.get('BITSANDBYTES_PACKAGE', 'bitsandbytes==0.39.1') - # install(bitsandbytes_package, 'bitsandbytes', ignore=True) + bitsandbytes_package = os.environ.get('BITSANDBYTES_PACKAGE', None) + if bitsandbytes_package is not None: + install(bitsandbytes_package, 'bitsandbytes', ignore=True) + else: + bitsandbytes_package = pkg_resources.working_set.by_key.get('bitsandbytes', None) + if bitsandbytes_package is not None: + log.warning(f'Not used, uninstalling: {bitsandbytes_package}') + pip('uninstall bitsandbytes --yes --quiet', ignore=True, quiet=True) if args.profile: print_profile(pr, 'Packages') diff --git a/launch.py b/launch.py index a645faf9a..31a6d533c 100644 --- a/launch.py +++ b/launch.py @@ -169,6 +169,7 @@ if __name__ == "__main__": if installer.check_timestamp(): installer.log.info('No changes detected: Quick launch active') installer.install_requirements() + installer.install_packages() installer.check_extensions() else: installer.install_requirements() diff --git a/modules/processing_diffusers.py b/modules/processing_diffusers.py index bf40fa9d0..51e295e02 100644 --- a/modules/processing_diffusers.py +++ b/modules/processing_diffusers.py @@ -1,14 +1,16 @@ import inspect +import typing import torch import modules.devices as devices import modules.shared as shared import modules.sd_samplers as sd_samplers import modules.sd_models as sd_models +import modules.sd_vae as sd_vae import modules.images as images from modules.lora_diffusers import lora_state, unload_diffusers_lora from modules.processing import StableDiffusionProcessing import modules.prompt_parser_diffusers as prompt_parser_diffusers -import typing + try: import diffusers @@ -16,16 +18,6 @@ except Exception as ex: shared.log.error(f'Failed to import diffusers: {ex}') -def encode_prompt(encoder, prompt): - cfg = encoder.config - # TODO implement similar hijack for diffusers text encoder but following diffusers pipeline.encode_prompt concepts - # from modules import sd_hijack_clip - # model.text_encoder = sd_hijack_clip.FrozenCLIPEmbedderWithCustomWords(model.text_encoder, None) - shared.log.debug(f'Diffuser encoder: {encoder.__class__.__name__} dict={getattr(cfg, "vocab_size", None)} layers={getattr(cfg, "num_hidden_layers", None)} tokens={getattr(cfg, "max_position_embeddings", None)}') - embeds = prompt - return embeds - - def process_diffusers(p: StableDiffusionProcessing, seeds, prompts, negative_prompts): results = [] @@ -36,7 +28,7 @@ def process_diffusers(p: StableDiffusionProcessing, seeds, prompts, negative_pro def vae_decode(latents, model, output_type='np'): if hasattr(model, 'vae') and torch.is_tensor(latents): - shared.log.debug(f'Diffusers VAE decode: name={model.vae.config.get("_name_or_path", "default")} dtype={model.vae.dtype} upcast={model.vae.config.get("force_upcast", None)}') + shared.log.debug(f'Diffusers VAE decode: name={sd_vae.loaded_vae_file} dtype={model.vae.dtype} upcast={model.vae.config.get("force_upcast", None)}') if shared.opts.diffusers_move_unet and not model.has_accelerate: shared.log.debug('Diffusers: Moving UNet to CPU') unet_device = model.unet.device @@ -113,6 +105,14 @@ def process_diffusers(p: StableDiffusionProcessing, seeds, prompts, negative_pro clean['prompt'] = len(clean['prompt']) if 'negative_prompt' in clean: clean['negative_prompt'] = len(clean['negative_prompt']) + if 'prompt_embeds' in clean: + clean['prompt_embeds'] = clean['prompt_embeds'].shape + if 'pooled_prompt_embeds' in clean: + clean['pooled_prompt_embeds'] = clean['pooled_prompt_embeds'].shape + if 'negative_prompt_embeds' in clean: + clean['negative_prompt_embeds'] = clean['negative_prompt_embeds'].shape + if 'negative_pooled_prompt_embeds' in clean: + clean['negative_pooled_prompt_embeds'] = clean['negative_pooled_prompt_embeds'].shape clean['generator'] = generator_device shared.log.debug(f'Diffuser pipeline: {pipeline.__class__.__name__} task={sd_models.get_diffusers_task(model)} set={clean}') return args @@ -138,10 +138,6 @@ def process_diffusers(p: StableDiffusionProcessing, seeds, prompts, negative_pro p.ops.append('inpaint') task_specific_kwargs = {"image": p.init_images, "mask_image": p.mask, "strength": p.denoising_strength, "height": p.height, "width": p.width} - # TODO diffusers use transformers for prompt parsing - # from modules.prompt_parser import parse_prompt_attention - # parsed_prompt = [parse_prompt_attention(prompt) for prompt in prompts] - if shared.state.interrupted or shared.state.skipped: unload_diffusers_lora() return results diff --git a/modules/sd_models.py b/modules/sd_models.py index f5c35b95f..07226acaa 100644 --- a/modules/sd_models.py +++ b/modules/sd_models.py @@ -533,6 +533,62 @@ def change_backend(): refresh_vae_list() +def detect_pipeline(f: str, op: str = 'model'): + guess = shared.opts.diffusers_pipeline + if guess == 'Autodetect': + try: + size = round(os.path.getsize(f) / 1024 / 1024 / 1024, 2) + if size < 1: + shared.log.warning(f'Model size smaller than expected: {f} size={size} GB') + elif size < 5: + guess = 'Stable Diffusion' + elif size < 6: + if op == 'model': + shared.log.warning(f'Model detected as SD-XL refiner model, but attempting to load a base model: {f} size={size} GB') + else: + guess = 'Stable Diffusion XL' + elif size < 7: + if op == 'refiner': + shared.log.warning(f'Model size matches SD-XL base model, but attempting to load a refiner model: {f} size={size} GB') + else: + guess = 'Stable Diffusion XL' + else: + shared.log.error(f'Diffusers autodetect failed, set diffuser pipeline manually: {f}') + return None, None + shared.log.debug(f'Diffusers autodetect {op}: {f} pipeline={guess} size={size} GB') + except Exception as e: + shared.log.error(f'Error detecting diffusers pipeline: model={f} {e}') + return None, None + if guess == shared.pipelines[1]: + pipeline = diffusers.StableDiffusionPipeline + elif guess == shared.pipelines[2]: + pipeline = diffusers.StableDiffusionXLPipeline + elif guess == shared.pipelines[3]: + pipeline = diffusers.KandinskyPipeline + elif guess == shared.pipelines[4]: + pipeline = diffusers.KandinskyV22Pipeline + elif guess == shared.pipelines[5]: + pipeline = diffusers.IFPipeline + elif guess == shared.pipelines[6]: + pipeline = diffusers.ShapEPipeline + elif guess == shared.pipelines[7]: + pipeline = diffusers.StableDiffusionImg2ImgPipeline + elif guess == shared.pipelines[8]: + pipeline = diffusers.StableDiffusionXLImg2ImgPipeline + elif guess == shared.pipelines[9]: + pipeline = diffusers.KandinskyImg2ImgPipeline + elif guess == shared.pipelines[10]: + pipeline = diffusers.KandinskyV22Img2ImgPipeline + elif guess == shared.pipelines[11]: + pipeline = diffusers.IFImg2ImgPipeline + elif guess == shared.pipelines[12]: + pipeline = diffusers.ShapEImg2ImgPipeline + else: + shared.log.error(f'Diffusers unknown pipeline: {guess}') + pipeline = None + return pipeline, guess + + def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=None, op='model'): # pylint: disable=unused-argument import torch # pylint: disable=reimported,redefined-outer-name if timer is None: @@ -593,9 +649,10 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No devices.set_cuda_params() vae = None + sd_vae.loaded_vae_file = None if op == 'model' or op == 'refiner': vae_file, vae_source = sd_vae.resolve_vae(checkpoint_info.filename) - vae = sd_vae.load_vae_diffusers(None, vae_file, vae_source) + vae = sd_vae.load_vae_diffusers(checkpoint_info.path, vae_file, vae_source) if vae is not None: diffusers_load_config["vae"] = vae @@ -609,35 +666,9 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No else: diffusers_load_config["local_files_only "] = True diffusers_load_config["extract_ema"] = shared.opts.diffusers_extract_ema - try: - if shared.opts.diffusers_pipeline == shared.pipelines[0]: - pipeline = diffusers.StableDiffusionPipeline - elif shared.opts.diffusers_pipeline == shared.pipelines[1]: - pipeline = diffusers.StableDiffusionXLPipeline - elif shared.opts.diffusers_pipeline == shared.pipelines[2]: - pipeline = diffusers.KandinskyPipeline - elif shared.opts.diffusers_pipeline == shared.pipelines[3]: - pipeline = diffusers.KandinskyV22Pipeline - elif shared.opts.diffusers_pipeline == shared.pipelines[4]: - pipeline = diffusers.IFPipeline - elif shared.opts.diffusers_pipeline == shared.pipelines[5]: - pipeline = diffusers.ShapEPipeline - elif shared.opts.diffusers_pipeline == shared.pipelines[6]: - pipeline = diffusers.StableDiffusionImg2ImgPipeline - elif shared.opts.diffusers_pipeline == shared.pipelines[7]: - pipeline = diffusers.StableDiffusionXLImg2ImgPipeline - elif shared.opts.diffusers_pipeline == shared.pipelines[8]: - pipeline = diffusers.KandinskyImg2ImgPipeline - elif shared.opts.diffusers_pipeline == shared.pipelines[9]: - pipeline = diffusers.KandinskyV22Img2ImgPipeline - elif shared.opts.diffusers_pipeline == shared.pipelines[10]: - pipeline = diffusers.IFImg2ImgPipeline - elif shared.opts.diffusers_pipeline == shared.pipelines[11]: - pipeline = diffusers.ShapEImg2ImgPipeline - else: - shared.log.error(f'Diffusers {op} unknown pipeline: {shared.opts.diffusers_pipeline}') - except Exception as e: - shared.log.error(f'Diffusers {op} failed initializing pipeline: {shared.opts.diffusers_pipeline} {e}') + pipeline, _model_type = detect_pipeline(checkpoint_info.path, op) + if pipeline is None: + shared.log.error(f'Diffusers {op} pipeline not initialized: {shared.opts.diffusers_pipeline}') return try: if hasattr(pipeline, 'from_single_file'): @@ -697,6 +728,8 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No else: sd_model.disable_attention_slicing() if hasattr(sd_model, "vae"): + if vae is not None: + sd_model.vae = vae if shared.opts.diffusers_vae_upcast != 'default': if shared.opts.diffusers_vae_upcast == 'true': sd_model.vae.config["force_upcast"] = True @@ -704,7 +737,7 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No else: sd_model.vae.config["force_upcast"] = False sd_model.vae.config.force_upcast = False - shared.log.debug(f'Diffusers {op} VAE: name={sd_model.vae.config.get("_name_or_path", "default")} upcast={sd_model.vae.config.get("force_upcast", None)}') + shared.log.debug(f'Diffusers {op} VAE: name={sd_vae.loaded_vae_file} upcast={sd_model.vae.config.get("force_upcast", None)}') if shared.opts.cross_attention_optimization == "xFormers" and hasattr(sd_model, 'enable_xformers_memory_efficient_attention'): sd_model.enable_xformers_memory_efficient_attention() if shared.opts.opt_channelslast: diff --git a/modules/sd_vae.py b/modules/sd_vae.py index 1edd39961..317906001 100644 --- a/modules/sd_vae.py +++ b/modules/sd_vae.py @@ -3,7 +3,7 @@ import collections import glob from copy import deepcopy import torch -from modules import shared, paths, devices, script_callbacks, sd_models +from modules import shared, paths, paths_internal, devices, script_callbacks, sd_models vae_ignore_keys = {"model_ema.decay", "model_ema.num_updates"} @@ -169,7 +169,7 @@ def load_vae(model, vae_file=None, vae_source="from unknown source"): loaded_vae_file = vae_file -def load_vae_diffusers(_model, vae_file=None, vae_source="from unknown source"): +def load_vae_diffusers(model_file, vae_file=None, vae_source="from unknown source"): if vae_file is None: return None if not os.path.exists(vae_file): @@ -196,14 +196,14 @@ def load_vae_diffusers(_model, vae_file=None, vae_source="from unknown source"): try: import diffusers if os.path.isfile(vae_file): - if shared.opts.diffusers_pipeline == "Stable Diffusion XL": - # load_config passed to from_single_file doesn't apply - # from_single_file by default downloads VAE1.5 config - shared.log.warning("Using SDXL VAE loaded from singular file will result in low contrast images.") - vae = diffusers.AutoencoderKL.from_single_file(vae_file) + _pipeline, model_type = sd_models.detect_pipeline(model_file, 'vae') + diffusers_load_config = { "config_file": paths_internal.sd_default_config if model_type != 'Stable Diffusion XL' else os.path.join(paths_internal.sd_configs_path, 'sd_xl_base.yaml')} + vae = diffusers.AutoencoderKL.from_single_file(vae_file, **diffusers_load_config) vae = vae.to(devices.dtype_vae) else: vae = diffusers.AutoencoderKL.from_pretrained(vae_file, **diffusers_load_config) + global loaded_vae_file # pylint: disable=global-statement + loaded_vae_file = os.path.basename(vae_file) # shared.log.debug(f'Diffusers VAE config: {vae.config}') return vae except Exception as e: diff --git a/modules/shared.py b/modules/shared.py index bd5444336..25993ea99 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -41,6 +41,7 @@ loaded_hypernetworks = [] gradio_theme = gr.themes.Base() settings_components = None pipelines = [ + 'Autodetect', 'Stable Diffusion', 'Stable Diffusion XL', 'Kandinsky V1', 'Kandinsky V2', 'DeepFloyd IF', 'Shap-E', 'Stable Diffusion Img2Img', 'Stable Diffusion XL Img2Img', 'Kandinsky V1 Img2Img', 'Kandinsky V2 Img2Img', 'DeepFloyd IF Img2Img', 'Shap-E Img2Img' ]