diff --git a/CHANGELOG.md b/CHANGELOG.md index fa564a06e..0cf4abd77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ # Change Log for SD.Next -## Update for 2024-10-17 +## Update for 2024-10-18 -### Highlights for 2024-10-17 +### Highlights for 2024-10-18 - **Reprocess**: New workflow options that allow you to generate at lower quality and then reprocess at higher quality for select images only or generate without hires/refine and then reprocess with hires/refine @@ -22,6 +22,8 @@ - Auto-detection of best available **device/dtype** settings for your platform and GPU reduces neeed for manual configuration - Full rewrite of **sampler options**, not far more streamlined with tons of new options to tweak scheduler behavior - Improved **LoRA** detection and handling for all supported models +- Tons of work on dynamic quantization that can be applied on-the-fly during model load to any model type + Supported quantization engines include TorchAO, Optimum.quanto, NNCF compression, and more... Oh, and we've compiled a full table with list of popular text-to-image generative models, their respective parameters and architecture overview: @@ -30,7 +32,7 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition [README](https://github.com/vladmandic/automatic/blob/master/README.md) | [CHANGELOG](https://github.com/vladmandic/automatic/blob/master/CHANGELOG.md) | [WiKi](https://github.com/vladmandic/automatic/wiki) | [Discord](https://discord.com/invite/sd-next-federal-batch-inspectors-1101998836328697867) -### Details for 2024-10-17 +### Details for 2024-10-18 - **reprocess** - new top-level button: reprocess latent from your history of generated image(s) @@ -211,6 +213,11 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition - setting `lora_load_gpu` to load LoRA directly to GPU *default*: true unless lovwram +- **torchao** + - reimplement torchao quantization + - configure in settings -> compute settings -> quantization + - can be applied to any model on-the-fly during load + - **huggingface**: - force logout/login on token change - unified handling of cache folder: set via `HF_HUB` or `HF_HUB_CACHE` or via settings -> system paths @@ -233,6 +240,7 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition - fix update infotext on image select - fix imageviewer exif parser - selectable info view in image viewer, thanks @ZeldaMaster501 + - setting to enable browser autolaunch, thanks @brknsoul - **free-u** check if device/dtype are fft compatible and cast as necessary - **rocm** - additional gpu detection and auto-config code, thanks @lshqqytiger diff --git a/javascript/sdnext.css b/javascript/sdnext.css index 0f3765694..3c81e7d8e 100644 --- a/javascript/sdnext.css +++ b/javascript/sdnext.css @@ -55,6 +55,8 @@ td > div > span { overflow-y: auto; max-height: 3em; overflow-x: hidden; } .gradio-radio { padding: 0 !important; width: max-content !important; } .gradio-slider { margin-right: var(--spacing-sm) !important; width: max-content !important } .gradio-slider input[type="number"] { width: 5em; font-size: var(--text-xs); height: 16px; text-align: right; padding: 0; } +.gradio-checkboxgroup { padding: 0 !important; } +.gradio-checkbox > label { color: var(--block-title-text-color) !important; } /* custom gradio elements */ .accordion-compact { padding: 8px 0px 4px 0px !important; } diff --git a/modules/loader.py b/modules/loader.py index 18cb42893..52331672a 100644 --- a/modules/loader.py +++ b/modules/loader.py @@ -1,5 +1,6 @@ from __future__ import annotations from functools import partial +import os import re import sys import logging @@ -13,6 +14,7 @@ errors.install() logging.getLogger("DeepSpeed").disabled = True +os.environ.setdefault('TORCH_LOGS', '-all') import torch # pylint: disable=C0411 try: import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import @@ -96,7 +98,6 @@ def get_packages(): } try: - import os import math cores = os.cpu_count() affinity = len(os.sched_getaffinity(0)) diff --git a/modules/sd_models.py b/modules/sd_models.py index a8c92bc4e..47d0c5947 100644 --- a/modules/sd_models.py +++ b/modules/sd_models.py @@ -737,8 +737,8 @@ def set_diffuser_options(sd_model, vae = None, op: str = 'model', offload=True): model.eval() return model sd_model = sd_models_compile.apply_compile_to_model(sd_model, eval_model, ["Model", "VAE", "Text Encoder"], op="eval") - if shared.opts.diffusers_quantization: - sd_model = sd_models_compile.dynamic_quantization(sd_model) + if len(shared.opts.torchao_quantization) > 0: + sd_model = sd_models_compile.torchao_quantization(sd_model) if shared.opts.opt_channelslast and hasattr(sd_model, 'unet'): shared.log.debug(f'Setting {op}: channels-last=True') @@ -1193,7 +1193,7 @@ def load_diffuser_file(model_type, pipeline, checkpoint_info, diffusers_load_con from diffusers.utils import import_utils import_utils._accelerate_available = False # pylint: disable=protected-access if shared.opts.diffusers_to_gpu and model_type.startswith('Stable Diffusion'): - shared.log.debug(f'Diffusers accelerate: hijack={shared.opts.diffusers_to_gpu}') + shared.log.debug(f'Diffusers accelerate: direct={shared.opts.diffusers_to_gpu}') sd_hijack_accelerate.hijack_accelerate() else: sd_hijack_accelerate.restore_accelerate() diff --git a/modules/sd_models_compile.py b/modules/sd_models_compile.py index ff4fa6aff..91ed84ded 100644 --- a/modules/sd_models_compile.py +++ b/modules/sd_models_compile.py @@ -183,7 +183,7 @@ def nncf_compress_model(model, op=None, sd_model=None): def nncf_compress_weights(sd_model): try: t0 = time.time() - shared.log.info(f"NNCF Compress Weights: {shared.opts.nncf_compress_weights}") + shared.log.info(f"Quantization: type=NNCF modules={shared.opts.nncf_compress_weights}") global quant_last_model_name, quant_last_model_device # pylint: disable=global-statement install('nncf==2.7.0', quiet=True) @@ -199,9 +199,9 @@ def nncf_compress_weights(sd_model): quant_last_model_device = None t1 = time.time() - shared.log.info(f"NNCF Compress Weights: time={t1-t0:.2f}") + shared.log.info(f"Quantization: type=NNCF time={t1-t0:.2f}") except Exception as e: - shared.log.warning(f"NNCF Compress Weights: error: {e}") + shared.log.warning(f"Quantization: type=NNCF {e}") return sd_model @@ -249,10 +249,10 @@ def optimum_quanto_model(model, op=None, sd_model=None, weights=None, activation def optimum_quanto_weights(sd_model): try: if shared.opts.diffusers_offload_mode in {"balanced", "sequential"}: - shared.log.warning(f"Optimum Quanto Weights is incompatible with {shared.opts.diffusers_offload_mode} offload!") + shared.log.warning(f"Quantization: type=Optimum.quanto offload={shared.opts.diffusers_offload_mode} not compatible") return sd_model t0 = time.time() - shared.log.info(f"Optimum Quanto Weights: {shared.opts.optimum_quanto_weights}") + shared.log.info(f"Quantization: type=Optimum.quanto: modules={shared.opts.optimum_quanto_weights}") global quant_last_model_name, quant_last_model_device # pylint: disable=global-statement quanto = model_quant.load_quanto() quanto.tensor.qbits.QBitsTensor.create = lambda *args, **kwargs: quanto.tensor.qbits.QBitsTensor(*args, **kwargs) @@ -299,9 +299,9 @@ def optimum_quanto_weights(sd_model): devices.torch_gc(force=True) t1 = time.time() - shared.log.info(f"Optimum Quanto Weights: time={t1-t0:.2f}") + shared.log.info(f"Quantization: type=Optimum.quanto time={t1-t0:.2f}") except Exception as e: - shared.log.warning(f"Optimum Quanto Weights: error: {e}") + shared.log.warning(f"Quantization: type=Optimum.quanto {e}") return sd_model @@ -329,7 +329,7 @@ def compile_onediff(sd_model): from onediff.infer_compiler import oneflow_compile except Exception as e: - shared.log.warning(f"Model compile using onediff/oneflow: {e}") + shared.log.warning(f"Model compile: task=onediff {e}") return sd_model try: @@ -351,9 +351,9 @@ def compile_onediff(sd_model): if shared.opts.cuda_compile_precompile: sd_model("dummy prompt") t1 = time.time() - shared.log.info(f"Model compile: task=onediff/oneflow time={t1-t0:.2f}") + shared.log.info(f"Model compile: task=onediff time={t1-t0:.2f}") except Exception as e: - shared.log.info(f"Model compile: task=onediff/oneflow error: {e}") + shared.log.info(f"Model compile: task=onediff {e}") return sd_model @@ -361,7 +361,7 @@ def compile_stablefast(sd_model): try: import sfast.compilers.stable_diffusion_pipeline_compiler as sf except Exception as e: - shared.log.warning(f'Model compile using stable-fast: {e}') + shared.log.warning(f'Model compile: task=stablefast: {e}') return sd_model config = sf.CompilationConfig.Default() try: @@ -390,9 +390,9 @@ def compile_stablefast(sd_model): if shared.opts.cuda_compile_precompile: sd_model("dummy prompt") t1 = time.time() - shared.log.info(f"Model compile: task='Stable-fast' config={config.__dict__} time={t1-t0:.2f}") + shared.log.info(f"Model compile: task=stablefast config={config.__dict__} time={t1-t0:.2f}") except Exception as e: - shared.log.info(f"Model compile: task=Stable-fast error: {e}") + shared.log.info(f"Model compile: task=stablefast {e}") return sd_model @@ -401,7 +401,7 @@ def compile_torch(sd_model): t0 = time.time() import torch._dynamo # pylint: disable=unused-import,redefined-outer-name torch._dynamo.reset() # pylint: disable=protected-access - shared.log.debug(f"Model compile available backends: {torch._dynamo.list_backends()}") # pylint: disable=protected-access + shared.log.debug(f"Model compile: task=torch backends={torch._dynamo.list_backends()}") # pylint: disable=protected-access def torch_compile_model(model, op=None, sd_model=None): # pylint: disable=unused-argument if hasattr(model, "device") and model.device.type != "meta": @@ -442,7 +442,7 @@ def compile_torch(sd_model): torch._inductor.config.use_mixed_mm = True # pylint: disable=protected-access # torch._inductor.config.force_fuse_int_mm_with_mul = True # pylint: disable=protected-access except Exception as e: - shared.log.error(f"Torch inductor config error: {e}") + shared.log.error(f"Model compile: torch inductor config error: {e}") sd_model = apply_compile_to_model(sd_model, function=torch_compile_model, options=shared.opts.cuda_compile, op="compile") @@ -450,9 +450,9 @@ def compile_torch(sd_model): if shared.opts.cuda_compile_precompile: sd_model("dummy prompt") t1 = time.time() - shared.log.info(f"Model compile: time={t1-t0:.2f}") + shared.log.info(f"Model compile: task=torch time={t1-t0:.2f}") except Exception as e: - shared.log.warning(f"Model compile error: {e}") + shared.log.warning(f"Model compile: task=torch {e}") return sd_model @@ -467,19 +467,19 @@ def check_deepcache(enable: bool): def compile_deepcache(sd_model): global deepcache_worker # pylint: disable=global-statement if not hasattr(sd_model, 'unet'): - shared.log.warning(f'Model compile using deep-cache: {sd_model.__class__} not supported') + shared.log.warning(f'Model compile: task=deepcache pipeline={sd_model.__class__} not supported') return sd_model try: from DeepCache import DeepCacheSDHelper except Exception as e: - shared.log.warning(f'Model compile using deep-cache: {e}') + shared.log.warning(f'Model compile: task=deepcache {e}') return sd_model t0 = time.time() check_deepcache(False) deepcache_worker = DeepCacheSDHelper(pipe=sd_model) deepcache_worker.set_params(cache_interval=shared.opts.deep_cache_interval, cache_branch_id=0) t1 = time.time() - shared.log.info(f"Model compile: task='DeepCache' config={deepcache_worker.params} time={t1-t0:.2f}") + shared.log.info(f"Model compile: task=deepcache config={deepcache_worker.params} time={t1-t0:.2f}") # config={'cache_interval': 3, 'cache_layer_id': 0, 'cache_block_id': 0, 'skip_mode': 'uniform'} time=0.00 return sd_model @@ -503,40 +503,56 @@ def compile_diffusers(sd_model): return sd_model -def dynamic_quantization(sd_model): +def torchao_quantization(sd_model): try: install('torchao', quiet=True) - from torchao.quantization import autoquant + from torchao import quantization as q except Exception as e: - shared.log.error(f"Model dynamic quantization not supported: {e}") + shared.log.error(f"Quantization: type=TorchAO quantization not supported: {e}") return sd_model - - """ - from torchao.quantization import quant_api - def dynamic_quant_filter_fn(mod, *args): # pylint: disable=unused-argument - return (isinstance(mod, torch.nn.Linear) and mod.in_features > 16 and (mod.in_features, mod.out_features) - not in [(1280, 640), (1920, 1280), (1920, 640), (2048, 1280), (2048, 2560), (2560, 1280), (256, 128), (2816, 1280), (320, 640), (512, 1536), (512, 256), (512, 512), (640, 1280), (640, 1920), (640, 320), (640, 5120), (640, 640), (960, 320), (960, 640)]) - - def conv_filter_fn(mod, *args): # pylint: disable=unused-argument - return (isinstance(mod, torch.nn.Conv2d) and mod.kernel_size == (1, 1) and 128 in [mod.in_channels, mod.out_channels]) - - quant_api.swap_conv2d_1x1_to_linear(sd_model.unet, conv_filter_fn) - quant_api.swap_conv2d_1x1_to_linear(sd_model.vae, conv_filter_fn) - quant_api.apply_dynamic_quant(sd_model.unet, dynamic_quant_filter_fn) - quant_api.apply_dynamic_quant(sd_model.vae, dynamic_quant_filter_fn) - """ - - shared.log.info(f"Model dynamic quantization: pipeline={sd_model.__class__.__name__}") + if shared.opts.torchao_quantization_type == "int8+act": + fn = q.int8_dynamic_activation_int8_weight + elif shared.opts.torchao_quantization_type == "int8": + fn = q.int8_weight_only + elif shared.opts.torchao_quantization_type == "int4": + fn = q.int4_weight_only + elif shared.opts.torchao_quantization_type == "fp8+act": + fn = q.float8_dynamic_activation_float8_weight + elif shared.opts.torchao_quantization_type == "fp8": + fn = q.float8_weight_only + elif shared.opts.torchao_quantization_type == "fpx": + fn = q.fpx_weight_only + else: + shared.log.error(f"Quantization: type=TorchAO type={shared.opts.torchao_quantization_type} not supported") + return sd_model + shared.log.info(f"Quantization: type=TorchAO pipe={sd_model.__class__.__name__} quant={shared.opts.torchao_quantization_type} fn={fn} targets={shared.opts.torchao_quantization}") try: - if shared.sd_model_type == 'sd' or shared.sd_model_type == 'sdxl': - sd_model.unet = sd_model.unet.to(devices.device) - sd_model.unet = autoquant(sd_model.unet, error_on_unseen=False) - elif shared.sd_model_type == 'f1': - sd_model.transformer = autoquant(sd_model.transformer, error_on_unseen=False) - else: - shared.log.error(f"Model dynamic quantization not supported: {shared.sd_model_type}") + t0 = time.time() + modules = [] + if hasattr(sd_model, 'unet') and 'Model' in shared.opts.torchao_quantization: + modules.append('unet') + q.quantize_(sd_model.unet, fn(), device=devices.device) + if hasattr(sd_model, 'transformer') and 'Model' in shared.opts.torchao_quantization: + modules.append('transformer') + q.quantize_(sd_model.transformer, fn(), device=devices.device) + # sd_model.transformer = q.autoquant(sd_model.transformer, error_on_unseen=False) + if hasattr(sd_model, 'vae') and 'VAE' in shared.opts.torchao_quantization: + modules.append('vae') + q.quantize_(sd_model.vae, fn(), device=devices.device) + if hasattr(sd_model, 'text_encoder') and 'Text Encoder' in shared.opts.torchao_quantization: + modules.append('te1') + q.quantize_(sd_model.text_encoder, fn(), device=devices.device) + if hasattr(sd_model, 'text_encoder_2') and 'Text Encoder' in shared.opts.torchao_quantization: + modules.append('te2') + q.quantize_(sd_model.text_encoder_2, fn(), device=devices.device) + if hasattr(sd_model, 'text_encoder_3') and 'Text Encoder' in shared.opts.torchao_quantization: + modules.append('te3') + q.quantize_(sd_model.text_encoder_3, fn(), device=devices.device) + t1 = time.time() + shared.log.info(f"Quantization: type=TorchAO modules={modules} time={t1-t0:.2f}") except Exception as e: - shared.log.error(f"Model dynamic quantization: {e}") + shared.log.error(f"Quantization: type=TorchAO {e}") + setup_logging() # torchao uses dynamo which messes with logging so reset is needed return sd_model diff --git a/modules/shared.py b/modules/shared.py index d54fd65b3..69574ff92 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -426,7 +426,6 @@ startup_offload_mode, startup_cross_attention, startup_sdp_options = get_default options_templates.update(options_section(('sd', "Execution & Models"), { "sd_backend": OptionInfo(default_backend, "Execution backend", gr.Radio, {"choices": ["diffusers", "original"] }), - "autolaunch": OptionInfo(False, "Autolaunch browser upon startup"), "sd_model_checkpoint": OptionInfo(default_checkpoint, "Base model", gr.Dropdown, lambda: {"choices": list_checkpoint_tiles()}, refresh=refresh_checkpoints), "sd_model_refiner": OptionInfo('None', "Refiner model", gr.Dropdown, lambda: {"choices": ['None'] + list_checkpoint_tiles()}, refresh=refresh_checkpoints), "sd_vae": OptionInfo("Automatic", "VAE model", gr.Dropdown, lambda: {"choices": shared_items.sd_vae_items()}, refresh=shared_items.refresh_vae_list), @@ -493,11 +492,12 @@ options_templates.update(options_section(('cuda', "Compute Settings"), { "quant_sep": OptionInfo("

Model Quantization

", "", gr.HTML, {"visible": native}), "quant_shuffle_weights": OptionInfo(False, "Shuffle the weights between GPU and CPU when quantizing", gr.Checkbox, {"visible": native}), - "diffusers_quantization": OptionInfo(False, "Dynamic quantization with TorchAO", gr.Checkbox, {"visible": native}), - "nncf_compress_weights": OptionInfo([], "Compress Model weights with NNCF INT8", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}), - "optimum_quanto_weights": OptionInfo([], "Quantize Model weights with Optimum Quanto", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}), - "optimum_quanto_weights_type": OptionInfo("qint8", "Weights type for Optimum Quanto", gr.Radio, {"choices": ['qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2', 'qint4', 'qint2'], "visible": native}), - "optimum_quanto_activations_type": OptionInfo("none", "Activations type for Optimum Quanto", gr.Radio, {"choices": ['none', 'qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2'], "visible": native}), + "nncf_compress_weights": OptionInfo([], "NNCF int8 compression enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}), + "optimum_quanto_weights": OptionInfo([], "Optimum.quanto quantization enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}), + "optimum_quanto_weights_type": OptionInfo("qint8", "Optimum.quanto quantization type", gr.Radio, {"choices": ['qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2', 'qint4', 'qint2'], "visible": native}), + "optimum_quanto_activations_type": OptionInfo("none", "Optimum.quanto quantization activations ", gr.Radio, {"choices": ['none', 'qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2'], "visible": native}), + "torchao_quantization": OptionInfo([], "TorchAO quantization enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder"], "visible": native}), + "torchao_quantization_type": OptionInfo("int8", "TorchAO quantization type", gr.Radio, {"choices": ["int8+act", "int8", "int4", "fp8+act", "fp8", "fpx"], "visible": native}), "ipex_sep": OptionInfo("

IPEX

", "", gr.HTML, {"visible": devices.backend == "ipex"}), "ipex_optimize": OptionInfo([], "IPEX Optimize for Intel GPUs", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "Upscaler"], "visible": devices.backend == "ipex"}), @@ -713,6 +713,7 @@ options_templates.update(options_section(('ui', "User Interface Options"), { "theme_type": OptionInfo("Standard", "Theme type", gr.Radio, {"choices": ["Modern", "Standard", "None"]}), "theme_style": OptionInfo("Auto", "Theme mode", gr.Radio, {"choices": ["Auto", "Dark", "Light"]}), "gradio_theme": OptionInfo("black-teal", "UI theme", gr.Dropdown, lambda: {"choices": theme.list_themes()}, refresh=theme.refresh_themes), + "autolaunch": OptionInfo(False, "Autolaunch browser upon startup"), "font_size": OptionInfo(14, "Font size", gr.Slider, {"minimum": 8, "maximum": 32, "step": 1, "visible": True}), "tooltips": OptionInfo("UI Tooltips", "UI tooltips", gr.Radio, {"choices": ["None", "Browser default", "UI tooltips"], "visible": False}), "aspect_ratios": OptionInfo("1:1, 4:3, 3:2, 16:9, 16:10, 21:9, 2:3, 3:4, 9:16, 10:16, 9:21", "Allowed aspect ratios"),