reimplement torchao quantization

Signed-off-by: Vladimir Mandic <mandic00@live.com>
2024-10-18 09:34:01 -04:00 · 2024-10-18 09:34:01 -04:00 · ae4591ac0b
parent 1d51ae3baa
commit ae4591ac0b
6 changed files with 89 additions and 61 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,8 +1,8 @@
 # Change Log for SD.Next

-## Update for 2024-10-17
+## Update for 2024-10-18

-### Highlights for 2024-10-17
+### Highlights for 2024-10-18

 - **Reprocess**: New workflow options that allow you to generate at lower quality and then  
  reprocess at higher quality for select images only or generate without hires/refine and then reprocess with hires/refine  
@ -22,6 +22,8 @@
 - Auto-detection of best available **device/dtype** settings for your platform and GPU reduces neeed for manual configuration  
 - Full rewrite of **sampler options**, not far more streamlined with tons of new options to tweak scheduler behavior  
 - Improved **LoRA** detection and handling for all supported models  
+- Tons of work on dynamic quantization that can be applied on-the-fly during model load to any model type  
+  Supported quantization engines include TorchAO, Optimum.quanto, NNCF compression, and more...  

 Oh, and we've compiled a full table with list of popular text-to-image generative models, their respective parameters and architecture overview: <https://github.com/vladmandic/automatic/wiki/Models>

@ -30,7 +32,7 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
 [README](https://github.com/vladmandic/automatic/blob/master/README.md) | [CHANGELOG](https://github.com/vladmandic/automatic/blob/master/CHANGELOG.md) | [WiKi](https://github.com/vladmandic/automatic/wiki) | [Discord](https://discord.com/invite/sd-next-federal-batch-inspectors-1101998836328697867)


-### Details for 2024-10-17
+### Details for 2024-10-18

 - **reprocess**
  - new top-level button: reprocess latent from your history of generated image(s)  
@ -211,6 +213,11 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
  - setting `lora_load_gpu` to load LoRA directly to GPU  
    *default*: true unless lovwram  

+- **torchao**
+  - reimplement torchao quantization
+  - configure in settings -> compute settings -> quantization
+  - can be applied to any model on-the-fly during load  
+
 - **huggingface**:  
  - force logout/login on token change  
  - unified handling of cache folder: set via `HF_HUB` or `HF_HUB_CACHE` or via settings -> system paths  
@ -233,6 +240,7 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
  - fix update infotext on image select  
  - fix imageviewer exif parser  
  - selectable info view in image viewer, thanks @ZeldaMaster501  
+  - setting to enable browser autolaunch, thanks @brknsoul  
 - **free-u** check if device/dtype are fft compatible and cast as necessary  
 - **rocm**
  - additional gpu detection and auto-config code, thanks @lshqqytiger  
--- a/javascript/sdnext.css
+++ b/javascript/sdnext.css
@ -55,6 +55,8 @@ td > div > span { overflow-y: auto; max-height: 3em; overflow-x: hidden; }
 .gradio-radio { padding: 0 !important; width: max-content !important; }
 .gradio-slider { margin-right: var(--spacing-sm) !important; width: max-content !important }
 .gradio-slider input[type="number"] { width: 5em; font-size: var(--text-xs); height: 16px; text-align: right; padding: 0; }
+.gradio-checkboxgroup { padding: 0 !important; }
+.gradio-checkbox > label { color: var(--block-title-text-color) !important; }

 /* custom gradio elements */
 .accordion-compact { padding: 8px 0px 4px 0px !important; }
--- a/modules/loader.py
+++ b/modules/loader.py
@ -1,5 +1,6 @@
 from __future__ import annotations
 from functools import partial
+import os
 import re
 import sys
 import logging
@ -13,6 +14,7 @@ errors.install()
 logging.getLogger("DeepSpeed").disabled = True


+os.environ.setdefault('TORCH_LOGS', '-all')
 import torch # pylint: disable=C0411
 try:
    import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
@ -96,7 +98,6 @@ def get_packages():
    }

 try:
-    import os
    import math
    cores = os.cpu_count()
    affinity = len(os.sched_getaffinity(0))
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@ -737,8 +737,8 @@ def set_diffuser_options(sd_model, vae = None, op: str = 'model', offload=True):
                model.eval()
            return model
        sd_model = sd_models_compile.apply_compile_to_model(sd_model, eval_model, ["Model", "VAE", "Text Encoder"], op="eval")
-    if shared.opts.diffusers_quantization:
-        sd_model = sd_models_compile.dynamic_quantization(sd_model)
+    if len(shared.opts.torchao_quantization) > 0:
+        sd_model = sd_models_compile.torchao_quantization(sd_model)

    if shared.opts.opt_channelslast and hasattr(sd_model, 'unet'):
        shared.log.debug(f'Setting {op}: channels-last=True')
@ -1193,7 +1193,7 @@ def load_diffuser_file(model_type, pipeline, checkpoint_info, diffusers_load_con
                from diffusers.utils import import_utils
                import_utils._accelerate_available = False # pylint: disable=protected-access
            if shared.opts.diffusers_to_gpu and model_type.startswith('Stable Diffusion'):
-                shared.log.debug(f'Diffusers accelerate: hijack={shared.opts.diffusers_to_gpu}')
+                shared.log.debug(f'Diffusers accelerate: direct={shared.opts.diffusers_to_gpu}')
                sd_hijack_accelerate.hijack_accelerate()
            else:
                sd_hijack_accelerate.restore_accelerate()
--- a/modules/sd_models_compile.py
+++ b/modules/sd_models_compile.py
@ -183,7 +183,7 @@ def nncf_compress_model(model, op=None, sd_model=None):
 def nncf_compress_weights(sd_model):
    try:
        t0 = time.time()
-        shared.log.info(f"NNCF Compress Weights: {shared.opts.nncf_compress_weights}")
+        shared.log.info(f"Quantization: type=NNCF modules={shared.opts.nncf_compress_weights}")
        global quant_last_model_name, quant_last_model_device # pylint: disable=global-statement
        install('nncf==2.7.0', quiet=True)

@ -199,9 +199,9 @@ def nncf_compress_weights(sd_model):
        quant_last_model_device = None

        t1 = time.time()
-        shared.log.info(f"NNCF Compress Weights: time={t1-t0:.2f}")
+        shared.log.info(f"Quantization: type=NNCF time={t1-t0:.2f}")
    except Exception as e:
-        shared.log.warning(f"NNCF Compress Weights: error: {e}")
+        shared.log.warning(f"Quantization: type=NNCF {e}")
    return sd_model


@ -249,10 +249,10 @@ def optimum_quanto_model(model, op=None, sd_model=None, weights=None, activation
 def optimum_quanto_weights(sd_model):
    try:
        if shared.opts.diffusers_offload_mode in {"balanced", "sequential"}:
-            shared.log.warning(f"Optimum Quanto Weights is incompatible with {shared.opts.diffusers_offload_mode} offload!")
+            shared.log.warning(f"Quantization: type=Optimum.quanto offload={shared.opts.diffusers_offload_mode} not compatible")
            return sd_model
        t0 = time.time()
-        shared.log.info(f"Optimum Quanto Weights: {shared.opts.optimum_quanto_weights}")
+        shared.log.info(f"Quantization: type=Optimum.quanto: modules={shared.opts.optimum_quanto_weights}")
        global quant_last_model_name, quant_last_model_device # pylint: disable=global-statement
        quanto = model_quant.load_quanto()
        quanto.tensor.qbits.QBitsTensor.create = lambda *args, **kwargs: quanto.tensor.qbits.QBitsTensor(*args, **kwargs)
@ -299,9 +299,9 @@ def optimum_quanto_weights(sd_model):
            devices.torch_gc(force=True)

        t1 = time.time()
-        shared.log.info(f"Optimum Quanto Weights: time={t1-t0:.2f}")
+        shared.log.info(f"Quantization: type=Optimum.quanto time={t1-t0:.2f}")
    except Exception as e:
-        shared.log.warning(f"Optimum Quanto Weights: error: {e}")
+        shared.log.warning(f"Quantization: type=Optimum.quanto {e}")
    return sd_model


@ -329,7 +329,7 @@ def compile_onediff(sd_model):
        from onediff.infer_compiler import oneflow_compile

    except Exception as e:
-        shared.log.warning(f"Model compile using onediff/oneflow: {e}")
+        shared.log.warning(f"Model compile: task=onediff {e}")
        return sd_model

    try:
@ -351,9 +351,9 @@ def compile_onediff(sd_model):
        if shared.opts.cuda_compile_precompile:
            sd_model("dummy prompt")
        t1 = time.time()
-        shared.log.info(f"Model compile: task=onediff/oneflow time={t1-t0:.2f}")
+        shared.log.info(f"Model compile: task=onediff time={t1-t0:.2f}")
    except Exception as e:
-        shared.log.info(f"Model compile: task=onediff/oneflow error: {e}")
+        shared.log.info(f"Model compile: task=onediff {e}")
    return sd_model


@ -361,7 +361,7 @@ def compile_stablefast(sd_model):
    try:
        import sfast.compilers.stable_diffusion_pipeline_compiler as sf
    except Exception as e:
-        shared.log.warning(f'Model compile using stable-fast: {e}')
+        shared.log.warning(f'Model compile: task=stablefast: {e}')
        return sd_model
    config = sf.CompilationConfig.Default()
    try:
@ -390,9 +390,9 @@ def compile_stablefast(sd_model):
        if shared.opts.cuda_compile_precompile:
            sd_model("dummy prompt")
        t1 = time.time()
-        shared.log.info(f"Model compile: task='Stable-fast' config={config.__dict__} time={t1-t0:.2f}")
+        shared.log.info(f"Model compile: task=stablefast config={config.__dict__} time={t1-t0:.2f}")
    except Exception as e:
-        shared.log.info(f"Model compile: task=Stable-fast error: {e}")
+        shared.log.info(f"Model compile: task=stablefast {e}")
    return sd_model


@ -401,7 +401,7 @@ def compile_torch(sd_model):
        t0 = time.time()
        import torch._dynamo # pylint: disable=unused-import,redefined-outer-name
        torch._dynamo.reset() # pylint: disable=protected-access
-        shared.log.debug(f"Model compile available backends: {torch._dynamo.list_backends()}") # pylint: disable=protected-access
+        shared.log.debug(f"Model compile: task=torch backends={torch._dynamo.list_backends()}") # pylint: disable=protected-access

        def torch_compile_model(model, op=None, sd_model=None): # pylint: disable=unused-argument
            if hasattr(model, "device") and model.device.type != "meta":
@ -442,7 +442,7 @@ def compile_torch(sd_model):
            torch._inductor.config.use_mixed_mm = True # pylint: disable=protected-access
            # torch._inductor.config.force_fuse_int_mm_with_mul = True # pylint: disable=protected-access
        except Exception as e:
-            shared.log.error(f"Torch inductor config error: {e}")
+            shared.log.error(f"Model compile: torch inductor config error: {e}")

        sd_model = apply_compile_to_model(sd_model, function=torch_compile_model, options=shared.opts.cuda_compile, op="compile")

@ -450,9 +450,9 @@ def compile_torch(sd_model):
        if shared.opts.cuda_compile_precompile:
            sd_model("dummy prompt")
        t1 = time.time()
-        shared.log.info(f"Model compile: time={t1-t0:.2f}")
+        shared.log.info(f"Model compile: task=torch time={t1-t0:.2f}")
    except Exception as e:
-        shared.log.warning(f"Model compile error: {e}")
+        shared.log.warning(f"Model compile: task=torch {e}")
    return sd_model


@ -467,19 +467,19 @@ def check_deepcache(enable: bool):
 def compile_deepcache(sd_model):
    global deepcache_worker # pylint: disable=global-statement
    if not hasattr(sd_model, 'unet'):
-        shared.log.warning(f'Model compile using deep-cache: {sd_model.__class__} not supported')
+        shared.log.warning(f'Model compile: task=deepcache pipeline={sd_model.__class__} not supported')
        return sd_model
    try:
        from DeepCache import DeepCacheSDHelper
    except Exception as e:
-        shared.log.warning(f'Model compile using deep-cache: {e}')
+        shared.log.warning(f'Model compile: task=deepcache {e}')
        return sd_model
    t0 = time.time()
    check_deepcache(False)
    deepcache_worker = DeepCacheSDHelper(pipe=sd_model)
    deepcache_worker.set_params(cache_interval=shared.opts.deep_cache_interval, cache_branch_id=0)
    t1 = time.time()
-    shared.log.info(f"Model compile: task='DeepCache' config={deepcache_worker.params} time={t1-t0:.2f}")
+    shared.log.info(f"Model compile: task=deepcache config={deepcache_worker.params} time={t1-t0:.2f}")
    # config={'cache_interval': 3, 'cache_layer_id': 0, 'cache_block_id': 0, 'skip_mode': 'uniform'} time=0.00
    return sd_model

@ -503,40 +503,56 @@ def compile_diffusers(sd_model):
    return sd_model


-def dynamic_quantization(sd_model):
+def torchao_quantization(sd_model):
    try:
        install('torchao', quiet=True)
-        from torchao.quantization import autoquant
+        from torchao import quantization as q
    except Exception as e:
-        shared.log.error(f"Model dynamic quantization not supported: {e}")
+        shared.log.error(f"Quantization: type=TorchAO quantization not supported: {e}")
        return sd_model
-
-    """
-    from torchao.quantization import quant_api
-    def dynamic_quant_filter_fn(mod, *args): # pylint: disable=unused-argument
-        return (isinstance(mod, torch.nn.Linear) and mod.in_features > 16 and (mod.in_features, mod.out_features)
-                not in [(1280, 640), (1920, 1280), (1920, 640), (2048, 1280), (2048, 2560), (2560, 1280), (256, 128), (2816, 1280), (320, 640), (512, 1536), (512, 256), (512, 512), (640, 1280), (640, 1920), (640, 320), (640, 5120), (640, 640), (960, 320), (960, 640)])
-
-    def conv_filter_fn(mod, *args): # pylint: disable=unused-argument
-        return (isinstance(mod, torch.nn.Conv2d) and mod.kernel_size == (1, 1) and 128 in [mod.in_channels, mod.out_channels])
-
-    quant_api.swap_conv2d_1x1_to_linear(sd_model.unet, conv_filter_fn)
-    quant_api.swap_conv2d_1x1_to_linear(sd_model.vae, conv_filter_fn)
-    quant_api.apply_dynamic_quant(sd_model.unet, dynamic_quant_filter_fn)
-    quant_api.apply_dynamic_quant(sd_model.vae, dynamic_quant_filter_fn)
-    """
-
-    shared.log.info(f"Model dynamic quantization: pipeline={sd_model.__class__.__name__}")
+    if shared.opts.torchao_quantization_type == "int8+act":
+        fn = q.int8_dynamic_activation_int8_weight
+    elif shared.opts.torchao_quantization_type == "int8":
+        fn = q.int8_weight_only
+    elif shared.opts.torchao_quantization_type == "int4":
+        fn = q.int4_weight_only
+    elif shared.opts.torchao_quantization_type == "fp8+act":
+        fn = q.float8_dynamic_activation_float8_weight
+    elif shared.opts.torchao_quantization_type == "fp8":
+        fn = q.float8_weight_only
+    elif shared.opts.torchao_quantization_type == "fpx":
+        fn = q.fpx_weight_only
+    else:
+        shared.log.error(f"Quantization: type=TorchAO type={shared.opts.torchao_quantization_type} not supported")
+        return sd_model
+    shared.log.info(f"Quantization: type=TorchAO pipe={sd_model.__class__.__name__} quant={shared.opts.torchao_quantization_type} fn={fn} targets={shared.opts.torchao_quantization}")
    try:
-        if shared.sd_model_type == 'sd' or shared.sd_model_type == 'sdxl':
-            sd_model.unet = sd_model.unet.to(devices.device)
-            sd_model.unet = autoquant(sd_model.unet, error_on_unseen=False)
-        elif shared.sd_model_type == 'f1':
-            sd_model.transformer = autoquant(sd_model.transformer, error_on_unseen=False)
-        else:
-            shared.log.error(f"Model dynamic quantization not supported: {shared.sd_model_type}")
+        t0 = time.time()
+        modules = []
+        if hasattr(sd_model, 'unet') and 'Model' in shared.opts.torchao_quantization:
+            modules.append('unet')
+            q.quantize_(sd_model.unet, fn(), device=devices.device)
+        if hasattr(sd_model, 'transformer') and 'Model' in shared.opts.torchao_quantization:
+            modules.append('transformer')
+            q.quantize_(sd_model.transformer, fn(), device=devices.device)
+            # sd_model.transformer = q.autoquant(sd_model.transformer, error_on_unseen=False)
+        if hasattr(sd_model, 'vae') and 'VAE' in shared.opts.torchao_quantization:
+            modules.append('vae')
+            q.quantize_(sd_model.vae, fn(), device=devices.device)
+        if hasattr(sd_model, 'text_encoder') and 'Text Encoder' in shared.opts.torchao_quantization:
+            modules.append('te1')
+            q.quantize_(sd_model.text_encoder, fn(), device=devices.device)
+        if hasattr(sd_model, 'text_encoder_2') and 'Text Encoder' in shared.opts.torchao_quantization:
+            modules.append('te2')
+            q.quantize_(sd_model.text_encoder_2, fn(), device=devices.device)
+        if hasattr(sd_model, 'text_encoder_3') and 'Text Encoder' in shared.opts.torchao_quantization:
+            modules.append('te3')
+            q.quantize_(sd_model.text_encoder_3, fn(), device=devices.device)
+        t1 = time.time()
+        shared.log.info(f"Quantization: type=TorchAO modules={modules} time={t1-t0:.2f}")
    except Exception as e:
-        shared.log.error(f"Model dynamic quantization: {e}")
+        shared.log.error(f"Quantization: type=TorchAO {e}")
+    setup_logging() # torchao uses dynamo which messes with logging so reset is needed
    return sd_model


--- a/modules/shared.py
+++ b/modules/shared.py
@ -426,7 +426,6 @@ startup_offload_mode, startup_cross_attention, startup_sdp_options = get_default

 options_templates.update(options_section(('sd', "Execution & Models"), {
    "sd_backend": OptionInfo(default_backend, "Execution backend", gr.Radio, {"choices": ["diffusers", "original"] }),
-    "autolaunch": OptionInfo(False, "Autolaunch browser upon startup"),
    "sd_model_checkpoint": OptionInfo(default_checkpoint, "Base model", gr.Dropdown, lambda: {"choices": list_checkpoint_tiles()}, refresh=refresh_checkpoints),
    "sd_model_refiner": OptionInfo('None', "Refiner model", gr.Dropdown, lambda: {"choices": ['None'] + list_checkpoint_tiles()}, refresh=refresh_checkpoints),
    "sd_vae": OptionInfo("Automatic", "VAE model", gr.Dropdown, lambda: {"choices": shared_items.sd_vae_items()}, refresh=shared_items.refresh_vae_list),
@ -493,11 +492,12 @@ options_templates.update(options_section(('cuda', "Compute Settings"), {

    "quant_sep": OptionInfo("<h2>Model Quantization</h2>", "", gr.HTML, {"visible": native}),
    "quant_shuffle_weights": OptionInfo(False, "Shuffle the weights between GPU and CPU when quantizing", gr.Checkbox, {"visible": native}),
-    "diffusers_quantization": OptionInfo(False, "Dynamic quantization with TorchAO", gr.Checkbox, {"visible": native}),
-    "nncf_compress_weights": OptionInfo([], "Compress Model weights with NNCF INT8", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),
-    "optimum_quanto_weights": OptionInfo([], "Quantize Model weights with Optimum Quanto", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),
-    "optimum_quanto_weights_type": OptionInfo("qint8", "Weights type for Optimum Quanto", gr.Radio, {"choices": ['qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2', 'qint4', 'qint2'], "visible": native}),
-    "optimum_quanto_activations_type": OptionInfo("none", "Activations type for Optimum Quanto", gr.Radio, {"choices": ['none', 'qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2'], "visible": native}),
+    "nncf_compress_weights": OptionInfo([], "NNCF int8 compression enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),
+    "optimum_quanto_weights": OptionInfo([], "Optimum.quanto quantization enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),
+    "optimum_quanto_weights_type": OptionInfo("qint8", "Optimum.quanto quantization type", gr.Radio, {"choices": ['qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2', 'qint4', 'qint2'], "visible": native}),
+    "optimum_quanto_activations_type": OptionInfo("none", "Optimum.quanto quantization activations ", gr.Radio, {"choices": ['none', 'qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2'], "visible": native}),
+    "torchao_quantization": OptionInfo([], "TorchAO quantization enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder"], "visible": native}),
+    "torchao_quantization_type": OptionInfo("int8", "TorchAO quantization type", gr.Radio, {"choices": ["int8+act", "int8", "int4", "fp8+act", "fp8", "fpx"], "visible": native}),

    "ipex_sep": OptionInfo("<h2>IPEX</h2>", "", gr.HTML, {"visible": devices.backend == "ipex"}),
    "ipex_optimize": OptionInfo([], "IPEX Optimize for Intel GPUs", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "Upscaler"], "visible": devices.backend == "ipex"}),
@ -713,6 +713,7 @@ options_templates.update(options_section(('ui', "User Interface Options"), {
    "theme_type": OptionInfo("Standard", "Theme type", gr.Radio, {"choices": ["Modern", "Standard", "None"]}),
    "theme_style": OptionInfo("Auto", "Theme mode", gr.Radio, {"choices": ["Auto", "Dark", "Light"]}),
    "gradio_theme": OptionInfo("black-teal", "UI theme", gr.Dropdown, lambda: {"choices": theme.list_themes()}, refresh=theme.refresh_themes),
+    "autolaunch": OptionInfo(False, "Autolaunch browser upon startup"),
    "font_size": OptionInfo(14, "Font size", gr.Slider, {"minimum": 8, "maximum": 32, "step": 1, "visible": True}),
    "tooltips": OptionInfo("UI Tooltips", "UI tooltips", gr.Radio, {"choices": ["None", "Browser default", "UI tooltips"], "visible": False}),
    "aspect_ratios": OptionInfo("1:1, 4:3, 3:2, 16:9, 16:10, 21:9, 2:3, 3:4, 9:16, 10:16, 9:21", "Allowed aspect ratios"),