reimplement torchao quantization

Signed-off-by: Vladimir Mandic <mandic00@live.com>
pull/3495/head
Vladimir Mandic 2024-10-18 09:34:01 -04:00
parent 1d51ae3baa
commit ae4591ac0b
6 changed files with 89 additions and 61 deletions

View File

@ -1,8 +1,8 @@
# Change Log for SD.Next
## Update for 2024-10-17
## Update for 2024-10-18
### Highlights for 2024-10-17
### Highlights for 2024-10-18
- **Reprocess**: New workflow options that allow you to generate at lower quality and then
reprocess at higher quality for select images only or generate without hires/refine and then reprocess with hires/refine
@ -22,6 +22,8 @@
- Auto-detection of best available **device/dtype** settings for your platform and GPU reduces neeed for manual configuration
- Full rewrite of **sampler options**, not far more streamlined with tons of new options to tweak scheduler behavior
- Improved **LoRA** detection and handling for all supported models
- Tons of work on dynamic quantization that can be applied on-the-fly during model load to any model type
Supported quantization engines include TorchAO, Optimum.quanto, NNCF compression, and more...
Oh, and we've compiled a full table with list of popular text-to-image generative models, their respective parameters and architecture overview: <https://github.com/vladmandic/automatic/wiki/Models>
@ -30,7 +32,7 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
[README](https://github.com/vladmandic/automatic/blob/master/README.md) | [CHANGELOG](https://github.com/vladmandic/automatic/blob/master/CHANGELOG.md) | [WiKi](https://github.com/vladmandic/automatic/wiki) | [Discord](https://discord.com/invite/sd-next-federal-batch-inspectors-1101998836328697867)
### Details for 2024-10-17
### Details for 2024-10-18
- **reprocess**
- new top-level button: reprocess latent from your history of generated image(s)
@ -211,6 +213,11 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
- setting `lora_load_gpu` to load LoRA directly to GPU
*default*: true unless lovwram
- **torchao**
- reimplement torchao quantization
- configure in settings -> compute settings -> quantization
- can be applied to any model on-the-fly during load
- **huggingface**:
- force logout/login on token change
- unified handling of cache folder: set via `HF_HUB` or `HF_HUB_CACHE` or via settings -> system paths
@ -233,6 +240,7 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
- fix update infotext on image select
- fix imageviewer exif parser
- selectable info view in image viewer, thanks @ZeldaMaster501
- setting to enable browser autolaunch, thanks @brknsoul
- **free-u** check if device/dtype are fft compatible and cast as necessary
- **rocm**
- additional gpu detection and auto-config code, thanks @lshqqytiger

View File

@ -55,6 +55,8 @@ td > div > span { overflow-y: auto; max-height: 3em; overflow-x: hidden; }
.gradio-radio { padding: 0 !important; width: max-content !important; }
.gradio-slider { margin-right: var(--spacing-sm) !important; width: max-content !important }
.gradio-slider input[type="number"] { width: 5em; font-size: var(--text-xs); height: 16px; text-align: right; padding: 0; }
.gradio-checkboxgroup { padding: 0 !important; }
.gradio-checkbox > label { color: var(--block-title-text-color) !important; }
/* custom gradio elements */
.accordion-compact { padding: 8px 0px 4px 0px !important; }

View File

@ -1,5 +1,6 @@
from __future__ import annotations
from functools import partial
import os
import re
import sys
import logging
@ -13,6 +14,7 @@ errors.install()
logging.getLogger("DeepSpeed").disabled = True
os.environ.setdefault('TORCH_LOGS', '-all')
import torch # pylint: disable=C0411
try:
import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
@ -96,7 +98,6 @@ def get_packages():
}
try:
import os
import math
cores = os.cpu_count()
affinity = len(os.sched_getaffinity(0))

View File

@ -737,8 +737,8 @@ def set_diffuser_options(sd_model, vae = None, op: str = 'model', offload=True):
model.eval()
return model
sd_model = sd_models_compile.apply_compile_to_model(sd_model, eval_model, ["Model", "VAE", "Text Encoder"], op="eval")
if shared.opts.diffusers_quantization:
sd_model = sd_models_compile.dynamic_quantization(sd_model)
if len(shared.opts.torchao_quantization) > 0:
sd_model = sd_models_compile.torchao_quantization(sd_model)
if shared.opts.opt_channelslast and hasattr(sd_model, 'unet'):
shared.log.debug(f'Setting {op}: channels-last=True')
@ -1193,7 +1193,7 @@ def load_diffuser_file(model_type, pipeline, checkpoint_info, diffusers_load_con
from diffusers.utils import import_utils
import_utils._accelerate_available = False # pylint: disable=protected-access
if shared.opts.diffusers_to_gpu and model_type.startswith('Stable Diffusion'):
shared.log.debug(f'Diffusers accelerate: hijack={shared.opts.diffusers_to_gpu}')
shared.log.debug(f'Diffusers accelerate: direct={shared.opts.diffusers_to_gpu}')
sd_hijack_accelerate.hijack_accelerate()
else:
sd_hijack_accelerate.restore_accelerate()

View File

@ -183,7 +183,7 @@ def nncf_compress_model(model, op=None, sd_model=None):
def nncf_compress_weights(sd_model):
try:
t0 = time.time()
shared.log.info(f"NNCF Compress Weights: {shared.opts.nncf_compress_weights}")
shared.log.info(f"Quantization: type=NNCF modules={shared.opts.nncf_compress_weights}")
global quant_last_model_name, quant_last_model_device # pylint: disable=global-statement
install('nncf==2.7.0', quiet=True)
@ -199,9 +199,9 @@ def nncf_compress_weights(sd_model):
quant_last_model_device = None
t1 = time.time()
shared.log.info(f"NNCF Compress Weights: time={t1-t0:.2f}")
shared.log.info(f"Quantization: type=NNCF time={t1-t0:.2f}")
except Exception as e:
shared.log.warning(f"NNCF Compress Weights: error: {e}")
shared.log.warning(f"Quantization: type=NNCF {e}")
return sd_model
@ -249,10 +249,10 @@ def optimum_quanto_model(model, op=None, sd_model=None, weights=None, activation
def optimum_quanto_weights(sd_model):
try:
if shared.opts.diffusers_offload_mode in {"balanced", "sequential"}:
shared.log.warning(f"Optimum Quanto Weights is incompatible with {shared.opts.diffusers_offload_mode} offload!")
shared.log.warning(f"Quantization: type=Optimum.quanto offload={shared.opts.diffusers_offload_mode} not compatible")
return sd_model
t0 = time.time()
shared.log.info(f"Optimum Quanto Weights: {shared.opts.optimum_quanto_weights}")
shared.log.info(f"Quantization: type=Optimum.quanto: modules={shared.opts.optimum_quanto_weights}")
global quant_last_model_name, quant_last_model_device # pylint: disable=global-statement
quanto = model_quant.load_quanto()
quanto.tensor.qbits.QBitsTensor.create = lambda *args, **kwargs: quanto.tensor.qbits.QBitsTensor(*args, **kwargs)
@ -299,9 +299,9 @@ def optimum_quanto_weights(sd_model):
devices.torch_gc(force=True)
t1 = time.time()
shared.log.info(f"Optimum Quanto Weights: time={t1-t0:.2f}")
shared.log.info(f"Quantization: type=Optimum.quanto time={t1-t0:.2f}")
except Exception as e:
shared.log.warning(f"Optimum Quanto Weights: error: {e}")
shared.log.warning(f"Quantization: type=Optimum.quanto {e}")
return sd_model
@ -329,7 +329,7 @@ def compile_onediff(sd_model):
from onediff.infer_compiler import oneflow_compile
except Exception as e:
shared.log.warning(f"Model compile using onediff/oneflow: {e}")
shared.log.warning(f"Model compile: task=onediff {e}")
return sd_model
try:
@ -351,9 +351,9 @@ def compile_onediff(sd_model):
if shared.opts.cuda_compile_precompile:
sd_model("dummy prompt")
t1 = time.time()
shared.log.info(f"Model compile: task=onediff/oneflow time={t1-t0:.2f}")
shared.log.info(f"Model compile: task=onediff time={t1-t0:.2f}")
except Exception as e:
shared.log.info(f"Model compile: task=onediff/oneflow error: {e}")
shared.log.info(f"Model compile: task=onediff {e}")
return sd_model
@ -361,7 +361,7 @@ def compile_stablefast(sd_model):
try:
import sfast.compilers.stable_diffusion_pipeline_compiler as sf
except Exception as e:
shared.log.warning(f'Model compile using stable-fast: {e}')
shared.log.warning(f'Model compile: task=stablefast: {e}')
return sd_model
config = sf.CompilationConfig.Default()
try:
@ -390,9 +390,9 @@ def compile_stablefast(sd_model):
if shared.opts.cuda_compile_precompile:
sd_model("dummy prompt")
t1 = time.time()
shared.log.info(f"Model compile: task='Stable-fast' config={config.__dict__} time={t1-t0:.2f}")
shared.log.info(f"Model compile: task=stablefast config={config.__dict__} time={t1-t0:.2f}")
except Exception as e:
shared.log.info(f"Model compile: task=Stable-fast error: {e}")
shared.log.info(f"Model compile: task=stablefast {e}")
return sd_model
@ -401,7 +401,7 @@ def compile_torch(sd_model):
t0 = time.time()
import torch._dynamo # pylint: disable=unused-import,redefined-outer-name
torch._dynamo.reset() # pylint: disable=protected-access
shared.log.debug(f"Model compile available backends: {torch._dynamo.list_backends()}") # pylint: disable=protected-access
shared.log.debug(f"Model compile: task=torch backends={torch._dynamo.list_backends()}") # pylint: disable=protected-access
def torch_compile_model(model, op=None, sd_model=None): # pylint: disable=unused-argument
if hasattr(model, "device") and model.device.type != "meta":
@ -442,7 +442,7 @@ def compile_torch(sd_model):
torch._inductor.config.use_mixed_mm = True # pylint: disable=protected-access
# torch._inductor.config.force_fuse_int_mm_with_mul = True # pylint: disable=protected-access
except Exception as e:
shared.log.error(f"Torch inductor config error: {e}")
shared.log.error(f"Model compile: torch inductor config error: {e}")
sd_model = apply_compile_to_model(sd_model, function=torch_compile_model, options=shared.opts.cuda_compile, op="compile")
@ -450,9 +450,9 @@ def compile_torch(sd_model):
if shared.opts.cuda_compile_precompile:
sd_model("dummy prompt")
t1 = time.time()
shared.log.info(f"Model compile: time={t1-t0:.2f}")
shared.log.info(f"Model compile: task=torch time={t1-t0:.2f}")
except Exception as e:
shared.log.warning(f"Model compile error: {e}")
shared.log.warning(f"Model compile: task=torch {e}")
return sd_model
@ -467,19 +467,19 @@ def check_deepcache(enable: bool):
def compile_deepcache(sd_model):
global deepcache_worker # pylint: disable=global-statement
if not hasattr(sd_model, 'unet'):
shared.log.warning(f'Model compile using deep-cache: {sd_model.__class__} not supported')
shared.log.warning(f'Model compile: task=deepcache pipeline={sd_model.__class__} not supported')
return sd_model
try:
from DeepCache import DeepCacheSDHelper
except Exception as e:
shared.log.warning(f'Model compile using deep-cache: {e}')
shared.log.warning(f'Model compile: task=deepcache {e}')
return sd_model
t0 = time.time()
check_deepcache(False)
deepcache_worker = DeepCacheSDHelper(pipe=sd_model)
deepcache_worker.set_params(cache_interval=shared.opts.deep_cache_interval, cache_branch_id=0)
t1 = time.time()
shared.log.info(f"Model compile: task='DeepCache' config={deepcache_worker.params} time={t1-t0:.2f}")
shared.log.info(f"Model compile: task=deepcache config={deepcache_worker.params} time={t1-t0:.2f}")
# config={'cache_interval': 3, 'cache_layer_id': 0, 'cache_block_id': 0, 'skip_mode': 'uniform'} time=0.00
return sd_model
@ -503,40 +503,56 @@ def compile_diffusers(sd_model):
return sd_model
def dynamic_quantization(sd_model):
def torchao_quantization(sd_model):
try:
install('torchao', quiet=True)
from torchao.quantization import autoquant
from torchao import quantization as q
except Exception as e:
shared.log.error(f"Model dynamic quantization not supported: {e}")
shared.log.error(f"Quantization: type=TorchAO quantization not supported: {e}")
return sd_model
"""
from torchao.quantization import quant_api
def dynamic_quant_filter_fn(mod, *args): # pylint: disable=unused-argument
return (isinstance(mod, torch.nn.Linear) and mod.in_features > 16 and (mod.in_features, mod.out_features)
not in [(1280, 640), (1920, 1280), (1920, 640), (2048, 1280), (2048, 2560), (2560, 1280), (256, 128), (2816, 1280), (320, 640), (512, 1536), (512, 256), (512, 512), (640, 1280), (640, 1920), (640, 320), (640, 5120), (640, 640), (960, 320), (960, 640)])
def conv_filter_fn(mod, *args): # pylint: disable=unused-argument
return (isinstance(mod, torch.nn.Conv2d) and mod.kernel_size == (1, 1) and 128 in [mod.in_channels, mod.out_channels])
quant_api.swap_conv2d_1x1_to_linear(sd_model.unet, conv_filter_fn)
quant_api.swap_conv2d_1x1_to_linear(sd_model.vae, conv_filter_fn)
quant_api.apply_dynamic_quant(sd_model.unet, dynamic_quant_filter_fn)
quant_api.apply_dynamic_quant(sd_model.vae, dynamic_quant_filter_fn)
"""
shared.log.info(f"Model dynamic quantization: pipeline={sd_model.__class__.__name__}")
if shared.opts.torchao_quantization_type == "int8+act":
fn = q.int8_dynamic_activation_int8_weight
elif shared.opts.torchao_quantization_type == "int8":
fn = q.int8_weight_only
elif shared.opts.torchao_quantization_type == "int4":
fn = q.int4_weight_only
elif shared.opts.torchao_quantization_type == "fp8+act":
fn = q.float8_dynamic_activation_float8_weight
elif shared.opts.torchao_quantization_type == "fp8":
fn = q.float8_weight_only
elif shared.opts.torchao_quantization_type == "fpx":
fn = q.fpx_weight_only
else:
shared.log.error(f"Quantization: type=TorchAO type={shared.opts.torchao_quantization_type} not supported")
return sd_model
shared.log.info(f"Quantization: type=TorchAO pipe={sd_model.__class__.__name__} quant={shared.opts.torchao_quantization_type} fn={fn} targets={shared.opts.torchao_quantization}")
try:
if shared.sd_model_type == 'sd' or shared.sd_model_type == 'sdxl':
sd_model.unet = sd_model.unet.to(devices.device)
sd_model.unet = autoquant(sd_model.unet, error_on_unseen=False)
elif shared.sd_model_type == 'f1':
sd_model.transformer = autoquant(sd_model.transformer, error_on_unseen=False)
else:
shared.log.error(f"Model dynamic quantization not supported: {shared.sd_model_type}")
t0 = time.time()
modules = []
if hasattr(sd_model, 'unet') and 'Model' in shared.opts.torchao_quantization:
modules.append('unet')
q.quantize_(sd_model.unet, fn(), device=devices.device)
if hasattr(sd_model, 'transformer') and 'Model' in shared.opts.torchao_quantization:
modules.append('transformer')
q.quantize_(sd_model.transformer, fn(), device=devices.device)
# sd_model.transformer = q.autoquant(sd_model.transformer, error_on_unseen=False)
if hasattr(sd_model, 'vae') and 'VAE' in shared.opts.torchao_quantization:
modules.append('vae')
q.quantize_(sd_model.vae, fn(), device=devices.device)
if hasattr(sd_model, 'text_encoder') and 'Text Encoder' in shared.opts.torchao_quantization:
modules.append('te1')
q.quantize_(sd_model.text_encoder, fn(), device=devices.device)
if hasattr(sd_model, 'text_encoder_2') and 'Text Encoder' in shared.opts.torchao_quantization:
modules.append('te2')
q.quantize_(sd_model.text_encoder_2, fn(), device=devices.device)
if hasattr(sd_model, 'text_encoder_3') and 'Text Encoder' in shared.opts.torchao_quantization:
modules.append('te3')
q.quantize_(sd_model.text_encoder_3, fn(), device=devices.device)
t1 = time.time()
shared.log.info(f"Quantization: type=TorchAO modules={modules} time={t1-t0:.2f}")
except Exception as e:
shared.log.error(f"Model dynamic quantization: {e}")
shared.log.error(f"Quantization: type=TorchAO {e}")
setup_logging() # torchao uses dynamo which messes with logging so reset is needed
return sd_model

View File

@ -426,7 +426,6 @@ startup_offload_mode, startup_cross_attention, startup_sdp_options = get_default
options_templates.update(options_section(('sd', "Execution & Models"), {
"sd_backend": OptionInfo(default_backend, "Execution backend", gr.Radio, {"choices": ["diffusers", "original"] }),
"autolaunch": OptionInfo(False, "Autolaunch browser upon startup"),
"sd_model_checkpoint": OptionInfo(default_checkpoint, "Base model", gr.Dropdown, lambda: {"choices": list_checkpoint_tiles()}, refresh=refresh_checkpoints),
"sd_model_refiner": OptionInfo('None', "Refiner model", gr.Dropdown, lambda: {"choices": ['None'] + list_checkpoint_tiles()}, refresh=refresh_checkpoints),
"sd_vae": OptionInfo("Automatic", "VAE model", gr.Dropdown, lambda: {"choices": shared_items.sd_vae_items()}, refresh=shared_items.refresh_vae_list),
@ -493,11 +492,12 @@ options_templates.update(options_section(('cuda', "Compute Settings"), {
"quant_sep": OptionInfo("<h2>Model Quantization</h2>", "", gr.HTML, {"visible": native}),
"quant_shuffle_weights": OptionInfo(False, "Shuffle the weights between GPU and CPU when quantizing", gr.Checkbox, {"visible": native}),
"diffusers_quantization": OptionInfo(False, "Dynamic quantization with TorchAO", gr.Checkbox, {"visible": native}),
"nncf_compress_weights": OptionInfo([], "Compress Model weights with NNCF INT8", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),
"optimum_quanto_weights": OptionInfo([], "Quantize Model weights with Optimum Quanto", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),
"optimum_quanto_weights_type": OptionInfo("qint8", "Weights type for Optimum Quanto", gr.Radio, {"choices": ['qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2', 'qint4', 'qint2'], "visible": native}),
"optimum_quanto_activations_type": OptionInfo("none", "Activations type for Optimum Quanto", gr.Radio, {"choices": ['none', 'qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2'], "visible": native}),
"nncf_compress_weights": OptionInfo([], "NNCF int8 compression enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),
"optimum_quanto_weights": OptionInfo([], "Optimum.quanto quantization enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),
"optimum_quanto_weights_type": OptionInfo("qint8", "Optimum.quanto quantization type", gr.Radio, {"choices": ['qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2', 'qint4', 'qint2'], "visible": native}),
"optimum_quanto_activations_type": OptionInfo("none", "Optimum.quanto quantization activations ", gr.Radio, {"choices": ['none', 'qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2'], "visible": native}),
"torchao_quantization": OptionInfo([], "TorchAO quantization enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder"], "visible": native}),
"torchao_quantization_type": OptionInfo("int8", "TorchAO quantization type", gr.Radio, {"choices": ["int8+act", "int8", "int4", "fp8+act", "fp8", "fpx"], "visible": native}),
"ipex_sep": OptionInfo("<h2>IPEX</h2>", "", gr.HTML, {"visible": devices.backend == "ipex"}),
"ipex_optimize": OptionInfo([], "IPEX Optimize for Intel GPUs", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "Upscaler"], "visible": devices.backend == "ipex"}),
@ -713,6 +713,7 @@ options_templates.update(options_section(('ui', "User Interface Options"), {
"theme_type": OptionInfo("Standard", "Theme type", gr.Radio, {"choices": ["Modern", "Standard", "None"]}),
"theme_style": OptionInfo("Auto", "Theme mode", gr.Radio, {"choices": ["Auto", "Dark", "Light"]}),
"gradio_theme": OptionInfo("black-teal", "UI theme", gr.Dropdown, lambda: {"choices": theme.list_themes()}, refresh=theme.refresh_themes),
"autolaunch": OptionInfo(False, "Autolaunch browser upon startup"),
"font_size": OptionInfo(14, "Font size", gr.Slider, {"minimum": 8, "maximum": 32, "step": 1, "visible": True}),
"tooltips": OptionInfo("UI Tooltips", "UI tooltips", gr.Radio, {"choices": ["None", "Browser default", "UI tooltips"], "visible": False}),
"aspect_ratios": OptionInfo("1:1, 4:3, 3:2, 16:9, 16:10, 21:9, 2:3, 3:4, 9:16, 10:16, 9:21", "Allowed aspect ratios"),