Merge pull request #3275 from vladmandic/dev

merge dev to master
2024-06-23 11:21:46 -04:00 · 2024-06-23 11:21:46 -04:00 · 8deb6a6e43
parent a3ffd478e5 168e10437f
commit 8deb6a6e43
108 changed files with 1313 additions and 2115 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,96 @@
 # Change Log for SD.Next

+## Update for 2024-06-23
+
+### Highlights for 2024-06-23
+
+Following zero-day **SD3** release, a 10 days later here's a refresh with 10+ improvements  
+including full prompt attention, support for compressed weights, additional text-encoder quantization modes.  
+
+But there's more than SD3:  
+- support for quantized **T5** text encoder *FP16/FP8/FP4/INT8* in all models that use T5: SD3, PixArt-Σ, etc.  
+- support for **PixArt-Sigma** in small/medium/large variants  
+- support for **HunyuanDiT 1.1**  
+- additional **NNCF weights compression** support: SD3, PixArt, ControlNet, Lora  
+- integration of **MS Florence** VLM/VQA *Base* and *Large* models  
+- (finally) new release of **Torch-DirectML**  
+- additional efficiencies for users with low VRAM GPUs  
+- over 20 overall fixes  
+
+### Model Improvements
+
+- **SD3**: enable tiny-VAE (TAESD) preview and non-full quality mode  
+- SD3: enable base LoRA support  
+- SD3: add support for FP4 quantized T5 text encoder  
+  simply select in *settings -> model -> text encoder*  
+  *note* for SD3 with T5, set SD.Next to use FP16 precision, not BF16 precision  
+- SD3: add support for INT8 quantized T5 text encoder, thanks @Disty0!  
+- SD3: enable cpu-offloading for T5 text encoder, thanks @Disty0!  
+- SD3: simplified loading of model in single-file safetensors format  
+  model load can now be performed fully offline  
+- SD3: full support for prompt parsing and attention, thanks @AI-Casanova!
+- SD3: ability to target different prompts to each of text-encoders, thanks @AI-Casanova!  
+  example: `dog TE2: cat TE3: bird`
+- SD3: add support for sampler shift for Euler FlowMatch  
+  see *settings -> samplers*, also available as param in xyz grid  
+  higher shift means model will spend more time on structure and less on details  
+- SD3: add support for selecting T5 text encoder variant in XYZ grid
+- **Pixart-Σ**: Add *small* (512px) and *large* (2k) variations, in addition to existing *medium* (1k)  
+- Pixart-Σ: Add support for 4/8bit quantized t5 text encoder  
+  *note* by default pixart-Σ uses full fp16 t5 encoder with large memory footprint  
+  simply select in *settings -> model -> text encoder* before or after model load  
+- **HunyuanDiT**: support for model version 1.1  
+- **MS Florence**: integration of Microsoft Florence VLM/VQA Base and Large models  
+  simply select in *process -> visual query*!
+
+### General Improvements
+
+- support FP4 quantized T5 text encoder, in addtion to existing FP8 and FP16
+- support for T5 text-encoder loader in **all** models that use T5  
+  *example*: load FP4 or FP8 quantized T5 text-encoder into PixArt Sigma!
+- support for `torch-directml` **0.2.2**, thanks @lshqqytiger!  
+  *note*: new directml is finally based on modern `torch` 2.3.1!  
+- xyz grid: add support for LoRA selector
+- vae load: store original vae so it can be restored when set to none
+- extra networks: info display now contains link to source url if model if its known  
+  works for civitai and huggingface models  
+- force gc for lowvram users and improve gc logging
+- improved google.colab support
+- css tweaks for standardui
+- css tweaks for modernui
+- additional torch gc checks, thanks @Disty0!
+
+**Improvements: NNCF**, thanks @Disty0!  
+- SD3 and PixArt support  
+- moved the first compression step to CPU  
+- sequential cpu offload (lowvram) support  
+- Lora support without reloading the model  
+- ControlNet compression support  
+
+### Fixes
+
+- fix unsaturated outputs, force apply vae config on model load  
+- fix hidiffusion handling of non-square aspect ratios, thanks @ShenZhang-Shin!
+- fix control second pass resize  
+- fix hunyuandit set attention processor
+- fix civitai download without name
+- fix compatibility with latest adetailer
+- fix invalid sampler warning
+- fix starting from non git repo
+- fix control api negative prompt handling
+- fix saving style without name provided
+- fix t2i-color adapter
+- fix sdxl "has been incorrectly initialized"
+- fix api face-hires
+- fix api ip-adapter
+- fix memory exceptions with ROCm, thanks @Disty0!
+- fix face-hires with lowvram, thanks @Disty0!
+- fix pag incorrectly resetting pipeline
+- cleanup image metadata
+- restructure api examples: `cli/api-*`
+- handle theme fallback when invalid theme is specified
+- remove obsolete training code leftovers
+
 ## Update for 2024-06-13

 ### Highlights for 2024-06-13
--- a/TODO.md
+++ b/TODO.md
@ -11,6 +11,7 @@ Main ToDo list can be found at [GitHub projects](https://github.com/users/vladma
 - diffusers public callbacks  
 - include reference styles
 - lora: sc lora, dora, etc
+- sd3 controlnet: <https://github.com/huggingface/diffusers/pull/8566>

 ## Experimental

--- a/cli/simple-control.py
+++ b/cli/simple-control.py
@ -132,7 +132,7 @@ def generate(args): # pylint: disable=redefined-outer-name


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description = 'simple-img2img')
+    parser = argparse.ArgumentParser(description = 'api-img2img')
    parser.add_argument('--init', required=False, default=None, help='init image')
    parser.add_argument('--input', required=False, default=None, help='input image')
    parser.add_argument('--mask', required=False, help='mask image')
--- a/cli/api-faceid.py
+++ b/cli/api-faceid.py
@ -0,0 +1,116 @@
+#!/usr/bin/env python
+import os
+import io
+import time
+import base64
+import logging
+import argparse
+import requests
+import urllib3
+from PIL import Image
+
+sd_url = os.environ.get('SDAPI_URL', "http://127.0.0.1:7860")
+sd_username = os.environ.get('SDAPI_USR', None)
+sd_password = os.environ.get('SDAPI_PWD', None)
+
+logging.basicConfig(level = logging.INFO, format = '%(asctime)s %(levelname)s: %(message)s')
+log = logging.getLogger(__name__)
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+options = {
+    "save_images": False,
+    "send_images": True,
+}
+
+
+def auth():
+    if sd_username is not None and sd_password is not None:
+        return requests.auth.HTTPBasicAuth(sd_username, sd_password)
+    return None
+
+
+def post(endpoint: str, dct: dict = None):
+    req = requests.post(f'{sd_url}{endpoint}', json = dct, timeout=300, verify=False, auth=auth())
+    if req.status_code != 200:
+        return { 'error': req.status_code, 'reason': req.reason, 'url': req.url }
+    else:
+        return req.json()
+
+
+def encode(f):
+    image = Image.open(f)
+    if image.mode == 'RGBA':
+        image = image.convert('RGB')
+    with io.BytesIO() as stream:
+        image.save(stream, 'JPEG')
+        image.close()
+        values = stream.getvalue()
+        encoded = base64.b64encode(values).decode()
+        return encoded
+
+
+def generate(args): # pylint: disable=redefined-outer-name
+    t0 = time.time()
+    if args.model is not None:
+        post('/sdapi/v1/options', { 'sd_model_checkpoint': args.model })
+        post('/sdapi/v1/reload-checkpoint') # needed if running in api-only to trigger new model load
+    options['prompt'] = args.prompt
+    options['negative_prompt'] = args.negative
+    options['steps'] = int(args.steps)
+    options['seed'] = int(args.seed)
+    options['sampler_name'] = args.sampler
+    options['width'] = args.width
+    options['height'] = args.height
+    options['face'] = {
+        'mode': 'FaceID',
+        'ip_model': 'FaceID Base',
+        'source_images': [encode(args.face)],
+    }
+    data = post('/sdapi/v1/txt2img', options)
+    t1 = time.time()
+    if 'images' in data:
+        for i in range(len(data['images'])):
+            b64 = data['images'][i].split(',',1)[0]
+            info = data['info']
+            image = Image.open(io.BytesIO(base64.b64decode(b64)))
+            log.info(f'received image: size={image.size} time={t1-t0:.2f} info="{info}"')
+            if args.output:
+                image.save(args.output)
+                log.info(f'image saved: size={image.size} filename={args.output}')
+
+    else:
+        log.warning(f'no images received: {data}')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description = 'api-faceid')
+    parser.add_argument('--width', required=False, default=512, help='image width')
+    parser.add_argument('--height', required=False, default=512, help='image height')
+    parser.add_argument('--face', required=False, help='face image')
+    parser.add_argument('--prompt', required=False, default='', help='prompt text')
+    parser.add_argument('--negative', required=False, default='', help='negative prompt text')
+    parser.add_argument('--steps', required=False, default=20, help='number of steps')
+    parser.add_argument('--seed', required=False, default=-1, help='initial seed')
+    parser.add_argument('--sampler', required=False, default='Euler a', help='sampler name')
+    parser.add_argument('--output', required=False, default=None, help='output image file')
+    parser.add_argument('--model', required=False, help='model name')
+    args = parser.parse_args()
+    log.info(f'img2img: {args}')
+    generate(args)
+
+"""
+request.face.mode,
+request.face.source_images,
+request.face.ip_model,
+request.face.ip_override_sampler,
+request.face.ip_cache_model,
+request.face.ip_strength,
+request.face.ip_structure,
+request.face.id_strength,
+request.face.id_conditioning,
+request.face.id_cache,
+request.face.pm_trigger,
+request.face.pm_strength,
+request.face.pm_start,
+request.face.fs_cache
+"""
--- a/cli/simple-img2img.py
+++ b/cli/simple-img2img.py
@ -83,7 +83,7 @@ def generate(args): # pylint: disable=redefined-outer-name


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description = 'simple-img2img')
+    parser = argparse.ArgumentParser(description = 'api-img2img')
    parser.add_argument('--init', required=True, help='init image')
    parser.add_argument('--mask', required=False, help='mask image')
    parser.add_argument('--prompt', required=False, default='', help='prompt text')
--- a/cli/simple-info.py
+++ b/cli/simple-info.py
@ -50,7 +50,7 @@ def info(args): # pylint: disable=redefined-outer-name


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description = 'simple-info')
+    parser = argparse.ArgumentParser(description = 'api-info')
    parser.add_argument('--input', required=True, help='input image')
    args = parser.parse_args()
    log.info(f'info: {args}')
--- a/cli/api-json.py
+++ b/cli/api-json.py
@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+# curl -vX POST http://localhost:7860/sdapi/v1/txt2img --header "Content-Type: application/json" -d @3261.json
+import os
+import json
+import logging
+import argparse
+import requests
+import urllib3
+
+
+sd_url = os.environ.get('SDAPI_URL', "http://127.0.0.1:7860")
+sd_username = os.environ.get('SDAPI_USR', None)
+sd_password = os.environ.get('SDAPI_PWD', None)
+options = {
+    "save_images": True,
+    "send_images": True,
+}
+
+logging.basicConfig(level = logging.INFO, format = '%(asctime)s %(levelname)s: %(message)s')
+log = logging.getLogger(__name__)
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+
+def auth():
+    if sd_username is not None and sd_password is not None:
+        return requests.auth.HTTPBasicAuth(sd_username, sd_password)
+    return None
+
+
+def post(endpoint: str, payload: dict = None):
+    if 'sdapi' not in endpoint:
+        endpoint = f'sdapi/v1/{endpoint}'
+    if 'http' not in endpoint:
+        endpoint = f'{sd_url}/{endpoint}'
+    req = requests.post(endpoint, json = payload, timeout=300, verify=False, auth=auth())
+    return { 'error': req.status_code, 'reason': req.reason, 'url': req.url } if req.status_code != 200 else req.json()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description = 'api-txt2img')
+    parser.add_argument('endpoint', nargs=1, help='endpoint')
+    parser.add_argument('json', nargs=1, help='json data or file')
+    args = parser.parse_args()
+    log.info(f'api-json: {args}')
+    if os.path.isfile(args.json[0]):
+        with open(args.json[0], 'r', encoding='ascii') as f:
+            dct = json.load(f) # TODO fails with b64 encoded images inside json due to string encoding
+    else:
+        dct = json.loads(args.json[0])
+    res = post(endpoint=args.endpoint[0], payload=dct)
+    print(res)
--- a/cli/simple-mask.py
+++ b/cli/simple-mask.py
@ -73,7 +73,7 @@ def info(args): # pylint: disable=redefined-outer-name


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description = 'simple-info')
+    parser = argparse.ArgumentParser(description = 'api-mask')
    parser.add_argument('--input', required=True, help='input image')
    parser.add_argument('--mask', required=False, help='input mask')
    parser.add_argument('--type', required=False, help='output mask type')
--- a/cli/simple-preprocess.py
+++ b/cli/simple-preprocess.py
@ -67,7 +67,7 @@ def info(args): # pylint: disable=redefined-outer-name


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description = 'simple-info')
+    parser = argparse.ArgumentParser(description = 'api-preprocess')
    parser.add_argument('--input', required=True, help='input image')
    parser.add_argument('--model', required=True, help='preprocessing model')
    parser.add_argument('--output', required=False, help='output image')
--- a/cli/api-progress.py
+++ b/cli/api-progress.py
--- a/cli/simple-txt2img.js
+++ b/cli/simple-txt2img.js
--- a/cli/simple-txt2img.py
+++ b/cli/simple-txt2img.py
@ -48,7 +48,10 @@ def generate(args): # pylint: disable=redefined-outer-name
    options['sampler_name'] = args.sampler
    options['width'] = int(args.width)
    options['height'] = int(args.height)
-    options['restore_faces'] = args.faces
+    if args.faces:
+        options['restore_faces'] = args.faces
+        options['denoising_strength'] = 0.5
+        options['hr_sampler_name'] = args.sampler
    data = post('/sdapi/v1/txt2img', options)
    t1 = time.time()
    if 'images' in data:
@ -65,7 +68,7 @@ def generate(args): # pylint: disable=redefined-outer-name


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description = 'simple-txt2img')
+    parser = argparse.ArgumentParser(description = 'api-txt2img')
    parser.add_argument('--prompt', required=False, default='', help='prompt text')
    parser.add_argument('--negative', required=False, default='', help='negative prompt text')
    parser.add_argument('--width', required=False, default=512, help='image width')
--- a/cli/simple-upscale.py
+++ b/cli/simple-upscale.py
@ -80,7 +80,7 @@ def upscale(args): # pylint: disable=redefined-outer-name


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description = 'simple-upscale')
+    parser = argparse.ArgumentParser(description = 'api-upscale')
    parser.add_argument('--input', required=True, help='input image')
    parser.add_argument('--output', required=True, help='output image')
    parser.add_argument('--upscaler', required=False, default='Nearest', help='upscaler name')
--- a/cli/simple-vqa.py
+++ b/cli/simple-vqa.py
@ -55,7 +55,7 @@ def info(args): # pylint: disable=redefined-outer-name


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description = 'simple-info')
+    parser = argparse.ArgumentParser(description = 'api-vqa')
    parser.add_argument('--input', required=True, help='input image')
    parser.add_argument('--model', required=False, help='vqa model')
    parser.add_argument('--question', required=False, help='question')
--- a/cli/image-encode.py
+++ b/cli/image-encode.py
@ -0,0 +1,32 @@
+#!/usr/bin/env python
+import io
+import os
+import sys
+import base64
+from PIL import Image
+from rich import print # pylint: disable=redefined-builtin
+
+
+def encode(file: str):
+    image = Image.open(file) if os.path.exists(file) else None
+    print(f'Input: file={file} image={image}')
+    if image is None:
+        return None
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    with io.BytesIO() as stream:
+        image.save(stream, 'JPEG')
+        image.close()
+        values = stream.getvalue()
+        encoded = base64.b64encode(values).decode()
+        return encoded
+
+
+if __name__ == "__main__":
+    sys.argv.pop(0)
+    fn = sys.argv[0] if len(sys.argv) > 0 else ''
+    b64 = encode(fn)
+    print('=== BEGIN ===')
+    print(f'{b64}')
+    print('=== END ===')
+
--- a/cli/latents.py
+++ b/cli/latents.py
@ -1,170 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-import json
-import pathlib
-import argparse
-import warnings
-
-import cv2
-import numpy as np
-import torch
-from PIL import Image
-from torchvision import transforms
-from tqdm import tqdm
-from util import Map
-
-from rich.pretty import install as pretty_install
-from rich.traceback import install as traceback_install
-from rich.console import Console
-
-console = Console(log_time=True, log_time_format='%H:%M:%S-%f')
-pretty_install(console=console)
-traceback_install(console=console, extra_lines=1, width=console.width, word_wrap=False, indent_guides=False)
-
-sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'modules', 'lora'))
-import library.model_util as model_util
-import library.train_util as train_util
-
-warnings.filterwarnings('ignore')
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-options = Map({
-  'batch': 1,
-  'input': '',
-  'json': '',
-  'max': 1024,
-  'min': 256,
-  'noupscale': False,
-  'precision': 'fp32',
-  'resolution': '512,512',
-  'steps': 64,
-  'vae': 'stabilityai/sd-vae-ft-mse'
-})
-vae = None
-
-
-def get_latents(local_vae, images, weight_dtype):
-    image_transforms = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([0.5], [0.5]) ])
-    img_tensors = [image_transforms(image) for image in images]
-    img_tensors = torch.stack(img_tensors)
-    img_tensors = img_tensors.to(device, weight_dtype)
-    with torch.no_grad():
-        latents = local_vae.encode(img_tensors).latent_dist.sample().float().to('cpu').numpy()
-    return latents, [images[0].shape[0], images[0].shape[1]]
-
-
-def get_npz_filename_wo_ext(data_dir, image_key):
-    return os.path.join(data_dir, os.path.splitext(os.path.basename(image_key))[0])
-
-
-def create_vae_latents(local_params):
-    args = Map({**options, **local_params})
-    console.log(f'create vae latents args: {args}')
-    image_paths = train_util.glob_images(args.input)
-    if os.path.exists(args.json):
-        with open(args.json, 'rt', encoding='utf-8') as f:
-            metadata = json.load(f)
-    else:
-        return
-    if args.precision == 'fp16':
-        weight_dtype = torch.float16
-    elif args.precision == 'bf16':
-        weight_dtype = torch.bfloat16
-    else:
-        weight_dtype = torch.float32
-    global vae # pylint: disable=global-statement
-    if vae is None:
-        vae = model_util.load_vae(args.vae, weight_dtype)
-        vae.eval()
-        vae.to(device, dtype=weight_dtype)
-    max_reso = tuple([int(t) for t in args.resolution.split(',')])
-    assert len(max_reso) == 2, f'illegal resolution: {args.resolution}'
-    bucket_manager = train_util.BucketManager(args.noupscale, max_reso, args.min, args.max, args.steps)
-    if not args.noupscale:
-        bucket_manager.make_buckets()
-    img_ar_errors = []
-    def process_batch(is_last):
-        for bucket in bucket_manager.buckets:
-            if (is_last and len(bucket) > 0) or len(bucket) >= args.batch:
-                latents, original_size = get_latents(vae, [img for _, img in bucket], weight_dtype)
-                assert latents.shape[2] == bucket[0][1].shape[0] // 8 and latents.shape[3] == bucket[0][1].shape[1] // 8, f'latent shape {latents.shape}, {bucket[0][1].shape}'
-                for (image_key, _), latent in zip(bucket, latents):
-                    npz_file_name = get_npz_filename_wo_ext(args.input, image_key)
-                    # np.savez(npz_file_name, latent)
-                    kwargs = {}
-                    np.savez(
-                        npz_file_name,
-                        latents=latent,
-                        original_size=np.array(original_size),
-                        crop_ltrb=np.array([0, 0]),
-                        **kwargs,
-                    )
-                bucket.clear()
-    data = [[(None, ip)] for ip in image_paths]
-    bucket_counts = {}
-    for data_entry in tqdm(data, smoothing=0.0):
-        if data_entry[0] is None:
-            continue
-        img_tensor, image_path = data_entry[0]
-        if img_tensor is not None:
-            image = transforms.functional.to_pil_image(img_tensor)
-        else:
-            image = Image.open(image_path)
-        image_key = os.path.basename(image_path)
-        image_key = os.path.join(os.path.basename(pathlib.Path(image_path).parent), pathlib.Path(image_path).stem)
-        if image_key not in metadata:
-            metadata[image_key] = {}
-        reso, resized_size, ar_error = bucket_manager.select_bucket(image.width, image.height)
-        img_ar_errors.append(abs(ar_error))
-        bucket_counts[reso] = bucket_counts.get(reso, 0) + 1
-        metadata[image_key]['train_resolution'] = (reso[0] - reso[0] % 8, reso[1] - reso[1] % 8)
-        if not args.noupscale:
-            assert resized_size[0] == reso[0] or resized_size[1] == reso[1], f'internal error, resized size not match: {reso}, {resized_size}, {image.width}, {image.height}'
-            assert resized_size[0] >= reso[0] and resized_size[1] >= reso[1], f'internal error, resized size too small: {reso}, {resized_size}, {image.width}, {image.height}'
-        assert resized_size[0] >= reso[0] and resized_size[1] >= reso[1], f'internal error resized size is small: {resized_size}, {reso}'
-        image = np.array(image)
-        if resized_size[0] != image.shape[1] or resized_size[1] != image.shape[0]:
-            image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA)
-        if resized_size[0] > reso[0]:
-            trim_size = resized_size[0] - reso[0]
-            image = image[:, trim_size//2:trim_size//2 + reso[0]]
-        if resized_size[1] > reso[1]:
-            trim_size = resized_size[1] - reso[1]
-            image = image[trim_size//2:trim_size//2 + reso[1]]
-        assert image.shape[0] == reso[1] and image.shape[1] == reso[0], f'internal error, illegal trimmed size: {image.shape}, {reso}'
-        bucket_manager.add_image(reso, (image_key, image))
-        process_batch(False)
-
-    process_batch(True)
-    vae.to('cpu')
-
-    bucket_manager.sort()
-    img_ar_errors = np.array(img_ar_errors)
-    for i, reso in enumerate(bucket_manager.resos):
-        count = bucket_counts.get(reso, 0)
-        if count > 0:
-            console.log(f'vae latents bucket: {i+1}/{len(bucket_manager.resos)} resolution: {reso} images: {count} mean-ar-error: {np.mean(img_ar_errors)}')
-    with open(args.json, 'wt', encoding='utf-8') as f:
-        json.dump(metadata, f, indent=2)
-
-
-def unload_vae():
-    global vae # pylint: disable=global-statement
-    vae = None
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('input', type=str, help='directory for train images')
-    parser.add_argument('--json', type=str, required=True, help='metadata file to input')
-    parser.add_argument('--vae', type=str, required=True, help='model name or path to encode latents')
-    parser.add_argument('--batch', type=int, default=1, help='batch size in inference')
-    parser.add_argument('--resolution', type=str, default='512,512', help='max resolution in fine tuning (width,height)')
-    parser.add_argument('--min', type=int, default=256, help='minimum resolution for buckets')
-    parser.add_argument('--max', type=int, default=1024, help='maximum resolution for buckets')
-    parser.add_argument('--steps', type=int, default=64, help='steps of resolution for buckets, divisible by 8')
-    parser.add_argument('--noupscale', action='store_true', help='make bucket for each image without upscaling')
-    parser.add_argument('--precision', type=str, default='fp32', choices=['fp32', 'fp16', 'bf16'], help='use precision')
-    params = parser.parse_args()
-    create_vae_latents(vars(params))
--- a/cli/model-jit.py
+++ b/cli/model-jit.py
@ -1,176 +0,0 @@
-#!/usr/bin/env python
-import os
-import time
-import functools
-import argparse
-import logging
-import warnings
-from dataclasses import dataclass
-
-logging.getLogger("DeepSpeed").disabled = True
-warnings.filterwarnings(action="ignore", category=FutureWarning)
-warnings.filterwarnings(action="ignore", category=DeprecationWarning)
-
-import torch
-import diffusers
-
-n_warmup = 5
-n_traces = 10
-n_runs = 100
-args = {}
-pipe = None
-log = logging.getLogger("sd")
-
-
-def setup_logging():
-    from rich.theme import Theme
-    from rich.logging import RichHandler
-    from rich.console import Console
-    from rich.traceback import install
-    log.setLevel(logging.DEBUG)
-    console = Console(log_time=True, log_time_format='%H:%M:%S-%f', theme=Theme({ "traceback.border": "black", "traceback.border.syntax_error": "black", "inspect.value.border": "black" }))
-    logging.basicConfig(level=logging.ERROR, format='%(asctime)s | %(name)s | %(levelname)s | %(module)s | %(message)s', handlers=[logging.NullHandler()]) # redirect default logger to null
-    rh = RichHandler(show_time=True, omit_repeated_times=False, show_level=True, show_path=False, markup=False, rich_tracebacks=True, log_time_format='%H:%M:%S-%f', level=logging.DEBUG, console=console)
-    rh.setLevel(logging.DEBUG)
-    log.addHandler(rh)
-    logging.getLogger("diffusers").setLevel(logging.ERROR)
-    logging.getLogger("torch").setLevel(logging.ERROR)
-    warnings.filterwarnings(action="ignore", category=torch.jit.TracerWarning)
-    install(console=console, extra_lines=1, max_frames=10, width=console.width, word_wrap=False, indent_guides=False, suppress=[])
-
-
-def generate_inputs():
-    if args.type == 'sd15':
-        sample = torch.randn(2, 4, 64, 64).half().cuda()
-        timestep = torch.rand(1).half().cuda() * 999
-        encoder_hidden_states = torch.randn(2, 77, 768).half().cuda()
-        return sample, timestep, encoder_hidden_states
-    if args.type == 'sdxl':
-        sample = torch.randn(2, 4, 64, 64).half().cuda()
-        timestep = torch.rand(1).half().cuda() * 999
-        encoder_hidden_states = torch.randn(2, 77, 768).half().cuda()
-        text_embeds = torch.randn(1, 77, 2048).half().cuda()
-        return sample, timestep, encoder_hidden_states, text_embeds
-
-
-def load_model():
-    log.info(f'versions: torch={torch.__version__} diffusers={diffusers.__version__}')
-    diffusers_load_config = {
-        "low_cpu_mem_usage": True,
-        "torch_dtype": torch.float16,
-        "safety_checker": None,
-        "requires_safety_checker": False,
-        "load_connected_pipeline": True,
-        "use_safetensors": True,
-    }
-    pipeline = diffusers.StableDiffusionPipeline if args.type == 'sd15' else diffusers.StableDiffusionXLPipeline
-    global pipe # pylint: disable=global-statement
-    t0 = time.time()
-    pipe = pipeline.from_single_file(args.model, **diffusers_load_config).to('cuda')
-    size = os.path.getsize(args.model)
-    log.info(f'load: model={args.model} type={args.type} time={time.time() - t0:.3f}s size={size / 1024 / 1024:.3f}mb')
-
-
-def load_trace(fn: str):
-
-    @dataclass
-    class UNet2DConditionOutput:
-        sample: torch.FloatTensor
-
-    class TracedUNet(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.in_channels = pipe.unet.in_channels
-            self.device = pipe.unet.device
-
-        def forward(self, latent_model_input, t, encoder_hidden_states):
-            sample = unet_traced(latent_model_input, t, encoder_hidden_states)[0]
-            return UNet2DConditionOutput(sample=sample)
-
-    t0 = time.time()
-    unet_traced = torch.jit.load(fn)
-    pipe.unet = TracedUNet()
-    size = os.path.getsize(fn)
-    log.info(f'load: optimized={fn} time={time.time() - t0:.3f}s size={size / 1024 / 1024:.3f}mb')
-
-
-def trace_model():
-    log.info(f'tracing model: {args.model}')
-    torch.set_grad_enabled(False)
-    unet = pipe.unet
-    unet.eval()
-    # unet.to(memory_format=torch.channels_last)  # use channels_last memory format
-    unet.forward = functools.partial(unet.forward, return_dict=False)  # set return_dict=False as default
-
-    # warmup
-    t0 = time.time()
-    for _ in range(n_warmup):
-        with torch.inference_mode():
-            inputs = generate_inputs()
-            _output = unet(*inputs)
-    log.info(f'warmup: time={time.time() - t0:.3f}s passes={n_warmup}')
-
-    # trace
-    t0 = time.time()
-    unet_traced = torch.jit.trace(unet, inputs, check_trace=True)
-    unet_traced.eval()
-    log.info(f'trace: time={time.time() - t0:.3f}s')
-
-    # optimize graph
-    t0 = time.time()
-    for _ in range(n_traces):
-        with torch.inference_mode():
-            inputs = generate_inputs()
-            _output = unet_traced(*inputs)
-    log.info(f'optimize: time={time.time() - t0:.3f}s passes={n_traces}')
-
-    # save the model
-    if args.save:
-        t0 = time.time()
-        basename, _ext = os.path.splitext(args.model)
-        fn = f"{basename}.pt"
-        unet_traced.save(fn)
-        size = os.path.getsize(fn)
-        log.info(f'save: optimized={fn} time={time.time() - t0:.3f}s size={size / 1024 / 1024:.3f}mb')
-        return fn
-
-    pipe.unet = unet_traced
-    return None
-
-
-def benchmark_model(msg: str):
-    with torch.inference_mode():
-        inputs = generate_inputs()
-        torch.cuda.synchronize()
-        for n in range(n_runs):
-            if n > n_runs / 10:
-                t0 = time.time()
-            _output = pipe.unet(*inputs)
-        torch.cuda.synchronize()
-        t1 = time.time()
-        log.info(f"benchmark unet: {t1 - t0:.3f}s passes={n_runs} type={msg}")
-        return t1 - t0
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description = 'SD.Next')
-    parser.add_argument('--model', type=str, default='', required=True, help='model path')
-    parser.add_argument('--type', type=str, default='sd15', choices=['sd15', 'sdxl'], required=False, help='model type, default: %(default)s')
-    parser.add_argument('--benchmark', default = False, action='store_true', help = "run benchmarks, default: %(default)s")
-    parser.add_argument('--trace', default = True, action='store_true', help = "run jit tracing, default: %(default)s")
-    parser.add_argument('--save', default = False, action='store_true', help = "save optimized unet, default: %(default)s")
-    args = parser.parse_args()
-    setup_logging()
-    log.info('sdnext model jit tracing')
-    if not os.path.isfile(args.model):
-        log.error(f"invalid model path: {args.model}")
-        exit(1)
-    load_model()
-    if args.benchmark:
-        time0 = benchmark_model('original')
-    unet_saved = trace_model()
-    if unet_saved is not None:
-        load_trace(unet_saved)
-    if args.benchmark:
-        time1 = benchmark_model('traced')
-        log.info(f'benchmark speedup: {100 * (time0 - time1) / time0:.3f}%')
--- a/cli/torch-compile.py
+++ b/cli/torch-compile.py
@ -1,99 +0,0 @@
-#!/usr/bin/env python
-# pylint: disable=cell-var-from-loop
-"""
-Test Torch Dynamo functionality and backends
-"""
-import json
-import warnings
-
-import numpy as np
-import torch
-from torchvision.models import resnet18
-
-
-print('torch:', torch.__version__)
-try:
-    # must be imported explicitly or namespace is not found
-    import torch._dynamo as dynamo # pylint: disable=ungrouped-imports
-except Exception as err:
-    print('torch without dynamo support', err)
-
-
-N_ITERS = 20
-torch._dynamo.config.verbose=True # pylint: disable=protected-access
-warnings.filterwarnings('ignore', category=UserWarning) # disable those for now as many backends reports tons
-# torch.set_float32_matmul_precision('high') # enable to test in fp32
-
-
-def timed(fn): # returns the result of running `fn()` and the time it took for `fn()` to run in ms using CUDA events
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
-    start.record()
-    result = fn()
-    end.record()
-    torch.cuda.synchronize()
-    return result, start.elapsed_time(end)
-
-
-def generate_data(b):
-    return (
-        torch.randn(b, 3, 128, 128).to(torch.float32).cuda(),
-        torch.randint(1000, (b,)).cuda(),
-    )
-
-
-def init_model():
-    return resnet18().to(torch.float32).cuda()
-
-
-def evaluate(mod, val):
-    return mod(val)
-
-
-if __name__ == '__main__':
-    # first pass, dynamo is going to be slower as it compiles
-    model = init_model()
-    inp = generate_data(16)[0]
-
-    # repeat test
-    results = {}
-    times = []
-    print('eager initial eval:', timed(lambda: evaluate(model, inp))[1])
-    for _i in range(N_ITERS):
-        inp = generate_data(16)[0]
-        _res, time = timed(lambda: evaluate(model, inp)) # noqa: B023
-        times.append(time)
-    results['default'] = np.median(times)
-
-    print('dynamo available backends:', dynamo.list_backends())
-    for backend in dynamo.list_backends():
-        try:
-            # required before changing backends
-            torch._dynamo.reset() # pylint: disable=protected-access
-            eval_dyn = dynamo.optimize(backend)(evaluate)
-            print('dynamo initial eval:', backend, timed(lambda: eval_dyn(model, inp))[1]) # noqa: B023
-            times = []
-            for _i in range(N_ITERS):
-                inp = generate_data(16)[0]
-                _res, time = timed(lambda: eval_dyn(model, inp)) # noqa: B023
-                times.append(time)
-            results[backend] = np.median(times)
-        except Exception as err:
-            lines = str(err).split('\n')
-            print('dyanmo backend failed:', backend, lines[0]) # print just first error line as backtraces can be quite long
-            results[backend] = 'error'
-
-    # print stats
-    print(json.dumps(results, indent = 4))
-
-"""
-Reference: <https://github.com/pytorch/pytorch/blob/4f4b62e4a255708e928445b6502139d5962974fa/docs/source/dynamo/get-started.rst>
-Training & Inference backends:
-    dynamo.optimize("inductor") - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels
-    dynamo.optimize("aot_nvfuser") - nvFuser with AotAutograd
-    dynamo.optimize("aot_cudagraphs") - cudagraphs with AotAutograd
-Inference-only backends:
-    dynamo.optimize("ofi") - Uses Torchscript optimize_for_inference
-    dynamo.optimize("fx2trt") - Uses Nvidia TensorRT for inference optimizations
-    dynamo.optimize("onnxrt") - Uses ONNXRT for inference on CPU/GPU
-"""
--- a/cli/train.py
+++ b/cli/train.py
@ -1,443 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Examples:
- sd15:    train.py --type lora --tag girl --comments sdnext --input ~/generative/Input/mia --process original,interrogate,resize --name mia
- sdxl:    train.py --type lora --tag girl --comments sdnext --input ~/generative/Input/mia --process original,interrogate,resize --precision fp32 --optimizer Adafactor --sdxl --name miaxl
- offline: train.py --type lora --tag girl --comments sdnext --input ~/generative/Input/mia --model /home/vlado/dev/sdnext/models/Stable-diffusion/sdxl/miaanimeSFWNSFWSDXL_v40.safetensors --dir /home/vlado/dev/sdnext/models/Lora/ --precision fp32 --optimizer Adafactor --sdxl --name miaxl
-"""
-
-# system imports
-import os
-import re
-import gc
-import sys
-import json
-import shutil
-import pathlib
-import asyncio
-import logging
-import tempfile
-import argparse
-
-# local imports
-import util
-import sdapi
-import options
-
-
-# globals
-args = None
-log = logging.getLogger('train')
-valid_steps = ['original', 'face', 'body', 'blur', 'range', 'upscale', 'restore', 'interrogate', 'resize', 'square', 'segment']
-log_file = os.path.join(os.path.dirname(__file__), 'train.log')
-server_ok = False
-
-# methods
-
-def setup_logging():
-    from rich.theme import Theme
-    from rich.logging import RichHandler
-    from rich.console import Console
-    from rich.pretty import install as pretty_install
-    from rich.traceback import install as traceback_install
-    console = Console(log_time=True, log_time_format='%H:%M:%S-%f', theme=Theme({
-        "traceback.border": "black",
-        "traceback.border.syntax_error": "black",
-        "inspect.value.border": "black",
-    }))
-    # logging.getLogger("urllib3").setLevel(logging.ERROR)
-    # logging.getLogger("httpx").setLevel(logging.ERROR)
-    level = logging.DEBUG if args.debug else logging.INFO
-    logging.basicConfig(level=logging.ERROR, format='%(asctime)s | %(name)s | %(levelname)s | %(module)s | %(message)s', filename=log_file, filemode='a', encoding='utf-8', force=True)
-    log.setLevel(logging.DEBUG) # log to file is always at level debug for facility `sd`
-    pretty_install(console=console)
-    traceback_install(console=console, extra_lines=1, width=console.width, word_wrap=False, indent_guides=False, suppress=[])
-    rh = RichHandler(show_time=True, omit_repeated_times=False, show_level=True, show_path=False, markup=False, rich_tracebacks=True, log_time_format='%H:%M:%S-%f', level=level, console=console)
-    rh.set_name(level)
-    while log.hasHandlers() and len(log.handlers) > 0:
-        log.removeHandler(log.handlers[0])
-    log.addHandler(rh)
-
-
-def mem_stats():
-    gc.collect()
-    import torch
-    if torch.cuda.is_available():
-        with torch.no_grad():
-            torch.cuda.empty_cache()
-        with torch.cuda.device('cuda'):
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
-    mem = util.get_memory()
-    peak = { 'active': mem['gpu-active']['peak'], 'allocated': mem['gpu-allocated']['peak'], 'reserved': mem['gpu-reserved']['peak'] }
-    log.debug(f"memory cpu: {mem.ram} gpu current: {mem.gpu} gpu peak: {peak}")
-
-
-def parse_args():
-    global args # pylint: disable=global-statement
-    parser = argparse.ArgumentParser(description = 'SD.Next Train')
-
-    group_server = parser.add_argument_group('Server')
-    group_server.add_argument('--server', type=str, default='http://127.0.0.1:7860', required=False, help='server url, default: %(default)s')
-    group_server.add_argument('--user', type=str, default=None, required=False, help='server url, default: %(default)s')
-    group_server.add_argument('--password', type=str, default=None, required=False, help='server url, default: %(default)s')
-    group_server.add_argument('--dir', type=str, default=None, required=False, help='folder with trained networks, default: use server setting')
-
-    group_main = parser.add_argument_group('Main')
-    group_main.add_argument('--type', type=str, choices=['embedding', 'ti', 'lora', 'lyco', 'dreambooth', 'hypernetwork'], default=None, required=True, help='training type')
-    group_main.add_argument('--model', type=str, default='', required=False, help='base model to use for training, default: current loaded model')
-    group_main.add_argument('--name', type=str, default=None, required=True, help='output filename')
-    group_main.add_argument('--tag', type=str, default='person', required=False, help='primary tags, default: %(default)s')
-    group_main.add_argument('--comments', type=str, default='', required=False, help='comments to be added to trained model metadata, default: %(default)s')
-
-    group_data = parser.add_argument_group('Dataset')
-    group_data.add_argument('--input', type=str, default=None, required=True, help='input folder with training images')
-    group_data.add_argument('--interim', type=str, default='', required=False, help='where to store processed images, default is system temp/train')
-    group_data.add_argument('--process', type=str, default='original,interrogate,resize,square', required=False, help=f'list of possible processing steps: {valid_steps}, default: %(default)s')
-
-    group_train = parser.add_argument_group('Train')
-    group_train.add_argument('--gradient', type=int, default=1, required=False, help='gradient accumulation steps, default: %(default)s')
-    group_train.add_argument('--steps', type=int, default=2500, required=False, help='training steps, default: %(default)s')
-    group_train.add_argument('--batch', type=int, default=1, required=False, help='batch size, default: %(default)s')
-    group_train.add_argument('--lr', type=float, default=1e-04, required=False, help='model learning rate, default: %(default)s')
-    group_train.add_argument('--dim', type=int, default=32, required=False, help='network dimension or number of vectors, default: %(default)s')
-
-    # lora params
-    group_train.add_argument('--repeats', type=int, default=1, required=False, help='number of repeats per image, default: %(default)s')
-    group_train.add_argument('--alpha', type=float, default=0, required=False, help='lora/lyco alpha for weights scaling, default: dim/2')
-    group_train.add_argument('--algo', type=str, default=None, choices=['locon', 'loha', 'lokr', 'ia3'], required=False, help='alternative lyco algoritm, default: %(default)s')
-    group_train.add_argument('--args', type=str, default=None, required=False, help='lora/lyco additional network arguments, default: %(default)s')
-    group_train.add_argument('--optimizer', type=str, default='AdamW', required=False, help='optimizer type, default: %(default)s')
-    group_train.add_argument('--precision', type=str, choices=['fp16', 'fp32'], default='fp16', required=False, help='training precision, default: %(default)s')
-    group_train.add_argument('--sdxl', default = False, action='store_true', help = "run sdxl training, default: %(default)s")
-    # AdamW (default), AdamW8bit, PagedAdamW8bit, Lion8bit, PagedLion8bit, Lion, SGDNesterov, SGDNesterov8bit, DAdaptation(DAdaptAdamPreprint), DAdaptAdaGrad, DAdaptAdam, DAdaptAdan, DAdaptAdanIP, DAdaptLion, DAdaptSGD, AdaFactor
-
-    group_other = parser.add_argument_group('Other')
-    group_other.add_argument('--overwrite', default = False, action='store_true', help = "overwrite existing training, default: %(default)s")
-    group_other.add_argument('--experimental', default = False, action='store_true', help = "enable experimental options, default: %(default)s")
-    group_other.add_argument('--debug', default = False, action='store_true', help = "enable debug level logging, default: %(default)s")
-
-    args = parser.parse_args()
-
-
-def prepare_server():
-    global server_ok # pylint: disable=global-statement
-    try:
-        server_status = util.Map(sdapi.progresssync())
-        server_state = server_status['state']
-        server_ok = True
-    except Exception:
-        log.warning(f'sdnext server error: {server_status}')
-        server_ok = False
-    if server_ok and server_state['job_count'] > 0:
-        log.error(f'sdnext server not idle: {server_state}')
-        exit(1)
-    if server_ok:
-        server_options = util.Map(sdapi.options())
-        server_options.options.save_training_settings_to_txt = False
-        server_options.options.training_enable_tensorboard = False
-        server_options.options.training_tensorboard_save_images = False
-        server_options.options.pin_memory = True
-        server_options.options.save_optimizer_state = False
-        server_options.options.training_image_repeats_per_epoch = args.repeats
-        server_options.options.training_write_csv_every = 0
-        sdapi.postsync('/sdapi/v1/options', server_options.options)
-        log.info('updated server options')
-
-
-def verify_args():
-    server_options = util.Map(sdapi.options())
-    if args.model != '':
-        if not os.path.isfile(args.model):
-            log.error(f'cannot find loaded model: {args.model}')
-            exit(1)
-        if server_ok:
-            server_options.options.sd_model_checkpoint = args.model
-            sdapi.postsync('/sdapi/v1/options', server_options.options)
-    elif server_ok:
-        args.model = server_options.options.sd_model_checkpoint.split(' [')[0]
-        if args.sdxl and (server_options.sd_backend != 'diffusers' or server_options.diffusers_pipeline != 'Stable Diffusion XL'):
-            log.warning('server checkpoint is not sdxl')
-    else:
-        log.error('no model specified')
-        exit(1)
-    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-    if args.type == 'lora' and not server_ok and not args.dir:
-        log.error('offline lora training requires --dir <lora folder>')
-        exit(1)
-    if args.type == 'lora':
-        import transformers
-        if transformers.__version__ != '4.30.2':
-            log.error(f'lora training requires specific transformers version: current {transformers.__version__} required transformers==4.30.2')
-            exit(1)
-    args.lora_dir = server_options.options.lora_dir or args.dir
-    if not os.path.isabs(args.lora_dir):
-        args.lora_dir = os.path.join(base_dir, args.lora_dir)
-    args.lyco_dir = server_options.options.lyco_dir or args.dir
-    if not os.path.isabs(args.lyco_dir):
-        args.lyco_dir = os.path.join(base_dir, args.lyco_dir)
-    args.embeddings_dir = server_options.options.embeddings_dir or args.dir
-    if not os.path.isfile(args.model):
-        args.ckpt_dir = server_options.options.ckpt_dir
-        if not os.path.isabs(args.ckpt_dir):
-            args.ckpt_dir = os.path.join(base_dir, args.ckpt_dir)
-        attempt = os.path.abspath(os.path.join(args.ckpt_dir, args.model))
-        args.model = attempt if os.path.isfile(attempt) else args.model
-    if not os.path.isfile(args.model):
-        attempt = os.path.abspath(os.path.join(args.ckpt_dir, args.model + '.safetensors'))
-        args.model = attempt if os.path.isfile(attempt) else args.model
-    if not os.path.isfile(args.model):
-        log.error(f'cannot find loaded model: {args.model}')
-        exit(1)
-    if not os.path.exists(args.input) or not os.path.isdir(args.input):
-        log.error(f'cannot find training folder: {args.input}')
-        exit(1)
-    if not os.path.exists(args.lora_dir) or not os.path.isdir(args.lora_dir):
-        log.error(f'cannot find lora folder: {args.lora_dir}')
-        exit(1)
-    if not os.path.exists(args.lyco_dir) or not os.path.isdir(args.lyco_dir):
-        log.error(f'cannot find lyco folder: {args.lyco_dir}')
-        exit(1)
-    if args.interim != '':
-        args.process_dir = args.interim
-    else:
-        args.process_dir = os.path.join(tempfile.gettempdir(), 'train', args.name)
-    log.debug(f'args: {vars(args)}')
-    log.debug(f'server flags: {server_options.flags}')
-    log.debug(f'server options: {server_options.options}')
-
-
-async def training_loop():
-    async def async_train():
-        res = await sdapi.post('/sdapi/v1/train/embedding', options.embedding)
-        log.info(f'train embedding result: {res}')
-
-    async def async_monitor():
-        from tqdm.rich import tqdm
-        await asyncio.sleep(3)
-        res = util.Map(sdapi.progress())
-        with tqdm(desc='train embedding', total=res.state.job_count) as pbar:
-            while res.state.job_no < res.state.job_count and not res.state.interrupted and not res.state.skipped:
-                await asyncio.sleep(2)
-                prev_job = res.state.job_no
-                res = util.Map(sdapi.progress())
-                loss = re.search(r"Loss: (.*?)(?=\<)", res.textinfo)
-                if loss:
-                    pbar.set_postfix({ 'loss': loss.group(0) })
-                    pbar.update(res.state.job_no - prev_job)
-
-    a = asyncio.create_task(async_train())
-    b = asyncio.create_task(async_monitor())
-    await asyncio.gather(a, b) # wait for both pipeline and monitor to finish
-
-
-def train_embedding():
-    log.info(f'{args.type} options: {options.embedding}')
-    create_options = util.Map({
-        "name": args.name,
-        "num_vectors_per_token": args.dim,
-        "overwrite_old": False,
-        "init_text": args.tag,
-    })
-    fn = os.path.join(args.embeddings_dir, args.name) + '.pt'
-    if os.path.exists(fn) and args.overwrite:
-        log.warning(f'delete existing embedding {fn}')
-        os.remove(fn)
-    else:
-        log.error(f'embedding exists {fn}')
-        return
-    log.info(f'create embedding {create_options}')
-    res = sdapi.postsync('/sdapi/v1/create/embedding', create_options)
-    if 'info' in res and 'error' in res['info']: # formatted error
-        log.error(res.info)
-    elif 'info' in res: # no error
-        asyncio.run(training_loop())
-    else: # unknown error
-        log.error(f'create embedding error {res}')
-
-
-def train_lora():
-    fn = os.path.join(options.lora.output_dir, args.name)
-    for ext in ['.ckpt', '.pt', '.safetensors']:
-        if os.path.exists(fn + ext):
-            if args.overwrite:
-                log.warning(f'delete existing lora: {fn + ext}')
-                os.remove(fn + ext)
-            else:
-                log.error(f'lora exists: {fn + ext}')
-                return
-    log.info(f'{args.type} options: {options.lora}')
-    # lora imports
-    lora_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'modules', 'lora'))
-    lycoris_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'modules', 'lycoris'))
-    sys.path.append(lora_path)
-    if args.type == 'lyco':
-        sys.path.append(lycoris_path)
-    log.debug('importing lora lib')
-    if not args.sdxl:
-        import train_network
-        trainer = train_network.NetworkTrainer()
-        trainer.train(options.lora)
-    else:
-        import sdxl_train_network
-        trainer = sdxl_train_network.SdxlNetworkTrainer()
-        trainer.train(options.lora)
-    if args.type == 'lyco':
-        log.debug('importing lycoris lib')
-        import importlib
-        _network_module = importlib.import_module(options.lora.network_module)
-
-
-def prepare_options():
-    if args.type == 'embedding':
-        log.info('train embedding')
-        options.lora.in_json = None
-    if args.type == 'dreambooth':
-        log.info('train using dreambooth style training')
-        options.lora.vae_batch_size = args.batch
-        options.lora.in_json = None
-    if args.type == 'lora':
-        log.info('train using lora style training')
-        options.lora.output_dir = args.lora_dir
-        options.lora.in_json = os.path.join(args.process_dir, args.name + '.json')
-    if args.type == 'lyco':
-        log.info('train using lycoris network')
-        options.lora.output_dir = args.lora_dir
-        options.lora.network_module = 'lycoris.kohya'
-        options.lora.in_json = os.path.join(args.process_dir, args.name + '.json')
-    # lora specific
-    options.lora.save_model_as = 'safetensors'
-    options.lora.pretrained_model_name_or_path = args.model
-    options.lora.output_name = args.name
-    options.lora.max_train_steps = args.steps
-    options.lora.network_dim = args.dim
-    options.lora.network_alpha = args.dim // 2 if args.alpha == 0 else args.alpha
-    options.lora.network_args = []
-    options.lora.training_comment = args.comments
-    options.lora.sdpa = True
-    options.lora.optimizer_type = args.optimizer
-    if args.algo is not None:
-        options.lora.network_args.append(f'algo={args.algo}')
-    if args.args is not None:
-        for net_arg in args.args:
-            options.lora.network_args.append(net_arg)
-    options.lora.gradient_accumulation_steps = args.gradient
-    options.lora.learning_rate = args.lr
-    options.lora.train_batch_size = args.batch
-    options.lora.train_data_dir = args.process_dir
-    options.lora.no_half_vae = args.precision == 'fp16'
-    # embedding specific
-    options.embedding.embedding_name = args.name
-    options.embedding.learn_rate = str(args.lr)
-    options.embedding.batch_size = args.batch
-    options.embedding.steps = args.steps
-    options.embedding.data_root = args.process_dir
-    options.embedding.log_directory = os.path.join(args.process_dir, 'log')
-    options.embedding.gradient_step = args.gradient
-
-
-def process_inputs():
-    import process
-    import filetype
-    pathlib.Path(args.process_dir).mkdir(parents=True, exist_ok=True)
-    processing_options = args.process.split(',') if isinstance(args.process, str) else args.process
-    processing_options = [opt.strip() for opt in re.split(',| ', args.process)]
-    log.info(f'processing steps: {processing_options}')
-    for step in processing_options:
-        if step not in valid_steps:
-            log.error(f'invalid processing step: {[step]}')
-            exit(1)
-    for root, _sub_dirs, folder in os.walk(args.input):
-        files = [os.path.join(root, f) for f in folder if filetype.is_image(os.path.join(root, f))]
-    log.info(f'processing input images: {len(files)}')
-    if os.path.exists(args.process_dir):
-        if args.overwrite:
-            log.warning(f'removing existing processed folder: {args.process_dir}')
-            shutil.rmtree(args.process_dir, ignore_errors=True)
-        else:
-            log.info(f'processed folder exists: {args.process_dir}')
-    steps = [step for step in processing_options if step in ['face', 'body', 'original']]
-    process.reset()
-    options.process.target_size = 1024 if args.sdxl else 512
-    metadata = {}
-    for step in steps:
-        if step == 'face':
-            opts = [step for step in processing_options if step not in ['body', 'original']]
-        if step == 'body':
-            opts = [step for step in processing_options if step not in ['face', 'original', 'upscale', 'restore']] # body does not perform upscale or restore
-        if step == 'original':
-            opts = [step for step in processing_options if step not in ['face', 'body', 'upscale', 'restore', 'blur', 'range', 'segment']] # original does not perform most steps
-        log.info(f'processing current step: {opts}')
-        tag = step
-        if tag == 'original' and args.tag is not None:
-            concept = args.tag.split(',')[0].strip()
-        else:
-            concept = step
-        if args.type in ['lora', 'lyco', 'dreambooth']:
-            folder = os.path.join(args.process_dir, str(args.repeats) + '_' + concept) # separate concepts per folder
-        if args.type in ['embedding']:
-            folder = os.path.join(args.process_dir) # everything into same folder
-        log.info(f'processing concept: {concept}')
-        log.info(f'processing output folder: {folder}')
-        pathlib.Path(folder).mkdir(parents=True, exist_ok=True)
-        results = {}
-        if server_ok:
-            for f in files:
-                res = process.file(filename = f, folder = folder, tag = args.tag, requested = opts)
-                if res.image: # valid result
-                    results[res.type] = results.get(res.type, 0) + 1
-                    results['total'] = results.get('total', 0) + 1
-                    rel_path = res.basename.replace(os.path.commonpath([res.basename, args.process_dir]), '')
-                    if rel_path.startswith(os.path.sep):
-                        rel_path = rel_path[1:]
-                    metadata[rel_path] = { 'caption': res.caption, 'tags': ','.join(res.tags) }
-                    if options.lora.in_json is None:
-                        with open(res.output.replace(options.process.format, '.txt'), "w", encoding='utf-8') as outfile:
-                            outfile.write(res.caption)
-                log.info(f"processing {'saved' if res.image is not None else 'skipped'}: {f} => {res.output} {res.ops} {res.message}")
-        else:
-            log.info('processing skipped: offline')
-    folders = [os.path.join(args.process_dir, folder) for folder in os.listdir(args.process_dir) if os.path.isdir(os.path.join(args.process_dir, folder))]
-    log.info(f'input datasets {folders}')
-    if options.lora.in_json is not None:
-        with open(options.lora.in_json, "w", encoding='utf-8') as outfile: # write json at the end only
-            outfile.write(json.dumps(metadata, indent=2))
-        for folder in folders: # create latents
-            import latents
-            latents.create_vae_latents(util.Map({ 'input': folder, 'json': options.lora.in_json }))
-            latents.unload_vae()
-    r = { 'inputs': len(files), 'outputs': results, 'metadata': options.lora.in_json }
-    log.info(f'processing steps result: {r}')
-    if args.gradient < 0:
-        log.info(f"setting gradient accumulation to number of images: {results['total']}")
-        options.lora.gradient_accumulation_steps = results['total']
-        options.embedding.gradient_step = results['total']
-    process.unload()
-
-
-if __name__ == '__main__':
-    parse_args()
-    setup_logging()
-    log.info('SD.Next Train')
-    sdapi.sd_url = args.server
-    if args.user is not None:
-        sdapi.sd_username = args.user
-    if args.password is not None:
-        sdapi.sd_password = args.password
-    prepare_server()
-    verify_args()
-    prepare_options()
-    mem_stats()
-    process_inputs()
-    mem_stats()
-    try:
-        if args.type == 'embedding':
-            train_embedding()
-        if args.type == 'lora' or args.type == 'lyco' or args.type == 'dreambooth':
-            train_lora()
-    except KeyboardInterrupt:
-        log.error('interrupt requested')
-        sdapi.interrupt()
-    mem_stats()
-    log.info('done')
--- a/cli/zluda-python.py
+++ b/cli/zluda-python.py
@ -13,7 +13,7 @@ class Interpreter:

    def execute(self, s: str):
        try:
-            exec(s, self.env_globals, self.env_locals)
+            exec(s, self.env_globals, self.env_locals) # pylint: disable=exec-used
        except Exception as e:
            print(f'{e.__class__.__name__}: {e}')

--- a/extensions-builtin/Lora/network_lora.py
+++ b/extensions-builtin/Lora/network_lora.py
@ -24,7 +24,8 @@ class NetworkModuleLora(network.NetworkModule):
        weight = weights.get(key)
        if weight is None and none_ok:
            return None
-        is_linear = type(self.sd_module) in [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear, torch.nn.MultiheadAttention, diffusers_lora.LoRACompatibleLinear]
+        linear_modules = [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear, torch.nn.MultiheadAttention, diffusers_lora.LoRACompatibleLinear]
+        is_linear = type(self.sd_module) in linear_modules or self.sd_module.__class__.__name__ == "NNCFLinear"
        is_conv = type(self.sd_module) in [torch.nn.Conv2d, diffusers_lora.LoRACompatibleConv]
        if is_linear:
            weight = weight.reshape(weight.shape[0], -1)
--- a/extensions-builtin/Lora/network_overrides.py
+++ b/extensions-builtin/Lora/network_overrides.py
@ -1,7 +1,7 @@
 from modules import shared


-maybe_diffusers = [
+maybe_diffusers = [ # forced if lora_maybe_diffusers is enabled
    'aaebf6360f7d', # sd15-lcm
    '3d18b05e4f56', # sdxl-lcm
    'b71dcb732467', # sdxl-tcd
@ -19,15 +19,26 @@ maybe_diffusers = [
    '8cca3706050b', # hyper-sdxl-1step
 ]

-force_diffusers = [
+force_diffusers = [ # forced always
    '816d0eed49fd', # flash-sdxl
    'c2ec22757b46', # flash-sd15
 ]

-def check_override(shorthash):
+force_models = [ # forced always
+    'sd3',
+]
+
+force_classes = [ # forced always
+]
+
+
+def check_override(shorthash=''):
+    force = False
+    force = force or (shared.sd_model_type in force_models)
+    force = force or (shared.sd_model.__class__.__name__ in force_classes)
    if len(shorthash) < 4:
-        return False
-    force = any(x.startswith(shorthash) for x in maybe_diffusers) if shared.opts.lora_maybe_diffusers else False
+        return force
+    force = force or (any(x.startswith(shorthash) for x in maybe_diffusers) if shared.opts.lora_maybe_diffusers else False)
    force = force or any(x.startswith(shorthash) for x in force_diffusers)
    if force and shared.opts.lora_maybe_diffusers:
        shared.log.debug('LoRA override: force diffusers')
--- a/extensions-builtin/Lora/networks.py
+++ b/extensions-builtin/Lora/networks.py
@ -49,6 +49,7 @@ def assign_network_names_to_compvis_modules(sd_model):
    network_layer_mapping = {}
    if shared.native:
        if not hasattr(shared.sd_model, 'text_encoder') or not hasattr(shared.sd_model, 'unet'):
+            sd_model.network_layer_mapping = {}
            return
        for name, module in shared.sd_model.text_encoder.named_modules():
            prefix = "lora_te1_" if shared.sd_model_type == "sdxl" else "lora_te_"
@ -66,6 +67,7 @@ def assign_network_names_to_compvis_modules(sd_model):
            module.network_layer_name = network_name
    else:
        if not hasattr(shared.sd_model, 'cond_stage_model'):
+            sd_model.network_layer_mapping = {}
            return
        for name, module in shared.sd_model.cond_stage_model.wrapped.named_modules():
            network_name = name.replace(".", "_")
@ -87,7 +89,14 @@ def load_diffusers(name, network_on_disk, lora_scale=1.0) -> network.Network:
        return cached
    if not shared.native:
        return None
-    shared.sd_model.load_lora_weights(network_on_disk.filename)
+    if not hasattr(shared.sd_model, 'load_lora_weights'):
+        shared.log.error(f"LoRA load failed: class={shared.sd_model.__class__} does not implement load lora")
+        return None
+    try:
+        shared.sd_model.load_lora_weights(network_on_disk.filename)
+    except Exception as e:
+        errors.display(e, "LoRA")
+        return None
    if shared.opts.lora_fuse_diffusers:
        shared.sd_model.fuse_lora(lora_scale=lora_scale)
    net = network.Network(name, network_on_disk)
@ -159,7 +168,6 @@ def load_networks(names, te_multipliers=None, unet_multipliers=None, dyn_dims=No
        list_available_networks()
        networks_on_disk = [available_network_aliases.get(name, None) for name in names]
    failed_to_load_networks = []
-
    recompile_model = False
    if shared.compiled_model_state is not None and shared.compiled_model_state.is_compiled:
        if len(names) == len(shared.compiled_model_state.lora_model):
@ -177,13 +185,10 @@ def load_networks(names, te_multipliers=None, unet_multipliers=None, dyn_dims=No
            shared.compiled_model_state.lora_model = []
    if recompile_model:
        backup_cuda_compile = shared.opts.cuda_compile
-        backup_nncf_compress_weights = shared.opts.nncf_compress_weights
        sd_models.unload_model_weights(op='model')
        shared.opts.cuda_compile = False
-        shared.opts.nncf_compress_weights = []
        sd_models.reload_model_weights(op='model')
        shared.opts.cuda_compile = backup_cuda_compile
-        shared.opts.nncf_compress_weights = backup_nncf_compress_weights

    loaded_networks.clear()
    for i, (network_on_disk, name) in enumerate(zip(networks_on_disk, names)):
@ -227,8 +232,6 @@ def load_networks(names, te_multipliers=None, unet_multipliers=None, dyn_dims=No
    if recompile_model:
        shared.log.info("LoRA recompiling model")
        backup_lora_model = shared.compiled_model_state.lora_model
-        if shared.opts.nncf_compress_weights and not (shared.opts.cuda_compile and shared.opts.cuda_compile_backend == "openvino_fx"):
-            shared.sd_model = sd_models_compile.nncf_compress_weights(shared.sd_model)
        if shared.opts.cuda_compile:
            shared.sd_model = sd_models_compile.compile_diffusers(shared.sd_model)

--- a/extensions-builtin/Lora/ui_extra_networks_lora.py
+++ b/extensions-builtin/Lora/ui_extra_networks_lora.py
@ -102,7 +102,7 @@ class ExtraNetworksPageLora(ui_extra_networks.ExtraNetworksPage):

            return item
        except Exception as e:
-            shared.log.debug(f"Extra networks error: type=lora file={name} {e}")
+            shared.log.debug(f"Networks error: type=lora file={name} {e}")
            from modules import errors
            errors.display('e', 'Lora')
            return None
--- a/extensions-builtin/sdnext-modernui
+++ b/extensions-builtin/sdnext-modernui
@ -1 +1 @@
-Subproject commit 285743a83f251ae23e3a4120d15badcead4eab33
+Subproject commit dae2c67d826b631dcc343c028c60f478b0437877
--- a/html/locale_en.json
+++ b/html/locale_en.json
@ -230,7 +230,7 @@
  {"id":"","label":"Control Options","localized":"","hint":"Settings related the Control tab"},
  {"id":"","label":"Training","localized":"","hint":"Settings related to model training configuration and directories"},
  {"id":"","label":"Interrogate","localized":"","hint":"Settings related to interrogation configuration"},
-  {"id":"","label":"Extra Networks","localized":"","hint":"Settings related to extra networks user interface, extra networks multiplier defaults, and configuration"},
+  {"id":"","label":"Networks","localized":"","hint":"Settings related to networks user interface, networks multiplier defaults, and configuration"},
  {"id":"","label":"Licenses","localized":"","hint":"View licenses of all additional included libraries"},
  {"id":"","label":"Show all pages","localized":"","hint":"Show all settings pages"}
 ],
--- a/html/locale_ko.json
+++ b/html/locale_ko.json
@ -48,7 +48,7 @@
  {"id":"","label":"Interrogate\nDeepBooru","localized":"DeepBooru 모델 사용","hint":"DeepBooru 모델을 사용해 이미지에서 설명을 추출한다."}
 ],
 "extra networks": [
-  {"id":"","label":"Extra networks tab order","localized":"엑스트라 네트워크 탭 순서","hint":"Comma-separated list of tab names; tabs listed here will appear in the extra networks UI first and in order lsited"},
+  {"id":"","label":"Networks tab order","localized":"엑스트라 네트워크 탭 순서","hint":"Comma-separated list of tab names; tabs listed here will appear in the extra networks UI first and in order lsited"},
  {"id":"","label":"UI position","localized":"UI 위치","hint":""},
  {"id":"","label":"UI height (%)","localized":"UI 높이 (%)","hint":""},
  {"id":"","label":"UI sidebar width (%)","localized":"UI 사이드바 너비 (%)","hint":""},
--- a/html/reference.json
+++ b/html/reference.json
@ -160,15 +160,30 @@
    "preview": "PixArt-alpha--PixArt-XL-2-1024-MS.jpg",
    "extras": "width: 1024, height: 1024, sampler: Default, cfg_scale: 2.0"
  },
-  "Pixart-Σ": {
-    "path": "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
+  "Pixart-Σ Small": {
+    "path": "huggingface/PixArt-alpha/PixArt-Sigma-XL-2-512-MS",
    "desc": "PixArt-Σ, a Diffusion Transformer model (DiT) capable of directly generating images at 4K resolution. PixArt-Σ represents a significant advancement over its predecessor, PixArt-α, offering images of markedly higher fidelity and improved alignment with text prompts.",
    "preview": "PixArt-alpha--pixart_sigma_sdxlvae_T5_diffusers.jpg",
+    "skip": true,
+    "extras": "width: 512, height: 512, sampler: Default, cfg_scale: 2.0"
+  },
+  "Pixart-Σ Medium": {
+    "path": "huggingface/PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
+    "desc": "PixArt-Σ, a Diffusion Transformer model (DiT) capable of directly generating images at 4K resolution. PixArt-Σ represents a significant advancement over its predecessor, PixArt-α, offering images of markedly higher fidelity and improved alignment with text prompts.",
+    "preview": "PixArt-alpha--pixart_sigma_sdxlvae_T5_diffusers.jpg",
+    "skip": true,
+    "extras": "width: 1024, height: 1024, sampler: Default, cfg_scale: 2.0"
+  },
+  "Pixart-Σ Large": {
+    "path": "huggingface/PixArt-alpha/PixArt-Sigma-XL-2-2K-MS",
+    "desc": "PixArt-Σ, a Diffusion Transformer model (DiT) capable of directly generating images at 4K resolution. PixArt-Σ represents a significant advancement over its predecessor, PixArt-α, offering images of markedly higher fidelity and improved alignment with text prompts.",
+    "preview": "PixArt-alpha--pixart_sigma_sdxlvae_T5_diffusers.jpg",
+    "skip": true,
    "extras": "width: 1024, height: 1024, sampler: Default, cfg_scale: 2.0"
  },
  
-  "Tencent HunyuanDiT": {
-    "path": "Tencent-Hunyuan/HunyuanDiT-Diffusers",
+  "Tencent HunyuanDiT 1.1": {
+    "path": "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers",
    "desc": "Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding.",
    "preview": "Tencent-Hunyuan-HunyuanDiT.jpg",
    "extras": "width: 1024, height: 1024, sampler: Default, cfg_scale: 2.0"
--- a/installer.py
+++ b/installer.py
@ -275,9 +275,12 @@ def install(package, friendly: str = None, ignore: bool = False, reinstall: bool

 # execute git command
@lru_cache()
-def git(arg: str, folder: str = None, ignore: bool = False):
+def git(arg: str, folder: str = None, ignore: bool = False, optional: bool = False):
    if args.skip_git:
        return ''
+    if optional:
+        if 'google.colab' in sys.modules:
+            return ''
    git_cmd = os.environ.get('GIT', "git")
    if git_cmd != "git":
        git_cmd = os.path.abspath(git_cmd)
@ -306,7 +309,7 @@ def branch(folder=None):
        return None
    branches = []
    try:
-        b = git('branch --show-current', folder)
+        b = git('branch --show-current', folder, optional=True)
        if b == '':
            branches = git('branch', folder).split('\n')
        if len(branches) > 0:
@ -315,7 +318,7 @@ def branch(folder=None):
                b = branches[1].strip()
                log.debug(f'Git detached head detected: folder="{folder}" reattach={b}')
    except Exception:
-        b = git('git rev-parse --abbrev-ref HEAD', folder)
+        b = git('git rev-parse --abbrev-ref HEAD', folder, optional=True)
    if 'main' in b:
        b = 'main'
    elif 'master' in b:
@ -323,7 +326,7 @@ def branch(folder=None):
    else:
        b = b.split('\n')[0].replace('*', '').strip()
    log.debug(f'Submodule: {folder} / {b}')
-    git(f'checkout {b}', folder, ignore=True)
+    git(f'checkout {b}', folder, ignore=True, optional=True)
    return b


@ -396,6 +399,12 @@ def check_python(supported_minors=[9, 10, 11, 12], reason=None):
    if args.quick:
        return
    log.info(f'Python version={platform.python_version()} platform={platform.system()} bin="{sys.executable}" venv="{sys.prefix}"')
+    if int(sys.version_info.major) == 3 and int(sys.version_info.minor) == 12 and int(sys.version_info.micro) > 3: # TODO python 3.12.4 or higher cause a mess with pydantic
+        log.error(f"Incompatible Python version: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro} required 3.12.3 or lower")
+        if reason is not None:
+            log.error(reason)
+        if not args.ignore:
+            sys.exit(1)
    if not (int(sys.version_info.major) == 3 and int(sys.version_info.minor) in supported_minors):
        log.error(f"Incompatible Python version: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro} required 3.{supported_minors}")
        if reason is not None:
@ -434,7 +443,7 @@ def check_onnx():


 def install_rocm_zluda(torch_command):
-    check_python(supported_minors=[10,11], reason='RocM or Zluda backends require Python 3.10 or 3.11')
+    check_python(supported_minors=[10, 11], reason='ROCm or ZLUDA backends require Python 3.10 or 3.11')
    is_windows = platform.system() == 'Windows'
    log.info('AMD ROCm toolkit detected')
    os.environ.setdefault('PYTORCH_HIP_ALLOC_CONF', 'garbage_collection_threshold:0.8,max_split_size_mb:512')
@ -515,14 +524,7 @@ def install_rocm_zluda(torch_command):
        torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision')

        # conceal ROCm installed
-        os.environ.pop("ROCM_HOME", None)
-        os.environ.pop("ROCM_PATH", None)
-        paths = os.environ["PATH"].split(";")
-        paths_no_rocm = []
-        for path in paths:
-            if "ROCm" not in path:
-                paths_no_rocm.append(path)
-        os.environ["PATH"] = ";".join(paths_no_rocm)
+        conceal_rocm()
    else:
        if rocm_ver is None: # assume the latest if version check fails
            torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/rocm6.0')
@ -541,6 +543,17 @@ def install_rocm_zluda(torch_command):
    return torch_command


+def conceal_rocm():
+    os.environ.pop("ROCM_HOME", None)
+    os.environ.pop("ROCM_PATH", None)
+    paths = os.environ["PATH"].split(";")
+    paths_no_rocm = []
+    for path in paths:
+        if "ROCm" not in path:
+            paths_no_rocm.append(path)
+    os.environ["PATH"] = ";".join(paths_no_rocm)
+
+
 def install_ipex(torch_command):
    check_python(supported_minors=[10,11], reason='IPEX backend requires Python 3.10 or 3.11')
    args.use_ipex = True # pylint: disable=attribute-defined-outside-init
@ -677,11 +690,11 @@ def check_torch():
            torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision')
        elif allow_directml and args.use_directml and ('arm' not in machine and 'aarch' not in machine):
            log.info('Using DirectML Backend')
-            check_python(supported_minors=[10], reason='DirectML backend requires Python 3.10')
-            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.0.0 torchvision torch-directml')
+            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.3.1 torchvision torch-directml')
            if 'torch' in torch_command and not args.version:
                install(torch_command, 'torch torchvision')
            install('onnxruntime-directml', 'onnxruntime-directml', ignore=True)
+            conceal_rocm()
        else:
            if args.use_zluda:
                log.warning("ZLUDA failed to initialize: no HIP SDK found")
@ -881,6 +894,7 @@ def install_submodules(force=True):
                branch(name)
        except Exception:
            log.error(f'Error updating submodule: {submodule}')
+    setup_logging()
    if args.profile:
        print_profile(pr, 'Submodule')
    return '\n'.join(res)
@ -1015,7 +1029,7 @@ def get_version(force=False):
                'url': origin.replace('\n', '') + '/tree/' + branch_name.replace('\n', '')
            }
        except Exception:
-            version = { 'app': 'sd.next', 'version': 'unknown' }
+            version = { 'app': 'sd.next', 'version': 'unknown', 'branch': 'unknown' }
        try:
            cwd = os.getcwd()
            os.chdir('extensions-builtin/sdnext-modernui')
@ -1031,21 +1045,24 @@ def get_version(force=False):


 def check_ui(ver):
-    if ver is None:
-        return
-    if ver['branch'] == ver['ui']:
-        return
-    log.debug(f'Branch mismatch: sdnext={ver["branch"]} ui={ver["ui"]}')
+    def same(ver):
+        core = ver['branch'] if ver is not None and 'branch' in ver else 'unknown'
+        ui = ver['ui'] if ver is not None and 'ui' in ver else 'unknown'
+        return core == ui or (core == 'master' and ui == 'main')
+
+    if not same(ver):
+        log.debug(f'Branch mismatch: sdnext={ver["branch"]} ui={ver["ui"]}')
    cwd = os.getcwd()
    try:
        os.chdir('extensions-builtin/sdnext-modernui')
-        git('checkout ' + ver['branch'], ignore=True)
+        target = 'dev' if 'dev' in ver['branch'] else 'main'
+        git('checkout ' + target, ignore=True, optional=True)
        os.chdir(cwd)
        ver = get_version(force=True)
-        if ver['branch'] == ver['ui']:
+        if not same(ver):
            log.debug(f'Branch synchronized: {ver["branch"]}')
        else:
-            log.debug(f'Branch synch failed: sdnext={ver["branch"]} ui={ver["ui"]}')
+            log.debug(f'Branch sync failed: sdnext={ver["branch"]} ui={ver["ui"]}')
    except Exception as e:
        log.debug(f'Branch switch: {e}')
    os.chdir(cwd)
--- a/javascript/base.css
+++ b/javascript/base.css
@ -17,6 +17,7 @@
 .tooltip-show { opacity: 0.9; }
 .tooltip-left { right: unset; left: 1em; }
 .toolbutton-selected { background: var(--background-fill-primary) !important; }
+.input-accordion-checkbox { display: none; }

 /* live preview */
 .progressDiv { position: relative; height: 20px; background: #b4c0cc; margin-bottom: -3px; }
--- a/javascript/black-teal.css
+++ b/javascript/black-teal.css
@ -30,7 +30,7 @@
  --inactive-color: var(--primary--800);
  --body-text-color: var(--neutral-100);
  --body-text-color-subdued: var(--neutral-300);
-  --background-color: black;
+  --background-color: var(--neutral-950);
  --background-fill-primary: var(--neutral-700);
  --input-padding: 4px;
  --input-background-fill: var(--neutral-800);
--- a/javascript/extraNetworks.js
+++ b/javascript/extraNetworks.js
@ -461,7 +461,7 @@ function setupExtraNetworksForTab(tabname) {
        en.style.position = 'absolute';
        en.style.right = '0';
        en.style.top = '13em';
-        en.style.height = '-webkit-fill-available';
+        en.style.height = 'auto';
        en.style.transition = 'width 0.3s ease';
        en.style.width = `${window.opts.extra_networks_sidebar_width}vw`;
        gradioApp().getElementById(`${tabname}_settings`).parentNode.style.width = `${100 - 2 - window.opts.extra_networks_sidebar_width}vw`;
--- a/javascript/inputAccordion.js
+++ b/javascript/inputAccordion.js
@ -0,0 +1,55 @@
+function inputAccordionChecked(id, checked) {
+  const accordion = gradioApp().getElementById(id);
+  accordion.visibleCheckbox.checked = checked;
+  accordion.onVisibleCheckboxChange();
+}
+
+function setupAccordion(accordion) {
+  const labelWrap = accordion.querySelector('.label-wrap');
+  const gradioCheckbox = gradioApp().querySelector(`#${accordion.id}-checkbox input`);
+  const extra = gradioApp().querySelector(`#${accordion.id}-extra`);
+  const span = labelWrap.querySelector('span');
+  let linked = true;
+  const isOpen = () => labelWrap.classList.contains('open');
+  const observerAccordionOpen = new MutationObserver((mutations) => {
+    mutations.forEach((mutationRecord) => {
+      accordion.classList.toggle('input-accordion-open', isOpen());
+      if (linked) {
+        accordion.visibleCheckbox.checked = isOpen();
+        accordion.onVisibleCheckboxChange();
+      }
+    });
+  });
+  observerAccordionOpen.observe(labelWrap, { attributes: true, attributeFilter: ['class'] });
+  if (extra) labelWrap.insertBefore(extra, labelWrap.lastElementChild);
+  accordion.onChecked = (checked) => {
+    if (isOpen() !== checked) labelWrap.click();
+  };
+
+  const visibleCheckbox = document.createElement('INPUT');
+  visibleCheckbox.type = 'checkbox';
+  visibleCheckbox.checked = isOpen();
+  visibleCheckbox.id = `${accordion.id}-visible-checkbox`;
+  visibleCheckbox.className = `${gradioCheckbox.className} input-accordion-checkbox`;
+  span.insertBefore(visibleCheckbox, span.firstChild);
+  accordion.visibleCheckbox = visibleCheckbox;
+  accordion.onVisibleCheckboxChange = () => {
+    if (linked && isOpen() !== visibleCheckbox.checked) labelWrap.click();
+    gradioCheckbox.checked = visibleCheckbox.checked;
+    updateInput(gradioCheckbox);
+  };
+
+  visibleCheckbox.addEventListener('click', (event) => {
+    linked = false;
+    event.stopPropagation();
+  });
+  visibleCheckbox.addEventListener('input', accordion.onVisibleCheckboxChange);
+}
+
+// onUiLoaded(() => {
+//  for (const accordion of gradioApp().querySelectorAll('.input-accordion')) setupAccordion(accordion);
+// });
+
+function initAccordions() {
+  for (const accordion of gradioApp().querySelectorAll('.input-accordion')) setupAccordion(accordion);
+}
--- a/javascript/sdnext.css
+++ b/javascript/sdnext.css
@ -220,7 +220,7 @@ table.settings-value-table td { padding: 0.4em; border: 1px solid #ccc; max-widt
 .extra-network-cards .card .preview { box-shadow: var(--button-shadow); min-height: 30px; }
 .extra-network-cards .card:hover .overlay { background: rgba(0, 0, 0, 0.70); }
 .extra-network-cards .card:hover .preview { box-shadow: none; filter: grayscale(100%); }
-.extra-network-cards .card .overlay .tags { display: none; overflow-wrap: anywhere; position: absolute; top: 100%; z-index: 20; background: var(--body-background-fill); }
+.extra-network-cards .card .overlay .tags { display: none; overflow-wrap: anywhere; position: absolute; top: 100%; z-index: 20; background: var(--body-background-fill); overflow-x: hidden; overflow-y: auto; max-height: 333px; }
 .extra-network-cards .card .overlay .tag { padding: 2px; margin: 2px; background: rgba(70, 70, 70, 0.60); font-size: var(--text-md); cursor: pointer; display: inline-block; }
 .extra-network-cards .card .actions>span { padding: 4px; font-size: 34px !important; }
 .extra-network-cards .card .actions>span:hover { color: var(--highlight-color); }
@ -240,7 +240,7 @@ table.settings-value-table td { padding: 0.4em; border: 1px solid #ccc; max-widt
 .extra-details > div { overflow-y: auto; min-height: 40vh; max-height: 80vh; align-self: flex-start; }
 .extra-details td:first-child { font-weight: bold; vertical-align: top; }
 .extra-details .gradio-image { max-height: 50vh; }
-
+.input-accordion-checkbox { display: none !important; }

 /* specific elements */
 #modelmerger_interp_description { margin-top: 1em; margin-bottom: 1em; }
--- a/javascript/startup.js
+++ b/javascript/startup.js
@ -12,6 +12,7 @@ async function initStartup() {
  initLogMonitor();
  initContextMenu();
  initDragDrop();
+  initAccordions();
  initSettings();
  initImageViewer();
  initGallery();
--- a/javascript/ui.js
+++ b/javascript/ui.js
@ -424,6 +424,7 @@ function selectVAE(name) {
 }

 function selectReference(name) {
+  log(`Select reference: ${name}`);
  desiredCheckpointName = name;
  gradioApp().getElementById('change_reference').click();
 }
@ -471,19 +472,23 @@ function toggleCompact(val, old) {

 function previewTheme() {
  let name = gradioApp().getElementById('setting_gradio_theme').querySelectorAll('input')?.[0].value || '';
-  fetch('/file=html/themes.json').then((res) => {
-    res.json().then((themes) => {
-      const theme = themes.find((t) => t.id === name);
-      if (theme) {
-        window.open(theme.subdomain, '_blank');
-      } else {
-        const el = document.getElementById('theme-preview') || createThemeElement();
-        el.style.display = el.style.display === 'block' ? 'none' : 'block';
-        name = name.replace('/', '-');
-        el.src = `/file=html/${name}.jpg`;
-      }
-    });
-  });
+  fetch('/file=html/themes.json')
+    .then((res) => {
+      res.json()
+        .then((themes) => {
+          const theme = Array.isArray(themes) ? themes.find((t) => t.id === name) : null;
+          if (theme) {
+            window.open(theme.subdomain, '_blank');
+          } else {
+            const el = document.getElementById('theme-preview') || createThemeElement();
+            el.style.display = el.style.display === 'block' ? 'none' : 'block';
+            name = name.replace('/', '-');
+            el.src = `/file=html/${name}.jpg`;
+          }
+        })
+        .catch((e) => console.error('previewTheme:', e));
+    })
+    .catch((e) => console.error('previewTheme:', e));
 }

 async function browseFolder() {
--- a/modules/api/models.py
+++ b/modules/api/models.py
@ -152,8 +152,8 @@ class ItemIPAdapter(BaseModel):
    adapter: str = Field(title="Adapter", default="Base", description="")
    images: List[str] = Field(title="Image", default=[], description="")
    masks: Optional[List[str]] = Field(title="Mask", default=[], description="")
-    scale: float = Field(title="Scale", default=0.5, gt=0, le=1, description="")
-    start: float = Field(title="Start", default=0.0, gt=0, le=1, description="")
+    scale: float = Field(title="Scale", default=0.5, ge=0, le=1, description="")
+    start: float = Field(title="Start", default=0.0, ge=0, le=1, description="")
    end: float = Field(title="End", default=1.0, gt=0, le=1, description="")

 class ItemFace(BaseModel):
@ -313,7 +313,7 @@ class ResInterrogate(BaseModel):

 class ReqVQA(BaseModel):
    image: str = Field(default="", title="Image", description="Image to work on, must be a Base64 string containing the image's data.")
-    model: str = Field(default="Moondream 2", title="Model", description="The interrogate model used.")
+    model: str = Field(default="MS Florence 2 Base", title="Model", description="The interrogate model used.")
    question: str = Field(default="describe the image", title="Question", description="Question to ask the model.")

 class ResVQA(BaseModel):
--- a/modules/control/processors.py
+++ b/modules/control/processors.py
@ -139,7 +139,7 @@ class Processor():
        self.model = None
        self.processor_id = processor_id
        # self.override = None
-        devices.torch_gc()
+        # devices.torch_gc()
        self.load_config = { 'cache_dir': cache_dir }

    def config(self, processor_id = None):
--- a/modules/control/run.py
+++ b/modules/control/run.py
@ -55,7 +55,7 @@ def control_set(kwargs):

 def control_run(units: List[unit.Unit] = [], inputs: List[Image.Image] = [], inits: List[Image.Image] = [], mask: Image.Image = None, unit_type: str = None, is_generator: bool = True,
                input_type: int = 0,
-                prompt: str = '', negative: str = '', styles: List[str] = [],
+                prompt: str = '', negative_prompt: str = '', styles: List[str] = [],
                steps: int = 20, sampler_index: int = None,
                seed: int = -1, subseed: int = -1, subseed_strength: float = 0, seed_resize_from_h: int = -1, seed_resize_from_w: int = -1,
                cfg_scale: float = 6.0, clip_skip: float = 1.0, image_cfg_scale: float = 6.0, diffusers_guidance_rescale: float = 0.7, pag_scale: float = 0.0, pag_adaptive: float = 0.5, cfg_end: float = 1.0,
@ -90,12 +90,11 @@ def control_run(units: List[unit.Unit] = [], inputs: List[Image.Image] = [], ini
        shared.log.warning('Sampler: invalid')
        sampler_index = 0
    if hr_sampler_index is None:
-        shared.log.warning('Sampler: invalid')
-        hr_sampler_index = 0
+        hr_sampler_index = sampler_index

    p = StableDiffusionProcessingControl(
        prompt = prompt,
-        negative_prompt = negative,
+        negative_prompt = negative_prompt,
        styles = styles,
        steps = steps,
        n_iter = batch_count,
@ -192,7 +191,9 @@ def control_run(units: List[unit.Unit] = [], inputs: List[Image.Image] = [], ini
    p.refiner_prompt = refiner_prompt
    p.refiner_negative = refiner_negative
    if p.enable_hr and (p.hr_resize_x == 0 or p.hr_resize_y == 0):
-        p.hr_upscale_to_x, p.hr_upscale_to_y = 8 * int(p.width * p.hr_scale / 8), 8 * int(p.height * p.hr_scale / 8)
+        p.hr_upscale_to_x, p.hr_upscale_to_y = 8 * int(width_before * p.hr_scale / 8), 8 * int(height_before * p.hr_scale / 8)
+    elif p.enable_hr and (p.hr_upscale_to_x == 0 or p.hr_upscale_to_y == 0):
+        p.hr_upscale_to_x, p.hr_upscale_to_y = 8 * int(p.hr_resize_x / 8), 8 * int(hr_resize_y / 8)

    global p_extra_args # pylint: disable=global-statement
    for k, v in p_extra_args.items():
--- a/modules/control/units/controlnet.py
+++ b/modules/control/units/controlnet.py
@ -172,10 +172,19 @@ class ControlNet():
                self.load_safetensors(model_path)
            else:
                self.model = ControlNetModel.from_pretrained(model_path, **self.load_config)
-            if self.device is not None:
-                self.model.to(self.device)
            if self.dtype is not None:
                self.model.to(self.dtype)
+            if "ControlNet" in opts.nncf_compress_weights:
+                try:
+                    log.debug(f'Control {what} model NNCF Compress: id="{model_id}"')
+                    from installer import install
+                    install('nncf==2.7.0', quiet=True)
+                    from modules.sd_models_compile import nncf_compress_model
+                    self.model = nncf_compress_model(self.model)
+                except Exception as e:
+                    log.error(f'Control {what} model NNCF Compression failed: id="{model_id}" error={e}')
+            if self.device is not None:
+                self.model.to(self.device)
            t1 = time.time()
            self.model_id = model_id
            log.debug(f'Control {what} model loaded: id="{model_id}" path="{model_path}" time={t1-t0:.2f}')
--- a/modules/control/units/t2iadapter.py
+++ b/modules/control/units/t2iadapter.py
@ -74,7 +74,7 @@ class Adapter():
        self.model_id: str = model_id
        self.device = device
        self.dtype = dtype
-        self.load_config = { 'cache_dir': cache_dir }
+        self.load_config = { 'cache_dir': cache_dir, 'use_safetensors': False }
        if load_config is not None:
            self.load_config.update(load_config)
        if model_id is not None:
@ -101,7 +101,7 @@ class Adapter():
                log.error(f'Control {what} model load failed: id="{model_id}" error=unknown model id')
                return
            log.debug(f'Control {what} model loading: id="{model_id}" path="{model_path}"')
-            if model_path.endswith('.pth') or model_path.endswith('.pt') or model_path.endswith('.safetensors'):
+            if model_path.endswith('.pth') or model_path.endswith('.pt') or model_path.endswith('.safetensors') or model_path.endswith('.bin'):
                from huggingface_hub import hf_hub_download
                parts = model_path.split('/')
                repo_id = f'{parts[0]}/{parts[1]}'
--- a/modules/control/units/xs_pipe.py
+++ b/modules/control/units/xs_pipe.py
@ -1048,7 +1048,7 @@ class StableDiffusionXLControlNetXSPipeline(
            self.upcast_vae()
            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)

-        if not output_type == "latent":
+        if output_type != "latent":
            # make sure the VAE is in float32 mode, as it overflows in float16
            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast

@ -1064,7 +1064,7 @@ class StableDiffusionXLControlNetXSPipeline(
        else:
            image = latents

-        if not output_type == "latent":
+        if output_type != "latent":
            # apply watermark if available
            if self.watermark is not None:
                image = self.watermark.apply_watermark(image)
@ -1907,7 +1907,7 @@ class StableDiffusionControlNetXSPipeline(
            self.controlnet.to("cpu")
            torch.cuda.empty_cache()

-        if not output_type == "latent":
+        if output_type != "latent":
            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
                0
            ]
--- a/modules/devices.py
+++ b/modules/devices.py
@ -140,13 +140,13 @@ def torch_gc(force=False):
        used_gpu = round(100 * gpu.get('used', 0) / gpu.get('total', 1)) if gpu.get('total', 1) > 1 else 0
    used_ram = round(100 * ram.get('used', 0) / ram.get('total', 1)) if ram.get('total', 1) > 1 else 0
    global previous_oom # pylint: disable=global-statement
+    threshold = 0 if (shared.cmd_opts.lowvram and not shared.cmd_opts.use_zluda) else shared.opts.torch_gc_threshold
+    if force or threshold == 0 or used_gpu >= threshold or used_ram >= threshold:
+        force = True
    if oom > previous_oom:
        previous_oom = oom
        log.warning(f'GPU out-of-memory error: {mem}')
        force = True
-    if used_gpu >= shared.opts.torch_gc_threshold or used_ram >= shared.opts.torch_gc_threshold:
-        log.info(f'High memory utilization: GPU={used_gpu}% RAM={used_ram}% {mem}')
-        force = True
    if not force:
        return

@ -160,7 +160,13 @@ def torch_gc(force=False):
        except Exception:
            pass
    t1 = time.time()
-    log.debug(f'GC: collected={collected} device={torch.device(get_optimal_device_name())} {memstats.memory_stats()} time={round(t1 - t0, 2)}')
+    mem = memstats.memory_stats()
+    saved = round(gpu.get('used', 0) - mem.get('gpu', {}).get('used', 0), 2)
+    before = { 'gpu': gpu.get('used', 0), 'ram': ram.get('used', 0) }
+    after = { 'gpu': mem.get('gpu', {}).get('used', 0), 'ram': mem.get('ram', {}).get('used', 0), 'retries': mem.get('retries', 0), 'oom': mem.get('oom', 0) }
+    utilization = { 'gpu': used_gpu, 'ram': used_ram, 'threshold': threshold }
+    results = { 'collected': collected, 'saved': saved }
+    log.debug(f'GC: utilization={utilization} gc={results} beofre={before} after={after} device={torch.device(get_optimal_device_name())} fn={sys._getframe(1).f_code.co_name} time={round(t1 - t0, 2)}') # pylint: disable=protected-access


 def set_cuda_sync_mode(mode):
@ -175,7 +181,7 @@ def set_cuda_sync_mode(mode):
        return
    try:
        import ctypes
-        log.info(f'Set cuda synch: mode={mode}')
+        log.info(f'Set cuda sync: mode={mode}')
        torch.cuda.set_device(torch.device(get_optimal_device_name()))
        ctypes.CDLL('libcudart.so').cudaSetDeviceFlags({'auto': 0, 'spin': 1, 'yield': 2, 'block': 4}[mode])
    except Exception:
--- a/modules/dml/backend.py
+++ b/modules/dml/backend.py
@ -78,6 +78,3 @@ class DirectML:

    def reset_peak_memory_stats(device: Optional[rDevice]=None):
        return
-
-    def synchronize_tensor(tensor: torch.Tensor) -> None:
-        tensor.__str__()
--- a/modules/dml/hijack/init.py
+++ b/modules/dml/hijack/init.py
@ -1,8 +1,4 @@
-import modules.dml.hijack.kdiffusion
-import modules.dml.hijack.stablediffusion
 import modules.dml.hijack.torch
 import modules.dml.hijack.realesrgan_model
-import modules.dml.hijack.plms
-import modules.dml.hijack.diffusers
 import modules.dml.hijack.transformers
 import modules.dml.hijack.tomesd
--- a/modules/dml/hijack/diffusers.py
+++ b/modules/dml/hijack/diffusers.py
@ -1,227 +0,0 @@
-from typing import Optional, Union, Tuple
-import torch
-import diffusers
-import diffusers.utils.torch_utils
-
-
-# copied from diffusers.PNDMScheduler._get_prev_sample
-def PNDMScheduler__get_prev_sample(self, sample: torch.FloatTensor, timestep, prev_timestep, model_output):
-    torch.dml.synchronize_tensor(sample) # DML synchronize
-    alpha_prod_t = self.alphas_cumprod[timestep]
-    alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
-    beta_prod_t = 1 - alpha_prod_t
-    beta_prod_t_prev = 1 - alpha_prod_t_prev
-
-    if self.config.prediction_type == "v_prediction":
-        model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
-    elif self.config.prediction_type != "epsilon":
-        raise ValueError(
-            f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`"
-        )
-
-    sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
-
-    model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
-        alpha_prod_t * beta_prod_t * alpha_prod_t_prev
-    ) ** (0.5)
-
-    # full formula (9)
-    prev_sample = (
-        sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
-    )
-
-    return prev_sample
-
-
-diffusers.PNDMScheduler._get_prev_sample = PNDMScheduler__get_prev_sample # pylint: disable=protected-access
-
-
-# copied from diffusers.UniPCMultistepScheduler.multistep_uni_p_bh_update
-def UniPCMultistepScheduler_multistep_uni_p_bh_update(
-    self: diffusers.UniPCMultistepScheduler,
-    model_output: torch.FloatTensor,
-    *args,
-    sample: torch.FloatTensor = None,
-    order: int = None,
-    **_,
-) -> torch.FloatTensor:
-    if sample is None:
-        if len(args) > 1:
-            sample = args[1]
-        else:
-            raise ValueError(" missing `sample` as a required keyward argument")
-    if order is None:
-        if len(args) > 2:
-            order = args[2]
-        else:
-            raise ValueError(" missing `order` as a required keyward argument")
-    model_output_list = self.model_outputs
-
-    s0 = self.timestep_list[-1]
-    m0 = model_output_list[-1]
-    x = sample
-
-    if self.solver_p:
-        x_t = self.solver_p.step(model_output, s0, x).prev_sample
-        return x_t
-
-    torch.dml.synchronize_tensor(sample) # DML synchronize
-    sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
-    alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
-    alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
-
-    lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
-    lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
-
-    h = lambda_t - lambda_s0
-    device = sample.device
-
-    rks = []
-    D1s = []
-    for i in range(1, order):
-        si = self.step_index - i
-        mi = model_output_list[-(i + 1)]
-        alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
-        lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
-        rk = (lambda_si - lambda_s0) / h
-        rks.append(rk)
-        D1s.append((mi - m0) / rk)
-
-    rks.append(1.0)
-    rks = torch.tensor(rks, device=device)
-
-    R = []
-    b = []
-
-    hh = -h if self.predict_x0 else h
-    h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
-    h_phi_k = h_phi_1 / hh - 1
-
-    factorial_i = 1
-
-    if self.config.solver_type == "bh1":
-        B_h = hh
-    elif self.config.solver_type == "bh2":
-        B_h = torch.expm1(hh)
-    else:
-        raise NotImplementedError
-
-    for i in range(1, order + 1):
-        R.append(torch.pow(rks, i - 1))
-        b.append(h_phi_k * factorial_i / B_h)
-        factorial_i *= i + 1
-        h_phi_k = h_phi_k / hh - 1 / factorial_i
-
-    R = torch.stack(R)
-    b = torch.tensor(b, device=device)
-
-    rhos_p = None
-    if len(D1s) > 0:
-        D1s = torch.stack(D1s, dim=1)  # (B, K)
-        # for order 2, we use a simplified version
-        if order == 2:
-            rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device)
-        else:
-            rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
-    else:
-        D1s = None
-
-    if self.predict_x0:
-        x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
-        if D1s is not None:
-            pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)
-        else:
-            pred_res = 0
-        x_t = x_t_ - alpha_t * B_h * pred_res
-    else:
-        x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
-        if D1s is not None:
-            pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)
-        else:
-            pred_res = 0
-        x_t = x_t_ - sigma_t * B_h * pred_res
-
-    x_t = x_t.to(x.dtype)
-    return x_t
-
-
-diffusers.UniPCMultistepScheduler.multistep_uni_p_bh_update = UniPCMultistepScheduler_multistep_uni_p_bh_update
-
-
-# copied from diffusers.LCMScheduler.step
-def LCMScheduler_step(
-        self: diffusers.LCMScheduler,
-        model_output: torch.FloatTensor,
-        timestep: int,
-        sample: torch.FloatTensor,
-        generator: Optional[torch.Generator] = None,
-        return_dict: bool = True,
-    ) -> Union[diffusers.schedulers.scheduling_lcm.LCMSchedulerOutput, Tuple]:
-    if self.num_inference_steps is None:
-        raise ValueError(
-            "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-        )
-
-    if self.step_index is None:
-        self._init_step_index(timestep)
-
-    # 1. get previous step value
-    prev_step_index = self.step_index + 1
-    if prev_step_index < len(self.timesteps):
-        prev_timestep = self.timesteps[prev_step_index]
-    else:
-        prev_timestep = timestep
-
-    # 2. compute alphas, betas
-    torch.dml.synchronize_tensor(sample) # DML synchronize
-    alpha_prod_t = self.alphas_cumprod[timestep]
-    alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
-
-    beta_prod_t = 1 - alpha_prod_t
-    beta_prod_t_prev = 1 - alpha_prod_t_prev
-
-    # 3. Get scalings for boundary conditions
-    c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
-
-    # 4. Compute the predicted original sample x_0 based on the model parameterization
-    if self.config.prediction_type == "epsilon":  # noise-prediction
-        predicted_original_sample = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
-    elif self.config.prediction_type == "sample":  # x-prediction
-        predicted_original_sample = model_output
-    elif self.config.prediction_type == "v_prediction":  # v-prediction
-        predicted_original_sample = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
-    else:
-        raise ValueError(
-            f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
-            " `v_prediction` for `LCMScheduler`."
-        )
-
-    # 5. Clip or threshold "predicted x_0"
-    if self.config.thresholding:
-        predicted_original_sample = self._threshold_sample(predicted_original_sample)
-    elif self.config.clip_sample:
-        predicted_original_sample = predicted_original_sample.clamp(
-            -self.config.clip_sample_range, self.config.clip_sample_range
-        )
-
-    # 6. Denoise model output using boundary conditions
-    denoised = c_out * predicted_original_sample + c_skip * sample
-
-    # 7. Sample and inject noise z ~ N(0, I) for MultiStep Inference
-    # Noise is not used for one-step sampling.
-    if len(self.timesteps) > 1:
-        noise = diffusers.utils.torch_utils.randn_tensor(model_output.shape, generator=generator, device=model_output.device)
-        prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
-    else:
-        prev_sample = denoised
-
-    # upon completion increase step index by one
-    self._step_index += 1
-
-    if not return_dict:
-        return (prev_sample, denoised)
-
-    return diffusers.schedulers.scheduling_lcm.LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
-
-
-diffusers.LCMScheduler.step = LCMScheduler_step
--- a/modules/dml/hijack/kdiffusion.py
+++ b/modules/dml/hijack/kdiffusion.py
@ -1,89 +0,0 @@
-import torch
-from tqdm.auto import tqdm
-from k_diffusion import sampling
-import modules.devices as devices
-
-
-def dpm_solver_adaptive(self, x, t_start, t_end, order=3, rtol=0.05, atol=0.0078, h_init=0.05, pcoeff=0., icoeff=1., dcoeff=0., accept_safety=0.81, eta=0., s_noise=1., noise_sampler=None):
-    noise_sampler = sampling.default_noise_sampler(x) if noise_sampler is None else noise_sampler
-    if order not in {2, 3}:
-        raise ValueError('order should be 2 or 3')
-    forward = t_end > t_start
-    if not forward and eta:
-        raise ValueError('eta must be 0 for reverse sampling')
-    h_init = abs(h_init) * (1 if forward else -1)
-    atol = torch.tensor(atol, device=devices.device)
-    rtol = torch.tensor(rtol, device=devices.device)
-    s = t_start
-    x_prev = x
-    accept = True
-    pid = sampling.PIDStepSizeController(h_init, pcoeff, icoeff, dcoeff, 1.5 if eta else order, accept_safety)
-    info = {'steps': 0, 'nfe': 0, 'n_accept': 0, 'n_reject': 0}
-
-    while s < t_end - 1e-5 if forward else s > t_end + 1e-5:
-        eps_cache = {}
-        t = torch.minimum(t_end, s + pid.h) if forward else torch.maximum(t_end, s + pid.h)
-        if eta:
-            sd, su = sampling.get_ancestral_step(self.sigma(s), self.sigma(t), eta)
-            t_ = torch.minimum(t_end, self.t(sd))
-            su = (self.sigma(t) ** 2 - self.sigma(t_) ** 2) ** 0.5
-        else:
-            t_, su = t, 0.
-
-        eps, eps_cache = self.eps(eps_cache, 'eps', x, s)
-        denoised = x - self.sigma(s) * eps
-
-        if order == 2:
-            x_low, eps_cache = self.dpm_solver_1_step(x, s, t_, eps_cache=eps_cache)
-            x_high, eps_cache = self.dpm_solver_2_step(x, s, t_, eps_cache=eps_cache)
-        else:
-            x_low, eps_cache = self.dpm_solver_2_step(x, s, t_, r1=1 / 3, eps_cache=eps_cache)
-            x_high, eps_cache = self.dpm_solver_3_step(x, s, t_, eps_cache=eps_cache)
-        delta = torch.maximum(atol, rtol * torch.maximum(x_low.abs(), x_prev.abs()))
-        error = torch.linalg.norm((x_low - x_high) / delta) / x.numel() ** 0.5
-        accept = pid.propose_step(error)
-        if accept:
-            x_prev = x_low
-            x = x_high + su * s_noise * noise_sampler(self.sigma(s), self.sigma(t))
-            s = t
-            info['n_accept'] += 1
-        else:
-            info['n_reject'] += 1
-        info['nfe'] += order
-        info['steps'] += 1
-
-        if self.info_callback is not None:
-            self.info_callback({'x': x, 'i': info['steps'] - 1, 't': s, 't_up': s, 'denoised': denoised, 'error': error, 'h': pid.h, **info})
-
-    return x, info
-
-
-@devices.inference_context()
-def sample_dpm_fast(model, x, sigma_min, sigma_max, n, extra_args=None, callback=None, disable=None, eta=0., s_noise=1., noise_sampler=None):
-    """DPM-Solver-Fast (fixed step size). See https://arxiv.org/abs/2206.00927."""
-    if sigma_min <= 0 or sigma_max <= 0:
-        raise ValueError('sigma_min and sigma_max must not be 0')
-    with tqdm(total=n, disable=disable) as pbar:
-        dpm_solver = sampling.DPMSolver(model, extra_args, eps_callback=pbar.update)
-        if callback is not None:
-            dpm_solver.info_callback = lambda info: callback({'sigma': dpm_solver.sigma(info['t']), 'sigma_hat': dpm_solver.sigma(info['t_up']), **info})
-        return dpm_solver.dpm_solver_fast(x, dpm_solver.t(torch.tensor(sigma_max, device=devices.device)), dpm_solver.t(torch.tensor(sigma_min, device=devices.device)), n, eta, s_noise, noise_sampler)
-
-
-@devices.inference_context()
-def sample_dpm_adaptive(model, x, sigma_min, sigma_max, extra_args=None, callback=None, disable=None, order=3, rtol=0.05, atol=0.0078, h_init=0.05, pcoeff=0., icoeff=1., dcoeff=0., accept_safety=0.81, eta=0., s_noise=1., noise_sampler=None, return_info=False):
-    """DPM-Solver-12 and 23 (adaptive step size). See https://arxiv.org/abs/2206.00927."""
-    if sigma_min <= 0 or sigma_max <= 0:
-        raise ValueError('sigma_min and sigma_max must not be 0')
-    with tqdm(disable=disable) as pbar:
-        dpm_solver = sampling.DPMSolver(model, extra_args, eps_callback=pbar.update)
-        if callback is not None:
-            dpm_solver.info_callback = lambda info: callback({'sigma': dpm_solver.sigma(info['t']), 'sigma_hat': dpm_solver.sigma(info['t_up']), **info})
-        x, info = dpm_solver.dpm_solver_adaptive(x, dpm_solver.t(torch.tensor(sigma_max, device=devices.device)), dpm_solver.t(torch.tensor(sigma_min, device=devices.device)), order, rtol, atol, h_init, pcoeff, icoeff, dcoeff, accept_safety, eta, s_noise, noise_sampler)
-    if return_info:
-        return x, info
-    return x
-
-sampling.DPMSolver.dpm_solver_adaptive = dpm_solver_adaptive
-sampling.sample_dpm_fast = sample_dpm_fast
-sampling.sample_dpm_adaptive = sample_dpm_adaptive
--- a/modules/dml/hijack/plms.py
+++ b/modules/dml/hijack/plms.py
@ -1,90 +0,0 @@
-import torch
-from ldm.models.diffusion.ddim import noise_like
-import modules.sd_hijack_inpainting as plms_hijack
-import modules.devices as devices
-
-
-@devices.inference_context()
-def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                  temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                  unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None, dynamic_threshold=None):
-    b, *_, device = *x.shape, x.device
-
-    def get_model_output(x, t):
-        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-            e_t = self.model.apply_model(x, t, c)
-        else:
-            x_in = torch.cat([x] * 2)
-            t_in = torch.cat([t] * 2)
-
-            if isinstance(c, dict):
-                assert isinstance(unconditional_conditioning, dict)
-                c_in = {}
-                for k in c:
-                    if isinstance(c[k], list):
-                        c_in[k] = [
-                            torch.cat([unconditional_conditioning[k][i], c[k][i]])
-                            for i in range(len(c[k]))
-                        ]
-                    else:
-                        c_in[k] = torch.cat([unconditional_conditioning[k], c[k]])
-            else:
-                c_in = torch.cat([unconditional_conditioning, c])
-
-            e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-            e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
-
-        if score_corrector is not None:
-            assert self.model.parameterization == "eps"
-            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
-
-        return e_t
-
-    alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-    alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-    sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-    sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-
-    def get_x_prev_and_pred_x0(e_t, index):
-        # select parameters corresponding to the currently considered timestep
-        torch.dml.synchronize_tensor(alphas[index]) # DML synchronize
-        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
-        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
-
-        # current prediction for x_0
-        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
-        if quantize_denoised:
-            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
-        if dynamic_threshold is not None:
-            from ldm.models.diffusion.sampling_util import norm_thresholding
-            pred_x0 = norm_thresholding(pred_x0, dynamic_threshold)
-        # direction pointing to x_t
-        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-        if noise_dropout > 0.:
-            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
-        return x_prev, pred_x0
-
-    e_t = get_model_output(x, t)
-    if len(old_eps) == 0:
-        # Pseudo Improved Euler (2nd order)
-        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
-        e_t_next = get_model_output(x_prev, t_next)
-        e_t_prime = (e_t + e_t_next) / 2
-    elif len(old_eps) == 1:
-        # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
-        e_t_prime = (3 * e_t - old_eps[-1]) / 2
-    elif len(old_eps) == 2:
-        # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
-        e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
-    elif len(old_eps) >= 3:
-        # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
-        e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
-
-    x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
-
-    return x_prev, pred_x0, e_t
-plms_hijack.p_sample_plms = p_sample_plms
--- a/modules/dml/hijack/stablediffusion.py
+++ b/modules/dml/hijack/stablediffusion.py
@ -1,81 +0,0 @@
-import torch
-from ldm.models.diffusion.ddim import DDIMSampler
-from ldm.modules.diffusionmodules.util import noise_like
-import modules.devices as devices
-
-
-@devices.inference_context()
-def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                    temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                    unconditional_guidance_scale=1., unconditional_conditioning=None,
-                    dynamic_threshold=None):
-    b, *_, device = *x.shape, x.device
-
-    if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-        model_output = self.model.apply_model(x, t, c)
-    else:
-        x_in = torch.cat([x] * 2)
-        t_in = torch.cat([t] * 2)
-        if isinstance(c, dict):
-            assert isinstance(unconditional_conditioning, dict)
-            c_in = dict()
-            for k in c:
-                if isinstance(c[k], list):
-                    c_in[k] = [torch.cat([
-                        unconditional_conditioning[k][i],
-                        c[k][i]]) for i in range(len(c[k]))]
-                else:
-                    c_in[k] = torch.cat([
-                            unconditional_conditioning[k],
-                            c[k]])
-        elif isinstance(c, list):
-            c_in = list()
-            assert isinstance(unconditional_conditioning, list)
-            for i in range(len(c)):
-                c_in.append(torch.cat([unconditional_conditioning[i], c[i]]))
-        else:
-            c_in = torch.cat([unconditional_conditioning, c])
-        model_uncond, model_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-        model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond)
-
-    if self.model.parameterization == "v":
-        e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
-    else:
-        e_t = model_output
-
-    if score_corrector is not None:
-        assert self.model.parameterization == "eps", 'not implemented'
-        e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
-
-    alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-    alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-    sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-    sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-    # select parameters corresponding to the currently considered timestep
-    torch.dml.synchronize_tensor(alphas[index]) # DML synchronize
-    a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-    a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
-    sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-    sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
-
-    # current prediction for x_0
-    if self.model.parameterization != "v":
-        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
-    else:
-        pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
-
-    if quantize_denoised:
-        pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
-
-    if dynamic_threshold is not None:
-        raise NotImplementedError
-
-    # direction pointing to x_t
-    dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-    noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-    if noise_dropout > 0.:
-        noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-    x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
-    return x_prev, pred_x0
-
-DDIMSampler.p_sample_ddim = p_sample_ddim
--- a/modules/dml/hijack/torch.py
+++ b/modules/dml/hijack/torch.py
@ -7,41 +7,6 @@ CondFunc('torchsde._brownian.brownian_interval._randn', lambda _, size, dtype, d
 CondFunc('torch.Tensor.new', lambda orig, self, *args, **kwargs: orig(self.cpu(), *args, **kwargs).to(self.device), lambda orig, self, *args, **kwargs: torch.dml.is_directml_device(self.device))


-_lerp = torch.lerp
-def lerp(*args, **kwargs) -> torch.Tensor:
-    rep = None
-    for i in range(0, len(args)):
-        if torch.is_tensor(args[i]):
-            rep = args[i]
-            break
-    if rep is None:
-        for key in kwargs:
-            if torch.is_tensor(kwargs[key]):
-                rep = kwargs[key]
-                break
-    if torch.dml.is_directml_device(rep.device):
-        args = list(args)
-
-        if rep.dtype == torch.float16:
-            for i in range(len(args)):
-                if torch.is_tensor(args[i]):
-                    args[i] = args[i].float()
-        for i in range(len(args)):
-            if torch.is_tensor(args[i]):
-                args[i] = args[i].cpu()
-
-        if rep.dtype == torch.float16:
-            for kwarg in kwargs:
-                if torch.is_tensor(kwargs[kwarg]):
-                    kwargs[kwarg] = kwargs[kwarg].float()
-        for kwarg in kwargs:
-            if torch.is_tensor(kwargs[kwarg]):
-                kwargs[kwarg] = kwargs[kwarg].cpu()
-        return _lerp(*args, **kwargs).to(rep.device).type(rep.dtype)
-    return _lerp(*args, **kwargs)
-torch.lerp = lerp
-
-
 # https://github.com/lshqqytiger/stable-diffusion-webui-directml/issues/436
 _pow_ = torch.Tensor.pow_
 def pow_(self: torch.Tensor, *args, **kwargs):
--- a/modules/hidiffusion/hidiffusion.py
+++ b/modules/hidiffusion/hidiffusion.py
@ -1,4 +1,5 @@
 from typing import Type, Dict, Any, Tuple, Optional
+import math
 import torch
 import torch.nn.functional as F
 from diffusers.utils.torch_utils import is_torch_version
@ -100,15 +101,18 @@ def make_diffusers_transformer_block(block_class: Type[torch.nn.Module]) -> Type
            # reference: https://github.com/microsoft/Swin-Transformer
            def window_partition(x, window_size, shift_size, H, W):
                B, _N, C = x.shape
-                # H, W = int(N**0.5), int(N**0.5)
                x = x.view(B,H,W,C)
+                if H % 2 != 0 or W % 2 != 0:
+                    from modules.errors import log
+                    log.warning('HiDiffusion: The feature size is not divisible by 2')
+                    x = F.interpolate(x.permute(0,3,1,2).contiguous(), size=(window_size[0]*2, window_size[1]*2), mode='bicubic').permute(0,2,3,1).contiguous()
                if type(shift_size) == list or type(shift_size) == tuple:
                    if shift_size[0] > 0:
                        x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1]), dims=(1, 2))
                else:
                    if shift_size > 0:
                        x = torch.roll(x, shifts=(-shift_size, -shift_size), dims=(1, 2))
-                x = x.view(B, H // window_size[0], window_size[0], W // window_size[1], window_size[1], C)
+                x = x.view(B, 2, window_size[0], 2, window_size[1], C)
                windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0], window_size[1], C)
                windows = windows.view(-1, window_size[0] * window_size[1], C)
                return windows
@ -116,15 +120,17 @@ def make_diffusers_transformer_block(block_class: Type[torch.nn.Module]) -> Type
            def window_reverse(windows, window_size, H, W, shift_size):
                B, _N, C = windows.shape
                windows = windows.view(-1, window_size[0], window_size[1], C)
-                B = int(windows.shape[0] / (H * W / window_size[0] / window_size[1]))
-                x = windows.view(B, H // window_size[0], W // window_size[1], window_size[0], window_size[1], -1)
-                x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+                B = int(windows.shape[0] / 4) # 2x2
+                x = windows.view(B, 2, 2, window_size[0], window_size[1], -1)
+                x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, window_size[0]*2, window_size[1]*2, -1)
                if type(shift_size) == list or type(shift_size) == tuple:
                    if shift_size[0] > 0:
                        x = torch.roll(x, shifts=(shift_size[0], shift_size[1]), dims=(1, 2))
                else:
                    if shift_size > 0:
                        x = torch.roll(x, shifts=(shift_size, shift_size), dims=(1, 2))
+                if H % 2 != 0 or W % 2 != 0:
+                    x = F.interpolate(x.permute(0,3,1,2).contiguous(), size=(H, W), mode='bicubic').permute(0,2,3,1).contiguous()
                x = x.view(B, H*W, C)
                return x

@ -152,9 +158,9 @@ def make_diffusers_transformer_block(block_class: Type[torch.nn.Module]) -> Type
            rand_num = torch.rand(1)
            _B, N, _C = hidden_states.shape
            ori_H, ori_W = self.info['size']
-            downsample_ratio = int(((ori_H*ori_W) // N)**0.5)
-            H, W = (ori_H//downsample_ratio, ori_W//downsample_ratio)
-            widow_size = (H//2, W//2)
+            downsample_ratio = round(((ori_H*ori_W) / N)**0.5)
+            H, W = (math.ceil(ori_H/downsample_ratio), math.ceil(ori_W/downsample_ratio))
+            widow_size = (math.ceil(H/2), math.ceil(W/2))
            if rand_num <= 0.25:
                shift_size = (0,0)
            if rand_num > 0.25 and rand_num <= 0.5:
@ -351,9 +357,11 @@ def make_diffusers_cross_attn_down_block(block_class: Type[torch.nn.Module]) ->

                if i == 0:
                    if self.aggressive_raunet and self.timestep >= self.T1_start and self.timestep < self.T1_end:
-                        hidden_states = F.avg_pool2d(hidden_states, kernel_size=(2,2))
+                        self.info["upsample_size"] = (hidden_states.shape[2], hidden_states.shape[3])
+                        hidden_states = F.avg_pool2d(hidden_states, kernel_size=(2,2),ceil_mode=True)
                    elif self.timestep < self.T1:
-                        hidden_states = F.avg_pool2d(hidden_states, kernel_size=(2,2))
+                        self.info["upsample_size"] = (hidden_states.shape[2], hidden_states.shape[3])
+                        hidden_states = F.avg_pool2d(hidden_states, kernel_size=(2,2),ceil_mode=True)
                output_states = output_states + (hidden_states,)

            if self.downsamplers is not None:
@ -458,11 +466,9 @@ def make_diffusers_cross_attn_up_block(block_class: Type[torch.nn.Module]) -> Ty
                )[0]
                if i == 1:
                    if self.aggressive_raunet and self.timestep >= self.T1_start and self.timestep < self.T1_end:
-                        re_size = (int(hidden_states.shape[-2] * 2), int(hidden_states.shape[-1] * 2))
-                        hidden_states = F.interpolate(hidden_states, size=re_size, mode='bicubic')
+                        hidden_states = F.interpolate(hidden_states, size=self.info["upsample_size"], mode='bicubic')
                    elif self.timestep < self.T1:
-                        re_size = (int(hidden_states.shape[-2] * 2), int(hidden_states.shape[-1] * 2))
-                        hidden_states = F.interpolate(hidden_states, size=re_size, mode='bicubic')
+                        hidden_states = F.interpolate(hidden_states, size=self.info["upsample_size"], mode='bicubic')

            if self.upsamplers is not None:
                for upsampler in self.upsamplers:
@ -589,9 +595,6 @@ def make_diffusers_upsampler_block(block_class: Type[torch.nn.Module]) -> Type[t
                self.T1 = int(aggressive_step/50 * self.max_timestep)
            else:
                self.T1 = int(self.max_timestep * self.T1_ratio)
-            if self.timestep < self.T1:
-                if ori_H != hidden_states.shape[2] and ori_W != hidden_states.shape[3]:
-                    hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode='bicubic')
            self.timestep += 1
            if self.timestep == self.max_timestep:
                self.timestep = 0
@ -629,9 +632,10 @@ def apply_hidiffusion(
        make_block_fn = make_diffusers_unet_2d_condition
        model.unet.__class__ = make_block_fn(model.unet.__class__)
    diffusion_model = model.unet if hasattr(model, "unet") else model
-    diffusion_model.num_upsamplers += 2
+    diffusion_model.num_upsamplers += 12
    diffusion_model.info = {
        'size': None,
+        'upsample_size': None,
        'hooks': [],
        'text_to_img_controlnet': hasattr(model, 'controlnet'),
        'is_inpainting_task': model.__class__ in auto_pipeline.AUTO_INPAINT_PIPELINES_MAPPING.values(),
--- a/modules/images.py
+++ b/modules/images.py
@ -54,6 +54,9 @@ def image_grid(imgs, batch_size=1, rows=None):
    cols = math.ceil(len(imgs) / rows)
    params = script_callbacks.ImageGridLoopParams(imgs, cols, rows)
    script_callbacks.image_grid_callback(params)
+    imgs = [i for i in imgs if i is not None] if imgs is not None else []
+    if len(imgs) == 0:
+        return None
    w, h = imgs[0].size
    grid = Image.new('RGB', size=(params.cols * w, params.rows * h), color=shared.opts.grid_background)
    for i, img in enumerate(params.imgs):
--- a/modules/intel/ipex/diffusers.py
+++ b/modules/intel/ipex/diffusers.py
@ -70,8 +70,8 @@ class SlicedAttnProcessor: # pylint: disable=too-few-public-methods
    def __init__(self, slice_size):
        self.slice_size = slice_size

-    def __call__(self, attn: Attention, hidden_states: torch.FloatTensor,
-    encoder_hidden_states=None, attention_mask=None) -> torch.FloatTensor: # pylint: disable=too-many-statements, too-many-locals, too-many-branches
+    def __call__(self, attn: Attention, hidden_states: torch.Tensor,
+    encoder_hidden_states=None, attention_mask=None) -> torch.Tensor: # pylint: disable=too-many-statements, too-many-locals, too-many-branches

        residual = hidden_states

@ -188,14 +188,11 @@ class AttnProcessor:
    Default processor for performing attention-related computations.
    """

-    def __call__(self, attn: Attention, hidden_states: torch.FloatTensor,
-    encoder_hidden_states=None, attention_mask=None,
-    temb=None, scale: float = 1.0) -> torch.Tensor: # pylint: disable=too-many-statements, too-many-locals, too-many-branches
+    def __call__(self, attn, hidden_states: torch.Tensor, encoder_hidden_states=None, attention_mask=None,
+    temb=None, *args, **kwargs) -> torch.Tensor: # pylint: disable=too-many-statements, too-many-locals, too-many-branches

        residual = hidden_states

-        args = () if USE_PEFT_BACKEND else (scale,)
-
        if attn.spatial_norm is not None:
            hidden_states = attn.spatial_norm(hidden_states, temb)

@ -213,15 +210,15 @@ class AttnProcessor:
        if attn.group_norm is not None:
            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

-        query = attn.to_q(hidden_states, *args)
+        query = attn.to_q(hidden_states)

        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

-        key = attn.to_k(encoder_hidden_states, *args)
-        value = attn.to_v(encoder_hidden_states, *args)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)

        query = attn.head_to_batch_dim(query)
        key = attn.head_to_batch_dim(key)
@ -292,7 +289,7 @@ class AttnProcessor:
        hidden_states = attn.batch_to_head_dim(hidden_states)

        # linear proj
-        hidden_states = attn.to_out[0](hidden_states, *args)
+        hidden_states = attn.to_out[0](hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)

--- a/modules/model_pixart.py
+++ b/modules/model_pixart.py
@ -0,0 +1,30 @@
+import diffusers
+
+
+def load_pixart(checkpoint_info, diffusers_load_config={}):
+    from modules import shared, devices, modelloader, model_t5
+    modelloader.hf_login()
+    # shared.opts.data['cuda_dtype'] = 'FP32' # override
+    # shared.opts.data['diffusers_model_cpu_offload'] = True # override
+    # devices.set_cuda_params()
+    fn = checkpoint_info.path.replace('huggingface/', '')
+    t5 = model_t5.load_t5(shared.opts.sd_text_encoder, cache_dir=shared.opts.diffusers_dir)
+    transformer = diffusers.PixArtTransformer2DModel.from_pretrained(
+        fn,
+        subfolder = 'transformer',
+        cache_dir = shared.opts.diffusers_dir,
+        **diffusers_load_config,
+    )
+    transformer.to(devices.device)
+    kwargs = { 'transformer': transformer }
+    if t5 is not None:
+        kwargs['text_encoder'] = t5
+    diffusers_load_config.pop('variant', None)
+    pipe = diffusers.PixArtSigmaPipeline.from_pretrained(
+        'PixArt-alpha/PixArt-Sigma-XL-2-1024-MS',
+        cache_dir = shared.opts.diffusers_dir,
+        **kwargs,
+        **diffusers_load_config,
+    )
+    devices.torch_gc()
+    return pipe
--- a/modules/model_sd3.py
+++ b/modules/model_sd3.py
@ -1,35 +1,11 @@
-import io
 import os
-import contextlib
-import warnings
 import torch
 import diffusers
 import transformers
-import rich.traceback


-rich.traceback.install()
-warnings.filterwarnings(action="ignore", category=FutureWarning)
-loggedin = False
-
-
-def hf_login():
-    global loggedin # pylint: disable=global-statement
-    import huggingface_hub as hf
-    from modules import shared
-    if shared.opts.huggingface_token is not None and len(shared.opts.huggingface_token) > 2 and not loggedin:
-        stdout = io.StringIO()
-        with contextlib.redirect_stdout(stdout):
-            hf.login(shared.opts.huggingface_token)
-        text = stdout.getvalue() or ''
-        line = [l for l in text.split('\n') if 'Token' in l]
-        shared.log.info(f'HF login: {line[0] if len(line) > 0 else text}')
-        loggedin = True
-
-
-def load_sd3(te3=None, fn=None, cache_dir=None, config=None):
-    from modules import devices
-    hf_login()
+def load_sd3(fn=None, cache_dir=None, config=None):
+    from modules import devices, modelloader
    repo_id = 'stabilityai/stable-diffusion-3-medium-diffusers'
    model_id = 'stabilityai/stable-diffusion-3-medium-diffusers'
    dtype = torch.float16
@ -37,140 +13,52 @@ def load_sd3(te3=None, fn=None, cache_dir=None, config=None):
    if fn is not None and fn.endswith('.safetensors') and os.path.exists(fn):
        model_id = fn
        loader = diffusers.StableDiffusion3Pipeline.from_single_file
-        kwargs = {
-            'text_encoder': transformers.CLIPTextModelWithProjection.from_pretrained(
-                repo_id,
-                subfolder='text_encoder',
-                cache_dir=cache_dir,
-                torch_dtype=dtype,
-            ),
-            'text_encoder_2': transformers.CLIPTextModelWithProjection.from_pretrained(
-                repo_id,
-                subfolder='text_encoder_2',
-                cache_dir=cache_dir,
-                torch_dtype=dtype,
-            ),
-            'tokenizer': transformers.CLIPTokenizer.from_pretrained(
-                repo_id,
-                subfolder='tokenizer',
-                cache_dir=cache_dir,
-            ),
-            'tokenizer_2': transformers.CLIPTokenizer.from_pretrained(
-                repo_id,
-                subfolder='tokenizer_2',
-                cache_dir=cache_dir,
-            ),
-        }
+        diffusers_minor = int(diffusers.__version__.split('.')[1])
+        fn_size = os.path.getsize(fn)
+        if diffusers_minor < 30 or fn_size < 5e9: # te1/te2 do not get loaded correctly in diffusers 0.29.0 or model is without te1/te2
+            kwargs = {
+                'text_encoder': transformers.CLIPTextModelWithProjection.from_pretrained(
+                    repo_id,
+                    subfolder='text_encoder',
+                    cache_dir=cache_dir,
+                    torch_dtype=dtype,
+                ),
+                'text_encoder_2': transformers.CLIPTextModelWithProjection.from_pretrained(
+                    repo_id,
+                    subfolder='text_encoder_2',
+                    cache_dir=cache_dir,
+                    torch_dtype=dtype,
+                ),
+                'tokenizer': transformers.CLIPTokenizer.from_pretrained(
+                    repo_id,
+                    subfolder='tokenizer',
+                    cache_dir=cache_dir,
+                ),
+                'tokenizer_2': transformers.CLIPTokenizer.from_pretrained(
+                    repo_id,
+                    subfolder='tokenizer_2',
+                    cache_dir=cache_dir,
+                ),
+                'text_encoder_3': None,
+            }
+        elif fn_size < 1e10: # if model is below 10gb it does not have te3
+            kwargs = {
+                'text_encoder_3': None,
+            }
+        else:
+            kwargs = {}
    else:
+        modelloader.hf_login()
        model_id = repo_id
        loader = diffusers.StableDiffusion3Pipeline.from_pretrained
-    if te3 == 'fp16':
-        text_encoder_3 = transformers.T5EncoderModel.from_pretrained(
-            repo_id,
-            subfolder='text_encoder_3',
-            torch_dtype=dtype,
-            cache_dir=cache_dir,
-        )
-        pipe = loader(
-            model_id,
-            torch_dtype=dtype,
-            text_encoder_3=text_encoder_3,
-            cache_dir=cache_dir,
-            config=config,
-            **kwargs,
-        )
-    elif te3 == 'fp8':
-        quantization_config = transformers.BitsAndBytesConfig(load_in_8bit=True)
-        text_encoder_3 = transformers.T5EncoderModel.from_pretrained(
-            repo_id,
-            subfolder='text_encoder_3',
-            quantization_config=quantization_config,
-            cache_dir=cache_dir,
-            config=config,
-        )
-        pipe = loader(
-            model_id,
-            text_encoder_3=text_encoder_3,
-            device_map='balanced',
-            torch_dtype=dtype,
-            cache_dir=cache_dir,
-            config=config,
-            **kwargs,
-        )
-    else:
-        pipe = loader(
-            model_id,
-            torch_dtype=dtype,
-            text_encoder_3=None,
-            cache_dir=cache_dir,
-            config=config,
-            **kwargs,
-        )
+    pipe = loader(
+        model_id,
+        torch_dtype=dtype,
+        cache_dir=cache_dir,
+        config=config,
+        **kwargs,
+    )
    diffusers.pipelines.auto_pipeline.AUTO_TEXT2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = diffusers.StableDiffusion3Pipeline
    diffusers.pipelines.auto_pipeline.AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = diffusers.StableDiffusion3Img2ImgPipeline
-    devices.torch_gc(force=True)
+    devices.torch_gc()
    return pipe
-
-
-def load_te3(pipe, te3=None, cache_dir=None):
-    from modules import devices
-    hf_login()
-    repo_id = 'stabilityai/stable-diffusion-3-medium-diffusers'
-    if pipe is None or not hasattr(pipe, 'text_encoder_3'):
-        return pipe
-    if 'fp16' in te3.lower():
-        pipe.text_encoder_3 = transformers.T5EncoderModel.from_pretrained(
-            repo_id,
-            subfolder='text_encoder_3',
-            # torch_dtype=dtype,
-            cache_dir=cache_dir,
-            torch_dtype=pipe.text_encoder.dtype,
-        )
-    elif 'fp8' in te3.lower():
-        from installer import install
-        install('bitsandbytes', quiet=True)
-        quantization_config = transformers.BitsAndBytesConfig(load_in_8bit=True)
-        pipe.text_encoder_3 = transformers.T5EncoderModel.from_pretrained(
-            repo_id,
-            subfolder='text_encoder_3',
-            quantization_config=quantization_config,
-            cache_dir=cache_dir,
-            torch_dtype=pipe.text_encoder.dtype,
-        )
-    else:
-        pipe.text_encoder_3 = None
-    if getattr(pipe, 'text_encoder_3', None) is not None and getattr(pipe, 'tokenizer_3', None) is None:
-        pipe.tokenizer_3 = transformers.T5TokenizerFast.from_pretrained(
-            repo_id,
-            subfolder='tokenizer_3',
-            cache_dir=cache_dir,
-        )
-    devices.torch_gc(force=True)
-
-
-if __name__ == '__main__':
-    model_fn = '/mnt/models/stable-diffusion/sd3/sd3_medium_incl_clips.safetensors'
-    import time
-    import logging
-    logging.basicConfig(level=logging.INFO)
-    log = logging.getLogger('sd')
-    t0 = time.time()
-    pipeline = load_sd3(te3='fp16', fn='')
-
-    # pipeline.to('cuda')
-    t1 = time.time()
-    log.info(f'Loaded: time={t1-t0:.3f}')
-
-    # pipeline.scheduler = diffusers.schedulers.EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
-    log.info(f'Scheduler, {pipeline.scheduler}')
-    image = pipeline(
-        prompt='a photo of a cute robot holding a sign above his head that says sdnext, high detailed',
-        negative_prompt='',
-        num_inference_steps=50,
-        height=1024,
-        width=1024,
-        guidance_scale=7.0,
-    ).images[0]
-    t2 = time.time()
-    log.info(f'Generated: time={t2-t1:.3f}')
-    image.save("/tmp/sd3.png")
--- a/modules/model_t5.py
+++ b/modules/model_t5.py
@ -0,0 +1,77 @@
+import transformers
+
+
+def load_t5(t5=None, cache_dir=None):
+    from modules import devices, modelloader
+    repo_id = 'stabilityai/stable-diffusion-3-medium-diffusers'
+    if 'fp16' in t5.lower():
+        modelloader.hf_login()
+        t5 = transformers.T5EncoderModel.from_pretrained(
+            repo_id,
+            subfolder='text_encoder_3',
+            # torch_dtype=dtype,
+            cache_dir=cache_dir,
+            torch_dtype=devices.dtype,
+        )
+    elif 'fp4' in t5.lower():
+        modelloader.hf_login()
+        from installer import install
+        install('bitsandbytes', quiet=True)
+        quantization_config = transformers.BitsAndBytesConfig(load_in_4bit=True)
+        t5 = transformers.T5EncoderModel.from_pretrained(
+            repo_id,
+            subfolder='text_encoder_3',
+            quantization_config=quantization_config,
+            cache_dir=cache_dir,
+            torch_dtype=devices.dtype,
+        )
+    elif 'fp8' in t5.lower():
+        modelloader.hf_login()
+        from installer import install
+        install('bitsandbytes', quiet=True)
+        quantization_config = transformers.BitsAndBytesConfig(load_in_8bit=True)
+        t5 = transformers.T5EncoderModel.from_pretrained(
+            repo_id,
+            subfolder='text_encoder_3',
+            quantization_config=quantization_config,
+            cache_dir=cache_dir,
+            torch_dtype=devices.dtype,
+        )
+    elif 'int8' in t5.lower():
+        modelloader.hf_login()
+        from installer import install
+        install('nncf==2.7.0', quiet=True)
+        from modules.sd_models_compile import nncf_compress_model
+        from modules.sd_hijack import NNCF_T5DenseGatedActDense # T5DenseGatedActDense uses fp32
+        t5 = transformers.T5EncoderModel.from_pretrained(
+            repo_id,
+            subfolder='text_encoder_3',
+            cache_dir=cache_dir,
+            torch_dtype=devices.dtype,
+        )
+        for i in range(len(t5.encoder.block)):
+            t5.encoder.block[i].layer[1].DenseReluDense = NNCF_T5DenseGatedActDense(
+                t5.encoder.block[i].layer[1].DenseReluDense
+            )
+        t5 = nncf_compress_model(t5)
+    else:
+        t5 = None
+    return t5
+
+
+def set_t5(pipe, module, t5=None, cache_dir=None):
+    from modules import devices, shared
+    if pipe is None or not hasattr(pipe, module):
+        return pipe
+    t5 = load_t5(t5=t5, cache_dir=cache_dir)
+    setattr(pipe, module, t5)
+    if shared.cmd_opts.lowvram or shared.opts.diffusers_seq_cpu_offload:
+        from accelerate import cpu_offload
+        getattr(pipe, module).to("cpu")
+        cpu_offload(getattr(pipe, module), devices.device, offload_buffers=len(getattr(pipe, module)._parameters) > 0) # pylint: disable=protected-access
+    elif shared.cmd_opts.medvram or shared.opts.diffusers_model_cpu_offload:
+        if not hasattr(pipe, "_all_hooks") or len(pipe._all_hooks) == 0: # pylint: disable=protected-access
+            pipe.enable_model_cpu_offload(device=devices.device)
+        else:
+            pipe.maybe_free_model_hooks()
+    devices.torch_gc()
--- a/modules/modeldata.py
+++ b/modules/modeldata.py
@ -83,7 +83,7 @@ class Shared(sys.modules[__name__].__class__):
                return model_type
            if not shared.native:
                model_type = 'ldm'
-            elif "StableDiffusion3" in self.sd_refiner.__class__.__name__:
+            elif "StableDiffusion3" in self.sd_model.__class__.__name__:
                model_type = 'sd3'
            elif "StableDiffusionXL" in self.sd_model.__class__.__name__:
                model_type = 'sdxl'
@ -97,6 +97,8 @@ class Shared(sys.modules[__name__].__class__):
                model_type = 'sd' # sd is compatible with sd
            elif "Kandinsky" in self.sd_model.__class__.__name__:
                model_type = 'kandinsky'
+            elif "HunyuanDiT" in self.sd_model.__class__.__name__:
+                model_type = 'hunyuandit'
            elif "Cascade" in self.sd_model.__class__.__name__:
                model_type = 'sc'
            else:
--- a/modules/modelloader.py
+++ b/modules/modelloader.py
@ -1,8 +1,10 @@
+import io
 import os
 import time
 import json
 import shutil
 import importlib
+import contextlib
 from typing import Dict
 from urllib.parse import urlparse
 from PIL import Image
@ -12,10 +14,25 @@ from modules.upscaler import Upscaler, UpscalerLanczos, UpscalerNearest, Upscale
 from modules.paths import script_path, models_path


+loggedin = False
 diffuser_repos = []
 debug = shared.log.trace if os.environ.get('SD_DOWNLOAD_DEBUG', None) is not None else lambda *args, **kwargs: None


+def hf_login(token=None):
+    global loggedin # pylint: disable=global-statement
+    import huggingface_hub as hf
+    token = token or shared.opts.huggingface_token
+    if token is not None and len(token) > 2 and not loggedin:
+        stdout = io.StringIO()
+        with contextlib.redirect_stdout(stdout):
+            hf.login(shared.opts.huggingface_token)
+        text = stdout.getvalue() or ''
+        line = [l for l in text.split('\n') if 'Token' in l]
+        shared.log.info(f'HF login: {line[0] if len(line) > 0 else text}')
+        loggedin = True
+
+
 def download_civit_meta(model_path: str, model_id):
    fn = os.path.splitext(model_path)[0] + '.json'
    url = f'https://civitai.com/api/v1/models/{model_id}'
@ -152,6 +169,10 @@ def download_civit_model_thread(model_name, model_url, model_path, model_type, t

 def download_civit_model(model_url: str, model_name: str, model_path: str, model_type: str, token: str = None):
    import threading
+    if model_name is None or len(model_name) == 0:
+        err = 'Model download: no target model name provided'
+        shared.log.error(err)
+        return err
    thread = threading.Thread(target=download_civit_model_thread, args=(model_name, model_url, model_path, model_type, token))
    thread.start()
    return f'Model download: name={model_name} url={model_url} path={model_path}'
@ -183,8 +204,7 @@ def download_diffusers_model(hub_id: str, cache_dir: str = None, download_config
    shared.log.debug(f'Diffusers downloading: id="{hub_id}" args={download_config}')
    token = token or shared.opts.huggingface_token
    if token is not None and len(token) > 2:
-        shared.log.debug(f"Diffusers authentication: {token}")
-        hf.login(token)
+        hf_login(token)
    pipeline_dir = None

    ok = False
@ -297,6 +317,10 @@ def get_reference_opts(name: str, quiet=False):
        if k == name or model_name == name:
            model_opts = v
            break
+        model_name = model_name.replace('huggingface/', '')
+        if k == name or model_name == name:
+            model_opts = v
+            break
    if not model_opts:
        # shared.log.error(f'Reference: model="{name}" not found')
        return {}
--- a/modules/pag/init.py
+++ b/modules/pag/init.py
@ -15,9 +15,11 @@ def apply(p: processing.StableDiffusionProcessing): # pylint: disable=arguments-
    c = shared.sd_model.__class__ if shared.sd_loaded else None
    if c == StableDiffusionPAGPipeline or c == StableDiffusionXLPAGPipeline:
        unapply()
-        return None
    if p.pag_scale == 0:
        return
+    if sd_models.get_diffusers_task(shared.sd_model) != sd_models.DiffusersTaskType.TEXT_2_IMAGE:
+        shared.log.warning(f'PAG: pipeline={c} not implemented')
+        return None
    if detect.is_sd15(c):
        orig_pipeline = shared.sd_model
        shared.sd_model = sd_models.switch_pipe(StableDiffusionPAGPipeline, shared.sd_model)
--- a/modules/pag/pipe_sdxl.py
+++ b/modules/pag/pipe_sdxl.py
@ -446,6 +446,7 @@ class StableDiffusionXLPAGPipeline(
        feature_extractor: CLIPImageProcessor = None,
        force_zeros_for_empty_prompt: bool = True,
        add_watermarker: Optional[bool] = None,
+        requires_aesthetics_score: Optional[bool] = None, # todo: patch SDXLPAG pipeline
    ):
        super().__init__()

@ -460,13 +461,13 @@ class StableDiffusionXLPAGPipeline(
            image_encoder=image_encoder,
            feature_extractor=feature_extractor,
        )
+        if 'requires_aesthetics_score' in self.config:
+            self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-
        self.default_sample_size = self.unet.config.sample_size
-
-        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+        add_watermarker = False

        if add_watermarker:
            self.watermark = StableDiffusionXLWatermarker()
@ -1500,7 +1501,7 @@ class StableDiffusionXLPAGPipeline(
                    else:
                        replace_processor = PAGIdentitySelfAttnProcessor()

-                    if(self.pag_applied_layers_index):
+                    if self.pag_applied_layers_index:
                        drop_layers = self.pag_applied_layers_index
                        for drop_layer in drop_layers:
                            layer_number = int(drop_layer[1:])
@ -1517,7 +1518,7 @@ class StableDiffusionXLPAGPipeline(
                                raise ValueError(
                                    f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
                                )
-                    elif(self.pag_applied_layers):
+                    elif self.pag_applied_layers:
                        drop_full_layers = self.pag_applied_layers
                        for drop_full_layer in drop_full_layers:
                            try:
@ -1621,7 +1622,7 @@ class StableDiffusionXLPAGPipeline(
                if XLA_AVAILABLE:
                    xm.mark_step()

-        if not output_type == "latent":
+        if output_type != "latent":
            # make sure the VAE is in float32 mode, as it overflows in float16
            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast

@ -1656,7 +1657,7 @@ class StableDiffusionXLPAGPipeline(
        else:
            image = latents

-        if not output_type == "latent":
+        if output_type != "latent":
            # apply watermark if available
            if self.watermark is not None:
                image = self.watermark.apply_watermark(image)
@ -1671,7 +1672,7 @@ class StableDiffusionXLPAGPipeline(

        #Change the attention layers back to original ones after PAG was applied
        if self.do_adversarial_guidance:
-            if(self.pag_applied_layers_index):
+            if self.pag_applied_layers_index:
                drop_layers = self.pag_applied_layers_index
                for drop_layer in drop_layers:
                    layer_number = int(drop_layer[1:])
@ -1685,26 +1686,22 @@ class StableDiffusionXLPAGPipeline(
                        else:
                            raise ValueError(f"Invalid layer type: {drop_layer[0]}")
                    except IndexError:
-                        raise ValueError(
-                            f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers."
-                        )
-            elif(self.pag_applied_layers):
-                            drop_full_layers = self.pag_applied_layers
-                            for drop_full_layer in drop_full_layers:
-                                try:
-                                    if drop_full_layer == "down":
-                                        for down_layer in down_layers:
-                                            down_layer.processor = AttnProcessor2_0()
-                                    elif drop_full_layer == "mid":
-                                        for mid_layer in mid_layers:
-                                            mid_layer.processor = AttnProcessor2_0()
-                                    elif drop_full_layer == "up":
-                                        for up_layer in up_layers:
-                                            up_layer.processor = AttnProcessor2_0()
-                                    else:
-                                        raise ValueError(f"Invalid layer type: {drop_full_layer}")
-                                except IndexError:
-                                    raise ValueError(
-                                        f"Invalid layer index: {drop_full_layer}. Available layers are: down, mid and up. If you need to specify each layer index, you can use `pag_applied_layers_index`"
-                                    )
+                        raise ValueError(f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers.")
+            elif self.pag_applied_layers:
+                drop_full_layers = self.pag_applied_layers
+                for drop_full_layer in drop_full_layers:
+                    try:
+                        if drop_full_layer == "down":
+                            for down_layer in down_layers:
+                                down_layer.processor = AttnProcessor2_0()
+                        elif drop_full_layer == "mid":
+                            for mid_layer in mid_layers:
+                                mid_layer.processor = AttnProcessor2_0()
+                        elif drop_full_layer == "up":
+                            for up_layer in up_layers:
+                                up_layer.processor = AttnProcessor2_0()
+                        else:
+                            raise ValueError(f"Invalid layer type: {drop_full_layer}")
+                    except IndexError:
+                        raise ValueError(f"Invalid layer index: {drop_full_layer}. Available layers are: down, mid and up. If you need to specify each layer index, you can use `pag_applied_layers_index`")
        return StableDiffusionXLPipelineOutput(images=image)
--- a/modules/processing.py
+++ b/modules/processing.py
@ -158,7 +158,6 @@ def process_images(p: StableDiffusionProcessing) -> Processed:

        shared.prompt_styles.apply_styles_to_extra(p)
        shared.prompt_styles.extract_comments(p)
-        pag.apply(p)
        if shared.opts.cuda_compile_backend == 'none':
            sd_models.apply_token_merging(p.sd_model)
            sd_hijack_freeu.apply_freeu(p, not shared.native)
@ -273,6 +272,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
        extra_network_data = None
        debug(f'Processing inner: args={vars(p)}')
        for n in range(p.n_iter):
+            pag.apply(p)
            debug(f'Processing inner: iteration={n+1}/{p.n_iter}')
            p.iteration = n
            if shared.state.skipped:
--- a/modules/processing_args.py
+++ b/modules/processing_args.py
@ -27,6 +27,8 @@ def task_specific_kwargs(p, model):
                'height': 8 * math.ceil(p.height / 8),
            }
    elif (sd_models.get_diffusers_task(model) == sd_models.DiffusersTaskType.IMAGE_2_IMAGE or is_img2img_model) and len(getattr(p, 'init_images', [])) > 0:
+        if shared.sd_model_type == 'sdxl':
+            model.register_to_config(requires_aesthetics_score = False)
        p.ops.append('img2img')
        task_args = {
            'image': p.init_images,
@ -41,6 +43,8 @@ def task_specific_kwargs(p, model):
            'strength': p.denoising_strength,
        }
    elif (sd_models.get_diffusers_task(model) == sd_models.DiffusersTaskType.INPAINTING or is_img2img_model) and len(getattr(p, 'init_images', [])) > 0:
+        if shared.sd_model_type == 'sdxl':
+            model.register_to_config(requires_aesthetics_score = False)
        p.ops.append('inpaint')
        width, height = processing_helpers.resize_init_images(p)
        task_args = {
@ -106,7 +110,7 @@ def set_pipeline_args(p, model, prompts: list, negative_prompts: list, prompts_2
                    shared.log.error(f'Sampler timesteps: {e}')
            else:
                shared.log.warning(f'Sampler: sampler={model.scheduler.__class__.__name__} timesteps not supported')
-    if shared.opts.prompt_attention != 'Fixed attention' and ('StableDiffusion' in model.__class__.__name__ or 'StableCascade' in model.__class__.__name__) and 'Onnx' not in model.__class__.__name__ and 'StableDiffusion3' not in model.__class__.__name__:
+    if shared.opts.prompt_attention != 'Fixed attention' and ('StableDiffusion' in model.__class__.__name__ or 'StableCascade' in model.__class__.__name__) and 'Onnx' not in model.__class__.__name__:
        try:
            prompt_parser_diffusers.encode_prompts(model, p, prompts, negative_prompts, steps=steps, clip_skip=clip_skip)
            parser = shared.opts.prompt_attention
@ -126,6 +130,8 @@ def set_pipeline_args(p, model, prompts: list, negative_prompts: list, prompts_2
                args['prompt_embeds_pooled'] = p.positive_pooleds[0].unsqueeze(0)
            elif 'XL' in model.__class__.__name__ and len(getattr(p, 'positive_pooleds', [])) > 0:
                args['pooled_prompt_embeds'] = p.positive_pooleds[0]
+            elif 'StableDiffusion3' in model.__class__.__name__ and len(getattr(p, 'positive_pooleds', [])) > 0:
+                args['pooled_prompt_embeds'] = p.positive_pooleds[0]
        else:
            args['prompt'] = prompts
    if 'negative_prompt' in possible:
@ -135,6 +141,8 @@ def set_pipeline_args(p, model, prompts: list, negative_prompts: list, prompts_2
                args['negative_prompt_embeds_pooled'] = p.negative_pooleds[0].unsqueeze(0)
            if 'XL' in model.__class__.__name__ and len(getattr(p, 'negative_pooleds', [])) > 0:
                args['negative_pooled_prompt_embeds'] = p.negative_pooleds[0]
+            if 'StableDiffusion3' in model.__class__.__name__ and len(getattr(p, 'negative_pooleds', [])) > 0:
+                args['negative_pooled_prompt_embeds'] = p.negative_pooleds[0]
        else:
            if 'PixArtSigmaPipeline' in model.__class__.__name__: # pixart-sigma pipeline throws list-of-list for negative prompt
                args['negative_prompt'] = negative_prompts[0]
--- a/modules/processing_callbacks.py
+++ b/modules/processing_callbacks.py
@ -37,8 +37,6 @@ def diffusers_callback(pipe, step: int, timestep: int, kwargs: dict):
    if p is None:
        return kwargs
    latents = kwargs.get('latents', None)
-    if torch.is_tensor(latents) and latents.device.type == "privateuseone":
-        torch.dml.synchronize_tensor(latents) # DML synchronize
    debug_callback(f'Callback: step={step} timestep={timestep} latents={latents.shape if latents is not None else None} kwargs={list(kwargs)}')
    shared.state.sampling_step = step
    if shared.state.interrupted or shared.state.skipped:
--- a/modules/processing_class.py
+++ b/modules/processing_class.py
@ -516,6 +516,8 @@ def switch_class(p: StableDiffusionProcessing, new_class: type, dct: dict = None
        for k, v in dct.items():
            if k in possible:
                kwargs[k] = v
+    if new_class == StableDiffusionProcessingTxt2Img:
+        sd_models.clean_diffuser_pipe(shared.sd_model)
    debug(f"Switching class: {p.__class__.__name__} -> {new_class.__name__} fn={sys._getframe(1).f_code.co_name}") # pylint: disable=protected-access
    p.__class__ = new_class
    p.__init__(**kwargs)
--- a/modules/processing_diffusers.py
+++ b/modules/processing_diffusers.py
@ -105,7 +105,6 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
        desc='Base',
    )
    shared.state.sampling_steps = base_args.get('prior_num_inference_steps', None) or base_args.get('num_inference_steps', None) or p.steps
-    p.extra_generation_params['Pipeline'] = shared.sd_model.__class__.__name__
    if shared.opts.scheduler_eta is not None and shared.opts.scheduler_eta > 0 and shared.opts.scheduler_eta < 1:
        p.extra_generation_params["Sampler Eta"] = shared.opts.scheduler_eta
    output = None
--- a/modules/processing_helpers.py
+++ b/modules/processing_helpers.py
@ -351,11 +351,10 @@ def validate_sample(tensor):
        cast = sample.astype(np.uint8)
    if len(w) > 0:
        nans = np.isnan(sample).sum()
-        shared.log.error(f'Failed to validate samples: sample={sample.shape} invalid={nans}')
        cast = np.nan_to_num(sample)
        minimum, maximum, mean = np.min(cast), np.max(cast), np.mean(cast)
        cast = cast.astype(np.uint8)
-        shared.log.warning(f'Attempted to correct samples: min={minimum:.2f} max={maximum:.2f} mean={mean:.2f}')
+        shared.log.error(f'Failed to validate samples: sample={sample.shape} min={minimum:.2f} max={maximum:.2f} mean={mean:.2f} invalid={nans}')
    return cast


@ -390,6 +389,9 @@ def resize_hires(p, latents): # input=latents output=pil if not latent_upscaler
    if latent_upscaler is not None:
        return torch.nn.functional.interpolate(latents, size=(p.hr_upscale_to_y // 8, p.hr_upscale_to_x // 8), mode=latent_upscaler["mode"], antialias=latent_upscaler["antialias"])
    first_pass_images = processing_vae.vae_decode(latents=latents, model=shared.sd_model, full_quality=p.full_quality, output_type='pil')
+    if p.hr_upscale_to_x == 0 or p.hr_upscale_to_y == 0 and hasattr(p, 'init_hr'):
+        shared.log.error('Hires: missing upscaling dimensions')
+        return first_pass_images
    resized_images = []
    for img in first_pass_images:
        if latent_upscaler is None:
@ -397,6 +399,7 @@ def resize_hires(p, latents): # input=latents output=pil if not latent_upscaler
        else:
            resized_image = img
        resized_images.append(resized_image)
+    devices.torch_gc()
    return resized_images


--- a/modules/processing_info.py
+++ b/modules/processing_info.py
@ -63,6 +63,10 @@ def create_infotext(p: StableDiffusionProcessing, all_prompts=None, all_seeds=No
        "Comment": comment,
        "Operations": '; '.join(ops).replace('"', '') if len(p.ops) > 0 else 'none',
    }
+    # native
+    if shared.native:
+        args['Pipeline'] = shared.sd_model.__class__.__name__
+        args['T5'] = None if (not shared.opts.add_model_name_to_info or shared.opts.sd_text_encoder is None or shared.opts.sd_text_encoder == 'None') else shared.opts.sd_text_encoder
    if 'txt2img' in p.ops:
        args["Variation seed"] = all_subseeds[index] if p.subseed_strength > 0 else None
        args["Variation strength"] = p.subseed_strength if p.subseed_strength > 0 else None
@ -143,12 +147,20 @@ def create_infotext(p: StableDiffusionProcessing, all_prompts=None, all_seeds=No
        args['Sampler sigma uncond'] = shared.opts.s_churn if shared.opts.s_churn != shared.opts.data_labels.get('s_churn').default else None
        args['Sampler sigma noise'] = shared.opts.s_noise if shared.opts.s_noise != shared.opts.data_labels.get('s_noise').default else None
        args['Sampler sigma tmin'] = shared.opts.s_tmin if shared.opts.s_tmin != shared.opts.data_labels.get('s_tmin').default else None
-    # tome
-    args['ToMe'] = shared.opts.tome_ratio if shared.opts.tome_ratio != 0 else None
-    args['ToDo'] = shared.opts.todo_ratio if shared.opts.todo_ratio != 0 else None
+    # tome/todo
+    if shared.opts.token_merging_method == 'ToMe':
+        args['ToMe'] = shared.opts.tome_ratio if shared.opts.tome_ratio != 0 else None
+    else:
+        args['ToDo'] = shared.opts.todo_ratio if shared.opts.todo_ratio != 0 else None

    args.update(p.extra_generation_params)
-    params_text = ", ".join([k if k == v else f'{k}: {generation_parameters_copypaste.quote(v)}' for k, v in args.items() if v is not None])
+    for k, v in args.copy().items():
+        if v is None:
+            del args[k]
+        if isinstance(v, str):
+            if len(v) == 0 or v == '0x0':
+                del args[k]
+    params_text = ", ".join([k if k == v else f'{k}: {generation_parameters_copypaste.quote(v)}' for k, v in args.items()])
    negative_prompt_text = f"\nNegative prompt: {all_negative_prompts[index]}" if all_negative_prompts[index] else ""
    infotext = f"{all_prompts[index]}{negative_prompt_text}\n{params_text}".strip()
    return infotext
--- a/modules/processing_vae.py
+++ b/modules/processing_vae.py
@ -140,6 +140,7 @@ def vae_decode(latents, model, output_type='np', full_quality=True):
    if shared.cmd_opts.profile:
        t1 = time.time()
        shared.log.debug(f'Profile: VAE decode: {t1-t0:.2f}')
+    devices.torch_gc()
    return imgs


@ -155,4 +156,5 @@ def vae_encode(image, model, full_quality=True): # pylint: disable=unused-variab
        latents = full_vae_encode(image=tensor, model=shared.sd_model)
    else:
        latents = taesd_vae_encode(image=tensor)
+    devices.torch_gc()
    return latents
--- a/modules/prompt_parser_diffusers.py
+++ b/modules/prompt_parser_diffusers.py
@ -12,10 +12,9 @@ debug_enabled = os.environ.get('SD_PROMPT_DEBUG', None)
 debug = shared.log.trace if os.environ.get('SD_PROMPT_DEBUG', None) is not None else lambda *args, **kwargs: None
 debug('Trace: PROMPT')
 orig_encode_token_ids_to_embeddings = EmbeddingsProvider._encode_token_ids_to_embeddings # pylint: disable=protected-access
-token_dict = None
-token_type = None
+token_dict = None # used by helper get_tokens
+token_type = None # used by helper get_tokens
 cache = {}
-cache_type = None


 def compel_hijack(self, token_ids: torch.Tensor,
@ -41,8 +40,26 @@ def compel_hijack(self, token_ids: torch.Tensor,
    return hidden_state


-EmbeddingsProvider._encode_token_ids_to_embeddings = compel_hijack # pylint: disable=protected-access
+def sd3_compel_hijack(self, token_ids: torch.Tensor,
+                  attention_mask: typing.Optional[torch.Tensor] = None) -> torch.Tensor:
+    needs_hidden_states = True
+    text_encoder_output = self.text_encoder(token_ids, attention_mask, output_hidden_states=needs_hidden_states, return_dict=True)
+    clip_skip = int(self.returned_embeddings_type)
+    hidden_state = text_encoder_output.hidden_states[-(clip_skip+1)]

+    return hidden_state
+
+def insert_parser_highjack(pipename):
+    if "StableDiffusion3" in pipename:
+        EmbeddingsProvider._encode_token_ids_to_embeddings = sd3_compel_hijack # pylint: disable=protected-access
+        debug("Loading SD3 Parser hijack")
+    else:
+        EmbeddingsProvider._encode_token_ids_to_embeddings = compel_hijack # pylint: disable=protected-access
+        debug("Loading Standard Parser hijack")
+
+
+
+insert_parser_highjack("Initialize")

 # from https://github.com/damian0815/compel/blob/main/src/compel/diffusers_textual_inversion_manager.py
 class DiffusersTextualInversionManager(BaseTextualInversionManager):
@ -126,14 +143,14 @@ def get_tokens(msg, prompt):
            except Exception:
                tokens.append(f'UNK_{i}')
        token_count = len(ids) - int(has_bos_token) - int(has_eos_token)
-        shared.log.trace(f'Prompt tokenizer: type={msg} tokens={token_count} {tokens}')
+        debug(f'Prompt tokenizer: type={msg} tokens={token_count} {tokens}')


 def encode_prompts(pipe, p, prompts: list, negative_prompts: list, steps: int, clip_skip: typing.Optional[int] = None):
    if 'StableDiffusion' not in pipe.__class__.__name__ and 'DemoFusion' not in pipe.__class__.__name__ and 'StableCascade' not in pipe.__class__.__name__:
        shared.log.warning(f"Prompt parser not supported: {pipe.__class__.__name__}")
        return
-    elif prompts == cache.get('prompts', None) and negative_prompts == cache.get('negative_prompts', None) and clip_skip == cache.get('clip_skip', None) and cache.get('model_type', None) == shared.sd_model_type and steps == cache.get('steps', None):
+    elif shared.opts.sd_textencoder_cache and prompts == cache.get('prompts', None) and negative_prompts == cache.get('negative_prompts', None) and clip_skip == cache.get('clip_skip', None) and cache.get('model_type', None) == shared.sd_model_type and steps == cache.get('steps', None):
        p.prompt_embeds = cache.get('prompt_embeds', None)
        p.positive_pooleds = cache.get('positive_pooleds', None)
        p.negative_embeds = cache.get('negative_embeds', None)
@ -151,6 +168,11 @@ def encode_prompts(pipe, p, prompts: list, negative_prompts: list, steps: int, c
        p.negative_embeds = []
        p.negative_pooleds = []

+        if (shared.cmd_opts.medvram or shared.opts.diffusers_model_cpu_offload) and hasattr(pipe, "_all_hooks") and hasattr(pipe, "maybe_free_model_hooks"):
+            # if the last job is interrupted, model will stay in the vram and cause oom, send everything back to cpu before continuing
+            pipe.maybe_free_model_hooks()
+            devices.torch_gc()
+
        for i in range(max(len(positive_schedule), len(negative_schedule))):
            positive_prompt = positive_schedule[i % len(positive_schedule)]
            negative_prompt = negative_schedule[i % len(negative_schedule)]
@ -164,22 +186,29 @@ def encode_prompts(pipe, p, prompts: list, negative_prompts: list, steps: int, c
            if negative_pooled is not None:
                p.negative_pooleds.append(torch.cat([negative_pooled] * len(negative_prompts), dim=0))

-        cache.update({
-            'prompt_embeds': p.prompt_embeds,
-            'negative_embeds': p.negative_embeds,
-            'positive_pooleds': p.positive_pooleds,
-            'negative_pooleds': p.negative_pooleds,
-            'scheduled_prompt': p.scheduled_prompt,
-            'prompts': prompts,
-            'negative_prompts': negative_prompts,
-            'clip_skip': clip_skip,
-            'steps': steps,
-            'model_type': shared.sd_model_type
-        })
+        if shared.opts.sd_textencoder_cache:
+            cache.update({
+                'prompt_embeds': p.prompt_embeds,
+                'negative_embeds': p.negative_embeds,
+                'positive_pooleds': p.positive_pooleds,
+                'negative_pooleds': p.negative_pooleds,
+                'scheduled_prompt': p.scheduled_prompt,
+                'prompts': prompts,
+                'negative_prompts': negative_prompts,
+                'clip_skip': clip_skip,
+                'steps': steps,
+                'model_type': shared.sd_model_type
+            })
+        else:
+            cache.clear()
        if debug_enabled:
            get_tokens('positive', prompts[0])
            get_tokens('negative', negative_prompts[0])
+        if (shared.cmd_opts.medvram or shared.opts.diffusers_model_cpu_offload) and hasattr(pipe, "_all_hooks") and hasattr(pipe, "maybe_free_model_hooks"):
+            # text encoder will stay in the vram and cause oom, send everything back to cpu before continuing
+            pipe.maybe_free_model_hooks()
        debug(f"Prompt encode: time={(time.time() - t0):.3f}")
+        devices.torch_gc()
        return


@ -237,7 +266,7 @@ def pad_to_same_length(pipe, embeds):
    if not hasattr(pipe, 'encode_prompt') and 'StableCascade' not in pipe.__class__.__name__:
        return embeds
    device = pipe.device if str(pipe.device) != 'meta' else devices.device
-    if shared.opts.diffusers_zeros_prompt_pad:
+    if shared.opts.diffusers_zeros_prompt_pad or 'StableDiffusion3' in pipe.__class__.__name__:
        empty_embed = [torch.zeros((1, 77, embeds[0].shape[2]), device=device, dtype=embeds[0].dtype)]
    else:
        try:
@ -257,15 +286,34 @@ def pad_to_same_length(pipe, embeds):
            embeds[i] = embed
    return embeds

+def split_prompts(prompt, SD3 = False):
+    if prompt.find("TE2:") != -1:
+        prompt, prompt2 = prompt.split("TE2:")
+    else:
+        prompt2 = prompt
+
+    if prompt.find("TE3:") != -1:
+        prompt, prompt3 = prompt.split("TE3:")
+    elif prompt2.find("TE3:") != -1:
+        prompt2, prompt3 = prompt2.split("TE3:")
+    else:
+        prompt3 = prompt
+
+    prompt = prompt.strip()
+    prompt2 = " " if prompt2.strip() == "" else prompt2.strip()
+    prompt3 = " " if prompt3.strip() == "" else prompt3.strip()
+
+    if SD3 and prompt3 != " ":
+        ps, _ws = get_prompts_with_weights(prompt3)
+        prompt3 = " ".join(ps)
+    return prompt, prompt2, prompt3
+

 def get_weighted_text_embeddings(pipe, prompt: str = "", neg_prompt: str = "", clip_skip: int = None):
    device = pipe.device if str(pipe.device) != 'meta' else devices.device
-    prompt_split = prompt.split("TE2:")
-    prompt = prompt_split[0]
-    prompt_2 = prompt_split[-1]
-    neg_prompt_split = neg_prompt.split("TE2:")
-    neg_prompt_2 = neg_prompt_split[-1]
-    neg_prompt = neg_prompt_split[0]
+    SD3 = hasattr(pipe, 'text_encoder_3')
+    prompt, prompt_2, prompt_3 = split_prompts(prompt, SD3)
+    neg_prompt, neg_prompt_2, neg_prompt_3 = split_prompts(neg_prompt, SD3)

    if prompt != prompt_2:
        ps = [get_prompts_with_weights(p) for p in [prompt, prompt_2]]
@ -285,8 +333,8 @@ def get_weighted_text_embeddings(pipe, prompt: str = "", neg_prompt: str = "", c
    embedding_providers = prepare_embedding_providers(pipe, clip_skip)
    prompt_embeds = []
    negative_prompt_embeds = []
-    pooled_prompt_embeds = None
-    negative_pooled_prompt_embeds = None
+    pooled_prompt_embeds = []
+    negative_pooled_prompt_embeds = []
    for i in range(len(embedding_providers)):
        t0 = time.time()
        text = list(positives[i])
@ -310,22 +358,30 @@ def get_weighted_text_embeddings(pipe, prompt: str = "", neg_prompt: str = "", c
        embed, ntokens = embedding_providers[i].get_embeddings_for_weighted_prompt_fragments(text_batch=[negatives[i]], fragment_weights_batch=[negative_weights[i]], device=device, should_return_tokens=True)
        negative_prompt_embeds.append(embed)
        debug(f'Prompt: unpadded shape={prompt_embeds[0].shape} TE{i+1} ptokens={torch.count_nonzero(ptokens)} ntokens={torch.count_nonzero(ntokens)} time={(time.time() - t0):.3f}')
-
-    if prompt_embeds[-1].shape[-1] > 768:
+    if SD3:
+        t0 = time.time()
+        pooled_prompt_embeds.append(embedding_providers[0].get_pooled_embeddings(texts=positives[0] if len(positives[0]) == 1 else [" ".join(positives[0])], device=device))
+        pooled_prompt_embeds.append(embedding_providers[1].get_pooled_embeddings(texts=positives[-1] if len(positives[-1]) == 1 else [" ".join(positives[-1])], device=device))
+        negative_pooled_prompt_embeds.append(embedding_providers[0].get_pooled_embeddings(texts=negatives[0] if len(negatives[0]) == 1 else [" ".join(negatives[0])], device=device))
+        negative_pooled_prompt_embeds.append(embedding_providers[1].get_pooled_embeddings(texts=negatives[-1] if len(negatives[-1]) == 1 else [" ".join(negatives[-1])], device=device))
+        pooled_prompt_embeds = torch.cat(pooled_prompt_embeds, dim=-1)
+        negative_pooled_prompt_embeds = torch.cat(negative_pooled_prompt_embeds, dim=-1)
+        debug(f'Prompt: pooled shape={pooled_prompt_embeds[0].shape} time={(time.time() - t0):.3f}')
+    elif prompt_embeds[-1].shape[-1] > 768:
        t0 = time.time()
        if shared.opts.diffusers_pooled == "weighted":
-            pooled_prompt_embeds = prompt_embeds[-1][
+            pooled_prompt_embeds = embedding_providers[-1].text_encoder.text_projection(prompt_embeds[-1][
                torch.arange(prompt_embeds[-1].shape[0], device=device),
                (ptokens.to(dtype=torch.int, device=device) == 49407)
                .int()
                .argmax(dim=-1),
-            ]
-            negative_pooled_prompt_embeds = negative_prompt_embeds[-1][
+            ])
+            negative_pooled_prompt_embeds = embedding_providers[-1].text_encoder.text_projection(negative_prompt_embeds[-1][
                torch.arange(negative_prompt_embeds[-1].shape[0], device=device),
                (ntokens.to(dtype=torch.int, device=device) == 49407)
                .int()
                .argmax(dim=-1),
-            ]
+            ])
        else:
            try:
                pooled_prompt_embeds = embedding_providers[-1].get_pooled_embeddings(texts=[prompt_2], device=device) if prompt_embeds[-1].shape[-1] > 768 else None
@ -338,7 +394,31 @@ def get_weighted_text_embeddings(pipe, prompt: str = "", neg_prompt: str = "", c
    prompt_embeds = torch.cat(prompt_embeds, dim=-1) if len(prompt_embeds) > 1 else prompt_embeds[0]
    negative_prompt_embeds = torch.cat(negative_prompt_embeds, dim=-1) if len(negative_prompt_embeds) > 1 else \
        negative_prompt_embeds[0]
+    if pooled_prompt_embeds == []:
+        pooled_prompt_embeds = None
+    if negative_pooled_prompt_embeds == []:
+        negative_pooled_prompt_embeds = None
    debug(f'Prompt: positive={prompt_embeds.shape if prompt_embeds is not None else None} pooled={pooled_prompt_embeds.shape if pooled_prompt_embeds is not None else None} negative={negative_prompt_embeds.shape if negative_prompt_embeds is not None else None} pooled={negative_pooled_prompt_embeds.shape if negative_pooled_prompt_embeds is not None else None}')
    if prompt_embeds.shape[1] != negative_prompt_embeds.shape[1]:
        [prompt_embeds, negative_prompt_embeds] = pad_to_same_length(pipe, [prompt_embeds, negative_prompt_embeds])
+    if SD3:
+        device = pipe.device if str(pipe.device) != 'meta' else devices.device
+        t5_prompt_embed = pipe._get_t5_prompt_embeds( # pylint: disable=protected-access
+            prompt=prompt_3,
+            num_images_per_prompt=prompt_embeds.shape[0],
+            device=device,
+        )
+        prompt_embeds = torch.nn.functional.pad(
+            prompt_embeds, (0, t5_prompt_embed.shape[-1] - prompt_embeds.shape[-1])
+        ).to(device)
+        prompt_embeds = torch.cat([prompt_embeds, t5_prompt_embed], dim=-2)
+        t5_negative_prompt_embed = pipe._get_t5_prompt_embeds( # pylint: disable=protected-access
+            prompt=neg_prompt_3,
+            num_images_per_prompt=prompt_embeds.shape[0],
+            device=device,
+        )
+        negative_prompt_embeds = torch.nn.functional.pad(
+            negative_prompt_embeds, (0, t5_negative_prompt_embed.shape[-1] - negative_prompt_embeds.shape[-1])
+        ).to(device)
+        negative_prompt_embeds = torch.cat([negative_prompt_embeds, t5_negative_prompt_embed], dim=-2)
    return prompt_embeds, pooled_prompt_embeds, negative_prompt_embeds, negative_pooled_prompt_embeds
--- a/modules/scripts.py
+++ b/modules/scripts.py
@ -489,8 +489,10 @@ class ScriptRunner:
        s = ScriptSummary('before-process')
        for script in self.alwayson_scripts:
            try:
-                script_args = p.script_args[script.args_from:script.args_to]
-                script.before_process(p, *script_args, **kwargs)
+                args = p.script_args[script.args_from:script.args_to]
+                if len(args) == 0:
+                    continue
+                script.before_process(p, *args, **kwargs)
            except Exception as e:
                errors.display(e, f"Error running before process: {script.filename}")
            s.record(script.title())
@ -501,6 +503,8 @@ class ScriptRunner:
        for script in self.alwayson_scripts:
            try:
                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                if len(args) == 0:
+                    continue
                script.process(p, *args, **kwargs)
            except Exception as e:
                errors.display(e, f'Running script process: {script.filename}')
@ -513,6 +517,8 @@ class ScriptRunner:
        for script in self.alwayson_scripts:
            try:
                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                if len(args) == 0:
+                    continue
                processed = script.process_images(p, *args, **kwargs)
            except Exception as e:
                errors.display(e, f'Running script process images: {script.filename}')
@ -525,6 +531,8 @@ class ScriptRunner:
        for script in self.alwayson_scripts:
            try:
                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                if len(args) == 0:
+                    continue
                script.before_process_batch(p, *args, **kwargs)
            except Exception as e:
                errors.display(e, f'Running script before process batch: {script.filename}')
@ -536,6 +544,8 @@ class ScriptRunner:
        for script in self.alwayson_scripts:
            try:
                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                if len(args) == 0:
+                    continue
                script.process_batch(p, *args, **kwargs)
            except Exception as e:
                errors.display(e, f'Running script process batch: {script.filename}')
@ -547,6 +557,8 @@ class ScriptRunner:
        for script in self.alwayson_scripts:
            try:
                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                if len(args) == 0:
+                    continue
                script.postprocess(p, processed, *args)
            except Exception as e:
                errors.display(e, f'Running script postprocess: {script.filename}')
@ -558,6 +570,8 @@ class ScriptRunner:
        for script in self.alwayson_scripts:
            try:
                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                if len(args) == 0:
+                    continue
                script.postprocess_batch(p, *args, images=images, **kwargs)
            except Exception as e:
                errors.display(e, f'Running script before postprocess batch: {script.filename}')
@ -569,6 +583,8 @@ class ScriptRunner:
        for script in self.alwayson_scripts:
            try:
                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                if len(args) == 0:
+                    continue
                script.postprocess_batch_list(p, pp, *args, **kwargs)
            except Exception as e:
                errors.display(e, f'Running script before postprocess batch list: {script.filename}')
@ -580,6 +596,8 @@ class ScriptRunner:
        for script in self.alwayson_scripts:
            try:
                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                if len(args) == 0:
+                    continue
                script.postprocess_image(p, pp, *args)
            except Exception as e:
                errors.display(e, f'Running script postprocess image: {script.filename}')
--- a/modules/sd_hijack.py
+++ b/modules/sd_hijack.py
@ -283,6 +283,25 @@ class EmbeddingsWithFixes(torch.nn.Module):
        return torch.stack(vecs)


+class NNCF_T5DenseGatedActDense(torch.nn.Module): # forward can't find what self is without creating a class
+    def __init__(self, T5DenseGatedActDense):
+        super().__init__()
+        self.wi_0 = T5DenseGatedActDense.wi_0
+        self.wi_1 = T5DenseGatedActDense.wi_1
+        self.wo = T5DenseGatedActDense.wo
+        self.dropout = T5DenseGatedActDense.dropout
+        self.act = T5DenseGatedActDense.act
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.to(torch.float32) # this line needs to be forced to fp32
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
 def add_circular_option_to_conv_2d():
    conv2d_constructor = torch.nn.Conv2d.__init__

--- a/modules/sd_hijack_dynamic_atten.py
+++ b/modules/sd_hijack_dynamic_atten.py
@ -110,8 +110,8 @@ class DynamicAttnProcessorSDP:
        if not hasattr(F, "scaled_dot_product_attention"):
            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")

-    def __call__(
-        self, attn, hidden_states: torch.FloatTensor, encoder_hidden_states=None, attention_mask=None, temb=None, scale: float = 1.0) -> torch.FloatTensor:
+    def __call__(self, attn, hidden_states: torch.Tensor, encoder_hidden_states=None, attention_mask=None, temb=None, *args, **kwargs) -> torch.Tensor:
+
        residual = hidden_states
        if attn.spatial_norm is not None:
            hidden_states = attn.spatial_norm(hidden_states, temb)
@ -135,16 +135,15 @@ class DynamicAttnProcessorSDP:
        if attn.group_norm is not None:
            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

-        args = () if USE_PEFT_BACKEND else (scale,)
-        query = attn.to_q(hidden_states, *args)
+        query = attn.to_q(hidden_states)

        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

-        key = attn.to_k(encoder_hidden_states, *args)
-        value = attn.to_v(encoder_hidden_states, *args)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)

        inner_dim = key.shape[-1]
        head_dim = inner_dim // attn.heads
@ -167,7 +166,7 @@ class DynamicAttnProcessorSDP:
        hidden_states = hidden_states.to(query.dtype)

        # linear proj
-        hidden_states = attn.to_out[0](hidden_states, *args)
+        hidden_states = attn.to_out[0](hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)

@ -190,13 +189,11 @@ class DynamicAttnProcessorBMM:
    based on AttnProcessor V1
    """

-    def __call__(self, attn, hidden_states: torch.FloatTensor, encoder_hidden_states=None, attention_mask=None,
-    temb=None, scale: float = 1.0) -> torch.Tensor: # pylint: disable=too-many-statements, too-many-locals, too-many-branches
+    def __call__(self, attn, hidden_states: torch.Tensor, encoder_hidden_states=None, attention_mask=None,
+    temb=None, *args, **kwargs) -> torch.Tensor: # pylint: disable=too-many-statements, too-many-locals, too-many-branches

        residual = hidden_states

-        args = () if USE_PEFT_BACKEND else (scale,)
-
        if attn.spatial_norm is not None:
            hidden_states = attn.spatial_norm(hidden_states, temb)

@ -214,15 +211,15 @@ class DynamicAttnProcessorBMM:
        if attn.group_norm is not None:
            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

-        query = attn.to_q(hidden_states, *args)
+        query = attn.to_q(hidden_states)

        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

-        key = attn.to_k(encoder_hidden_states, *args)
-        value = attn.to_v(encoder_hidden_states, *args)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)

        query = attn.head_to_batch_dim(query)
        key = attn.head_to_batch_dim(key)
@ -294,7 +291,7 @@ class DynamicAttnProcessorBMM:
        hidden_states = attn.batch_to_head_dim(hidden_states)

        # linear proj
-        hidden_states = attn.to_out[0](hidden_states, *args)
+        hidden_states = attn.to_out[0](hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)

--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@ -38,6 +38,7 @@ sd_metadata_pending = 0
 sd_metadata_timer = 0
 debug_move = shared.log.trace if os.environ.get('SD_MOVE_DEBUG', None) is not None else lambda *args, **kwargs: None
 debug_load = os.environ.get('SD_LOAD_DEBUG', None)
+debug_process = shared.log.trace if os.environ.get('SD_PROCESS_DEBUG', None) is not None else lambda *args, **kwargs: None
 diffusers_version = int(diffusers.__version__.split('.')[1])


@ -202,11 +203,17 @@ def get_closet_checkpoint_match(search_string):
    if checkpoint_info is not None:
        return checkpoint_info
    found = sorted([info for info in checkpoints_list.values() if search_string in info.title], key=lambda x: len(x.title))
-    if found:
+    if found and len(found) > 0:
        return found[0]
    found = sorted([info for info in checkpoints_list.values() if search_string.split(' ')[0] in info.title], key=lambda x: len(x.title))
-    if found:
+    if found and len(found) > 0:
        return found[0]
+    for v in shared.reference_models.values():
+        if search_string in v['path'] or os.path.basename(search_string) in v['path']:
+            model_name = search_string.replace('huggingface/', '')
+            checkpoint_info = CheckpointInfo(v['path']) # create a virutal model info
+            checkpoint_info.type = 'huggingface'
+            return checkpoint_info
    return None


@ -249,15 +256,16 @@ def select_checkpoint(op='model'):
        shared.log.info("  or use --ckpt-dir <path-to-folder> to specify folder with sd models")
        shared.log.info("  or use --ckpt <path-to-checkpoint> to force using specific model")
        return None
-    checkpoint_info = next(iter(checkpoints_list.values()))
+    # checkpoint_info = next(iter(checkpoints_list.values()))
    if model_checkpoint is not None:
        if model_checkpoint != 'model.ckpt' and model_checkpoint != 'runwayml/stable-diffusion-v1-5':
-            shared.log.warning(f"Selected checkpoint not found: {model_checkpoint}")
+            shared.log.warning(f'Selected: {op}="{model_checkpoint}" not found')
        else:
            shared.log.info("Selecting first available checkpoint")
        # shared.log.warning(f"Loading fallback checkpoint: {checkpoint_info.title}")
-        shared.opts.data['sd_model_checkpoint'] = checkpoint_info.title
-    shared.log.info(f'Select: {op}="{checkpoint_info.title if checkpoint_info is not None else None}"')
+        # shared.opts.data['sd_model_checkpoint'] = checkpoint_info.title
+    else:
+        shared.log.info(f'Select: {op}="{checkpoint_info.title if checkpoint_info is not None else None}"')
    return checkpoint_info


@ -545,7 +553,7 @@ def change_backend():
    refresh_vae_list()


-def detect_pipeline(f: str, op: str = 'model', warning=True):
+def detect_pipeline(f: str, op: str = 'model', warning=True, quiet=False):
    guess = shared.opts.diffusers_pipeline
    warn = shared.log.warning if warning else lambda *args, **kwargs: None
    size = 0
@ -560,39 +568,25 @@ def detect_pipeline(f: str, op: str = 'model', warning=True):
                elif (size >= 316 and size <= 324) or (size >= 156 and size <= 164): # 320 or 160
                    warn(f'Model detected as VAE model, but attempting to load as model: {op}={f} size={size} MB')
                    guess = 'VAE'
-                elif size >= 4970 and size <= 4976: # 4973
+                elif (size >= 4970 and size <= 4976): # 4973
                    guess = 'Stable Diffusion 2' # SD v2 but could be eps or v-prediction
                # elif size < 0: # unknown
                #    guess = 'Stable Diffusion 2B'
-                elif size >= 5791 and size <= 5799: # 5795
-                    if not shared.native:
-                        warn(f'Model detected as SD-XL refiner model, but attempting to load using backend=original: {op}={f} size={size} MB')
+                elif (size >= 5791 and size <= 5799): # 5795
                    if op == 'model':
                        warn(f'Model detected as SD-XL refiner model, but attempting to load a base model: {op}={f} size={size} MB')
                    guess = 'Stable Diffusion XL Refiner'
                elif (size >= 6611 and size <= 7220): # 6617, HassakuXL is 6776, monkrenRealisticINT_v10 is 7217
-                    if not shared.native:
-                        warn(f'Model detected as SD-XL base model, but attempting to load using backend=original: {op}={f} size={size} MB')
                    guess = 'Stable Diffusion XL'
-                elif size >= 3361 and size <= 3369: # 3368
-                    if not shared.native:
-                        warn(f'Model detected as SD upscale model, but attempting to load using backend=original: {op}={f} size={size} MB')
+                elif (size >= 3361 and size <= 3369): # 3368
                    guess = 'Stable Diffusion Upscale'
-                elif size >= 4891 and size <= 4899: # 4897
-                    if not shared.native:
-                        warn(f'Model detected as SD XL inpaint model, but attempting to load using backend=original: {op}={f} size={size} MB')
+                elif (size >= 4891 and size <= 4899): # 4897
                    guess = 'Stable Diffusion XL Inpaint'
-                elif size >= 9791 and size <= 9799: # 9794
-                    if not shared.native:
-                        warn(f'Model detected as SD XL instruct pix2pix model, but attempting to load using backend=original: {op}={f} size={size} MB')
+                elif (size >= 9791 and size <= 9799): # 9794
                    guess = 'Stable Diffusion XL Instruct'
-                elif size > 3138 and size < 3142: #3140
-                    if not shared.native:
-                        warn(f'Model detected as Segmind Vega model, but attempting to load using backend=original: {op}={f} size={size} MB')
+                elif (size > 3138 and size < 3142): #3140
                    guess = 'Stable Diffusion XL'
-                elif size > 5692 and size < 5698 or size > 4134 and size < 4138:
-                    if not shared.native:
-                        warn(f'Model detected as Stable Diffusion 3 model, but attempting to load using backend=original: {op}={f} size={size} MB')
+                elif (size > 5692 and size < 5698) or (size > 4134 and size < 4138) or (size > 10362 and size < 10366):
                    guess = 'Stable Diffusion 3'
            # guess by name
            """
@ -602,34 +596,20 @@ def detect_pipeline(f: str, op: str = 'model', warning=True):
                guess = 'Latent Consistency Model'
            """
            if 'instaflow' in f.lower():
-                if not shared.native:
-                    warn(f'Model detected as InstaFlow model, but attempting to load using backend=original: {op}={f} size={size} MB')
                guess = 'InstaFlow'
            if 'segmoe' in f.lower():
-                if not shared.native:
-                    warn(f'Model detected as SegMoE model, but attempting to load using backend=original: {op}={f} size={size} MB')
                guess = 'SegMoE'
            if 'hunyuandit' in f.lower():
-                if not shared.native:
-                    warn(f'Model detected as Tenecent HunyuanDiT model, but attempting to load using backend=original: {op}={f} size={size} MB')
                guess = 'HunyuanDiT'
            if 'pixart-xl' in f.lower():
-                if not shared.native:
-                    warn(f'Model detected as PixArt Alpha model, but attempting to load using backend=original: {op}={f} size={size} MB')
                guess = 'PixArt-Alpha'
            if 'stable-diffusion-3' in f.lower():
-                if not shared.native:
-                    warn(f'Model detected as Stable Diffusion 3 model, but attempting to load using backend=original: {op}={f} size={size} MB')
                guess = 'Stable Diffusion 3'
            if 'stable-cascade' in f.lower() or 'stablecascade' in f.lower() or 'wuerstchen3' in f.lower():
-                if not shared.native:
-                    warn(f'Model detected as Stable Cascade model, but attempting to load using backend=original: {op}={f} size={size} MB')
                if devices.dtype == torch.float16:
                    warn('Stable Cascade does not support Float16')
                guess = 'Stable Cascade'
            if 'pixart-sigma' in f.lower():
-                if not shared.native:
-                    warn(f'Model detected as PixArt-Sigma model, but attempting to load using backend=original: {op}={f} size={size} MB')
                guess = 'PixArt-Sigma'
            # switch for specific variant
            if guess == 'Stable Diffusion' and 'inpaint' in f.lower():
@ -642,7 +622,8 @@ def detect_pipeline(f: str, op: str = 'model', warning=True):
                guess = 'Stable Diffusion XL Instruct'
            # get actual pipeline
            pipeline = shared_items.get_pipelines().get(guess, None)
-            shared.log.info(f'Autodetect: {op}="{guess}" class={pipeline.__name__} file="{f}" size={size}MB')
+            if not quiet:
+                shared.log.info(f'Autodetect: {op}="{guess}" class={pipeline.__name__} file="{f}" size={size}MB')
        except Exception as e:
            shared.log.error(f'Error detecting diffusers pipeline: model={f} {e}')
            return None, None
@ -650,7 +631,8 @@ def detect_pipeline(f: str, op: str = 'model', warning=True):
        try:
            size = round(os.path.getsize(f) / 1024 / 1024)
            pipeline = shared_items.get_pipelines().get(guess, None)
-            shared.log.info(f'Diffusers: {op}="{guess}" class={pipeline.__name__} file="{f}" size={size}MB')
+            if not quiet:
+                shared.log.info(f'Diffusers: {op}="{guess}" class={pipeline.__name__} file="{f}" size={size}MB')
        except Exception as e:
            shared.log.error(f'Error loading diffusers pipeline: model={f} {e}')

@ -673,15 +655,10 @@ def copy_diffuser_options(new_pipe, orig_pipe):
    new_pipe.is_sd1 = getattr(orig_pipe, 'is_sd1', True)


-def set_diffuser_options(sd_model, vae = None, op: str = 'model'):
+def set_diffuser_options(sd_model, vae = None, op: str = 'model', offload=True):
    if sd_model is None:
        shared.log.warning(f'{op} is not loaded')
        return
-    if (shared.opts.diffusers_model_cpu_offload or shared.cmd_opts.medvram) and (shared.opts.diffusers_seq_cpu_offload or shared.cmd_opts.lowvram):
-        shared.log.warning(f'Setting {op}: Model CPU offload and Sequential CPU offload are not compatible')
-        shared.log.debug(f'Setting {op}: disabling model CPU offload')
-        shared.opts.diffusers_model_cpu_offload=False
-        shared.cmd_opts.medvram=False

    if hasattr(sd_model, "watermark"):
        sd_model.watermark = NoWatermark()
@ -737,6 +714,20 @@ def set_diffuser_options(sd_model, vae = None, op: str = 'model'):
        shared.log.debug(f'Setting {op}: enable channels last')
        sd_model.unet.to(memory_format=torch.channels_last)

+    if offload:
+        set_diffuser_offload(sd_model, op)
+
+def set_diffuser_offload(sd_model, op: str = 'model'):
+    if sd_model is None:
+        shared.log.warning(f'{op} is not loaded')
+        return
+    if (shared.opts.diffusers_model_cpu_offload or shared.cmd_opts.medvram) and (shared.opts.diffusers_seq_cpu_offload or shared.cmd_opts.lowvram):
+        shared.log.warning(f'Setting {op}: Model CPU offload and Sequential CPU offload are not compatible')
+        shared.log.debug(f'Setting {op}: disabling model CPU offload')
+        shared.opts.diffusers_model_cpu_offload=False
+        shared.cmd_opts.medvram=False
+    if not (hasattr(sd_model, "has_accelerate") and sd_model.has_accelerate):
+        sd_model.has_accelerate = False
    if hasattr(sd_model, "enable_model_cpu_offload"):
        if shared.cmd_opts.medvram or shared.opts.diffusers_model_cpu_offload:
            shared.log.debug(f'Setting {op}: enable model CPU offload')
@ -774,7 +765,7 @@ def move_model(model, device=None, force=False):
    if model is None or device is None:
        return
    if getattr(model, 'vae', None) is not None and get_diffusers_task(model) != DiffusersTaskType.TEXT_2_IMAGE:
-        if device == devices.device: # force vae back to gpu if not in txt2img mode
+        if device == devices.device and model.vae.device.type != "meta": # force vae back to gpu if not in txt2img mode
            model.vae.to(device)
            if hasattr(model.vae, '_hf_hook'):
                debug_move(f'Model move: to={device} class={model.vae.__class__} fn={sys._getframe(1).f_code.co_name}') # pylint: disable=protected-access
@ -994,14 +985,8 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
                    return
            elif model_type in ['PixArt-Sigma']: # forced pipeline
                try:
-                    # shared.opts.data['cuda_dtype'] = 'FP32' # override
-                    shared.opts.data['diffusers_model_cpu_offload'] = True # override
-                    devices.set_cuda_params()
-                    sd_model = diffusers.PixArtSigmaPipeline.from_pretrained(
-                        checkpoint_info.path,
-                        use_safetensors=True,
-                        cache_dir=shared.opts.diffusers_dir,
-                        **diffusers_load_config)
+                    from modules.model_pixart import load_pixart
+                    sd_model = load_pixart(checkpoint_info, diffusers_load_config)
                except Exception as e:
                    shared.log.error(f'Diffusers Failed loading {op}: {checkpoint_info.path} {e}')
                    if debug_load:
@ -1156,8 +1141,17 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
        sd_model.embedding_db.load_textual_inversion_embeddings(force_reload=True)
        timer.record("embeddings")

-        set_diffuser_options(sd_model, vae, op)
+        from modules.prompt_parser_diffusers import insert_parser_highjack
+        insert_parser_highjack(sd_model.__class__.__name__)

+        set_diffuser_options(sd_model, vae, op, offload=False)
+        if shared.opts.nncf_compress_weights and not (shared.opts.cuda_compile and shared.opts.cuda_compile_backend == "openvino_fx"):
+            sd_model = sd_models_compile.nncf_compress_weights(sd_model) # run this before move model so it can be compressed in CPU
+        timer.record("options")
+
+        set_diffuser_offload(sd_model, op)
+        if op == 'model':
+            sd_vae.apply_vae_config(shared.sd_model.sd_checkpoint_info.filename, vae_file, sd_model)
        if op == 'refiner' and shared.opts.diffusers_move_refiner:
            shared.log.debug('Moving refiner model to CPU')
            move_model(sd_model, devices.cpu)
@ -1165,14 +1159,11 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
            move_model(sd_model, devices.device)
        timer.record("move")

-        reload_text_encoder()
+        reload_text_encoder(initial=True)

        if shared.opts.ipex_optimize:
            sd_model = sd_models_compile.ipex_optimize(sd_model)

-        if shared.opts.nncf_compress_weights and not (shared.opts.cuda_compile and shared.opts.cuda_compile_backend == "openvino_fx"):
-            sd_model = sd_models_compile.nncf_compress_weights(sd_model)
-
        if (shared.opts.cuda_compile and shared.opts.cuda_compile_backend != 'none'):
            sd_model = sd_models_compile.compile_diffusers(sd_model)
        timer.record("compile")
@ -1305,11 +1296,25 @@ def switch_pipe(cls: diffusers.DiffusionPipeline, pipeline: diffusers.DiffusionP
    return pipeline


+def clean_diffuser_pipe(pipe):
+    if pipe is not None and shared.sd_model_type == 'sdxl' and 'requires_aesthetics_score' in pipe.config and hasattr(pipe, '_internal_dict'):
+        debug_process(f'Pipeline clean: {pipe.__class__.__name__}')
+        # diffusers adds requires_aesthetics_score with img2img and complains if requires_aesthetics_score exist in txt2img
+        internal_dict = dict(pipe._internal_dict) # pylint: disable=protected-access
+        internal_dict.pop('requires_aesthetics_score', None)
+        del pipe._internal_dict
+        pipe.register_to_config(**internal_dict)
+
+
 def set_diffuser_pipe(pipe, new_pipe_type):
+    n = getattr(pipe.__class__, '__name__', '')
+    if new_pipe_type == DiffusersTaskType.TEXT_2_IMAGE:
+        clean_diffuser_pipe(pipe)
+
    if get_diffusers_task(pipe) == new_pipe_type:
        return pipe
+
    # skip specific pipelines
-    n = getattr(pipe.__class__, '__name__', '')
    if n in ['StableDiffusionReferencePipeline', 'StableDiffusionAdapterPipeline', 'AnimateDiffPipeline', 'AnimateDiffSDXLPipeline']:
        return pipe
    if 'Onnx' in pipe.__class__.__name__:
@ -1378,8 +1383,10 @@ def set_diffusers_attention(pipe):
        modules = [getattr(pipe, n, None) for n in module_names]
        modules = [m for m in modules if isinstance(m, torch.nn.Module) and hasattr(m, "set_attn_processor")]
        for module in modules:
-            if 'SD3Transformer2DModel' in module.__class__.__name__: # TODO SD3
+            if module.__class__.__name__ in ['SD3Transformer2DModel']:
                module.set_attn_processor(p.JointAttnProcessor2_0())
+            elif module.__class__.__name__ in ['HunyuanDiT2DModel']:
+                pass
            else:
                module.set_attn_processor(attention)

@ -1522,11 +1529,19 @@ def load_model(checkpoint_info=None, already_loaded_state_dict=None, timer=None,
    shared.log.info(f'Model load finished: {memory_stats()} cached={len(checkpoints_loaded.keys())}')


-def reload_text_encoder():
-    if hasattr(shared.sd_model, 'text_encoder_3'):
-        from modules.model_sd3 import load_te3
-        shared.log.debug(f'Load: TE3={shared.opts.sd_te3}')
-        load_te3(shared.sd_model, shared.opts.sd_te3, cache_dir=shared.opts.diffusers_dir)
+def reload_text_encoder(initial=False):
+    if initial and (shared.opts.sd_text_encoder is None or shared.opts.sd_text_encoder == 'None'):
+        return # dont unload
+    signature = inspect.signature(shared.sd_model.__class__.__init__, follow_wrapped=True, eval_str=True).parameters
+    t5 = [k for k, v in signature.items() if 'T5EncoderModel' in str(v)]
+    if len(t5) > 0:
+        from modules.model_t5 import set_t5
+        shared.log.debug(f'Load: t5={shared.opts.sd_text_encoder} module="{t5[0]}"')
+        set_t5(pipe=shared.sd_model, module=t5[0], t5=shared.opts.sd_text_encoder, cache_dir=shared.opts.diffusers_dir)
+    elif hasattr(shared.sd_model, 'text_encoder_3'):
+        from modules.model_t5 import set_t5
+        shared.log.debug(f'Load: t5={shared.opts.sd_text_encoder} module="text_encoder_3"')
+        set_t5(pipe=shared.sd_model, module='text_encoder_3', t5=shared.opts.sd_text_encoder, cache_dir=shared.opts.diffusers_dir)


 def reload_model_weights(sd_model=None, info=None, reuse_dict=False, op='model', force=False):
--- a/modules/sd_models_compile.py
+++ b/modules/sd_models_compile.py
@ -58,9 +58,23 @@ def apply_compile_to_model(sd_model, function, options, op=None):
                sd_model.text_encoder = None
                sd_model.text_encoder = sd_model.decoder_pipe.text_encoder = function(sd_model.decoder_pipe.text_encoder)
            else:
+                if op == "nncf" and sd_model.text_encoder.__class__.__name__ == "T5EncoderModel":
+                    from modules.sd_hijack import NNCF_T5DenseGatedActDense # T5DenseGatedActDense uses fp32
+                    for i in range(len(sd_model.text_encoder.encoder.block)):
+                        sd_model.text_encoder.encoder.block[i].layer[1].DenseReluDense = NNCF_T5DenseGatedActDense(
+                            sd_model.text_encoder.encoder.block[i].layer[1].DenseReluDense
+                        )
                sd_model.text_encoder = function(sd_model.text_encoder)
        if hasattr(sd_model, 'text_encoder_2') and hasattr(sd_model.text_encoder_2, 'config'):
            sd_model.text_encoder_2 = function(sd_model.text_encoder_2)
+        if hasattr(sd_model, 'text_encoder_3') and hasattr(sd_model.text_encoder_3, 'config'):
+            if op == "nncf" and sd_model.text_encoder_3.__class__.__name__ == "T5EncoderModel":
+                from modules.sd_hijack import NNCF_T5DenseGatedActDense # T5DenseGatedActDense uses fp32
+                for i in range(len(sd_model.text_encoder_3.encoder.block)):
+                    sd_model.text_encoder_3.encoder.block[i].layer[1].DenseReluDense = NNCF_T5DenseGatedActDense(
+                        sd_model.text_encoder_3.encoder.block[i].layer[1].DenseReluDense
+                    )
+            sd_model.text_encoder_3 = function(sd_model.text_encoder_3)
        if hasattr(sd_model, 'prior_pipe') and hasattr(sd_model, 'prior_text_encoder'):
            sd_model.prior_text_encoder = None
            sd_model.prior_text_encoder = sd_model.prior_pipe.text_encoder = function(sd_model.prior_pipe.text_encoder)
@ -100,29 +114,31 @@ def ipex_optimize(sd_model):
        shared.log.warning(f"IPEX Optimize: error: {e}")
    return sd_model

+def nncf_send_to_device(model):
+    for child in model.children():
+        if child.__class__.__name__ == "WeightsDecompressor":
+            child.scale = child.scale.to(devices.device)
+            child.zero_point = child.zero_point.to(devices.device)
+        nncf_send_to_device(child)
+
+def nncf_compress_model(model):
+    import nncf
+    model.eval()
+    backup_embeddings = None
+    if hasattr(model, "get_input_embeddings"):
+        backup_embeddings = copy.deepcopy(model.get_input_embeddings())
+    model = nncf.compress_weights(model)
+    nncf_send_to_device(model)
+    if hasattr(model, "set_input_embeddings") and backup_embeddings is not None:
+        model.set_input_embeddings(backup_embeddings)
+    devices.torch_gc(force=True)
+    return model

 def nncf_compress_weights(sd_model):
    try:
        t0 = time.time()
-        if sd_model.device.type == "meta":
-            shared.log.warning("Compress Weights is not compatible with Sequential CPU offload")
-            return sd_model
-
-        def nncf_compress_model(model):
-            return_device = model.device
-            model.eval()
-            backup_embeddings = None
-            if hasattr(model, "get_input_embeddings"):
-                backup_embeddings = copy.deepcopy(model.get_input_embeddings())
-            model = nncf.compress_weights(model.to(devices.device)).to(return_device)
-            if hasattr(model, "set_input_embeddings") and backup_embeddings is not None:
-                model.set_input_embeddings(backup_embeddings)
-            devices.torch_gc(force=True)
-            return model
-
-        import nncf
-        shared.compiled_model_state = CompiledModelState()
-        shared.compiled_model_state.is_compiled = True
+        from installer import install
+        install('nncf==2.7.0', quiet=True)

        sd_model = apply_compile_to_model(sd_model, nncf_compress_model, shared.opts.nncf_compress_weights, op="nncf")

--- a/modules/sd_samplers_common.py
+++ b/modules/sd_samplers_common.py
@ -40,15 +40,17 @@ def single_sample_to_image(sample, approximation=None):
                warn_once('Unknown decode type')
                approximation = 0
        # normal sample is [4,64,64]
-        if sample.dtype == torch.bfloat16:
-            sample = sample.to(torch.float16)
+        try:
+            if sample.dtype == torch.bfloat16:
+                sample = sample.to(torch.float16)
+        except Exception as e:
+            warn_once(f'live preview: {e}')
        if len(sample.shape) > 4: # likely unknown video latent (e.g. svd)
            return Image.new(mode="RGB", size=(512, 512))
        if len(sample) == 16: # sd_cascade
            sd_cascade = True
        if len(sample.shape) == 4 and sample.shape[0]: # likely animatediff latent
            sample = sample.permute(1, 0, 2, 3)[0]
-
        if shared.native: # [-x,x] to [-5,5]
            sample_max = torch.max(sample)
            if sample_max > 5:
@ -56,7 +58,10 @@ def single_sample_to_image(sample, approximation=None):
            sample_min = torch.min(sample)
            if sample_min < -5:
                sample = sample * (5 / abs(sample_min))
-        if sd_cascade:
+        if approximation == 2: # TAESD
+            x_sample = sd_vae_taesd.decode(sample)
+            x_sample = (1.0 + x_sample) / 2.0 # preview requires smaller range
+        elif sd_cascade:
            x_sample = sd_vae_stablecascade.decode(sample)
        elif approximation == 0: # Simple
            x_sample = sd_vae_approx.cheap_approximation(sample) * 0.5 + 0.5
@ -64,9 +69,6 @@ def single_sample_to_image(sample, approximation=None):
            x_sample = sd_vae_approx.nn_approximation(sample) * 0.5 + 0.5
            if shared.sd_model_type == "sdxl":
                x_sample = x_sample[[2,1,0], :, :] # BGR to RGB
-        elif approximation == 2: # TAESD
-            x_sample = sd_vae_taesd.decode(sample)
-            x_sample = (1.0 + x_sample) / 2.0 # preview requires smaller range
        elif approximation == 3: # Full VAE
            x_sample = processing.decode_first_stage(shared.sd_model, sample.unsqueeze(0))[0] * 0.5 + 0.5
        else:
--- a/modules/sd_samplers_diffusers.py
+++ b/modules/sd_samplers_diffusers.py
@ -66,7 +66,7 @@ config = {
    'Euler EDM': { },
    'DPM++ 2M EDM': { 'solver_order': 2, 'solver_type': 'midpoint', 'final_sigmas_type': 'zero', 'algorithm_type': 'dpmsolver++' },
    'CMSI': { }, #{ 'sigma_min':  0.002, 'sigma_max': 80.0, 'sigma_data': 0.5, 's_noise': 1.0, 'rho': 7.0, 'clip_denoised': True },
-    'Euler FlowMatch': { },
+    'Euler FlowMatch': { 'shift': 1, },
    'IPNDM': { },
 }

@ -156,6 +156,8 @@ class DiffusionSampler:
            self.config['beta_start'] = shared.opts.schedulers_beta_start
        if 'beta_end' in self.config and shared.opts.schedulers_beta_end > 0:
            self.config['beta_end'] = shared.opts.schedulers_beta_end
+        if 'shift' in self.config and shared.opts.schedulers_shift != 1:
+            self.config['shift'] = shared.opts.schedulers_shift
        if 'rescale_betas_zero_snr' in self.config:
            self.config['rescale_betas_zero_snr'] = shared.opts.schedulers_rescale_betas
        if 'timestep_spacing' in self.config and shared.opts.schedulers_timestep_spacing != 'default' and shared.opts.schedulers_timestep_spacing is not None:
--- a/modules/sd_vae.py
+++ b/modules/sd_vae.py
@ -155,8 +155,6 @@ def load_vae(model, vae_file=None, vae_source="unknown-source"):
        except Exception as e:
            shared.log.error(f"Loading VAE failed: model={vae_file} source={vae_source} {e}")
            restore_base_vae(model)
-        # If vae used is not in dict, update it
-        # It will be removed on refresh though
        vae_opt = get_filename(vae_file)
        if vae_opt not in vae_dict:
            vae_dict[vae_opt] = vae_file
@ -165,6 +163,26 @@ def load_vae(model, vae_file=None, vae_source="unknown-source"):
    loaded_vae_file = vae_file


+def apply_vae_config(model_file, vae_file, sd_model):
+    def get_vae_config():
+        config_file = os.path.join(paths.sd_configs_path, os.path.splitext(os.path.basename(model_file))[0] + '_vae.json')
+        if config_file is not None and os.path.exists(config_file):
+            return shared.readfile(config_file)
+        config_file = os.path.join(paths.sd_configs_path, os.path.splitext(os.path.basename(vae_file))[0] + '.json') if vae_file else None
+        if config_file is not None and os.path.exists(config_file):
+            return shared.readfile(config_file)
+        config_file = os.path.join(paths.sd_configs_path, shared.sd_model_type, 'vae', 'config.json')
+        if config_file is not None and os.path.exists(config_file):
+            return shared.readfile(config_file)
+        return {}
+
+    if hasattr(sd_model, 'vae') and hasattr(sd_model.vae, 'config'):
+        config = get_vae_config()
+        for k, v in config.items():
+            if k in sd_model.vae.config and not k.startswith('_'):
+                sd_model.vae.config[k] = v
+
+
 def load_vae_diffusers(model_file, vae_file=None, vae_source="unknown-source"):
    if vae_file is None:
        return None
@ -241,6 +259,11 @@ def reload_vae_weights(sd_model=None, vae_file=unspecified):
        vae_file, vae_source = resolve_vae(checkpoint_file)
    else:
        vae_source = "function-argument"
+    if vae_file is None or vae_file == 'None':
+        if hasattr(sd_model, 'original_vae'):
+            sd_models.set_diffuser_options(sd_model, vae=sd_model.original_vae, op='vae')
+            shared.log.info("VAE restored")
+            return None
    if loaded_vae_file == vae_file:
        return None
    if not shared.native and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):
@ -258,10 +281,14 @@ def reload_vae_weights(sd_model=None, vae_file=unspecified):
        if vae_file is not None:
            shared.log.info(f"VAE weights loaded: {vae_file}")
    else:
-        if hasattr(shared.sd_model, "vae") and hasattr(shared.sd_model, "sd_checkpoint_info"):
-            vae = load_vae_diffusers(shared.sd_model.sd_checkpoint_info.filename, vae_file, vae_source)
+        if hasattr(sd_model, "vae") and hasattr(sd_model, "sd_checkpoint_info"):
+            vae = load_vae_diffusers(sd_model.sd_checkpoint_info.filename, vae_file, vae_source)
            if vae is not None:
+                if not hasattr(sd_model, 'original_vae'):
+                    sd_model.original_vae = sd_model.vae
+                    sd_models.move_model(sd_model.original_vae, devices.cpu)
                sd_models.set_diffuser_options(sd_model, vae=vae, op='vae')
+                apply_vae_config(sd_model.sd_checkpoint_info.filename, vae_file, sd_model)

    if not shared.cmd_opts.lowvram and not shared.cmd_opts.medvram:
        sd_models.move_model(sd_model, devices.device)
--- a/modules/sd_vae_approx.py
+++ b/modules/sd_vae_approx.py
@ -34,21 +34,24 @@ class VAEApprox(nn.Module):

 def nn_approximation(sample): # Approximate NN
    global sd_vae_approx_model # pylint: disable=global-statement
+    # ROCm throws memory exceptions and crashes the GPU with it if we use approx on the GPU
+    device = devices.device if devices.backend != "rocm" else "cpu"
+    dtype = devices.dtype_vae if devices.backend != "rocm" else torch.float32
    if sd_vae_approx_model is None:
        model_path = os.path.join(paths.models_path, "VAE-approx", "model.pt")
        sd_vae_approx_model = VAEApprox()
        if not os.path.exists(model_path):
            model_path = os.path.join(paths.script_path, "models", "VAE-approx", "model.pt")
-        approx_weights = torch.load(model_path, map_location='cpu' if devices.device.type != 'cuda' else None)
+        approx_weights = torch.load(model_path, map_location='cpu' if devices.device.type != 'cuda' or devices.backend == "rocm" else None)
        sd_vae_approx_model.load_state_dict(approx_weights)
        sd_vae_approx_model.eval()
-        sd_vae_approx_model.to(devices.device, sample.dtype)
+        sd_vae_approx_model.to(device, dtype)
        shared.log.debug(f'VAE load: type=approximate model={model_path}')
    try:
-        in_sample = sample.to(devices.device).unsqueeze(0)
-        sd_vae_approx_model.to(devices.device, devices.dtype)
+        in_sample = sample.to(device, dtype).unsqueeze(0)
+        sd_vae_approx_model.to(device, dtype)
        x_sample = sd_vae_approx_model(in_sample)
-        x_sample = x_sample[0].detach().cpu()
+        x_sample = x_sample[0].to(torch.float32).detach().cpu()
        return x_sample
    except Exception as e:
        shared.log.error(f'VAE decode approximate: {e}')
--- a/modules/sd_vae_taesd.py
+++ b/modules/sd_vae_taesd.py
@ -11,7 +11,14 @@ import torch.nn as nn
 from modules import devices, paths


-taesd_models = { 'sd-decoder': None, 'sd-encoder': None, 'sdxl-decoder': None, 'sdxl-encoder': None }
+taesd_models = {
+    'sd-decoder': None,
+    'sd-encoder': None,
+    'sdxl-decoder': None,
+    'sdxl-encoder': None,
+    'sd3-decoder': None,
+    'sd3-encoder': None,
+}
 previous_warnings = False


@ -31,33 +38,63 @@ class Block(nn.Module):
    def forward(self, x):
        return self.fuse(self.conv(x) + self.skip(x))

-def Encoder():
+def Encoder(latent_channels=4):
    return nn.Sequential(
        conv(3, 64), Block(64, 64),
        conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
        conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
        conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
-        conv(64, 4),
+        conv(64, latent_channels),
    )

-def Decoder():
+def Decoder(latent_channels=4):
    return nn.Sequential(
-        Clamp(), conv(4, 64), nn.ReLU(),
+        Clamp(), conv(latent_channels, 64), nn.ReLU(),
        Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
        Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
        Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
        Block(64, 64), conv(64, 3),
    )

+
+class TAESD2(nn.Module): # pylint: disable=abstract-method
+    latent_magnitude = 3
+    latent_shift = 0.5
+
+    def __init__(self, encoder_path="taesd_encoder.pth", decoder_path="taesd_decoder.pth", latent_channels=None):
+        """Initialize pretrained TAESD on the given device from the given checkpoints."""
+        super().__init__()
+        if latent_channels is None:
+            latent_channels = 16 if "taesd3" in str(encoder_path) else 4
+        self.encoder = Encoder(latent_channels)
+        self.decoder = Decoder(latent_channels)
+        if encoder_path is not None:
+            self.encoder.load_state_dict(torch.load(encoder_path, map_location="cpu"))
+        if decoder_path is not None:
+            self.decoder.load_state_dict(torch.load(decoder_path, map_location="cpu"))
+
+    @staticmethod
+    def scale_latents(x):
+        """raw latents -> [0, 1]"""
+        return x.div(2 * TAESD.latent_magnitude).add(TAESD.latent_shift).clamp(0, 1)
+
+    @staticmethod
+    def unscale_latents(x):
+        """[0, 1] -> raw latents"""
+        return x.sub(TAESD.latent_shift).mul(2 * TAESD.latent_magnitude)
+
+
 class TAESD(nn.Module): # pylint: disable=abstract-method
    latent_magnitude = 3
    latent_shift = 0.5

-    def __init__(self, encoder_path="taesd_encoder.pth", decoder_path="taesd_decoder.pth"):
+    def __init__(self, encoder_path="taesd_encoder.pth", decoder_path="taesd_decoder.pth", latent_channels=None):
        """Initialize pretrained TAESD on the given device from the given checkpoints."""
        super().__init__()
-        self.encoder = Encoder()
-        self.decoder = Decoder()
+        if latent_channels is None:
+            latent_channels = 16 if "taesd3" in str(encoder_path) or "taesd3" in str(decoder_path) else 4
+        self.encoder = Encoder(latent_channels)
+        self.decoder = Decoder(latent_channels)
        if encoder_path is not None:
            self.encoder.load_state_dict(torch.load(encoder_path, map_location="cpu"))
        if decoder_path is not None:
@ -105,13 +142,16 @@ def model(model_class = 'sd', model_type = 'decoder'):


 def decode(latents):
+    global previous_warnings # pylint: disable=global-statement
    from modules import shared
    model_class = shared.sd_model_type
    if model_class == 'ldm':
        model_class = 'sd'
    dtype = devices.dtype_vae if devices.dtype_vae != torch.bfloat16 else torch.float16 # taesd does not support bf16
    if 'sd' not in model_class:
-        shared.log.warning(f'TAESD unsupported model type: {model_class}')
+        if not previous_warnings:
+            previous_warnings = True
+            shared.log.warning(f'TAESD unsupported model type: {model_class}')
        return Image.new('RGB', (8, 8), color = (0, 0, 0))
    vae = taesd_models[f'{model_class}-decoder']
    if vae is None:
--- a/modules/shared.py
+++ b/modules/shared.py
@ -41,7 +41,7 @@ hide_dirs = {"visible": not cmd_opts.hide_ui_dir_config}
 xformers_available = False
 locking_available = True
 clip_model = None
-interrogator = modules.interrogate.InterrogateModels("interrogate")
+interrogator = modules.interrogate.InterrogateModels(os.path.join("models", "interrogate"))
 sd_upscalers = []
 face_restorers = []
 tab_names = []
@ -330,8 +330,9 @@ def temp_disable_extensions():
        modules.shared.opts.data['theme_type'] = 'None'
        modules.shared.opts.data['gradio_theme'] = theme_name
    else:
-        modules.shared.opts.data['theme_type'] = 'None'
-        modules.shared.opts.data['gradio_theme'] = theme_name
+        modules.shared.log.error(f'UI theme invalid: theme="{theme_name}" available={["standard/*", "modern/*", "none/*"]} fallback="standard/black-teal"')
+        modules.shared.opts.data['theme_type'] = 'Standard'
+        modules.shared.opts.data['gradio_theme'] = 'black-teal'

    for ext in disable_themes:
        if ext.lower() not in opts.disabled_extensions:
@ -385,19 +386,20 @@ else:
    sdp_options_default = ['Flash attention', 'Memory attention', 'Math attention']

 options_templates.update(options_section(('sd', "Execution & Models"), {
-    "sd_backend": OptionInfo(default_backend, "Execution backend", gr.Radio, {"choices": ["original", "diffusers"] }),
+    "sd_backend": OptionInfo(default_backend, "Execution backend", gr.Radio, {"choices": ["diffusers", "original"] }),
    "sd_model_checkpoint": OptionInfo(default_checkpoint, "Base model", gr.Dropdown, lambda: {"choices": list_checkpoint_tiles()}, refresh=refresh_checkpoints),
    "sd_model_refiner": OptionInfo('None', "Refiner model", gr.Dropdown, lambda: {"choices": ['None'] + list_checkpoint_tiles()}, refresh=refresh_checkpoints),
    "sd_vae": OptionInfo("Automatic", "VAE model", gr.Dropdown, lambda: {"choices": shared_items.sd_vae_items()}, refresh=shared_items.refresh_vae_list),
    "sd_unet": OptionInfo("None", "UNET model", gr.Dropdown, lambda: {"choices": shared_items.sd_unet_items()}, refresh=shared_items.refresh_unet_list),
-    "sd_te3": OptionInfo('None', "Text encoder model", gr.Dropdown, lambda: {"choices": ['None', 'T5 FP8', 'T5 FP16']}),
-    "sd_checkpoint_autoload": OptionInfo(True, "Model autoload on start"),
+    "sd_text_encoder": OptionInfo('None', "Text encoder model", gr.Dropdown, lambda: {"choices": ['None', 'T5 FP4', 'T5 FP8', 'T5 INT8', 'T5 FP16']}),
    "sd_model_dict": OptionInfo('None', "Use separate base dict", gr.Dropdown, lambda: {"choices": ['None'] + list_checkpoint_tiles()}, refresh=refresh_checkpoints),
+    "sd_checkpoint_autoload": OptionInfo(True, "Model autoload on start"),
+    "sd_textencoder_cache": OptionInfo(True, "Cache text encoder results"),
    "stream_load": OptionInfo(False, "Load models using stream loading method", gr.Checkbox, {"visible": not native }),
    "model_reuse_dict": OptionInfo(False, "Reuse loaded model dictionary", gr.Checkbox, {"visible": False}),
-    "prompt_attention": OptionInfo("Full parser", "Prompt attention parser", gr.Radio, {"choices": ["Full parser", "Compel parser", "A1111 parser", "Fixed attention"] }),
    "prompt_mean_norm": OptionInfo(False, "Prompt attention normalization", gr.Checkbox),
    "comma_padding_backtrack": OptionInfo(20, "Prompt padding", gr.Slider, {"minimum": 0, "maximum": 74, "step": 1, "visible": not native }),
+    "prompt_attention": OptionInfo("Full parser", "Prompt attention parser", gr.Radio, {"choices": ["Full parser", "Compel parser", "A1111 parser", "Fixed attention"] }),
    "sd_checkpoint_cache": OptionInfo(0, "Cached models", gr.Slider, {"minimum": 0, "maximum": 10, "step": 1, "visible": not native }),
    "sd_vae_checkpoint_cache": OptionInfo(0, "Cached VAEs", gr.Slider, {"minimum": 0, "maximum": 10, "step": 1, "visible": False}),
    "sd_disable_ckpt": OptionInfo(False, "Disallow models in ckpt format", gr.Checkbox, {"visible": False}),
@ -448,7 +450,7 @@ options_templates.update(options_section(('cuda', "Compute Settings"), {
    "deep_cache_interval": OptionInfo(3, "DeepCache cache interval", gr.Slider, {"minimum": 1, "maximum": 10, "step": 1}),

    "nncf_sep": OptionInfo("<h2>Model Compress</h2>", "", gr.HTML),
-    "nncf_compress_weights": OptionInfo([], "Compress Model weights with NNCF", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder"], "visible": native}),
+    "nncf_compress_weights": OptionInfo([], "Compress Model weights with NNCF", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),

    "ipex_sep": OptionInfo("<h2>IPEX</h2>", "", gr.HTML, {"visible": devices.backend == "ipex"}),
    "ipex_optimize": OptionInfo([], "IPEX Optimize for Intel GPUs", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "Upscaler"], "visible": devices.backend == "ipex"}),
@ -715,6 +717,7 @@ options_templates.update(options_section(('sampler-params', "Sampler Settings"),
    'schedulers_timesteps_range': OptionInfo(1000, "Timesteps range", gr.Slider, {"minimum": 250, "maximum": 4000, "step": 1}),
    'schedulers_timesteps': OptionInfo('', "Timesteps"),
    "schedulers_rescale_betas": OptionInfo(False, "Rescale betas with zero terminal SNR", gr.Checkbox),
+    'schedulers_shift': OptionInfo(1, "Sampler shift", gr.Slider, {"minimum": 0.1, "maximum": 10, "step": 0.1}),

    # managed from ui.py for backend original k-diffusion
    "schedulers_sep_kdiffusers": OptionInfo("<h2>K-Diffusion specific config</h2>", "", gr.HTML),
@ -774,19 +777,19 @@ options_templates.update(options_section(('control', "Control Options"), {
    "control_unload_processor": OptionInfo(False, "Processor unload after use"),
 }))

-options_templates.update(options_section(('training', "Training"), {
-    "unload_models_when_training": OptionInfo(False, "Move VAE and CLIP to RAM when training"),
-    "pin_memory": OptionInfo(True, "Pin training dataset to memory"),
-    "save_optimizer_state": OptionInfo(False, "Save resumable optimizer state when training"),
-    "save_training_settings_to_txt": OptionInfo(True, "Save training settings to a text file"),
-    "dataset_filename_word_regex": OptionInfo("", "Filename word regex"),
-    "dataset_filename_join_string": OptionInfo(" ", "Filename join string"),
-    "embeddings_templates_dir": OptionInfo(os.path.join(paths.script_path, 'train', 'templates'), "Embeddings train templates directory", folder=True),
-    "training_image_repeats_per_epoch": OptionInfo(1, "Image repeats per epoch", gr.Slider, {"minimum": 1, "maximum": 100, "step": 1}),
-    "training_write_csv_every": OptionInfo(0, "Save loss CSV file every n steps"),
-    "training_enable_tensorboard": OptionInfo(False, "Enable tensorboard logging"),
-    "training_tensorboard_save_images": OptionInfo(False, "Save generated images within tensorboard"),
-    "training_tensorboard_flush_every": OptionInfo(120, "Tensorboard flush period"),
+options_templates.update(options_section(('interrogate', "Interrogate"), { # "Training" section disabled so just a placeholder
+    "unload_models_when_training": OptionInfo(False, "Move VAE and CLIP to RAM when training", gr.Checkbox, { "visible": False }),
+    "pin_memory": OptionInfo(True, "Pin training dataset to memory", gr.Checkbox, { "visible": False }),
+    "save_optimizer_state": OptionInfo(False, "Save resumable optimizer state when training", gr.Checkbox, { "visible": False }),
+    "save_training_settings_to_txt": OptionInfo(True, "Save training settings to a text file", gr.Checkbox, { "visible": False }),
+    "dataset_filename_word_regex": OptionInfo("", "Filename word regex", gr.Textbox, { "visible": False }),
+    "dataset_filename_join_string": OptionInfo(" ", "Filename join string", gr.Textbox, { "visible": False }),
+    "embeddings_templates_dir": OptionInfo("", "Embeddings train templates directory", gr.Textbox, { "visible": False }),
+    "training_image_repeats_per_epoch": OptionInfo(1, "Image repeats per epoch", gr.Slider, {"minimum": 1, "maximum": 100, "step": 1, "visible": False }),
+    "training_write_csv_every": OptionInfo(0, "Save loss CSV file every n steps", gr.Number, { "visible": False }),
+    "training_enable_tensorboard": OptionInfo(False, "Enable tensorboard logging", gr.Checkbox, { "visible": False }),
+    "training_tensorboard_save_images": OptionInfo(False, "Save generated images within tensorboard", gr.Checkbox, { "visible": False }),
+    "training_tensorboard_flush_every": OptionInfo(120, "Tensorboard flush period", gr.Number, { "visible": False }),
 }))

 options_templates.update(options_section(('interrogate', "Interrogate"), {
@ -804,13 +807,13 @@ options_templates.update(options_section(('interrogate', "Interrogate"), {
    "deepbooru_filter_tags": OptionInfo("", "Filter out tags from deepbooru output"),
 }))

-options_templates.update(options_section(('extra_networks', "Extra Networks"), {
+options_templates.update(options_section(('extra_networks', "Networks"), {
    "extra_networks_sep1": OptionInfo("<h2>Extra networks UI</h2>", "", gr.HTML),
-    "extra_networks": OptionInfo(["All"], "Extra networks", gr.Dropdown, lambda: {"multiselect":True, "choices": ['All'] + [en.title for en in extra_networks]}),
+    "extra_networks": OptionInfo(["All"], "Networks", gr.Dropdown, lambda: {"multiselect":True, "choices": ['All'] + [en.title for en in extra_networks]}),
    "extra_networks_sort": OptionInfo("Default", "Sort order", gr.Dropdown, {"choices": ['Default', 'Name [A-Z]', 'Name [Z-A]', 'Date [Newest]', 'Date [Oldest]', 'Size [Largest]', 'Size [Smallest]']}),
    "extra_networks_view": OptionInfo("gallery", "UI view", gr.Radio, {"choices": ["gallery", "list"]}),
    "extra_networks_card_cover": OptionInfo("sidebar", "UI position", gr.Radio, {"choices": ["cover", "inline", "sidebar"]}),
-    "extra_networks_height": OptionInfo(53, "UI height (%)", gr.Slider, {"minimum": 10, "maximum": 100, "step": 1}),
+    "extra_networks_height": OptionInfo(55, "UI height (%)", gr.Slider, {"minimum": 10, "maximum": 100, "step": 1}),
    "extra_networks_sidebar_width": OptionInfo(35, "UI sidebar width (%)", gr.Slider, {"minimum": 10, "maximum": 80, "step": 1}),
    "extra_networks_card_size": OptionInfo(160, "UI card size (px)", gr.Slider, {"minimum": 20, "maximum": 2000, "step": 1}),
    "extra_networks_card_square": OptionInfo(True, "UI disable variable aspect ratio"),
@ -818,7 +821,7 @@ options_templates.update(options_section(('extra_networks', "Extra Networks"), {
    "extra_networks_sep2": OptionInfo("<h2>Extra networks general</h2>", "", gr.HTML),
    "extra_network_reference": OptionInfo(False, "Use reference values when available", gr.Checkbox),
    "extra_network_skip_indexing": OptionInfo(False, "Build info on first access", gr.Checkbox),
-    "extra_networks_default_multiplier": OptionInfo(1.0, "Default multiplier for extra networks", gr.Slider, {"minimum": 0.0, "maximum": 1.0, "step": 0.01}),
+    "extra_networks_default_multiplier": OptionInfo(1.0, "Default strength for extra networks", gr.Slider, {"minimum": 0.0, "maximum": 1.0, "step": 0.01}),
    "diffusers_convert_embed": OptionInfo(False, "Auto-convert SD 1.5 embeddings to SDXL ", gr.Checkbox, {"visible": native}),
    "extra_networks_sep3": OptionInfo("<h2>Extra networks settings</h2>", "", gr.HTML),
    "extra_networks_styles": OptionInfo(True, "Show built-in styles"),
--- a/modules/shared_state.py
+++ b/modules/shared_state.py
@ -41,10 +41,12 @@ class State:
        log.debug(f'Requested {"pause" if self.paused else "continue"}')

    def nextjob(self):
+        import modules.devices
        self.do_set_current_image()
        self.job_no += 1
        self.sampling_step = 0
        self.current_image_sampling_step = 0
+        modules.devices.torch_gc()

    def dict(self):
        obj = {
--- a/modules/styles.py
+++ b/modules/styles.py
@ -328,7 +328,7 @@ class StyleDatabase:
                "preview": "",
            }
            keepcharacters = (' ','.','_')
-            fn = "".join(c for c in name if c.isalnum() or c in keepcharacters).rstrip()
+            fn = "".join(c for c in name if c.isalnum() or c in keepcharacters).strip()
            fn = os.path.join(path, fn + ".json")
            try:
                with open(fn, 'w', encoding='utf-8') as f:
--- a/modules/textual_inversion/textual_inversion.py
+++ b/modules/textual_inversion/textual_inversion.py
@ -13,17 +13,6 @@ from modules.files_cache import directory_files, directory_mtime, extension_filt
 debug = shared.log.trace if os.environ.get('SD_TI_DEBUG', None) is not None else lambda *args, **kwargs: None
 debug('Trace: TEXTUAL INVERSION')
 TokenToAdd = namedtuple("TokenToAdd", ["clip_l", "clip_g"])
-TextualInversionTemplate = namedtuple("TextualInversionTemplate", ["name", "path"])
-textual_inversion_templates = {}
-
-
-def list_textual_inversion_templates():
-    textual_inversion_templates.clear()
-    for root, _dirs, fns in os.walk(shared.opts.embeddings_templates_dir):
-        for fn in fns:
-            path = os.path.join(root, fn)
-            textual_inversion_templates[fn] = TextualInversionTemplate(fn, path)
-    return textual_inversion_templates


 def list_embeddings(*dirs):
--- a/modules/theme.py
+++ b/modules/theme.py
@ -91,7 +91,6 @@ def reload_gradio_theme():
        'font_mono':['IBM Plex Mono', 'ui-monospace', 'Consolas', 'monospace']
    }
    gradio_theme = gr.themes.Base(**default_font_params)
-
    available_themes = list_themes()
    if theme_name not in available_themes:
        modules.shared.log.error(f'UI theme invalid: type={modules.shared.opts.theme_type} theme="{theme_name}" available={available_themes}')
@ -99,6 +98,9 @@ def reload_gradio_theme():
            theme_name = 'black-teal'
        elif modules.shared.opts.theme_type == 'Modern':
            theme_name = 'Default'
+        else:
+            modules.shared.opts.theme_type = 'Standard'
+            theme_name = 'black-teal'

    modules.shared.opts.data['gradio_theme'] = theme_name

--- a/modules/txt2img.py
+++ b/modules/txt2img.py
@ -35,8 +35,7 @@ def txt2img(id_task,
        shared.log.warning('Sampler: invalid')
        sampler_index = 0
    if hr_sampler_index is None:
-        shared.log.warning('Sampler: invalid')
-        hr_sampler_index = 0
+        hr_sampler_index = sampler_index

    p = processing.StableDiffusionProcessingTxt2Img(
        sd_model=shared.sd_model,
--- a/modules/ui_extra_networks.py
+++ b/modules/ui_extra_networks.py
@ -71,7 +71,7 @@ def init_api(app):
        metadata = page.metadata.get(item, 'none')
        if metadata is None:
            metadata = ''
-        # shared.log.debug(f"Extra networks metadata: page='{page}' item={item} len={len(metadata)}")
+        # shared.log.debug(f"Networks metadata: page='{page}' item={item} len={len(metadata)}")
        return JSONResponse({"metadata": metadata})

    def get_info(page: str = "", item: str = ""):
@ -84,7 +84,7 @@ def init_api(app):
        info = page.find_info(item['filename'])
        if info is None:
            info = {}
-        # shared.log.debug(f"Extra networks info: page='{page.name}' item={item['name']} len={len(info)}")
+        # shared.log.debug(f"Networks info: page='{page.name}' item={item['name']} len={len(info)}")
        return JSONResponse({"info": info})

    def get_desc(page: str = "", item: str = ""):
@ -97,7 +97,7 @@ def init_api(app):
        desc = page.find_description(item['filename'])
        if desc is None:
            desc = ''
-        # shared.log.debug(f"Extra networks desc: page='{page.name}' item={item['name']} len={len(desc)}")
+        # shared.log.debug(f"Networks desc: page='{page.name}' item={item['name']} len={len(desc)}")
        return JSONResponse({"description": desc})

    app.add_api_route("/sd_extra_networks/thumb", fetch_file, methods=["GET"])
@ -186,7 +186,7 @@ class ExtraNetworksPage:
            except Exception as e:
                shared.log.warning(f'Extra network error creating thumbnail: {f} {e}')
        if created > 0:
-            shared.log.info(f"Extra network thumbnails: {self.name} created={created}")
+            shared.log.info(f"Network thumbnails: {self.name} created={created}")
            self.missing_thumbs.clear()

    def create_items(self, tabname):
@ -235,7 +235,7 @@ class ExtraNetworksPage:
                    continue
                # if not self.is_empty(tgt):
                subdirs[subdir] = 1
-        debug(f"Extra networks: page='{self.name}' subfolders={list(subdirs)}")
+        debug(f"Networks: page='{self.name}' subfolders={list(subdirs)}")
        subdirs = OrderedDict(sorted(subdirs.items()))
        if self.name == 'model':
            subdirs['Reference'] = 1
@ -272,7 +272,7 @@ class ExtraNetworksPage:
        self.html += ''.join(htmls)
        self.page_time = time.time()
        self.html = f"<div id='~tabname_{self_name_id}_subdirs' class='extra-network-subdirs'>{subdirs_html}</div><div id='~tabname_{self_name_id}_cards' class='extra-network-cards'>{self.html}</div>"
-        shared.log.debug(f"Extra networks: page='{self.name}' items={len(self.items)} subfolders={len(subdirs)} tab={tabname} folders={self.allowed_directories_for_previews()} list={self.list_time:.2f} thumb={self.preview_time:.2f} desc={self.desc_time:.2f} info={self.info_time:.2f} workers={shared.max_workers} sort={shared.opts.extra_networks_sort}")
+        shared.log.debug(f"Networks: page='{self.name}' items={len(self.items)} subfolders={len(subdirs)} tab={tabname} folders={self.allowed_directories_for_previews()} list={self.list_time:.2f} thumb={self.preview_time:.2f} desc={self.desc_time:.2f} info={self.info_time:.2f} workers={shared.max_workers} sort={shared.opts.extra_networks_sort}")
        if len(self.missing_thumbs) > 0:
            threading.Thread(target=self.create_thumb).start()
        return self.patch(self.html, tabname)
@ -570,7 +570,7 @@ def create_ui(container, button_parent, tabname, skip_indexing = False):
        with gr.Group(elem_id=f"{tabname}_extra_details_tabs", visible=False) as ui.details_tabs:
            with gr.Tabs():
                with gr.Tab('Description', elem_classes=['extra-details-tabs']):
-                    desc = gr.Textbox('', show_label=False, lines=8, placeholder="Extra network description...")
+                    desc = gr.Textbox('', show_label=False, lines=8, placeholder="Network description...")
                    ui.details_components.append(desc)
                    with gr.Row():
                        btn_save_desc = gr.Button('Save', elem_classes=['small-button'], elem_id=f'{tabname}_extra_details_save_desc')
@ -812,15 +812,22 @@ def create_ui(container, button_parent, tabname, skip_indexing = False):
                    <tr><td>Preview Embedded</td><td>{item.preview.startswith('data:')}</td></tr>
                '''
                # desc = f'Name: {os.path.basename(item.name)}\nDescription: {item.description}\nPrompt: {item.prompt}\nNegative: {item.negative}\nExtra: {item.extra}\n'
+            if item.name.startswith('Diffusers'):
+                url = item.name.replace('Diffusers/', '')
+                url = f'<a href="https://huggingface.co/{url}" target="_blank">https://huggingface.co/models/{url}</a>' if url is not None else 'N/A'
+            else:
+                url = info.get('id', None) if info is not None else None
+                url = f'<a href="https://civitai.com/models/{url}" target="_blank">civitai.com/models/{url}</a>' if url is not None else 'N/A'
            text = f'''
                <h2 style="border-bottom: 1px solid var(--button-primary-border-color); margin: 0em 0px 1em 0 !important">{item.name}</h2>
-                <table style="width: 100%; line-height: 1.3em;"><tbody>
+                <table style="width: 100%; line-height: 1.5em;"><tbody>
                    <tr><td>Type</td><td>{page.title}</td></tr>
                    <tr><td>Alias</td><td>{getattr(item, 'alias', 'N/A')}</td></tr>
                    <tr><td>Filename</td><td>{item.filename}</td></tr>
                    <tr><td>Hash</td><td>{getattr(item, 'hash', 'N/A')}</td></tr>
                    <tr><td>Size</td><td>{round(stat.st_size/1024/1024, 2) if stat is not None else 'N/A'} MB</td></tr>
                    <tr><td>Last modified</td><td>{datetime.fromtimestamp(stat.st_mtime) if stat is not None else 'N/A'}</td></tr>
+                    <tr><td>Source URL</td><td>{url}</td></tr>
                    <tr><td style="border-top: 1px solid var(--button-primary-border-color);"></td><td></td></tr>
                    {lora}
                    {model}
@ -888,7 +895,8 @@ def create_ui(container, button_parent, tabname, skip_indexing = False):
        return res

    def ui_quicksave_click(name):
-        if name is None:
+        if name is None or len(name) < 1:
+            shared.log.warning("Network quick save style: no name provided")
            return
        fn = os.path.join(paths.data_path, "params.txt")
        if os.path.exists(fn):
@ -908,9 +916,9 @@ def create_ui(container, button_parent, tabname, skip_indexing = False):
        }
        shared.writefile(item, fn, silent=True)
        if len(prompt) > 0:
-            shared.log.debug(f"Extra network quick save style: item={name} filename='{fn}'")
+            shared.log.debug(f"Network quick save style: item={name} filename='{fn}'")
        else:
-            shared.log.warning(f"Extra network quick save model: item={name} filename='{fn}' prompt is empty")
+            shared.log.warning(f"Network quick save model: item={name} filename='{fn}' prompt is empty")

    def ui_sort_cards(sort_order):
        if shared.opts.extra_networks_sort != sort_order:
--- a/modules/ui_extra_networks_checkpoints.py
+++ b/modules/ui_extra_networks_checkpoints.py
@ -64,7 +64,7 @@ class ExtraNetworksPageCheckpoints(ui_extra_networks.ExtraNetworksPage):
            record["info"] = self.find_info(checkpoint.filename)
            record["description"] = self.find_description(checkpoint.filename, record["info"])
        except Exception as e:
-            shared.log.debug(f"Extra networks error: type=model file={name} {e}")
+            shared.log.debug(f"Networks error: type=model file={name} {e}")
        return record

    def list_items(self):
--- a/modules/ui_extra_networks_hypernets.py
+++ b/modules/ui_extra_networks_hypernets.py
@ -27,7 +27,7 @@ class ExtraNetworksPageHypernetworks(ui_extra_networks.ExtraNetworksPage):
                    "size": os.path.getsize(path),
                }
            except Exception as e:
-                shared.log.debug(f"Extra networks error: type=hypernetwork file={path} {e}")
+                shared.log.debug(f"Networks error: type=hypernetwork file={path} {e}")

    def allowed_directories_for_previews(self):
        return [shared.opts.hypernetwork_dir]
--- a/modules/ui_extra_networks_styles.py
+++ b/modules/ui_extra_networks_styles.py
@ -93,11 +93,12 @@ class ExtraNetworksPageStyles(ui_extra_networks.ExtraNetworksPage):
                "size": os.path.getsize(style.filename),
            }
        except Exception as e:
-            shared.log.debug(f"Extra networks error: type=style file={k} {e}")
+            shared.log.debug(f"Networks error: type=style file={k} {e}")
        return item

    def list_items(self):
        items = [self.create_item(k) for k in list(shared.prompt_styles.styles)]
+        items = [item for item in items if item is not None]
        self.update_all_previews(items)
        return items

--- a/modules/ui_extra_networks_textual_inversion.py
+++ b/modules/ui_extra_networks_textual_inversion.py
@ -37,7 +37,7 @@ class ExtraNetworksPageTextualInversion(ui_extra_networks.ExtraNetworksPage):
            record["info"] = self.find_info(embedding.filename)
            record["description"] = self.find_description(embedding.filename, record["info"])
        except Exception as e:
-            shared.log.debug(f"Extra networks error: type=embedding file={embedding.filename} {e}")
+            shared.log.debug(f"Networks error: type=embedding file={embedding.filename} {e}")
        return record

    def list_items(self):
--- a/modules/ui_extra_networks_vae.py
+++ b/modules/ui_extra_networks_vae.py
@ -31,7 +31,7 @@ class ExtraNetworksPageVAEs(ui_extra_networks.ExtraNetworksPage):
                record["description"] = self.find_description(filename, record["info"])
                yield record
            except Exception as e:
-                shared.log.debug(f"Extra networks error: type=vae file={filename} {e}")
+                shared.log.debug(f"Networks error: type=vae file={filename} {e}")

    def allowed_directories_for_previews(self):
        return [v for v in [shared.opts.vae_dir] if v is not None]
--- a/modules/ui_models.py
+++ b/modules/ui_models.py
@ -431,7 +431,7 @@ def create_ui():
                    r = req(url)
                    log.debug(f'CivitAI search: name="{name}" tag={tag or "none"} url="{url}" status={r.status_code}')
                    if r.status_code != 200:
-                        return [], [], []
+                        return [], gr.update(visible=False, value=[]), gr.update(visible=False, value=None), gr.update(visible=False, value=None)
                    body = r.json()
                    nonlocal data
                    data = body.get('items', [])
--- a/modules/ui_postprocessing.py
+++ b/modules/ui_postprocessing.py
@ -80,7 +80,7 @@ def create_ui():
                    with gr.Row():
                        vqa_answer = gr.Textbox(label="Answer", lines=3)
                    with gr.Row(elem_id='interrogate_buttons_query'):
-                        vqa_model = gr.Dropdown(list(vqa.MODELS), value='Moondream 2', label='VQA Model')
+                        vqa_model = gr.Dropdown(list(vqa.MODELS), value='MS Florence 2 Base', label='VQA Model')
                        vqa_submit = gr.Button("Interrogate", elem_id="interrogate_btn_interrogate", variant='primary')
                    vqa_submit.click(vqa.interrogate, inputs=[vqa_question, vqa_image, vqa_model], outputs=[vqa_answer])

--- a/modules/ui_prompt_styles.py
+++ b/modules/ui_prompt_styles.py
@ -46,60 +46,3 @@ def refresh_styles():
 class UiPromptStyles:
    def __init__(self, tabname, main_ui_prompt, main_ui_negative_prompt): # pylint: disable=unused-argument
        self.dropdown = gr.Dropdown(label="Styles", elem_id=f"{tabname}_styles", choices=[style.name for style in shared.prompt_styles.styles.values()], value=[], multiselect=True)
-
-    """
-    def __init__(self, tabname, main_ui_prompt, main_ui_negative_prompt):
-        self.tabname = tabname
-
-        with gr.Row(elem_id=f"{tabname}_styles_row"):
-            self.dropdown = gr.Dropdown(label="Styles", show_label=False, elem_id=f"{tabname}_styles", choices=list(shared.prompt_styles.styles), value=[], multiselect=True, tooltip="Styles")
-            edit_button = ui_components.ToolButton(value=styles_edit_symbol, elem_id=f"{tabname}_styles_edit_button", tooltip="Edit styles")
-
-        with gr.Box(elem_id=f"{tabname}_styles_dialog", elem_classes="popup-dialog") as styles_dialog:
-            with gr.Row():
-                self.selection = gr.Dropdown(label="Styles", elem_id=f"{tabname}_styles_edit_select", choices=list(shared.prompt_styles.styles), value=[], allow_custom_value=True, info="Styles allow you to add custom text to prompt. Use the {prompt} token in style text, and it will be replaced with user's prompt when applying style. Otherwise, style's text will be added to the end of the prompt.")
-                ui_common.create_refresh_button([self.dropdown, self.selection], shared.prompt_styles.reload, lambda: {"choices": list(shared.prompt_styles.styles)}, f"refresh_{tabname}_styles")
-                self.materialize = ui_components.ToolButton(value=styles_materialize_symbol, elem_id=f"{tabname}_style_apply", tooltip="Apply all selected styles from the style selction dropdown in main UI to the prompt.")
-
-            with gr.Row():
-                self.prompt = gr.Textbox(label="Prompt", show_label=True, elem_id=f"{tabname}_edit_style_prompt", lines=3)
-
-            with gr.Row():
-                self.neg_prompt = gr.Textbox(label="Negative prompt", show_label=True, elem_id=f"{tabname}_edit_style_neg_prompt", lines=3)
-
-            with gr.Row():
-                self.save = gr.Button('Save', variant='primary', elem_id=f'{tabname}_edit_style_save', visible=False)
-                self.delete = gr.Button('Delete', variant='primary', elem_id=f'{tabname}_edit_style_delete', visible=False)
-                self.close = gr.Button('Close', variant='secondary', elem_id=f'{tabname}_edit_style_close')
-
-        self.selection.change(
-            fn=select_style,
-            inputs=[self.selection],
-            outputs=[self.prompt, self.neg_prompt, self.delete, self.save],
-            show_progress=False,
-        )
-
-        self.save.click(
-            fn=save_style,
-            inputs=[self.selection, self.prompt, self.neg_prompt],
-            outputs=[self.delete],
-            show_progress=False,
-        ).then(refresh_styles, outputs=[self.dropdown, self.selection], show_progress=False)
-
-        self.delete.click(
-            fn=delete_style,
-            _js='function(name){ if(name == "") return ""; return confirm("Delete style " + name + "?") ? name : ""; }',
-            inputs=[self.selection],
-            outputs=[self.selection, self.prompt, self.neg_prompt],
-            show_progress=False,
-        ).then(refresh_styles, outputs=[self.dropdown, self.selection], show_progress=False)
-
-        self.materialize.click(
-            fn=materialize_styles,
-            inputs=[main_ui_prompt, main_ui_negative_prompt, self.dropdown],
-            outputs=[main_ui_prompt, main_ui_negative_prompt, self.dropdown],
-            show_progress=False,
-        ).then(fn=None, _js="function(){update_"+tabname+"_tokens(); closePopup();}", show_progress=False)
-
-        ui_common.setup_dialog(button_show=edit_button, dialog=styles_dialog, button_close=self.close)
-    """
--- a/modules/ui_sections.py
+++ b/modules/ui_sections.py
@ -16,7 +16,7 @@ def create_toprow(is_img2img: bool = False, id_part: str = None):
    if id_part is None:
        id_part = "img2img" if is_img2img else "txt2img"
    with gr.Row(elem_id=f"{id_part}_toprow", variant="compact"):
-        with gr.Column(elem_id=f"{id_part}_prompt_container", scale=6):
+        with gr.Column(elem_id=f"{id_part}_prompt_container", scale=5):
            with gr.Row():
                with gr.Column(scale=80):
                    with gr.Row():
--- a/modules/unipc/uni_pc.py
+++ b/modules/unipc/uni_pc.py
@ -14,86 +14,6 @@ class NoiseScheduleVP:
            continuous_beta_0=0.1,
            continuous_beta_1=20.,
        ):
-        """Create a wrapper class for the forward SDE (VP type).
-
-        ***
-        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
-                We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
-        ***
-
-        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
-        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
-        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
-
-            log_alpha_t = self.marginal_log_mean_coeff(t)
-            sigma_t = self.marginal_std(t)
-            lambda_t = self.marginal_lambda(t)
-
-        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
-
-            t = self.inverse_lambda(lambda_t)
-
-        ===============================================================
-
-        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
-
-        1. For discrete-time DPMs:
-
-            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
-                t_i = (i + 1) / N
-            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
-            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
-
-            Args:
-                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
-                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
-
-            Note that we always have alphas_cumprod = cumprod(betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
-
-            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
-                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
-                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
-                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
-                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
-                and
-                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
-
-
-        2. For continuous-time DPMs:
-
-            We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
-            schedule are the default settings in DDPM and improved-DDPM:
-
-            Args:
-                beta_min: A `float` number. The smallest beta for the linear schedule.
-                beta_max: A `float` number. The largest beta for the linear schedule.
-                cosine_s: A `float` number. The hyperparameter in the cosine schedule.
-                cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
-                T: A `float` number. The ending time of the forward process.
-
-        ===============================================================
-
-        Args:
-            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
-                    'linear' or 'cosine' for continuous-time DPMs.
-        Returns:
-            A wrapper object of the forward SDE (VP type).
-
-        ===============================================================
-
-        Example:
-
-        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
-        >>> ns = NoiseScheduleVP('discrete', betas=betas)
-
-        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
-        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
-
-        # For continuous-time DPMs (VPSDE), linear schedule:
-        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
-
-        """
-
        if schedule not in ['discrete', 'linear', 'cosine']:
            raise ValueError(f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear' or 'cosine'")

--- a/modules/vqa.py
+++ b/modules/vqa.py
@ -8,6 +8,8 @@ processor = None
 model = None
 loaded: str = None
 MODELS = {
+    "MS Florence 2 Base": "microsoft/Florence-2-base", # 0.5GB
+    "MS Florence 2 Large": "microsoft/Florence-2-large", # 1.5GB
    "Moondream 2": "vikhyatk/moondream2", # 3.7GB
    "GIT TextCaps Base": "microsoft/git-base-textcaps", # 0.7GB
    "GIT VQA Base": "microsoft/git-base-vqav2", # 0.7GB
@ -124,7 +126,44 @@ def moondream(question: str, image: Image.Image, repo: str = None):
    return response


+def florence(question: str, image: Image.Image, repo: str = None):
+    global processor, model, loaded # pylint: disable=global-statement
+    if model is None or loaded != repo:
+        model = transformers.AutoModelForCausalLM.from_pretrained(repo, trust_remote_code=True)
+        processor = transformers.AutoProcessor.from_pretrained(repo, trust_remote_code=True)
+        loaded = repo
+        model.eval()
+    model.to(devices.device, devices.dtype)
+    shared.log.debug(f'VQA: class={model.__class__.__name__} processor={processor.__class__} model={repo}')
+
+    if question.startswith('<'):
+        task = question.split('>', 1)[0] + '>'
+    else:
+        task = '<MORE_DETAILED_CAPTION>'
+        question = task + question
+    inputs = processor(text=question, images=image, return_tensors="pt")
+    input_ids = inputs['input_ids'].to(devices.device)
+    pixel_values = inputs['pixel_values'].to(devices.device, devices.dtype)
+    with devices.inference_context():
+        generated_ids = model.generate(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            max_new_tokens=1024,
+            num_beams=3,
+            do_sample=False
+        )
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+        response = processor.post_process_generation(generated_text, task="task", image_size=(image.width, image.height))
+
+    if 'task' in response:
+        response = response['task']
+    shared.log.debug(f'VQA: task={task} response="{response}"')
+    return response
+
+
 def interrogate(vqa_question, vqa_image, vqa_model_req):
+    from installer import install
+    install('flash_attn', quiet=True)
    vqa_model = MODELS.get(vqa_model_req, None)
    shared.log.debug(f'VQA: model="{vqa_model}" question="{vqa_question}" image={vqa_image}')
    if vqa_image is None:
@ -146,6 +185,8 @@ def interrogate(vqa_question, vqa_image, vqa_model_req):
        answer = pix(vqa_question, vqa_image, vqa_model)
    if 'moondream2' in vqa_model.lower():
        answer = moondream(vqa_question, vqa_image, vqa_model)
+    if 'florence' in vqa_model.lower():
+        answer = florence(vqa_question, vqa_image, vqa_model)
    else:
        answer = 'unknown model'
    if model is not None:
--- a/requirements.txt
+++ b/requirements.txt
@ -27,7 +27,7 @@ fasteners
 orjson
 invisible-watermark
 pi-heif
-diffusers==0.29.0
+diffusers==0.29.1
 safetensors==0.4.3
 tensordict==0.1.2
 peft==0.11.1
@ -54,7 +54,7 @@ protobuf==4.25.3
 pytorch_lightning==1.9.4
 tokenizers==0.19.1
 transformers==4.41.2
-urllib3==1.26.18
+urllib3==1.26.19
 Pillow==10.3.0
 timm==0.9.16
 pydantic==1.10.15
--- a/scripts/face_details.py
+++ b/scripts/face_details.py
@ -104,11 +104,12 @@ class FaceRestorerYolo(FaceRestoration):
            return np_image
        self.load()
        if self.model is None:
-            shared.log.error(f"Model load: type=FaceHires model='{self.model_name}' dir={self.model_dir} url={self.model_url}")
+            shared.log.debug('Face HiRes: model not loaded')
            return np_image
        image = Image.fromarray(np_image)
        faces = self.predict(image)
        if len(faces) == 0:
+            shared.log.debug('Face HiRes: no faces detected')
            return np_image

        # create backups
@ -140,6 +141,7 @@ class FaceRestorerYolo(FaceRestoration):
        if args['denoising_strength'] == 0:
            shared.log.debug('Face HiRes skip: strength=0')
        control_pipeline = None
+        orig_class = shared.sd_model.__class__
        if getattr(p, 'is_control', False):
            from modules.control import run
            control_pipeline = shared.sd_model
@ -177,6 +179,8 @@ class FaceRestorerYolo(FaceRestoration):
        # restore pipeline
        if control_pipeline is not None:
            shared.sd_model = control_pipeline
+        else:
+            shared.sd_model.__class__ = orig_class
        p = processing_class.switch_class(p, orig_cls, orig_p)
        p.init_images = getattr(orig_p, 'init_images', None)
        p.image_mask = getattr(orig_p, 'image_mask', None)
--- a/scripts/xyz_grid.py
+++ b/scripts/xyz_grid.py
@ -65,6 +65,7 @@ def apply_sampler(p, x, xs):
    else:
        p.sampler_name = sampler_name

+
 def apply_hr_sampler_name(p, x, xs):
    hr_sampler_name = sd_samplers.samplers_map.get(x.lower(), None)
    if hr_sampler_name is None:
@ -72,6 +73,7 @@ def apply_hr_sampler_name(p, x, xs):
    else:
        p.hr_sampler_name = hr_sampler_name

+
 def confirm_samplers(p, xs):
    for x in xs:
        if x.lower() not in sd_samplers.samplers_map:
@ -138,6 +140,24 @@ def apply_vae(p, x, xs):
    sd_vae.reload_vae_weights(shared.sd_model, vae_file=find_vae(x))


+def list_lora():
+    import sys
+    lora = [v for k, v in sys.modules.items() if k == 'networks'][0]
+    loras = [v.name for v in lora.available_networks.values()]
+    return ['None'] + loras
+
+
+def apply_lora(p, x, xs):
+    if x == 'None':
+        return
+    p.prompt = p.prompt + f" <lora:{x}:{shared.opts.extra_networks_default_multiplier}>"
+
+
+def apply_te(p, x, xs):
+    shared.opts.data["sd_text_encoder"] = x
+    sd_models.reload_text_encoder()
+
+
 def apply_styles(p: processing.StableDiffusionProcessingTxt2Img, x: str, _):
    p.styles.extend(x.split(','))

@ -230,6 +250,8 @@ axis_options = [
    AxisOption("Prompt S/R", str, apply_prompt, fmt=format_value),
    AxisOption("Model", str, apply_checkpoint, fmt=format_value, cost=1.0, choices=lambda: sorted(sd_models.checkpoints_list)),
    AxisOption("VAE", str, apply_vae, cost=0.7, choices=lambda: ['None'] + list(sd_vae.vae_dict)),
+    AxisOption("LoRA", str, apply_lora, cost=0.5, choices=list_lora),
+    AxisOption("Text encoder", str, apply_te, cost=0.7, choices=lambda: ['None', 'T5 FP4', 'T5 FP8', 'T5 FP16']),
    AxisOption("Styles", str, apply_styles, choices=lambda: [s.name for s in shared.prompt_styles.styles.values()]),
    AxisOption("Seed", int, apply_field("seed")),
    AxisOption("Steps", int, apply_field("steps")),
@ -251,6 +273,7 @@ axis_options = [
    AxisOption("[Sampler] Sigma tmax", float, apply_field("s_tmax")),
    AxisOption("[Sampler] Sigma Churn", float, apply_field("s_churn")),
    AxisOption("[Sampler] Sigma noise", float, apply_field("s_noise")),
+    AxisOption("[Sampler] Shift", float, apply_setting("schedulers_shift")),
    AxisOption("[Sampler] ETA", float, apply_setting("scheduler_eta")),
    AxisOption("[Sampler] Solver order", int, apply_setting("schedulers_solver_order")),
    AxisOption("[Second pass] Upscaler", str, apply_field("hr_upscaler"), choices=lambda: [*shared.latent_upscale_modes, *[x.name for x in shared.sd_upscalers]]),
--- a/Show More
+++ b/Show More