diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 98a27b4ce..d56e113ba 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -26,14 +26,15 @@ body:
         Easiest is to include top part of console log, for example:  
         ```log
         Starting SD.Next
-        Python 3.10.6 on Linux
-        Version: abd7d160 Sat Jun 10 07:37:42 2023 -0400
-        nVidia CUDA toolkit detected
-        Torch 2.1.0.dev20230519+cu121
-        Torch backend: nVidia CUDA 12.1 cuDNN 8801
-        Torch detected GPU: NVIDIA GeForce RTX 3060 VRAM 12288 Arch (8, 6) Cores 28
-        Enabled extensions-builtin: [...]
-        Enabled extensions: [...]
+        Version: app=sd.next updated=2024-06-28 hash=1fc20e72 branch=dev url=https://github.com/vladmandic/automatic/tree/dev ui=dev
+        Branch sync failed: sdnext=dev ui=dev
+        Platform: arch=x86_64 cpu=x86_64 system=Linux release=5.15.153.1-microsoft-standard-WSL2 python=3.12.3
+        Torch allocator: "garbage_collection_threshold:0.80,max_split_size_mb:512"
+        Load packages: {'torch': '2.3.1+cu121', 'diffusers': '0.29.1', 'gradio': '3.43.2'}
+        Engine: backend=Backend.DIFFUSERS compute=cuda device=cuda attention="Scaled-Dot-Product" mode=no_grad
+        Device: device=NVIDIA GeForce RTX 4090 n=1 arch=sm_90 cap=(8, 9) cuda=12.1 cudnn=8902 driver=555.99
+        Extensions: enabled=['sd-webui-agent-scheduler', 'sd-extension-chainner', 'sd-extension-system-info', 'sdnext-modernui', 'Lora'] extensions-builtin
+        Extensions: enabled=[] extensions
         ```
   - type: markdown
     attributes:
@@ -73,6 +74,18 @@ body:
       default: 0
     validations:
       required: true
+  - type: dropdown
+    id: ui
+    attributes:
+      label: UI
+      description: Which UI are you're using?
+      options:
+        - None
+        - Standard
+        - ModernUI
+      default: 1
+    validations:
+      required: true
   - type: dropdown
     id: branch
     attributes:
@@ -90,11 +103,12 @@ body:
       label: Model
       description: What is the model type you're using?
       options:
-        - SD 1.5
-        - SD 2.1
-        - SD-XL
+        - StableDiffusion 1.5
+        - StableDiffusion 2.1
+        - StableDiffusion XL
+        - StableDiffusion 3
         - PixArt
-        - Stable Cascade
+        - StableCascade
         - Kandinsky
         - Other
       default: 0
diff --git a/.github/workflows/on_pull_request.yaml b/.github/workflows/on_pull_request.yaml
index 7abade093..5b00eec65 100644
--- a/.github/workflows/on_pull_request.yaml
+++ b/.github/workflows/on_pull_request.yaml
@@ -7,6 +7,12 @@ on:
 jobs:
   lint:
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        flags:
+          - --debug --test --uv
+          - --debug --test
     steps:
       - name: checkout-code
         uses: actions/checkout@main
@@ -27,5 +33,5 @@ jobs:
           msg: apply code formatting and linting auto-fixes
       - name: test-startup
         run: |
-          export COMMANDLINE_ARGS="--debug --test"
+          export COMMANDLINE_ARGS="${{ matrix.flags }}"
           python launch.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b2b10a416..5729e62b3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,13 +1,82 @@
 # Change Log for SD.Next
 
+## Update for 2024-07-09: WiP
+
+### Pending
+
+- Requires `diffusers==0.30.0`
+- [AuraFlow/LavenderFlow](https://github.com/huggingface/diffusers/pull/8796) (previously known as LavenderFlow)
+- [Kolors](https://github.com/huggingface/diffusers/pull/8812)
+- [ControlNet Union](https://huggingface.co/xinsir/controlnet-union-sdxl-1.0) pipeline
+- FlowMatchHeunDiscreteScheduler enable
+
+### Highlights
+
+Massive update to WiKi with over 20 new pages and articles, now includes guides for nearly all major features
+Support for new models:
+- [AlphaVLLM Lumina-Next-SFT](https://huggingface.co/Alpha-VLLM/Lumina-Next-SFT-diffusers)
+- [Kwai Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
+- [HunyuanDiT 1.2](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers)
+
+What else? Just a bit... ;)
+New **fast-install** mode, new **controlnet-union** *all-in-one* model, support for **DoRA** networks, additional **VLM** models, new **AuraSR** upscaler, and more...
+
+### New Models
+
+- [AlphaVLLM Lumina-Next-SFT](https://huggingface.co/Alpha-VLLM/Lumina-Next-SFT-diffusers)  
+  to use, simply select from *networks -> reference
+  use scheduler: default or euler flowmatch or heun flowmatch  
+  note: this model uses T5 XXL variation of text encoder  
+  (previous version of Lumina used Gemma 2B as text encoder)  
+- [Kwai Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
+  to use, simply select from *networks -> reference  
+  note: this is an SDXL style model that replaces standard CLiP-L and CLiP-G text encoders with a massive `chatglm3-6b` encoder  
+  however, this new encoder does support both English and Chinese prompting  
+- [HunyuanDiT 1.2](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers)
+  to use, simply select from *networks -> reference
+
+## Update for 2024-07-08
+
+This release is primary service release with cumulative fixes and several improvements, but no breaking changes.
+
+**New features...**
+- massive updates to [Wiki](https://github.com/vladmandic/automatic/wiki)  
+  with over 20 new pages and articles, now includes guides for nearly all major features  
+  *note*: this is work-in-progress, if you have any feedback or suggestions, please let us know!
+  thanks @GenesisArtemis!  
+- support for **DoRA** networks, thanks @AI-Casanova!
+- support for [uv](https://pypi.org/project/uv/), extremely fast installer, thanks @Yoinky3000!  
+  to use, simply add `--uv` to your command line params  
+- [Xinsir ControlNet++ Union](https://huggingface.co/xinsir/controlnet-union-sdxl-1.0)  
+  new SDXL *all-in-one* controlnet that can process any kind of preprocessors!
+- [CogFlorence 2 Large](https://huggingface.co/thwri/CogFlorence-2-Large-Freeze) VLM model  
+  to use, simply select in process -> visual query  
+- [AuraSR](https://huggingface.co/fal/AuraSR) high-quality 4x GAN-style upscaling model  
+  note: this is a large upscaler at 2.5GB  
+
+**And fixes...**
+- enable **Florence VLM**  for all platforms, thanks @lshqqytiger!  
+- improve ROCm detection under WSL2, thanks @lshqqytiger!  
+- add SD3 with FP16 T5 to list of detected models
+- fix executing extensions with zero params  
+- add support for embeddings bundled in LoRA, thanks @AI-Casanova!
+- fix executing extensions with zero params  
+- fix nncf for lora, thanks @Disty0!
+- fix diffusers version detection for SD3
+- fix current step for higher order samplers
+- fix control input type video  
+- fix reset pipeline at the end of each iteration  
+- fix faceswap when no faces detected  
+- multiple ModernUI fixes
+
 ## Update for 2024-06-23
 
 ### Highlights for 2024-06-23
 
-Following zero-day **SD3** release, a 10 days later here's a refresh with 10+ improvements  
+Following zero-day **SD3** release, a 10 days later heres a refresh with 10+ improvements  
 including full prompt attention, support for compressed weights, additional text-encoder quantization modes.  
 
-But there's more than SD3:  
+But theres more than SD3:  
 - support for quantized **T5** text encoder *FP16/FP8/FP4/INT8* in all models that use T5: SD3, PixArt-Σ, etc.  
 - support for **PixArt-Sigma** in small/medium/large variants  
 - support for **HunyuanDiT 1.1**  
@@ -17,7 +86,7 @@ But there's more than SD3:
 - additional efficiencies for users with low VRAM GPUs  
 - over 20 overall fixes  
 
-### Model Improvements
+### Model Improvements for 2024-06-23
 
 - **SD3**: enable tiny-VAE (TAESD) preview and non-full quality mode  
 - SD3: enable base LoRA support  
@@ -43,9 +112,9 @@ But there's more than SD3:
 - **MS Florence**: integration of Microsoft Florence VLM/VQA Base and Large models  
   simply select in *process -> visual query*!
 
-### General Improvements
+### General Improvements for 2024-06-23
 
-- support FP4 quantized T5 text encoder, in addtion to existing FP8 and FP16
+- support FP4 quantized T5 text encoder, in addition to existing FP8 and FP16
 - support for T5 text-encoder loader in **all** models that use T5  
   *example*: load FP4 or FP8 quantized T5 text-encoder into PixArt Sigma!
 - support for `torch-directml` **0.2.2**, thanks @lshqqytiger!  
@@ -67,7 +136,7 @@ But there's more than SD3:
 - Lora support without reloading the model  
 - ControlNet compression support  
 
-### Fixes
+### Fixes for 2024-06-23
 
 - fix unsaturated outputs, force apply vae config on model load  
 - fix hidiffusion handling of non-square aspect ratios, thanks @ShenZhang-Shin!
@@ -105,7 +174,7 @@ Plus tons of minor features such as optimized initial install experience, **T-Ga
 
 ### Full Changelog for 2024-06-13
 
-#### New Models
+#### New Models for 2024-06-23
 
 - [StabilityAI Stable Diffusion 3 Medium](https://stability.ai/news/stable-diffusion-3-medium)  
   yup, supported!  
@@ -116,7 +185,7 @@ Plus tons of minor features such as optimized initial install experience, **T-Ga
   note: this is a very large model at ~17GB, but can be used with less VRAM using model offloading  
   simply select from networks -> models -> reference, model will be auto-downloaded on first use  
 
-#### New Functionality
+#### New Functionality for 2024-06-23
 
 - [MuLan](https://github.com/mulanai/MuLan) Multi-language prompts
   write your prompts in ~110 auto-detected languages!  
@@ -153,7 +222,7 @@ Plus tons of minor features such as optimized initial install experience, **T-Ga
   typical differences are not large and its disabled by default as it does have some performance impact  
 - new sampler: **Euler FlowMatch**  
 
-#### Improvements
+#### Improvements Fixes 2024-06-13
 
 - additional modernui themes
 - reintroduce prompt attention normalization, disabled by default, enable in settings -> execution  
@@ -173,7 +242,7 @@ Plus tons of minor features such as optimized initial install experience, **T-Ga
 - auto-synchronize modernui and core branches  
 - add option to pad prompt with zeros, thanks @Disty
 
-#### Fixes
+#### Fixes 2024-06-13
 
 - cumulative fixes since the last release  
 - fix apply/unapply hidiffusion for sd15  
diff --git a/README.md b/README.md
index 52853ad3a..a0ec2a07b 100644
--- a/README.md
+++ b/README.md
@@ -64,31 +64,31 @@ For screenshots and informations on other available themes, see [Themes Wiki](ht
 
 Additional models will be added as they become available and there is public interest in them
 
-- [RunwayML Stable Diffusion](https://github.com/Stability-AI/stablediffusion/) 1.x and 2.x *(all variants)*  
-- [StabilityAI Stable Diffusion XL](https://github.com/Stability-AI/generative-models)  
-- [StabilityAI Stable Diffusion 3 Medium](https://stability.ai/news/stable-diffusion-3-medium)  
+- [RunwayML Stable Diffusion](https://github.com/Stability-AI/stablediffusion/) 1.x and 2.x *(all variants)*
+- [StabilityAI Stable Diffusion XL](https://github.com/Stability-AI/generative-models)
+- [StabilityAI Stable Diffusion 3 Medium](https://stability.ai/news/stable-diffusion-3-medium)
 - [StabilityAI Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) Base, XT 1.0, XT 1.1
-- [LCM: Latent Consistency Models](https://github.com/openai/consistency_models)  
-- [Playground](https://huggingface.co/playgroundai/playground-v2-256px-base) *v1, v2 256, v2 512, v2 1024 and latest v2.5*  
+- [LCM: Latent Consistency Models](https://github.com/openai/consistency_models)
+- [Playground](https://huggingface.co/playgroundai/playground-v2-256px-base) *v1, v2 256, v2 512, v2 1024 and latest v2.5*
 - [Stable Cascade](https://github.com/Stability-AI/StableCascade) *Full* and *Lite*
 - [aMUSEd 256](https://huggingface.co/amused/amused-256) 256 and 512
-- [Segmind Vega](https://huggingface.co/segmind/Segmind-Vega)  
-- [Segmind SSD-1B](https://huggingface.co/segmind/SSD-1B)  
-- [Segmind SegMoE](https://github.com/segmind/segmoe) *SD and SD-XL*  
-- [Kandinsky](https://github.com/ai-forever/Kandinsky-2) *2.1 and 2.2 and latest 3.0*  
-- [PixArt-α XL 2](https://github.com/PixArt-alpha/PixArt-alpha) *Medium and Large*  
-- [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma)  
-- [Warp Wuerstchen](https://huggingface.co/blog/wuertschen)  
+- [Segmind Vega](https://huggingface.co/segmind/Segmind-Vega)
+- [Segmind SSD-1B](https://huggingface.co/segmind/SSD-1B)
+- [Segmind SegMoE](https://github.com/segmind/segmoe) *SD and SD-XL*
+- [Kandinsky](https://github.com/ai-forever/Kandinsky-2) *2.1 and 2.2 and latest 3.0*
+- [PixArt-α XL 2](https://github.com/PixArt-alpha/PixArt-alpha) *Medium and Large*
+- [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma)
+- [Warp Wuerstchen](https://huggingface.co/blog/wuertschen)
 - [Tenecent HunyuanDiT](https://github.com/Tencent/HunyuanDiT)
 - [Tsinghua UniDiffusion](https://github.com/thu-ml/unidiffuser)
 - [DeepFloyd IF](https://github.com/deep-floyd/IF) *Medium and Large*
 - [ModelScope T2V](https://huggingface.co/damo-vilab/text-to-video-ms-1.7b)
 - [Segmind SD Distilled](https://huggingface.co/blog/sd_distillation) *(all variants)*
-- [BLIP-Diffusion](https://dxli94.github.io/BLIP-Diffusion-website/)  
+- [BLIP-Diffusion](https://dxli94.github.io/BLIP-Diffusion-website/)
 - [KOALA 700M](https://github.com/youngwanLEE/sdxl-koala)
-- [VGen](https://huggingface.co/ali-vilab/i2vgen-xl)  
+- [VGen](https://huggingface.co/ali-vilab/i2vgen-xl)
 - [SDXS](https://github.com/IDKiro/sdxs)
-- [Hyper-SD](https://huggingface.co/ByteDance/Hyper-SD) 
+- [Hyper-SD](https://huggingface.co/ByteDance/Hyper-SD)
 
 
 Also supported are modifiers such as:
@@ -226,6 +226,7 @@ List of available parameters, run `webui --help` for the full & up-to-date list:
       --version                                          Print version information
       --ignore                                           Ignore any errors and attempt to continue
       --safe                                             Run in safe mode with no user extensions
+      --uv                                               Use uv as installer, default: False
 
     Logging options:
       --log LOG                                          Set log file, default: None
diff --git a/TODO.md b/TODO.md
index 0647494dc..1a63122ec 100644
--- a/TODO.md
+++ b/TODO.md
@@ -10,8 +10,7 @@ Main ToDo list can be found at [GitHub projects](https://github.com/users/vladma
 - init latents: variations, img2img
 - diffusers public callbacks  
 - include reference styles
-- lora: sc lora, dora, etc
-- sd3 controlnet: <https://github.com/huggingface/diffusers/pull/8566>
+- lora: sc lora, etc
 
 ## Experimental
 
diff --git a/extensions-builtin/Lora/lora_convert.py b/extensions-builtin/Lora/lora_convert.py
index 827f97e3d..8432d8208 100644
--- a/extensions-builtin/Lora/lora_convert.py
+++ b/extensions-builtin/Lora/lora_convert.py
@@ -164,6 +164,8 @@ class KeyConvert:
 
     def diffusers(self, key):
         if self.is_sdxl:
+            if "diffusion_model" in key:  # Fix NTC Slider naming error
+                key = key.replace("diffusion_model", "lora_unet")
             map_keys = list(self.UNET_CONVERSION_MAP.keys())  # prefix of U-Net modules
             map_keys.sort()
             search_key = key.replace(self.LORA_PREFIX_UNET, "").replace(self.OFT_PREFIX_UNET, "").replace(self.LORA_PREFIX_TEXT_ENCODER1, "").replace(self.LORA_PREFIX_TEXT_ENCODER2, "")
diff --git a/extensions-builtin/Lora/network.py b/extensions-builtin/Lora/network.py
index a6579ae90..dc9ec4c8a 100644
--- a/extensions-builtin/Lora/network.py
+++ b/extensions-builtin/Lora/network.py
@@ -65,6 +65,7 @@ class Network:  # LoraModule
         self.unet_multiplier = [1.0] * 3
         self.dyn_dim = None
         self.modules = {}
+        self.bundle_embeddings = {}
         self.mtime = None
         self.mentioned_name = None
         """the text that was used to add the network to prompt - can be either name or an alias"""
@@ -87,6 +88,8 @@ class NetworkModule:
         self.bias = weights.w.get("bias")
         self.alpha = weights.w["alpha"].item() if "alpha" in weights.w else None
         self.scale = weights.w["scale"].item() if "scale" in weights.w else None
+        self.dora_scale = weights.w.get("dora_scale", None)
+        self.dora_norm_dims = len(self.shape) - 1
 
     def multiplier(self):
         unet_multiplier = 3 * [self.network.unet_multiplier] if not isinstance(self.network.unet_multiplier, list) else self.network.unet_multiplier
@@ -108,6 +111,27 @@ class NetworkModule:
             return self.alpha / self.dim
         return 1.0
 
+    def apply_weight_decompose(self, updown, orig_weight):
+        # Match the device/dtype
+        orig_weight = orig_weight.to(updown.dtype)
+        dora_scale = self.dora_scale.to(device=orig_weight.device, dtype=updown.dtype)
+        updown = updown.to(orig_weight.device)
+
+        merged_scale1 = updown + orig_weight
+        merged_scale1_norm = (
+            merged_scale1.transpose(0, 1)
+            .reshape(merged_scale1.shape[1], -1)
+            .norm(dim=1, keepdim=True)
+            .reshape(merged_scale1.shape[1], *[1] * self.dora_norm_dims)
+            .transpose(0, 1)
+        )
+
+        dora_merged = (
+                merged_scale1 * (dora_scale / merged_scale1_norm)
+        )
+        final_updown = dora_merged - orig_weight
+        return final_updown
+
     def finalize_updown(self, updown, orig_weight, output_shape, ex_bias=None):
         if self.bias is not None:
             updown = updown.reshape(self.bias.shape)
@@ -119,6 +143,8 @@ class NetworkModule:
             updown = updown.reshape(orig_weight.shape)
         if ex_bias is not None:
             ex_bias = ex_bias * self.multiplier()
+        if self.dora_scale is not None:
+            updown = self.apply_weight_decompose(updown, orig_weight)
         return updown * self.calc_scale() * self.multiplier(), ex_bias
 
     def calc_updown(self, target):
diff --git a/extensions-builtin/Lora/networks.py b/extensions-builtin/Lora/networks.py
index 71b5b29dc..4c6162677 100644
--- a/extensions-builtin/Lora/networks.py
+++ b/extensions-builtin/Lora/networks.py
@@ -1,504 +1,513 @@
-from typing import Union, List
-import os
-import re
-import time
-import concurrent
-import lora_patches
-import network
-import network_lora
-import network_hada
-import network_ia3
-import network_oft
-import network_lokr
-import network_full
-import network_norm
-import network_glora
-import network_overrides
-import lora_convert
-import torch
-import diffusers.models.lora
-from modules import shared, devices, sd_models, sd_models_compile, errors, scripts, files_cache
-
-
-debug = os.environ.get('SD_LORA_DEBUG', None) is not None
-originals: lora_patches.LoraPatches = None
-extra_network_lora = None
-available_networks = {}
-available_network_aliases = {}
-loaded_networks: List[network.Network] = []
-timer = { 'load': 0, 'apply': 0, 'restore': 0 }
-# networks_in_memory = {}
-lora_cache = {}
-available_network_hash_lookup = {}
-forbidden_network_aliases = {}
-re_network_name = re.compile(r"(.*)\s*\([0-9a-fA-F]+\)")
-module_types = [
-    network_lora.ModuleTypeLora(),
-    network_hada.ModuleTypeHada(),
-    network_ia3.ModuleTypeIa3(),
-    network_oft.ModuleTypeOFT(),
-    network_lokr.ModuleTypeLokr(),
-    network_full.ModuleTypeFull(),
-    network_norm.ModuleTypeNorm(),
-    network_glora.ModuleTypeGLora(),
-]
-convert_diffusers_name_to_compvis = lora_convert.convert_diffusers_name_to_compvis # supermerger compatibility item
-
-
-def assign_network_names_to_compvis_modules(sd_model):
-    network_layer_mapping = {}
-    if shared.native:
-        if not hasattr(shared.sd_model, 'text_encoder') or not hasattr(shared.sd_model, 'unet'):
-            sd_model.network_layer_mapping = {}
-            return
-        for name, module in shared.sd_model.text_encoder.named_modules():
-            prefix = "lora_te1_" if shared.sd_model_type == "sdxl" else "lora_te_"
-            network_name = prefix + name.replace(".", "_")
-            network_layer_mapping[network_name] = module
-            module.network_layer_name = network_name
-        if shared.sd_model_type == "sdxl":
-            for name, module in shared.sd_model.text_encoder_2.named_modules():
-                network_name = "lora_te2_" + name.replace(".", "_")
-                network_layer_mapping[network_name] = module
-                module.network_layer_name = network_name
-        for name, module in shared.sd_model.unet.named_modules():
-            network_name = "lora_unet_" + name.replace(".", "_")
-            network_layer_mapping[network_name] = module
-            module.network_layer_name = network_name
-    else:
-        if not hasattr(shared.sd_model, 'cond_stage_model'):
-            sd_model.network_layer_mapping = {}
-            return
-        for name, module in shared.sd_model.cond_stage_model.wrapped.named_modules():
-            network_name = name.replace(".", "_")
-            network_layer_mapping[network_name] = module
-            module.network_layer_name = network_name
-        for name, module in shared.sd_model.model.named_modules():
-            network_name = name.replace(".", "_")
-            network_layer_mapping[network_name] = module
-            module.network_layer_name = network_name
-    sd_model.network_layer_mapping = network_layer_mapping
-
-
-def load_diffusers(name, network_on_disk, lora_scale=1.0) -> network.Network:
-    t0 = time.time()
-    cached = lora_cache.get(name, None)
-    # if debug:
-    shared.log.debug(f'LoRA load: name="{name}" file="{network_on_disk.filename}" type=diffusers {"cached" if cached else ""} fuse={shared.opts.lora_fuse_diffusers}')
-    if cached is not None:
-        return cached
-    if not shared.native:
-        return None
-    if not hasattr(shared.sd_model, 'load_lora_weights'):
-        shared.log.error(f"LoRA load failed: class={shared.sd_model.__class__} does not implement load lora")
-        return None
-    try:
-        shared.sd_model.load_lora_weights(network_on_disk.filename)
-    except Exception as e:
-        errors.display(e, "LoRA")
-        return None
-    if shared.opts.lora_fuse_diffusers:
-        shared.sd_model.fuse_lora(lora_scale=lora_scale)
-    net = network.Network(name, network_on_disk)
-    net.mtime = os.path.getmtime(network_on_disk.filename)
-    lora_cache[name] = net
-    t1 = time.time()
-    timer['load'] += t1 - t0
-    return net
-
-
-def load_network(name, network_on_disk) -> network.Network:
-    t0 = time.time()
-    cached = lora_cache.get(name, None)
-    if debug:
-        shared.log.debug(f'LoRA load: name="{name}" file="{network_on_disk.filename}" type=lora {"cached" if cached else ""}')
-    if cached is not None:
-        return cached
-    net = network.Network(name, network_on_disk)
-    net.mtime = os.path.getmtime(network_on_disk.filename)
-    sd = sd_models.read_state_dict(network_on_disk.filename)
-    assign_network_names_to_compvis_modules(shared.sd_model) # this should not be needed but is here as an emergency fix for an unknown error people are experiencing in 1.2.0
-    keys_failed_to_match = {}
-    matched_networks = {}
-    convert = lora_convert.KeyConvert()
-    for key_network, weight in sd.items():
-        parts = key_network.split('.')
-        if len(parts) > 5: # messy handler for diffusers peft lora
-            key_network_without_network_parts = '_'.join(parts[:-2])
-            if not key_network_without_network_parts.startswith('lora_'):
-                key_network_without_network_parts = 'lora_' + key_network_without_network_parts
-            network_part = '.'.join(parts[-2:]).replace('lora_A', 'lora_down').replace('lora_B', 'lora_up')
-        else:
-            key_network_without_network_parts, network_part = key_network.split(".", 1)
-        # if debug:
-        #     shared.log.debug(f'LoRA load: name="{name}" full={key_network} network={network_part} key={key_network_without_network_parts}')
-        key, sd_module = convert(key_network_without_network_parts)  # Now returns lists
-        if sd_module[0] is None:
-            keys_failed_to_match[key_network] = key
-            continue
-        for k, module in zip(key, sd_module):
-            if k not in matched_networks:
-                matched_networks[k] = network.NetworkWeights(network_key=key_network, sd_key=k, w={}, sd_module=module)
-            matched_networks[k].w[network_part] = weight
-    for key, weights in matched_networks.items():
-        net_module = None
-        for nettype in module_types:
-            net_module = nettype.create_module(net, weights)
-            if net_module is not None:
-                break
-        if net_module is None:
-            shared.log.error(f'LoRA unhandled: name={name} key={key} weights={weights.w.keys()}')
-        else:
-            net.modules[key] = net_module
-    if len(keys_failed_to_match) > 0:
-        shared.log.warning(f"LoRA file={network_on_disk.filename} unmatched={len(keys_failed_to_match)} matched={len(matched_networks)}")
-        if debug:
-            shared.log.debug(f"LoRA file={network_on_disk.filename} unmatched={keys_failed_to_match}")
-    elif debug:
-        shared.log.debug(f"LoRA file={network_on_disk.filename} unmatched={len(keys_failed_to_match)} matched={len(matched_networks)}")
-    lora_cache[name] = net
-    t1 = time.time()
-    timer['load'] += t1 - t0
-    return net
-
-
-def load_networks(names, te_multipliers=None, unet_multipliers=None, dyn_dims=None):
-    networks_on_disk = [available_network_aliases.get(name, None) for name in names]
-    if any(x is None for x in networks_on_disk):
-        list_available_networks()
-        networks_on_disk = [available_network_aliases.get(name, None) for name in names]
-    failed_to_load_networks = []
-    recompile_model = False
-    if shared.compiled_model_state is not None and shared.compiled_model_state.is_compiled:
-        if len(names) == len(shared.compiled_model_state.lora_model):
-            for i, name in enumerate(names):
-                if shared.compiled_model_state.lora_model[i] != f"{name}:{te_multipliers[i] if te_multipliers else 1.0}":
-                    recompile_model = True
-                    shared.compiled_model_state.lora_model = []
-                    break
-            if not recompile_model:
-                if len(loaded_networks) > 0 and debug:
-                    shared.log.debug('Model Compile: Skipping LoRa loading')
-                return
-        else:
-            recompile_model = True
-            shared.compiled_model_state.lora_model = []
-    if recompile_model:
-        backup_cuda_compile = shared.opts.cuda_compile
-        sd_models.unload_model_weights(op='model')
-        shared.opts.cuda_compile = False
-        sd_models.reload_model_weights(op='model')
-        shared.opts.cuda_compile = backup_cuda_compile
-
-    loaded_networks.clear()
-    for i, (network_on_disk, name) in enumerate(zip(networks_on_disk, names)):
-        net = None
-        if network_on_disk is not None:
-            shorthash = getattr(network_on_disk, 'shorthash', '').lower()
-            if debug:
-                shared.log.debug(f'LoRA load: name="{name}" file="{network_on_disk.filename}" hash="{shorthash}"')
-            try:
-                if recompile_model:
-                    shared.compiled_model_state.lora_model.append(f"{name}:{te_multipliers[i] if te_multipliers else 1.0}")
-                if shared.native and shared.opts.lora_force_diffusers: # OpenVINO only works with Diffusers LoRa loading
-                    net = load_diffusers(name, network_on_disk, lora_scale=te_multipliers[i] if te_multipliers else 1.0)
-                elif shared.native and network_overrides.check_override(shorthash):
-                    net = load_diffusers(name, network_on_disk, lora_scale=te_multipliers[i] if te_multipliers else 1.0)
-                else:
-                    net = load_network(name, network_on_disk)
-            except Exception as e:
-                shared.log.error(f"LoRA load failed: file={network_on_disk.filename} {e}")
-                if debug:
-                    errors.display(e, f"LoRA load failed file={network_on_disk.filename}")
-                continue
-            net.mentioned_name = name
-            network_on_disk.read_hash()
-        if net is None:
-            failed_to_load_networks.append(name)
-            shared.log.error(f"LoRA unknown type: network={name}")
-            continue
-        net.te_multiplier = te_multipliers[i] if te_multipliers else 1.0
-        net.unet_multiplier = unet_multipliers[i] if unet_multipliers else 1.0
-        net.dyn_dim = dyn_dims[i] if dyn_dims else 1.0
-        loaded_networks.append(net)
-
-    while len(lora_cache) > shared.opts.lora_in_memory_limit:
-        name = next(iter(lora_cache))
-        lora_cache.pop(name, None)
-    if len(loaded_networks) > 0 and debug:
-        shared.log.debug(f'LoRA loaded={len(loaded_networks)} cache={list(lora_cache)}')
-    devices.torch_gc()
-
-    if recompile_model:
-        shared.log.info("LoRA recompiling model")
-        backup_lora_model = shared.compiled_model_state.lora_model
-        if shared.opts.cuda_compile:
-            shared.sd_model = sd_models_compile.compile_diffusers(shared.sd_model)
-
-        shared.compiled_model_state.lora_model = backup_lora_model
-
-
-def network_restore_weights_from_backup(self: Union[torch.nn.Conv2d, torch.nn.Linear, torch.nn.GroupNorm, torch.nn.LayerNorm, torch.nn.MultiheadAttention, diffusers.models.lora.LoRACompatibleLinear, diffusers.models.lora.LoRACompatibleConv]):
-    t0 = time.time()
-    weights_backup = getattr(self, "network_weights_backup", None)
-    bias_backup = getattr(self, "network_bias_backup", None)
-    if weights_backup is None and bias_backup is None:
-        return
-    # if debug:
-    #     shared.log.debug('LoRA restore weights')
-    if weights_backup is not None:
-        if isinstance(self, torch.nn.MultiheadAttention):
-            self.in_proj_weight.copy_(weights_backup[0])
-            self.out_proj.weight.copy_(weights_backup[1])
-        else:
-            self.weight.copy_(weights_backup)
-    if bias_backup is not None:
-        if isinstance(self, torch.nn.MultiheadAttention):
-            self.out_proj.bias.copy_(bias_backup)
-        else:
-            self.bias.copy_(bias_backup)
-    else:
-        if isinstance(self, torch.nn.MultiheadAttention):
-            self.out_proj.bias = None
-        else:
-            self.bias = None
-    t1 = time.time()
-    timer['restore'] += t1 - t0
-
-
-def network_apply_weights(self: Union[torch.nn.Conv2d, torch.nn.Linear, torch.nn.GroupNorm, torch.nn.LayerNorm, torch.nn.MultiheadAttention, diffusers.models.lora.LoRACompatibleLinear, diffusers.models.lora.LoRACompatibleConv]):
-    """
-    Applies the currently selected set of networks to the weights of torch layer self.
-    If weights already have this particular set of networks applied, does nothing.
-    If not, restores orginal weights from backup and alters weights according to networks.
-    """
-    network_layer_name = getattr(self, 'network_layer_name', None)
-    if network_layer_name is None:
-        return
-    t0 = time.time()
-    current_names = getattr(self, "network_current_names", ())
-    wanted_names = tuple((x.name, x.te_multiplier, x.unet_multiplier, x.dyn_dim) for x in loaded_networks)
-    weights_backup = getattr(self, "network_weights_backup", None)
-    if weights_backup is None and wanted_names != (): # pylint: disable=C1803
-        if current_names != ():
-            raise RuntimeError("no backup weights found and current weights are not unchanged")
-        if isinstance(self, torch.nn.MultiheadAttention):
-            weights_backup = (self.in_proj_weight.to(devices.cpu, copy=True), self.out_proj.weight.to(devices.cpu, copy=True))
-        else:
-            weights_backup = self.weight.to(devices.cpu, copy=True)
-        self.network_weights_backup = weights_backup
-    bias_backup = getattr(self, "network_bias_backup", None)
-    if bias_backup is None:
-        if isinstance(self, torch.nn.MultiheadAttention) and self.out_proj.bias is not None:
-            bias_backup = self.out_proj.bias.to(devices.cpu, copy=True)
-        elif getattr(self, 'bias', None) is not None:
-            bias_backup = self.bias.to(devices.cpu, copy=True)
-        else:
-            bias_backup = None
-        self.network_bias_backup = bias_backup
-
-    if current_names != wanted_names:
-        network_restore_weights_from_backup(self)
-        for net in loaded_networks:
-            # default workflow where module is known and has weights
-            module = net.modules.get(network_layer_name, None)
-            if module is not None and hasattr(self, 'weight'):
-                try:
-                    with devices.inference_context():
-                        updown, ex_bias = module.calc_updown(self.weight)
-                        if len(self.weight.shape) == 4 and self.weight.shape[1] == 9:
-                            # inpainting model. zero pad updown to make channel[1]  4 to 9
-                            updown = torch.nn.functional.pad(updown, (0, 0, 0, 0, 0, 5)) # pylint: disable=not-callable
-                        self.weight = torch.nn.Parameter(self.weight + updown)
-                        if ex_bias is not None and hasattr(self, 'bias'):
-                            if self.bias is None:
-                                self.bias = torch.nn.Parameter(ex_bias)
-                            else:
-                                self.bias += ex_bias
-                except RuntimeError as e:
-                    extra_network_lora.errors[net.name] = extra_network_lora.errors.get(net.name, 0) + 1
-                    if debug:
-                        module_name = net.modules.get(network_layer_name, None)
-                        shared.log.error(f"LoRA apply weight name={net.name} module={module_name} layer={network_layer_name} {e}")
-                        errors.display(e, 'LoRA apply weight')
-                        raise RuntimeError('LoRA apply weight') from e
-                continue
-            # alternative workflow looking at _*_proj layers
-            module_q = net.modules.get(network_layer_name + "_q_proj", None)
-            module_k = net.modules.get(network_layer_name + "_k_proj", None)
-            module_v = net.modules.get(network_layer_name + "_v_proj", None)
-            module_out = net.modules.get(network_layer_name + "_out_proj", None)
-            if isinstance(self, torch.nn.MultiheadAttention) and module_q and module_k and module_v and module_out:
-                try:
-                    with devices.inference_context():
-                        updown_q, _ = module_q.calc_updown(self.in_proj_weight)
-                        updown_k, _ = module_k.calc_updown(self.in_proj_weight)
-                        updown_v, _ = module_v.calc_updown(self.in_proj_weight)
-                        updown_qkv = torch.vstack([updown_q, updown_k, updown_v])
-                        updown_out, ex_bias = module_out.calc_updown(self.out_proj.weight)
-                        self.in_proj_weight += updown_qkv
-                        self.out_proj.weight += updown_out
-                    if ex_bias is not None:
-                        if self.out_proj.bias is None:
-                            self.out_proj.bias = torch.nn.Parameter(ex_bias)
-                        else:
-                            self.out_proj.bias += ex_bias
-                except RuntimeError as e:
-                    if debug:
-                        shared.log.debug(f"LoRA network={net.name} layer={network_layer_name} {e}")
-                    extra_network_lora.errors[net.name] = extra_network_lora.errors.get(net.name, 0) + 1
-                continue
-            if module is None:
-                continue
-            shared.log.warning(f"LoRA network={net.name} layer={network_layer_name} unsupported operation")
-            extra_network_lora.errors[net.name] = extra_network_lora.errors.get(net.name, 0) + 1
-        self.network_current_names = wanted_names
-    t1 = time.time()
-    timer['apply'] += t1 - t0
-
-
-def network_forward(module, input, original_forward): # pylint: disable=W0622
-    """
-    Old way of applying Lora by executing operations during layer's forward.
-    Stacking many loras this way results in big performance degradation.
-    """
-    if len(loaded_networks) == 0:
-        return original_forward(module, input)
-    input = devices.cond_cast_unet(input)
-    network_restore_weights_from_backup(module)
-    network_reset_cached_weight(module)
-    y = original_forward(module, input)
-    network_layer_name = getattr(module, 'network_layer_name', None)
-    for lora in loaded_networks:
-        module = lora.modules.get(network_layer_name, None)
-        if module is None:
-            continue
-        y = module.forward(input, y)
-    return y
-
-
-def network_reset_cached_weight(self: Union[torch.nn.Conv2d, torch.nn.Linear]):
-    self.network_current_names = ()
-    self.network_weights_backup = None
-
-
-def network_Linear_forward(self, input): # pylint: disable=W0622
-    if shared.opts.lora_functional:
-        return network_forward(self, input, originals.Linear_forward)
-    network_apply_weights(self)
-    return originals.Linear_forward(self, input)
-
-
-def network_Linear_load_state_dict(self, *args, **kwargs):
-    network_reset_cached_weight(self)
-    return originals.Linear_load_state_dict(self, *args, **kwargs)
-
-
-def network_Conv2d_forward(self, input): # pylint: disable=W0622
-    if shared.opts.lora_functional:
-        return network_forward(self, input, originals.Conv2d_forward)
-    network_apply_weights(self)
-    return originals.Conv2d_forward(self, input)
-
-
-def network_Conv2d_load_state_dict(self, *args, **kwargs):
-    network_reset_cached_weight(self)
-    return originals.Conv2d_load_state_dict(self, *args, **kwargs)
-
-
-def network_GroupNorm_forward(self, input): # pylint: disable=W0622
-    if shared.opts.lora_functional:
-        return network_forward(self, input, originals.GroupNorm_forward)
-    network_apply_weights(self)
-    return originals.GroupNorm_forward(self, input)
-
-
-def network_GroupNorm_load_state_dict(self, *args, **kwargs):
-    network_reset_cached_weight(self)
-    return originals.GroupNorm_load_state_dict(self, *args, **kwargs)
-
-
-def network_LayerNorm_forward(self, input): # pylint: disable=W0622
-    if shared.opts.lora_functional:
-        return network_forward(self, input, originals.LayerNorm_forward)
-    network_apply_weights(self)
-    return originals.LayerNorm_forward(self, input)
-
-
-def network_LayerNorm_load_state_dict(self, *args, **kwargs):
-    network_reset_cached_weight(self)
-    return originals.LayerNorm_load_state_dict(self, *args, **kwargs)
-
-
-def network_MultiheadAttention_forward(self, *args, **kwargs):
-    network_apply_weights(self)
-    return originals.MultiheadAttention_forward(self, *args, **kwargs)
-
-
-def network_MultiheadAttention_load_state_dict(self, *args, **kwargs):
-    network_reset_cached_weight(self)
-    return originals.MultiheadAttention_load_state_dict(self, *args, **kwargs)
-
-
-def list_available_networks():
-    available_networks.clear()
-    available_network_aliases.clear()
-    forbidden_network_aliases.clear()
-    available_network_hash_lookup.clear()
-    forbidden_network_aliases.update({"none": 1, "Addams": 1})
-    directories = []
-    if os.path.exists(shared.cmd_opts.lora_dir):
-        directories.append(shared.cmd_opts.lora_dir)
-    else:
-        shared.log.warning(f'LoRA directory not found: path="{shared.cmd_opts.lora_dir}"')
-    if os.path.exists(shared.cmd_opts.lyco_dir) and shared.cmd_opts.lyco_dir != shared.cmd_opts.lora_dir:
-        directories.append(shared.cmd_opts.lyco_dir)
-
-    def add_network(filename):
-        if not os.path.isfile(filename):
-            return
-        name = os.path.splitext(os.path.basename(filename))[0]
-        try:
-            entry = network.NetworkOnDisk(name, filename)
-            available_networks[entry.name] = entry
-            if entry.alias in available_network_aliases:
-                forbidden_network_aliases[entry.alias.lower()] = 1
-            if shared.opts.lora_preferred_name == 'filename':
-                available_network_aliases[entry.name] = entry
-            else:
-                available_network_aliases[entry.alias] = entry
-            if entry.shorthash:
-                available_network_hash_lookup[entry.shorthash] = entry
-        except OSError as e:  # should catch FileNotFoundError and PermissionError etc.
-            shared.log.error(f"Failed to load network {name} from {filename} {e}")
-
-    candidates = list(files_cache.list_files(*directories, ext_filter=[".pt", ".ckpt", ".safetensors"]))
-    with concurrent.futures.ThreadPoolExecutor(max_workers=shared.max_workers) as executor:
-        for fn in candidates:
-            executor.submit(add_network, fn)
-    shared.log.info(f'LoRA networks: available={len(available_networks)} folders={len(forbidden_network_aliases)}')
-
-
-def infotext_pasted(infotext, params): # pylint: disable=W0613
-    if "AddNet Module 1" in [x[1] for x in scripts.scripts_txt2img.infotext_fields]:
-        return  # if the other extension is active, it will handle those fields, no need to do anything
-    added = []
-    for k in params:
-        if not k.startswith("AddNet Model "):
-            continue
-        num = k[13:]
-        if params.get("AddNet Module " + num) != "LoRA":
-            continue
-        name = params.get("AddNet Model " + num)
-        if name is None:
-            continue
-        m = re_network_name.match(name)
-        if m:
-            name = m.group(1)
-        multiplier = params.get("AddNet Weight A " + num, "1.0")
-        added.append(f"<lora:{name}:{multiplier}>")
-    if added:
-        params["Prompt"] += "\n" + "".join(added)
-
-
-list_available_networks()
+from typing import Union, List
+import os
+import re
+import time
+import concurrent
+import lora_patches
+import network
+import network_lora
+import network_hada
+import network_ia3
+import network_oft
+import network_lokr
+import network_full
+import network_norm
+import network_glora
+import network_overrides
+import lora_convert
+import torch
+import diffusers.models.lora
+from modules import shared, devices, sd_models, sd_models_compile, errors, scripts, files_cache
+
+
+debug = os.environ.get('SD_LORA_DEBUG', None) is not None
+originals: lora_patches.LoraPatches = None
+extra_network_lora = None
+available_networks = {}
+available_network_aliases = {}
+loaded_networks: List[network.Network] = []
+timer = { 'load': 0, 'apply': 0, 'restore': 0 }
+# networks_in_memory = {}
+lora_cache = {}
+available_network_hash_lookup = {}
+forbidden_network_aliases = {}
+re_network_name = re.compile(r"(.*)\s*\([0-9a-fA-F]+\)")
+module_types = [
+    network_lora.ModuleTypeLora(),
+    network_hada.ModuleTypeHada(),
+    network_ia3.ModuleTypeIa3(),
+    network_oft.ModuleTypeOFT(),
+    network_lokr.ModuleTypeLokr(),
+    network_full.ModuleTypeFull(),
+    network_norm.ModuleTypeNorm(),
+    network_glora.ModuleTypeGLora(),
+]
+convert_diffusers_name_to_compvis = lora_convert.convert_diffusers_name_to_compvis # supermerger compatibility item
+
+
+def assign_network_names_to_compvis_modules(sd_model):
+    network_layer_mapping = {}
+    if shared.native:
+        if not hasattr(shared.sd_model, 'text_encoder') or not hasattr(shared.sd_model, 'unet'):
+            sd_model.network_layer_mapping = {}
+            return
+        for name, module in shared.sd_model.text_encoder.named_modules():
+            prefix = "lora_te1_" if shared.sd_model_type == "sdxl" else "lora_te_"
+            network_name = prefix + name.replace(".", "_")
+            network_layer_mapping[network_name] = module
+            module.network_layer_name = network_name
+        if shared.sd_model_type == "sdxl":
+            for name, module in shared.sd_model.text_encoder_2.named_modules():
+                network_name = "lora_te2_" + name.replace(".", "_")
+                network_layer_mapping[network_name] = module
+                module.network_layer_name = network_name
+        for name, module in shared.sd_model.unet.named_modules():
+            network_name = "lora_unet_" + name.replace(".", "_")
+            network_layer_mapping[network_name] = module
+            module.network_layer_name = network_name
+    else:
+        if not hasattr(shared.sd_model, 'cond_stage_model'):
+            sd_model.network_layer_mapping = {}
+            return
+        for name, module in shared.sd_model.cond_stage_model.wrapped.named_modules():
+            network_name = name.replace(".", "_")
+            network_layer_mapping[network_name] = module
+            module.network_layer_name = network_name
+        for name, module in shared.sd_model.model.named_modules():
+            network_name = name.replace(".", "_")
+            network_layer_mapping[network_name] = module
+            module.network_layer_name = network_name
+    sd_model.network_layer_mapping = network_layer_mapping
+
+
+def load_diffusers(name, network_on_disk, lora_scale=1.0) -> network.Network:
+    t0 = time.time()
+    cached = lora_cache.get(name, None)
+    # if debug:
+    shared.log.debug(f'LoRA load: name="{name}" file="{network_on_disk.filename}" type=diffusers {"cached" if cached else ""} fuse={shared.opts.lora_fuse_diffusers}')
+    if cached is not None:
+        return cached
+    if not shared.native:
+        return None
+    if not hasattr(shared.sd_model, 'load_lora_weights'):
+        shared.log.error(f"LoRA load failed: class={shared.sd_model.__class__} does not implement load lora")
+        return None
+    try:
+        shared.sd_model.load_lora_weights(network_on_disk.filename)
+    except Exception as e:
+        errors.display(e, "LoRA")
+        return None
+    if shared.opts.lora_fuse_diffusers:
+        shared.sd_model.fuse_lora(lora_scale=lora_scale)
+    net = network.Network(name, network_on_disk)
+    net.mtime = os.path.getmtime(network_on_disk.filename)
+    lora_cache[name] = net
+    t1 = time.time()
+    timer['load'] += t1 - t0
+    return net
+
+
+def load_network(name, network_on_disk) -> network.Network:
+    t0 = time.time()
+    cached = lora_cache.get(name, None)
+    if debug:
+        shared.log.debug(f'LoRA load: name="{name}" file="{network_on_disk.filename}" type=lora {"cached" if cached else ""}')
+    if cached is not None:
+        return cached
+    net = network.Network(name, network_on_disk)
+    net.mtime = os.path.getmtime(network_on_disk.filename)
+    sd = sd_models.read_state_dict(network_on_disk.filename)
+    assign_network_names_to_compvis_modules(shared.sd_model) # this should not be needed but is here as an emergency fix for an unknown error people are experiencing in 1.2.0
+    keys_failed_to_match = {}
+    matched_networks = {}
+    bundle_embeddings = {}
+    convert = lora_convert.KeyConvert()
+    for key_network, weight in sd.items():
+        parts = key_network.split('.')
+        if parts[0] == "bundle_emb":
+            emb_name, vec_name = parts[1], key_network.split(".", 2)[-1]
+            emb_dict = bundle_embeddings.get(emb_name, {})
+            emb_dict[vec_name] = weight
+            bundle_embeddings[emb_name] = emb_dict
+        if len(parts) > 5: # messy handler for diffusers peft lora
+            key_network_without_network_parts = '_'.join(parts[:-2])
+            if not key_network_without_network_parts.startswith('lora_'):
+                key_network_without_network_parts = 'lora_' + key_network_without_network_parts
+            network_part = '.'.join(parts[-2:]).replace('lora_A', 'lora_down').replace('lora_B', 'lora_up')
+        else:
+            key_network_without_network_parts, network_part = key_network.split(".", 1)
+        # if debug:
+        #     shared.log.debug(f'LoRA load: name="{name}" full={key_network} network={network_part} key={key_network_without_network_parts}')
+        key, sd_module = convert(key_network_without_network_parts)  # Now returns lists
+        if sd_module[0] is None:
+            if "bundle_emb" not in key_network:
+                keys_failed_to_match[key_network] = key
+            continue
+        for k, module in zip(key, sd_module):
+            if k not in matched_networks:
+                matched_networks[k] = network.NetworkWeights(network_key=key_network, sd_key=k, w={}, sd_module=module)
+            matched_networks[k].w[network_part] = weight
+    for key, weights in matched_networks.items():
+        net_module = None
+        for nettype in module_types:
+            net_module = nettype.create_module(net, weights)
+            if net_module is not None:
+                break
+        if net_module is None:
+            shared.log.error(f'LoRA unhandled: name={name} key={key} weights={weights.w.keys()}')
+        else:
+            net.modules[key] = net_module
+    if len(keys_failed_to_match) > 0:
+        shared.log.warning(f"LoRA file={network_on_disk.filename} unmatched={len(keys_failed_to_match)} matched={len(matched_networks)}")
+        if debug:
+            shared.log.debug(f"LoRA file={network_on_disk.filename} unmatched={keys_failed_to_match}")
+    elif debug:
+        shared.log.debug(f"LoRA file={network_on_disk.filename} unmatched={len(keys_failed_to_match)} matched={len(matched_networks)}")
+    lora_cache[name] = net
+    t1 = time.time()
+    net.bundle_embeddings = bundle_embeddings
+    timer['load'] += t1 - t0
+    return net
+
+
+def load_networks(names, te_multipliers=None, unet_multipliers=None, dyn_dims=None):
+    networks_on_disk = [available_network_aliases.get(name, None) for name in names]
+    if any(x is None for x in networks_on_disk):
+        list_available_networks()
+        networks_on_disk = [available_network_aliases.get(name, None) for name in names]
+    failed_to_load_networks = []
+    recompile_model = False
+    if shared.compiled_model_state is not None and shared.compiled_model_state.is_compiled:
+        if len(names) == len(shared.compiled_model_state.lora_model):
+            for i, name in enumerate(names):
+                if shared.compiled_model_state.lora_model[i] != f"{name}:{te_multipliers[i] if te_multipliers else 1.0}":
+                    recompile_model = True
+                    shared.compiled_model_state.lora_model = []
+                    break
+            if not recompile_model:
+                if len(loaded_networks) > 0 and debug:
+                    shared.log.debug('Model Compile: Skipping LoRa loading')
+                return
+        else:
+            recompile_model = True
+            shared.compiled_model_state.lora_model = []
+    if recompile_model:
+        backup_cuda_compile = shared.opts.cuda_compile
+        sd_models.unload_model_weights(op='model')
+        shared.opts.cuda_compile = False
+        sd_models.reload_model_weights(op='model')
+        shared.opts.cuda_compile = backup_cuda_compile
+
+    loaded_networks.clear()
+    for i, (network_on_disk, name) in enumerate(zip(networks_on_disk, names)):
+        net = None
+        if network_on_disk is not None:
+            shorthash = getattr(network_on_disk, 'shorthash', '').lower()
+            if debug:
+                shared.log.debug(f'LoRA load: name="{name}" file="{network_on_disk.filename}" hash="{shorthash}"')
+            try:
+                if recompile_model:
+                    shared.compiled_model_state.lora_model.append(f"{name}:{te_multipliers[i] if te_multipliers else 1.0}")
+                if shared.native and shared.opts.lora_force_diffusers: # OpenVINO only works with Diffusers LoRa loading
+                    net = load_diffusers(name, network_on_disk, lora_scale=te_multipliers[i] if te_multipliers else 1.0)
+                elif shared.native and network_overrides.check_override(shorthash):
+                    net = load_diffusers(name, network_on_disk, lora_scale=te_multipliers[i] if te_multipliers else 1.0)
+                else:
+                    net = load_network(name, network_on_disk)
+            except Exception as e:
+                shared.log.error(f"LoRA load failed: file={network_on_disk.filename} {e}")
+                if debug:
+                    errors.display(e, f"LoRA load failed file={network_on_disk.filename}")
+                continue
+            net.mentioned_name = name
+            network_on_disk.read_hash()
+        if net is None:
+            failed_to_load_networks.append(name)
+            shared.log.error(f"LoRA unknown type: network={name}")
+            continue
+        shared.sd_model.embedding_db.load_diffusers_embedding(None, net.bundle_embeddings)
+        net.te_multiplier = te_multipliers[i] if te_multipliers else 1.0
+        net.unet_multiplier = unet_multipliers[i] if unet_multipliers else 1.0
+        net.dyn_dim = dyn_dims[i] if dyn_dims else 1.0
+        loaded_networks.append(net)
+
+    while len(lora_cache) > shared.opts.lora_in_memory_limit:
+        name = next(iter(lora_cache))
+        lora_cache.pop(name, None)
+    if len(loaded_networks) > 0 and debug:
+        shared.log.debug(f'LoRA loaded={len(loaded_networks)} cache={list(lora_cache)}')
+    devices.torch_gc()
+
+    if recompile_model:
+        shared.log.info("LoRA recompiling model")
+        backup_lora_model = shared.compiled_model_state.lora_model
+        if shared.opts.cuda_compile:
+            shared.sd_model = sd_models_compile.compile_diffusers(shared.sd_model)
+
+        shared.compiled_model_state.lora_model = backup_lora_model
+
+
+def network_restore_weights_from_backup(self: Union[torch.nn.Conv2d, torch.nn.Linear, torch.nn.GroupNorm, torch.nn.LayerNorm, torch.nn.MultiheadAttention, diffusers.models.lora.LoRACompatibleLinear, diffusers.models.lora.LoRACompatibleConv]):
+    t0 = time.time()
+    weights_backup = getattr(self, "network_weights_backup", None)
+    bias_backup = getattr(self, "network_bias_backup", None)
+    if weights_backup is None and bias_backup is None:
+        return
+    # if debug:
+    #     shared.log.debug('LoRA restore weights')
+    if weights_backup is not None:
+        if isinstance(self, torch.nn.MultiheadAttention):
+            self.in_proj_weight.copy_(weights_backup[0])
+            self.out_proj.weight.copy_(weights_backup[1])
+        else:
+            self.weight.copy_(weights_backup)
+    if bias_backup is not None:
+        if isinstance(self, torch.nn.MultiheadAttention):
+            self.out_proj.bias.copy_(bias_backup)
+        else:
+            self.bias.copy_(bias_backup)
+    else:
+        if isinstance(self, torch.nn.MultiheadAttention):
+            self.out_proj.bias = None
+        else:
+            self.bias = None
+    t1 = time.time()
+    timer['restore'] += t1 - t0
+
+
+def network_apply_weights(self: Union[torch.nn.Conv2d, torch.nn.Linear, torch.nn.GroupNorm, torch.nn.LayerNorm, torch.nn.MultiheadAttention, diffusers.models.lora.LoRACompatibleLinear, diffusers.models.lora.LoRACompatibleConv]):
+    """
+    Applies the currently selected set of networks to the weights of torch layer self.
+    If weights already have this particular set of networks applied, does nothing.
+    If not, restores orginal weights from backup and alters weights according to networks.
+    """
+    network_layer_name = getattr(self, 'network_layer_name', None)
+    if network_layer_name is None:
+        return
+    t0 = time.time()
+    current_names = getattr(self, "network_current_names", ())
+    wanted_names = tuple((x.name, x.te_multiplier, x.unet_multiplier, x.dyn_dim) for x in loaded_networks)
+    weights_backup = getattr(self, "network_weights_backup", None)
+    if weights_backup is None and wanted_names != (): # pylint: disable=C1803
+        if current_names != ():
+            raise RuntimeError("no backup weights found and current weights are not unchanged")
+        if isinstance(self, torch.nn.MultiheadAttention):
+            weights_backup = (self.in_proj_weight.to(devices.cpu, copy=True), self.out_proj.weight.to(devices.cpu, copy=True))
+        else:
+            weights_backup = self.weight.to(devices.cpu, copy=True)
+        self.network_weights_backup = weights_backup
+    bias_backup = getattr(self, "network_bias_backup", None)
+    if bias_backup is None:
+        if isinstance(self, torch.nn.MultiheadAttention) and self.out_proj.bias is not None:
+            bias_backup = self.out_proj.bias.to(devices.cpu, copy=True)
+        elif getattr(self, 'bias', None) is not None:
+            bias_backup = self.bias.to(devices.cpu, copy=True)
+        else:
+            bias_backup = None
+        self.network_bias_backup = bias_backup
+
+    if current_names != wanted_names:
+        network_restore_weights_from_backup(self)
+        for net in loaded_networks:
+            # default workflow where module is known and has weights
+            module = net.modules.get(network_layer_name, None)
+            if module is not None and hasattr(self, 'weight'):
+                try:
+                    with devices.inference_context():
+                        updown, ex_bias = module.calc_updown(self.weight)
+                        if len(self.weight.shape) == 4 and self.weight.shape[1] == 9:
+                            # inpainting model. zero pad updown to make channel[1]  4 to 9
+                            updown = torch.nn.functional.pad(updown, (0, 0, 0, 0, 0, 5)) # pylint: disable=not-callable
+                        self.weight = torch.nn.Parameter(self.weight + updown)
+                        if ex_bias is not None and hasattr(self, 'bias'):
+                            if self.bias is None:
+                                self.bias = torch.nn.Parameter(ex_bias)
+                            else:
+                                self.bias += ex_bias
+                except RuntimeError as e:
+                    extra_network_lora.errors[net.name] = extra_network_lora.errors.get(net.name, 0) + 1
+                    if debug:
+                        module_name = net.modules.get(network_layer_name, None)
+                        shared.log.error(f"LoRA apply weight name={net.name} module={module_name} layer={network_layer_name} {e}")
+                        errors.display(e, 'LoRA apply weight')
+                        raise RuntimeError('LoRA apply weight') from e
+                continue
+            # alternative workflow looking at _*_proj layers
+            module_q = net.modules.get(network_layer_name + "_q_proj", None)
+            module_k = net.modules.get(network_layer_name + "_k_proj", None)
+            module_v = net.modules.get(network_layer_name + "_v_proj", None)
+            module_out = net.modules.get(network_layer_name + "_out_proj", None)
+            if isinstance(self, torch.nn.MultiheadAttention) and module_q and module_k and module_v and module_out:
+                try:
+                    with devices.inference_context():
+                        updown_q, _ = module_q.calc_updown(self.in_proj_weight)
+                        updown_k, _ = module_k.calc_updown(self.in_proj_weight)
+                        updown_v, _ = module_v.calc_updown(self.in_proj_weight)
+                        updown_qkv = torch.vstack([updown_q, updown_k, updown_v])
+                        updown_out, ex_bias = module_out.calc_updown(self.out_proj.weight)
+                        self.in_proj_weight += updown_qkv
+                        self.out_proj.weight += updown_out
+                    if ex_bias is not None:
+                        if self.out_proj.bias is None:
+                            self.out_proj.bias = torch.nn.Parameter(ex_bias)
+                        else:
+                            self.out_proj.bias += ex_bias
+                except RuntimeError as e:
+                    if debug:
+                        shared.log.debug(f"LoRA network={net.name} layer={network_layer_name} {e}")
+                    extra_network_lora.errors[net.name] = extra_network_lora.errors.get(net.name, 0) + 1
+                continue
+            if module is None:
+                continue
+            shared.log.warning(f"LoRA network={net.name} layer={network_layer_name} unsupported operation")
+            extra_network_lora.errors[net.name] = extra_network_lora.errors.get(net.name, 0) + 1
+        self.network_current_names = wanted_names
+    t1 = time.time()
+    timer['apply'] += t1 - t0
+
+
+def network_forward(module, input, original_forward): # pylint: disable=W0622
+    """
+    Old way of applying Lora by executing operations during layer's forward.
+    Stacking many loras this way results in big performance degradation.
+    """
+    if len(loaded_networks) == 0:
+        return original_forward(module, input)
+    input = devices.cond_cast_unet(input)
+    network_restore_weights_from_backup(module)
+    network_reset_cached_weight(module)
+    y = original_forward(module, input)
+    network_layer_name = getattr(module, 'network_layer_name', None)
+    for lora in loaded_networks:
+        module = lora.modules.get(network_layer_name, None)
+        if module is None:
+            continue
+        y = module.forward(input, y)
+    return y
+
+
+def network_reset_cached_weight(self: Union[torch.nn.Conv2d, torch.nn.Linear]):
+    self.network_current_names = ()
+    self.network_weights_backup = None
+
+
+def network_Linear_forward(self, input): # pylint: disable=W0622
+    if shared.opts.lora_functional:
+        return network_forward(self, input, originals.Linear_forward)
+    network_apply_weights(self)
+    return originals.Linear_forward(self, input)
+
+
+def network_Linear_load_state_dict(self, *args, **kwargs):
+    network_reset_cached_weight(self)
+    return originals.Linear_load_state_dict(self, *args, **kwargs)
+
+
+def network_Conv2d_forward(self, input): # pylint: disable=W0622
+    if shared.opts.lora_functional:
+        return network_forward(self, input, originals.Conv2d_forward)
+    network_apply_weights(self)
+    return originals.Conv2d_forward(self, input)
+
+
+def network_Conv2d_load_state_dict(self, *args, **kwargs):
+    network_reset_cached_weight(self)
+    return originals.Conv2d_load_state_dict(self, *args, **kwargs)
+
+
+def network_GroupNorm_forward(self, input): # pylint: disable=W0622
+    if shared.opts.lora_functional:
+        return network_forward(self, input, originals.GroupNorm_forward)
+    network_apply_weights(self)
+    return originals.GroupNorm_forward(self, input)
+
+
+def network_GroupNorm_load_state_dict(self, *args, **kwargs):
+    network_reset_cached_weight(self)
+    return originals.GroupNorm_load_state_dict(self, *args, **kwargs)
+
+
+def network_LayerNorm_forward(self, input): # pylint: disable=W0622
+    if shared.opts.lora_functional:
+        return network_forward(self, input, originals.LayerNorm_forward)
+    network_apply_weights(self)
+    return originals.LayerNorm_forward(self, input)
+
+
+def network_LayerNorm_load_state_dict(self, *args, **kwargs):
+    network_reset_cached_weight(self)
+    return originals.LayerNorm_load_state_dict(self, *args, **kwargs)
+
+
+def network_MultiheadAttention_forward(self, *args, **kwargs):
+    network_apply_weights(self)
+    return originals.MultiheadAttention_forward(self, *args, **kwargs)
+
+
+def network_MultiheadAttention_load_state_dict(self, *args, **kwargs):
+    network_reset_cached_weight(self)
+    return originals.MultiheadAttention_load_state_dict(self, *args, **kwargs)
+
+
+def list_available_networks():
+    available_networks.clear()
+    available_network_aliases.clear()
+    forbidden_network_aliases.clear()
+    available_network_hash_lookup.clear()
+    forbidden_network_aliases.update({"none": 1, "Addams": 1})
+    directories = []
+    if os.path.exists(shared.cmd_opts.lora_dir):
+        directories.append(shared.cmd_opts.lora_dir)
+    else:
+        shared.log.warning(f'LoRA directory not found: path="{shared.cmd_opts.lora_dir}"')
+    if os.path.exists(shared.cmd_opts.lyco_dir) and shared.cmd_opts.lyco_dir != shared.cmd_opts.lora_dir:
+        directories.append(shared.cmd_opts.lyco_dir)
+
+    def add_network(filename):
+        if not os.path.isfile(filename):
+            return
+        name = os.path.splitext(os.path.basename(filename))[0]
+        try:
+            entry = network.NetworkOnDisk(name, filename)
+            available_networks[entry.name] = entry
+            if entry.alias in available_network_aliases:
+                forbidden_network_aliases[entry.alias.lower()] = 1
+            if shared.opts.lora_preferred_name == 'filename':
+                available_network_aliases[entry.name] = entry
+            else:
+                available_network_aliases[entry.alias] = entry
+            if entry.shorthash:
+                available_network_hash_lookup[entry.shorthash] = entry
+        except OSError as e:  # should catch FileNotFoundError and PermissionError etc.
+            shared.log.error(f"Failed to load network {name} from {filename} {e}")
+
+    candidates = list(files_cache.list_files(*directories, ext_filter=[".pt", ".ckpt", ".safetensors"]))
+    with concurrent.futures.ThreadPoolExecutor(max_workers=shared.max_workers) as executor:
+        for fn in candidates:
+            executor.submit(add_network, fn)
+    shared.log.info(f'LoRA networks: available={len(available_networks)} folders={len(forbidden_network_aliases)}')
+
+
+def infotext_pasted(infotext, params): # pylint: disable=W0613
+    if "AddNet Module 1" in [x[1] for x in scripts.scripts_txt2img.infotext_fields]:
+        return  # if the other extension is active, it will handle those fields, no need to do anything
+    added = []
+    for k in params:
+        if not k.startswith("AddNet Model "):
+            continue
+        num = k[13:]
+        if params.get("AddNet Module " + num) != "LoRA":
+            continue
+        name = params.get("AddNet Model " + num)
+        if name is None:
+            continue
+        m = re_network_name.match(name)
+        if m:
+            name = m.group(1)
+        multiplier = params.get("AddNet Weight A " + num, "1.0")
+        added.append(f"<lora:{name}:{multiplier}>")
+    if added:
+        params["Prompt"] += "\n" + "".join(added)
+
+
+list_available_networks()
diff --git a/extensions-builtin/sdnext-modernui b/extensions-builtin/sdnext-modernui
index dae2c67d8..6a570df7a 160000
--- a/extensions-builtin/sdnext-modernui
+++ b/extensions-builtin/sdnext-modernui
@@ -1 +1 @@
-Subproject commit dae2c67d826b631dcc343c028c60f478b0437877
+Subproject commit 6a570df7ada9a048f3ce273851ade9cede9d5c26
diff --git a/html/logo-bg-6.jpg b/html/logo-bg-6.jpg
index 88c7cb446..5e001eb2b 100644
Binary files a/html/logo-bg-6.jpg and b/html/logo-bg-6.jpg differ
diff --git a/html/reference.json b/html/reference.json
index a3f62e7e1..dfc0a59df 100644
--- a/html/reference.json
+++ b/html/reference.json
@@ -182,13 +182,29 @@
     "extras": "width: 1024, height: 1024, sampler: Default, cfg_scale: 2.0"
   },
   
-  "Tencent HunyuanDiT 1.1": {
-    "path": "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers",
+  "Tencent HunyuanDiT 1.2": {
+    "path": "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers",
     "desc": "Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding.",
     "preview": "Tencent-Hunyuan-HunyuanDiT.jpg",
     "extras": "width: 1024, height: 1024, sampler: Default, cfg_scale: 2.0"
   },
-  
+
+  "AlphaVLLM Lumina Next SFT": {
+    "path": "Alpha-VLLM/Lumina-Next-SFT-diffusers",
+    "desc": "The Lumina-Next-SFT is a Next-DiT model containing 2B parameters and utilizes Gemma-2B as the text encoder, enhanced through high-quality supervised fine-tuning (SFT).",
+    "preview": "Alpha-VLLM-Lumina-Next-SFT-diffusers.jpg",
+    "skip": true,
+    "extras": "width: 1024, height: 1024, sampler: Default"
+  }, 
+
+  "Kwai Kolors": {
+    "path": "Kwai-Kolors/Kolors",
+    "desc": "Kolors is a large-scale text-to-image generation model based on latent diffusion, developed by the Kuaishou Kolors team. Trained on billions of text-image pairs, Kolors exhibits significant advantages over both open-source and proprietary models in visual quality, complex semantic accuracy, and text rendering for both Chinese and English characters. Furthermore, Kolors supports both Chinese and English inputs",
+    "preview": "Kwai-Kolors.jpg",
+    "skip": true,
+    "extras": "width: 1024, height: 1024"
+  }, 
+
   "Kandinsky 2.1": {
     "path": "kandinsky-community/kandinsky-2-1",
     "desc": "Kandinsky 2.1 is a text-conditional diffusion model based on unCLIP and latent diffusion, composed of a transformer-based image prior model, a unet diffusion model, and a decoder. Kandinsky 2.1 inherits best practices from Dall-E 2 and Latent diffusion while introducing some new ideas. It uses the CLIP model as a text and image encoder, and diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach increases the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation.",
diff --git a/installer.py b/installer.py
index 2bffe8276..e1e84be76 100644
--- a/installer.py
+++ b/installer.py
@@ -52,6 +52,7 @@ args = Dot({
     'reinstall': False,
     'version': False,
     'ignore': False,
+    'uv': False,
 })
 git_commit = "unknown"
 submodules_commit = {
@@ -235,22 +236,25 @@ def uninstall(package, quiet = False):
 
 
 @lru_cache()
-def pip(arg: str, ignore: bool = False, quiet: bool = False):
+def pip(arg: str, ignore: bool = False, quiet: bool = False, uv = True):
+    uv = uv and args.uv
+    pipCmd = "uv pip" if uv else "pip"
     arg = arg.replace('>=', '==')
     if not quiet and '-r ' not in arg:
-        log.info(f'Install: package="{arg.replace("install", "").replace("--upgrade", "").replace("--no-deps", "").replace("--force", "").replace("  ", " ").strip()}"')
+        log.info(f'Install: package="{arg.replace("install", "").replace("--upgrade", "").replace("--no-deps", "").replace("--force", "").replace(" ", " ").strip()}" mode={"uv" if uv else "pip"}')
     env_args = os.environ.get("PIP_EXTRA_ARGS", "")
-    log.debug(f'Running: pip="{pip_log}{arg} {env_args}"')
-    result = subprocess.run(f'"{sys.executable}" -m pip {pip_log}{arg} {env_args}', shell=True, check=False, env=os.environ, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    all_args = f'{pip_log}{arg} {env_args}'.strip()
+    log.debug(f'Running: {pipCmd}="{all_args}"')
+    result = subprocess.run(f'"{sys.executable}" -m {pipCmd} {all_args}', shell=True, check=False, env=os.environ, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     txt = result.stdout.decode(encoding="utf8", errors="ignore")
     if len(result.stderr) > 0:
         txt += ('\n' if len(txt) > 0 else '') + result.stderr.decode(encoding="utf8", errors="ignore")
     txt = txt.strip()
-    debug(f'Install pip: {txt}')
+    debug(f'Install {pipCmd}: {txt}')
     if result.returncode != 0 and not ignore:
         global errors # pylint: disable=global-statement
         errors += 1
-        log.error(f'Error running pip: {arg}')
+        log.error(f'Error running {pipCmd}: {arg}')
         log.debug(f'Pip output: {txt}')
     return txt
 
@@ -264,7 +268,7 @@ def install(package, friendly: str = None, ignore: bool = False, reinstall: bool
         quick_allowed = False
     if args.reinstall or reinstall or not installed(package, friendly, quiet=quiet):
         deps = '' if not no_deps else '--no-deps '
-        res = pip(f"install --upgrade {deps}{package}", ignore=ignore)
+        res = pip(f"install{' --upgrade' if not args.uv else ''} {deps}{package}", ignore=ignore, uv=package != "uv")
         try:
             import imp # pylint: disable=deprecated-module
             imp.reload(pkg_resources)
@@ -454,6 +458,10 @@ def install_rocm_zluda(torch_command):
             command = subprocess.run('hipinfo', shell=True, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             amd_gpus = command.stdout.decode(encoding="utf8", errors="ignore").split('\n')
             amd_gpus = [x.split(' ')[-1].strip() for x in amd_gpus if x.startswith('gcnArchName:')]
+        elif os.environ.get('WSL_DISTRO_NAME', None) is not None: # WSL does not have 'rocm_agent_enumerator'
+            command = subprocess.run('rocminfo', shell=True, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            amd_gpus = command.stdout.decode(encoding="utf8", errors="ignore").split('\n')
+            amd_gpus = [x.strip().split(" ")[-1] for x in amd_gpus if x.startswith('  Name:') and "CPU" not in x]
         else:
             command = subprocess.run('rocm_agent_enumerator', shell=True, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             amd_gpus = command.stdout.decode(encoding="utf8", errors="ignore").split('\n')
@@ -529,7 +537,10 @@ def install_rocm_zluda(torch_command):
         if rocm_ver is None: # assume the latest if version check fails
             torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/rocm6.0')
         elif rocm_ver == "6.1": # need nightlies
-            torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --pre --index-url https://download.pytorch.org/whl/nightly/rocm6.1')
+            if args.experimental:
+                torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --pre --index-url https://download.pytorch.org/whl/nightly/rocm6.1')
+            else:
+                torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/rocm6.0')
         elif float(rocm_ver) < 5.5: # oldest supported version is 5.5
             log.warning(f"Unsupported ROCm version detected: {rocm_ver}")
             log.warning("Minimum supported ROCm version is 5.5")
@@ -540,6 +551,27 @@ def install_rocm_zluda(torch_command):
             ort_version = os.environ.get('ONNXRUNTIME_VERSION', None)
             ort_package = os.environ.get('ONNXRUNTIME_PACKAGE', f"--pre onnxruntime-training{'' if ort_version is None else ('==' + ort_version)} --index-url https://pypi.lsh.sh/{rocm_ver[0]}{rocm_ver[2]} --extra-index-url https://pypi.org/simple")
             install(ort_package, 'onnxruntime-training')
+
+        if bool(int(os.environ.get("TORCH_BLAS_PREFER_HIPBLASLT", "1"))):
+            supported_archs = []
+            hipblaslt_available = True
+            libpath = os.environ.get("HIPBLASLT_TENSILE_LIBPATH", "/opt/rocm/lib/hipblaslt/library")
+            for file in os.listdir(libpath):
+                if not file.startswith('extop_'):
+                    continue
+                supported_archs.append(file[6:-3])
+            for gpu in amd_gpus:
+                if gpu not in supported_archs:
+                    hipblaslt_available = False
+                    break
+            log.info(f'hipBLASLt supported_archs={supported_archs}, available={hipblaslt_available}')
+            if hipblaslt_available:
+                import ctypes
+                # Preload hipBLASLt.
+                ctypes.CDLL("/opt/rocm/lib/libhipblaslt.so", mode=ctypes.RTLD_GLOBAL)
+                os.environ["HIPBLASLT_TENSILE_LIBPATH"] = libpath
+            else:
+                os.environ["TORCH_BLAS_PREFER_HIPBLASLT"] = "0"
     return torch_command
 
 
@@ -680,6 +712,20 @@ def check_torch():
         install('onnxruntime-gpu', 'onnxruntime-gpu', ignore=True, quiet=True)
     elif is_rocm_available(allow_rocm):
         torch_command = install_rocm_zluda(torch_command)
+
+        # WSL ROCm
+        if os.environ.get('WSL_DISTRO_NAME', None) is not None:
+            import ctypes
+            try:
+                # Preload stdc++ library. This will ignore Anaconda stdc++ library.
+                ctypes.CDLL("/lib/x86_64-linux-gnu/libstdc++.so.6", mode=ctypes.RTLD_GLOBAL)
+            except OSError:
+                pass
+            try:
+                # Preload HSA Runtime library.
+                ctypes.CDLL("/opt/rocm/lib/libhsa-runtime64.so", mode=ctypes.RTLD_GLOBAL)
+            except OSError:
+                log.error("Failed to preload HSA Runtime library.")
     elif is_ipex_available(allow_ipex):
         torch_command = install_ipex(torch_command)
     elif allow_openvino and args.use_openvino:
@@ -1052,20 +1098,20 @@ def check_ui(ver):
 
     if not same(ver):
         log.debug(f'Branch mismatch: sdnext={ver["branch"]} ui={ver["ui"]}')
-    cwd = os.getcwd()
-    try:
-        os.chdir('extensions-builtin/sdnext-modernui')
-        target = 'dev' if 'dev' in ver['branch'] else 'main'
-        git('checkout ' + target, ignore=True, optional=True)
+        cwd = os.getcwd()
+        try:
+            os.chdir('extensions-builtin/sdnext-modernui')
+            target = 'dev' if 'dev' in ver['branch'] else 'main'
+            git('checkout ' + target, ignore=True, optional=True)
+            os.chdir(cwd)
+            ver = get_version(force=True)
+            if not same(ver):
+                log.debug(f'Branch synchronized: {ver["branch"]}')
+            else:
+                log.debug(f'Branch sync failed: sdnext={ver["branch"]} ui={ver["ui"]}')
+        except Exception as e:
+            log.debug(f'Branch switch: {e}')
         os.chdir(cwd)
-        ver = get_version(force=True)
-        if not same(ver):
-            log.debug(f'Branch synchronized: {ver["branch"]}')
-        else:
-            log.debug(f'Branch sync failed: sdnext={ver["branch"]} ui={ver["ui"]}')
-    except Exception as e:
-        log.debug(f'Branch switch: {e}')
-    os.chdir(cwd)
 
 
 # check version of the main repo and optionally upgrade it
@@ -1165,7 +1211,7 @@ def check_timestamp():
 def add_args(parser):
     group = parser.add_argument_group('Setup options')
     group.add_argument('--reset', default = os.environ.get("SD_RESET",False), action='store_true', help = "Reset main repository to latest version, default: %(default)s")
-    group.add_argument('--upgrade', default = os.environ.get("SD_UPGRADE",False), action='store_true', help = "Upgrade main repository to latest version, default: %(default)s")
+    group.add_argument('--upgrade', '--update', default = os.environ.get("SD_UPGRADE",False), action='store_true', help = "Upgrade main repository to latest version, default: %(default)s")
     group.add_argument('--requirements', default = os.environ.get("SD_REQUIREMENTS",False), action='store_true', help = "Force re-check of requirements, default: %(default)s")
     group.add_argument('--quick', default = os.environ.get("SD_QUICK",False), action='store_true', help = "Bypass version checks, default: %(default)s")
     group.add_argument('--use-directml', default = os.environ.get("SD_USEDIRECTML",False), action='store_true', help = "Use DirectML if no compatible GPU is detected, default: %(default)s")
@@ -1188,6 +1234,7 @@ def add_args(parser):
     group.add_argument('--version', default = False, action='store_true', help = "Print version information")
     group.add_argument('--ignore', default = os.environ.get("SD_IGNORE",False), action='store_true', help = "Ignore any errors and attempt to continue")
     group.add_argument('--safe', default = os.environ.get("SD_SAFE",False), action='store_true', help = "Run in safe mode with no user extensions")
+    group.add_argument('--uv', default = os.environ.get("SD_UV",False), action='store_true', help = "Use uv instead of pip to install the packages")
 
     group = parser.add_argument_group('Logging options')
     group.add_argument("--log", type=str, default=os.environ.get("SD_LOG", None), help="Set log file, default: %(default)s")
diff --git a/launch.py b/launch.py
index f1d8b7ec5..1db8a81f6 100755
--- a/launch.py
+++ b/launch.py
@@ -204,6 +204,8 @@ def main():
     installer.log.info(f'Platform: {installer.print_dict(installer.get_platform())}')
     if not args.skip_env:
         installer.set_environment()
+    if args.uv:
+        installer.install("uv", "uv")
     installer.check_torch()
     installer.check_onnx()
     installer.check_diffusers()
diff --git a/models/Reference/Alpha-VLLM-Lumina-Next-SFT-diffusers.jpg b/models/Reference/Alpha-VLLM-Lumina-Next-SFT-diffusers.jpg
new file mode 100644
index 000000000..e252bff5b
Binary files /dev/null and b/models/Reference/Alpha-VLLM-Lumina-Next-SFT-diffusers.jpg differ
diff --git a/models/Reference/Kwai-Kolors.jpg b/models/Reference/Kwai-Kolors.jpg
new file mode 100644
index 000000000..3ed14d7ce
Binary files /dev/null and b/models/Reference/Kwai-Kolors.jpg differ
diff --git a/modules/api/script.py b/modules/api/script.py
index 6f9e5a3f6..cae59791e 100644
--- a/modules/api/script.py
+++ b/modules/api/script.py
@@ -39,10 +39,10 @@ def get_script(script_name, script_runner):
     return script_runner.scripts[script_idx]
 
 def init_default_script_args(script_runner):
-    #find max idx from the scripts in runner and generate a none array to init script_args
+    # find max idx from the scripts in runner and generate a none array to init script_args
     last_arg_index = 1
     for script in script_runner.scripts:
-        if last_arg_index < script.args_to:
+        if last_arg_index < script.args_to: # pylint disable=consider-using-max-builtin
             last_arg_index = script.args_to
     # None everywhere except position 0 to initialize script args
     script_args = [None]*last_arg_index
diff --git a/modules/control/run.py b/modules/control/run.py
index 5e0749d81..ba736a4a8 100644
--- a/modules/control/run.py
+++ b/modules/control/run.py
@@ -282,67 +282,72 @@ def control_run(units: List[unit.Unit] = [], inputs: List[Image.Image] = [], ini
     else:
         pass
 
-    debug(f'Control: run type={unit_type} models={has_models}')
-    if has_models:
-        p.ops.append('control')
-        p.extra_generation_params["Control mode"] = unit_type # overriden later with pretty-print
-        p.extra_generation_params["Control conditioning"] = control_conditioning if isinstance(control_conditioning, list) else [control_conditioning]
-        p.extra_generation_params['Control start'] = control_guidance_start if isinstance(control_guidance_start, list) else [control_guidance_start]
-        p.extra_generation_params['Control end'] = control_guidance_end if isinstance(control_guidance_end, list) else [control_guidance_end]
-        p.extra_generation_params["Control model"] = ';'.join([(m.model_id or '') for m in active_model if m.model is not None])
-        p.extra_generation_params["Control conditioning"] = ';'.join([str(c) for c in p.extra_generation_params["Control conditioning"]])
-        p.extra_generation_params['Control start'] = ';'.join([str(c) for c in p.extra_generation_params['Control start']])
-        p.extra_generation_params['Control end'] = ';'.join([str(c) for c in p.extra_generation_params['Control end']])
-    if unit_type == 't2i adapter' and has_models:
-        p.extra_generation_params["Control mode"] = 'T2I-Adapter'
-        p.task_args['adapter_conditioning_scale'] = control_conditioning
-        instance = t2iadapter.AdapterPipeline(selected_models, shared.sd_model)
-        pipe = instance.pipeline
-        if inits is not None:
-            shared.log.warning('Control: T2I-Adapter does not support separate init image')
-    elif unit_type == 'controlnet' and has_models:
-        p.extra_generation_params["Control mode"] = 'ControlNet'
-        p.task_args['controlnet_conditioning_scale'] = control_conditioning
-        p.task_args['control_guidance_start'] = control_guidance_start
-        p.task_args['control_guidance_end'] = control_guidance_end
-        p.task_args['guess_mode'] = p.guess_mode
-        instance = controlnet.ControlNetPipeline(selected_models, shared.sd_model)
-        pipe = instance.pipeline
-    elif unit_type == 'xs' and has_models:
-        p.extra_generation_params["Control mode"] = 'ControlNet-XS'
-        p.controlnet_conditioning_scale = control_conditioning
-        p.control_guidance_start = control_guidance_start
-        p.control_guidance_end = control_guidance_end
-        instance = xs.ControlNetXSPipeline(selected_models, shared.sd_model)
-        pipe = instance.pipeline
-        if inits is not None:
-            shared.log.warning('Control: ControlNet-XS does not support separate init image')
-    elif unit_type == 'lite' and has_models:
-        p.extra_generation_params["Control mode"] = 'ControlLLLite'
-        p.controlnet_conditioning_scale = control_conditioning
-        instance = lite.ControlLLitePipeline(shared.sd_model)
-        pipe = instance.pipeline
-        if inits is not None:
-            shared.log.warning('Control: ControlLLLite does not support separate init image')
-    elif unit_type == 'reference' and has_models:
-        p.extra_generation_params["Control mode"] = 'Reference'
-        p.extra_generation_params["Control attention"] = p.attention
-        p.task_args['reference_attn'] = 'Attention' in p.attention
-        p.task_args['reference_adain'] = 'Adain' in p.attention
-        p.task_args['attention_auto_machine_weight'] = p.query_weight
-        p.task_args['gn_auto_machine_weight'] = p.adain_weight
-        p.task_args['style_fidelity'] = p.fidelity
-        instance = reference.ReferencePipeline(shared.sd_model)
-        pipe = instance.pipeline
-        if inits is not None:
-            shared.log.warning('Control: ControlNet-XS does not support separate init image')
-    else: # run in txt2img/img2img mode
-        if len(active_strength) > 0:
-            p.strength = active_strength[0]
-        pipe = shared.sd_model
-        instance = None
+    def set_pipe():
+        global pipe, instance # pylint: disable=global-statement
+        pipe = None
+        if has_models:
+            p.ops.append('control')
+            p.extra_generation_params["Control mode"] = unit_type # overriden later with pretty-print
+            p.extra_generation_params["Control conditioning"] = control_conditioning if isinstance(control_conditioning, list) else [control_conditioning]
+            p.extra_generation_params['Control start'] = control_guidance_start if isinstance(control_guidance_start, list) else [control_guidance_start]
+            p.extra_generation_params['Control end'] = control_guidance_end if isinstance(control_guidance_end, list) else [control_guidance_end]
+            p.extra_generation_params["Control model"] = ';'.join([(m.model_id or '') for m in active_model if m.model is not None])
+            p.extra_generation_params["Control conditioning"] = ';'.join([str(c) for c in p.extra_generation_params["Control conditioning"]])
+            p.extra_generation_params['Control start'] = ';'.join([str(c) for c in p.extra_generation_params['Control start']])
+            p.extra_generation_params['Control end'] = ';'.join([str(c) for c in p.extra_generation_params['Control end']])
+        if unit_type == 't2i adapter' and has_models:
+            p.extra_generation_params["Control mode"] = 'T2I-Adapter'
+            p.task_args['adapter_conditioning_scale'] = control_conditioning
+            instance = t2iadapter.AdapterPipeline(selected_models, shared.sd_model)
+            pipe = instance.pipeline
+            if inits is not None:
+                shared.log.warning('Control: T2I-Adapter does not support separate init image')
+        elif unit_type == 'controlnet' and has_models:
+            p.extra_generation_params["Control mode"] = 'ControlNet'
+            p.task_args['controlnet_conditioning_scale'] = control_conditioning
+            p.task_args['control_guidance_start'] = control_guidance_start
+            p.task_args['control_guidance_end'] = control_guidance_end
+            p.task_args['guess_mode'] = p.guess_mode
+            instance = controlnet.ControlNetPipeline(selected_models, shared.sd_model)
+            pipe = instance.pipeline
+        elif unit_type == 'xs' and has_models:
+            p.extra_generation_params["Control mode"] = 'ControlNet-XS'
+            p.controlnet_conditioning_scale = control_conditioning
+            p.control_guidance_start = control_guidance_start
+            p.control_guidance_end = control_guidance_end
+            instance = xs.ControlNetXSPipeline(selected_models, shared.sd_model)
+            pipe = instance.pipeline
+            if inits is not None:
+                shared.log.warning('Control: ControlNet-XS does not support separate init image')
+        elif unit_type == 'lite' and has_models:
+            p.extra_generation_params["Control mode"] = 'ControlLLLite'
+            p.controlnet_conditioning_scale = control_conditioning
+            instance = lite.ControlLLitePipeline(shared.sd_model)
+            pipe = instance.pipeline
+            if inits is not None:
+                shared.log.warning('Control: ControlLLLite does not support separate init image')
+        elif unit_type == 'reference' and has_models:
+            p.extra_generation_params["Control mode"] = 'Reference'
+            p.extra_generation_params["Control attention"] = p.attention
+            p.task_args['reference_attn'] = 'Attention' in p.attention
+            p.task_args['reference_adain'] = 'Adain' in p.attention
+            p.task_args['attention_auto_machine_weight'] = p.query_weight
+            p.task_args['gn_auto_machine_weight'] = p.adain_weight
+            p.task_args['style_fidelity'] = p.fidelity
+            instance = reference.ReferencePipeline(shared.sd_model)
+            pipe = instance.pipeline
+            if inits is not None:
+                shared.log.warning('Control: ControlNet-XS does not support separate init image')
+        else: # run in txt2img/img2img mode
+            if len(active_strength) > 0:
+                p.strength = active_strength[0]
+            pipe = shared.sd_model
+            instance = None
+        debug(f'Control: run type={unit_type} models={has_models} pipe={pipe.__class__.__name__ if pipe is not None else None}')
+        return pipe
 
 
+    pipe = set_pipe()
     debug(f'Control pipeline: class={pipe.__class__.__name__} args={vars(p)}')
     t1, t2, t3 = time.time(), 0, 0
     status = True
@@ -351,6 +356,7 @@ def control_run(units: List[unit.Unit] = [], inputs: List[Image.Image] = [], ini
     output_filename = None
     index = 0
     frames = 0
+    blended_image = None
 
     # set pipeline
     if pipe.__class__.__name__ != shared.sd_model.__class__.__name__:
@@ -382,6 +388,7 @@ def control_run(units: List[unit.Unit] = [], inputs: List[Image.Image] = [], ini
                     codec = util.decode_fourcc(video.get(cv2.CAP_PROP_FOURCC))
                     status, frame = video.read()
                     if status:
+                        shared.state.frame_count = 1 + frames // (video_skip_frames + 1)
                         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                     shared.log.debug(f'Control: input video: path={inputs} frames={frames} fps={fps} size={w}x{h} codec={codec}')
                 except Exception as e:
@@ -389,6 +396,9 @@ def control_run(units: List[unit.Unit] = [], inputs: List[Image.Image] = [], ini
                     return [], '', '', 'Error: video open failed'
 
             while status:
+                if pipe is None: # pipe may have been reset externally
+                    pipe = set_pipe()
+                    debug(f'Control pipeline reinit: class={pipe.__class__.__name__}')
                 processed_image = None
                 if frame is not None:
                     inputs = [Image.fromarray(frame)] # cv2 to pil
@@ -425,9 +435,10 @@ def control_run(units: List[unit.Unit] = [], inputs: List[Image.Image] = [], ini
                     else:
                         debug(f'Control Init image: {i % len(inits) + 1} of {len(inits)}')
                         init_image = inits[i % len(inits)]
-                    index += 1
                     if video is not None and index % (video_skip_frames + 1) != 0:
+                        index += 1
                         continue
+                    index += 1
 
                     # resize before
                     if resize_mode_before != 0 and resize_name_before != 'None':
@@ -477,7 +488,6 @@ def control_run(units: List[unit.Unit] = [], inputs: List[Image.Image] = [], ini
                             process.model = None
 
                     debug(f'Control processed: {len(processed_images)}')
-                    blended_image = None
                     if len(processed_images) > 0:
                         try:
                             if len(p.extra_generation_params["Control process"]) == 0:
@@ -593,10 +603,11 @@ def control_run(units: List[unit.Unit] = [], inputs: List[Image.Image] = [], ini
                     output = None
                     script_run = False
                     if pipe is not None: # run new pipeline
-                        pipe.restore_pipeline = restore_pipeline
+                        if not hasattr(pipe, 'restore_pipeline') and video is None:
+                            pipe.restore_pipeline = restore_pipeline
                         debug(f'Control exec pipeline: task={sd_models.get_diffusers_task(pipe)} class={pipe.__class__}')
-                        debug(f'Control exec pipeline: p={vars(p)}')
-                        debug(f'Control exec pipeline: args={p.task_args} image={p.task_args.get("image", None)} control={p.task_args.get("control_image", None)} mask={p.task_args.get("mask_image", None) or p.image_mask} ref={p.task_args.get("ref_image", None)}')
+                        # debug(f'Control exec pipeline: p={vars(p)}')
+                        # debug(f'Control exec pipeline: args={p.task_args} image={p.task_args.get("image", None)} control={p.task_args.get("control_image", None)} mask={p.task_args.get("mask_image", None) or p.image_mask} ref={p.task_args.get("ref_image", None)}')
                         if sd_models.get_diffusers_task(pipe) != sd_models.DiffusersTaskType.TEXT_2_IMAGE: # force vae back to gpu if not in txt2img mode
                             sd_models.move_model(pipe.vae, devices.device)
 
@@ -692,5 +703,4 @@ def control_run(units: List[unit.Unit] = [], inputs: List[Image.Image] = [], ini
     if is_generator:
         yield (output_images, blended_image, html_txt, output_filename)
     else:
-        yield (output_images, blended_image, html_txt, output_filename)
-        return
+        return (output_images, blended_image, html_txt, output_filename)
diff --git a/modules/control/units/controlnet.py b/modules/control/units/controlnet.py
index b57005f3c..ec99b7bd6 100644
--- a/modules/control/units/controlnet.py
+++ b/modules/control/units/controlnet.py
@@ -49,9 +49,12 @@ predefined_sdxl = {
     'Canny XL': 'diffusers/controlnet-canny-sdxl-1.0',
     'Depth Zoe XL': 'diffusers/controlnet-zoe-depth-sdxl-1.0',
     'Depth Mid XL': 'diffusers/controlnet-depth-sdxl-1.0-mid',
-    'OpenPose XL': 'thibaud/controlnet-openpose-sdxl-1.0',
+    'OpenPose XL': 'thibaud/controlnet-openpose-sdxl-1.0/bin',
+    # 'OpenPose XL': 'thibaud/controlnet-openpose-sdxl-1.0/OpenPoseXL2.safetensors',
+    'Xinsir Union XL': 'xinsir/controlnet-union-sdxl-1.0',
     'Xinsir OpenPose XL': 'xinsir/controlnet-openpose-sdxl-1.0',
     'Xinsir Canny XL': 'xinsir/controlnet-canny-sdxl-1.0',
+    'Xinsir Depth XL': 'xinsir/controlnet-depth-sdxl-1.0',
     'Xinsir Scribble XL': 'xinsir/controlnet-scribble-sdxl-1.0',
     'Xinsir Anime Painter XL': 'xinsir/anime-painter',
     # 'StabilityAI Canny R128': 'stabilityai/control-lora/control-LoRAs-rank128/control-lora-canny-rank128.safetensors',
@@ -171,6 +174,9 @@ class ControlNet():
             if model_path.endswith('.safetensors'):
                 self.load_safetensors(model_path)
             else:
+                if '/bin' in model_path:
+                    model_path = model_path.replace('/bin', '')
+                    self.load_config['use_safetensors'] = False
                 self.model = ControlNetModel.from_pretrained(model_path, **self.load_config)
             if self.dtype is not None:
                 self.model.to(self.dtype)
diff --git a/modules/control/units/xs_pipe.py b/modules/control/units/xs_pipe.py
index 7e717b542..30cd8cef0 100644
--- a/modules/control/units/xs_pipe.py
+++ b/modules/control/units/xs_pipe.py
@@ -26,8 +26,7 @@ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, StableDiffus
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.attention_processor import (
     AttnProcessor2_0,
-    LoRAAttnProcessor2_0,
-    LoRAXFormersAttnProcessor,
+    FusedAttnProcessor2_0,
     XFormersAttnProcessor,
 )
 from diffusers.models.lora import adjust_lora_scale_text_encoder
@@ -652,8 +651,7 @@ class StableDiffusionXLControlNetXSPipeline(
             (
                 AttnProcessor2_0,
                 XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
             ),
         )
         # if xformers or torch_2_0 is used attention block does not need
diff --git a/modules/devices.py b/modules/devices.py
index d72555192..e703a0b50 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -46,7 +46,7 @@ def get_gpu_info():
         try:
             if shared.cmd_opts.use_openvino:
                 return {
-                    'device': get_openvino_device(),
+                    'device': get_openvino_device(), # pylint: disable=used-before-assignment
                     'openvino': get_package_version("openvino"),
                 }
             elif shared.cmd_opts.use_directml:
@@ -166,7 +166,7 @@ def torch_gc(force=False):
     after = { 'gpu': mem.get('gpu', {}).get('used', 0), 'ram': mem.get('ram', {}).get('used', 0), 'retries': mem.get('retries', 0), 'oom': mem.get('oom', 0) }
     utilization = { 'gpu': used_gpu, 'ram': used_ram, 'threshold': threshold }
     results = { 'collected': collected, 'saved': saved }
-    log.debug(f'GC: utilization={utilization} gc={results} beofre={before} after={after} device={torch.device(get_optimal_device_name())} fn={sys._getframe(1).f_code.co_name} time={round(t1 - t0, 2)}') # pylint: disable=protected-access
+    log.debug(f'GC: utilization={utilization} gc={results} before={before} after={after} device={torch.device(get_optimal_device_name())} fn={sys._getframe(1).f_code.co_name} time={round(t1 - t0, 2)}') # pylint: disable=protected-access
 
 
 def set_cuda_sync_mode(mode):
@@ -311,7 +311,7 @@ def set_cuda_params():
         inference_context = contextlib.nullcontext
     else:
         inference_context = torch.no_grad
-    log_device_name = get_raw_openvino_device() if shared.cmd_opts.use_openvino else torch.device(get_optimal_device_name())
+    log_device_name = get_raw_openvino_device() if shared.cmd_opts.use_openvino else torch.device(get_optimal_device_name()) # pylint: disable=used-before-assignment
     log.debug(f'Desired Torch parameters: dtype={shared.opts.cuda_dtype} no-half={shared.opts.no_half} no-half-vae={shared.opts.no_half_vae} upscast={shared.opts.upcast_sampling}')
     log.info(f'Setting Torch parameters: device={log_device_name} dtype={dtype} vae={dtype_vae} unet={dtype_unet} context={inference_context.__name__} fp16={fp16_ok} bf16={bf16_ok} optimization={shared.opts.cross_attention_optimization}')
 
diff --git a/modules/face/faceswap.py b/modules/face/faceswap.py
index 2d03a5232..2868d6c78 100644
--- a/modules/face/faceswap.py
+++ b/modules/face/faceswap.py
@@ -22,6 +22,9 @@ def face_swap(p: processing.StableDiffusionProcessing, app, input_images: List[I
 
     np_image = cv2.cvtColor(np.array(source_image), cv2.COLOR_RGB2BGR)
     faces = app.get(np_image)
+    if faces is None or len(faces) == 0:
+        shared.log.warning('FaceSwap: No faces detected')
+        return
     source_face = faces[0]
     processed_images = []
     for image in input_images:
diff --git a/modules/loader.py b/modules/loader.py
index 1fe3ba81e..e7cb03339 100644
--- a/modules/loader.py
+++ b/modules/loader.py
@@ -35,7 +35,7 @@ timer.startup.record("torch")
 import transformers # pylint: disable=W0611,C0411
 timer.startup.record("transformers")
 
-import onnxruntime
+import onnxruntime # pylint: disable=W0611,C0411
 onnxruntime.set_default_logger_severity(3)
 timer.startup.record("onnx")
 
@@ -50,7 +50,7 @@ timer.startup.record("pydantic")
 import diffusers # pylint: disable=W0611,C0411
 import diffusers.loaders.single_file # pylint: disable=W0611,C0411
 logging.getLogger("diffusers.loaders.single_file").setLevel(logging.ERROR)
-from tqdm.rich import tqdm
+from tqdm.rich import tqdm # pylint: disable=W0611,C0411
 diffusers.loaders.single_file.logging.tqdm = partial(tqdm, unit='C')
 timer.startup.record("diffusers")
 
diff --git a/modules/model_kolors.py b/modules/model_kolors.py
new file mode 100644
index 000000000..dcf7c1f26
--- /dev/null
+++ b/modules/model_kolors.py
@@ -0,0 +1,28 @@
+import torch
+import transformers
+import diffusers
+
+
+repo_id = 'Kwai-Kolors/Kolors'
+encoder_id = 'THUDM/chatglm3-6b'
+
+
+def load_kolors(_checkpoint_info, diffusers_load_config={}):
+    from modules import shared, devices, modelloader
+    modelloader.hf_login()
+    diffusers_load_config['variant'] = "fp16"
+    if 'torch_dtype' not in diffusers_load_config:
+        diffusers_load_config['torch_dtype'] = 'torch.float16'
+
+    text_encoder = transformers.AutoModel.from_pretrained(encoder_id, torch_dtype=torch.float16, trust_remote_code=True, cache_dir=shared.opts.diffusers_dir)
+    # text_encoder = transformers.AutoModel.from_pretrained("THUDM/chatglm3-6b", torch_dtype=torch.float16, trust_remote_code=True).quantize(4).cuda()
+    tokenizer = transformers.AutoTokenizer.from_pretrained(encoder_id, trust_remote_code=True, cache_dir=shared.opts.diffusers_dir)
+    pipe = diffusers.StableDiffusionXLPipeline.from_pretrained(
+        repo_id,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        cache_dir = shared.opts.diffusers_dir,
+        **diffusers_load_config,
+    )
+    devices.torch_gc()
+    return pipe
diff --git a/modules/model_lumina.py b/modules/model_lumina.py
new file mode 100644
index 000000000..ca5f2e7b8
--- /dev/null
+++ b/modules/model_lumina.py
@@ -0,0 +1,24 @@
+import diffusers
+
+
+def load_lumina(_checkpoint_info, diffusers_load_config={}):
+    from modules import shared, devices, modelloader
+    modelloader.hf_login()
+    # {'low_cpu_mem_usage': True, 'torch_dtype': torch.float16, 'load_connected_pipeline': True, 'safety_checker': None, 'requires_safety_checker': False}
+    if 'torch_dtype' not in diffusers_load_config:
+        diffusers_load_config['torch_dtype'] = 'torch.float16'
+    if 'low_cpu_mem_usage' in diffusers_load_config:
+        del diffusers_load_config['low_cpu_mem_usage']
+    if 'load_connected_pipeline' in diffusers_load_config:
+        del diffusers_load_config['load_connected_pipeline']
+    if 'safety_checker' in diffusers_load_config:
+        del diffusers_load_config['safety_checker']
+    if 'requires_safety_checker' in diffusers_load_config:
+        del diffusers_load_config['requires_safety_checker']
+    pipe = diffusers.LuminaText2ImgPipeline.from_pretrained(
+        'Alpha-VLLM/Lumina-Next-SFT-diffusers',
+        cache_dir = shared.opts.diffusers_dir,
+        **diffusers_load_config,
+    )
+    devices.torch_gc()
+    return pipe
diff --git a/modules/model_sd3.py b/modules/model_sd3.py
index 81470a97e..8fc2de233 100644
--- a/modules/model_sd3.py
+++ b/modules/model_sd3.py
@@ -13,9 +13,9 @@ def load_sd3(fn=None, cache_dir=None, config=None):
     if fn is not None and fn.endswith('.safetensors') and os.path.exists(fn):
         model_id = fn
         loader = diffusers.StableDiffusion3Pipeline.from_single_file
-        diffusers_minor = int(diffusers.__version__.split('.')[1])
+        _diffusers_major, diffusers_minor, diffusers_micro = int(diffusers.__version__.split('.')[0]), int(diffusers.__version__.split('.')[1]), int(diffusers.__version__.split('.')[2])
         fn_size = os.path.getsize(fn)
-        if diffusers_minor < 30 or fn_size < 5e9: # te1/te2 do not get loaded correctly in diffusers 0.29.0 or model is without te1/te2
+        if (diffusers_minor <= 29 and diffusers_micro < 1) or fn_size < 5e9: # te1/te2 do not get loaded correctly in diffusers 0.29.0 if model is without te1/te2
             kwargs = {
                 'text_encoder': transformers.CLIPTextModelWithProjection.from_pretrained(
                     repo_id,
diff --git a/modules/model_t5.py b/modules/model_t5.py
index 7b735794c..1a1e40382 100644
--- a/modules/model_t5.py
+++ b/modules/model_t5.py
@@ -75,3 +75,4 @@ def set_t5(pipe, module, t5=None, cache_dir=None):
         else:
             pipe.maybe_free_model_hooks()
     devices.torch_gc()
+    return pipe
diff --git a/modules/onnx_impl/__init__.py b/modules/onnx_impl/__init__.py
index 7e23e72e6..013387d8f 100644
--- a/modules/onnx_impl/__init__.py
+++ b/modules/onnx_impl/__init__.py
@@ -3,7 +3,6 @@ import numpy as np
 import torch
 import diffusers
 import onnxruntime as ort
-import optimum.onnxruntime
 
 initialized = False
 
@@ -208,8 +207,6 @@ def initialize_onnx():
         from .pipelines.onnx_stable_diffusion_img2img_pipeline import OnnxStableDiffusionImg2ImgPipeline
         from .pipelines.onnx_stable_diffusion_inpaint_pipeline import OnnxStableDiffusionInpaintPipeline
         from .pipelines.onnx_stable_diffusion_upscale_pipeline import OnnxStableDiffusionUpscalePipeline
-        from .pipelines.onnx_stable_diffusion_xl_pipeline import OnnxStableDiffusionXLPipeline
-        from .pipelines.onnx_stable_diffusion_xl_img2img_pipeline import OnnxStableDiffusionXLImg2ImgPipeline
 
         OnnxRuntimeModel.__module__ = 'diffusers' # OnnxRuntimeModel Hijack.
         diffusers.OnnxRuntimeModel = OnnxRuntimeModel
@@ -225,6 +222,16 @@ def initialize_onnx():
 
         diffusers.OnnxStableDiffusionUpscalePipeline = OnnxStableDiffusionUpscalePipeline
 
+        log.debug(f'ONNX: version={ort.__version__} provider={opts.onnx_execution_provider}, available={available_execution_providers}')
+    except Exception as e:
+        log.error(f'ONNX failed to initialize: {e}')
+
+    try:
+        # load xl pipelines. may fail if the user has the latest diffusers (0.30.x)
+        import optimum.onnxruntime
+        from .pipelines.onnx_stable_diffusion_xl_pipeline import OnnxStableDiffusionXLPipeline
+        from .pipelines.onnx_stable_diffusion_xl_img2img_pipeline import OnnxStableDiffusionXLImg2ImgPipeline
+
         diffusers.OnnxStableDiffusionXLPipeline = OnnxStableDiffusionXLPipeline
         diffusers.pipelines.auto_pipeline.AUTO_TEXT2IMAGE_PIPELINES_MAPPING["onnx-stable-diffusion-xl"] = diffusers.OnnxStableDiffusionXLPipeline
 
@@ -235,10 +242,9 @@ def initialize_onnx():
         diffusers.ORTStableDiffusionXLImg2ImgPipeline = diffusers.OnnxStableDiffusionXLImg2ImgPipeline
 
         optimum.onnxruntime.modeling_diffusion._ORTDiffusionModelPart.to = ORTDiffusionModelPart_to # pylint: disable=protected-access
+    except Exception:
+        pass
 
-        log.debug(f'ONNX: version={ort.__version__} provider={opts.onnx_execution_provider}, available={available_execution_providers}')
-    except Exception as e:
-        log.error(f'ONNX failed to initialize: {e}')
     initialized = True
 
 
diff --git a/modules/onnx_impl/execution_providers.py b/modules/onnx_impl/execution_providers.py
index aa1e7346c..641bf8720 100644
--- a/modules/onnx_impl/execution_providers.py
+++ b/modules/onnx_impl/execution_providers.py
@@ -52,8 +52,8 @@ def get_execution_provider_options():
     execution_provider_options = { "device_id": int(cmd_opts.device_id or 0) }
     if opts.onnx_execution_provider == ExecutionProvider.ROCm:
         if ExecutionProvider.ROCm in available_execution_providers:
-            execution_provider_options["tunable_op_enable"] = 1
-            execution_provider_options["tunable_op_tuning_enable"] = 1
+            execution_provider_options["tunable_op_enable"] = True
+            execution_provider_options["tunable_op_tuning_enable"] = True
     elif opts.onnx_execution_provider == ExecutionProvider.OpenVINO:
         from modules.intel.openvino import get_device as get_raw_openvino_device
         device = get_raw_openvino_device()
diff --git a/modules/onnx_impl/pipelines/__init__.py b/modules/onnx_impl/pipelines/__init__.py
index bb0d9b010..ca1ddd2f7 100644
--- a/modules/onnx_impl/pipelines/__init__.py
+++ b/modules/onnx_impl/pipelines/__init__.py
@@ -1,14 +1,12 @@
 import os
+import sys
 import json
 import shutil
 import tempfile
 from abc import ABCMeta
 from typing import Type, Tuple, List, Any, Dict
-from packaging import version
 import torch
 import diffusers
-import onnxruntime as ort
-import optimum.onnxruntime
 from installer import log, install
 from modules import shared
 from modules.paths import sd_configs_path, models_path
@@ -23,7 +21,6 @@ from modules.onnx_impl.execution_providers import ExecutionProvider, EP_TO_NAME,
 SUBMODELS_SD = ("text_encoder", "unet", "vae_encoder", "vae_decoder",)
 SUBMODELS_SDXL = ("text_encoder", "text_encoder_2", "unet", "vae_encoder", "vae_decoder",)
 SUBMODELS_SDXL_REFINER = ("text_encoder_2", "unet", "vae_encoder", "vae_decoder",)
-
 SUBMODELS_LARGE = ("text_encoder_2", "unet",)
 
 
@@ -48,11 +45,13 @@ class PipelineBase(TorchCompatibleModule, diffusers.DiffusionPipeline, metaclass
 
             module = getattr(self, name)
 
-            if isinstance(module, optimum.onnxruntime.modeling_diffusion._ORTDiffusionModelPart): # pylint: disable=protected-access
-                device = extract_device(args, kwargs)
-                if device is None:
-                    return self
-                module.session = move_inference_session(module.session, device)
+            if "optimum.onnxruntime" in sys.modules:
+                import optimum.onnxruntime
+                if isinstance(module, optimum.onnxruntime.modeling_diffusion._ORTDiffusionModelPart): # pylint: disable=protected-access
+                    device = extract_device(args, kwargs)
+                    if device is None:
+                        return self
+                    module.session = move_inference_session(module.session, device)
 
             if not isinstance(module, diffusers.OnnxRuntimeModel):
                 continue
diff --git a/modules/pag/pipe_sdxl.py b/modules/pag/pipe_sdxl.py
index 429384ea3..82ae06c07 100644
--- a/modules/pag/pipe_sdxl.py
+++ b/modules/pag/pipe_sdxl.py
@@ -5,7 +5,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
-from packaging import version
 
 from transformers import (
     CLIPImageProcessor,
@@ -26,8 +25,6 @@ from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionMode
 from diffusers.models.attention_processor import (
     AttnProcessor2_0,
     FusedAttnProcessor2_0,
-    LoRAAttnProcessor2_0,
-    LoRAXFormersAttnProcessor,
     XFormersAttnProcessor,
 )
 from diffusers.models.lora import adjust_lora_scale_text_encoder
@@ -943,8 +940,6 @@ class StableDiffusionXLPAGPipeline(
             (
                 AttnProcessor2_0,
                 XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
                 FusedAttnProcessor2_0,
             ),
         )
diff --git a/modules/postprocess/aurasr_arch.py b/modules/postprocess/aurasr_arch.py
new file mode 100644
index 000000000..2e5eb0ea6
--- /dev/null
+++ b/modules/postprocess/aurasr_arch.py
@@ -0,0 +1,833 @@
+# AuraSR: GAN-based Super-Resolution for real-world, a reproduction of the GigaGAN* paper. Implementation is
+# based on the unofficial lucidrains/gigagan-pytorch repository. Heavily modified from there.
+#
+# https://mingukkang.github.io/GigaGAN/
+from math import log2, ceil
+from functools import partial
+from typing import Any, Optional, List, Iterable
+
+import torch
+from torchvision import transforms
+from PIL import Image
+from torch import nn, einsum, Tensor
+import torch.nn.functional as F
+
+from einops import rearrange, repeat, reduce
+from einops.layers.torch import Rearrange
+
+
+def get_same_padding(size, kernel, dilation, stride):
+    return ((size - 1) * (stride - 1) + dilation * (kernel - 1)) // 2
+
+
+class AdaptiveConv2DMod(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        kernel,
+        *,
+        demod=True,
+        stride=1,
+        dilation=1,
+        eps=1e-8,
+        num_conv_kernels=1,  # set this to be greater than 1 for adaptive
+    ):
+        super().__init__()
+        self.eps = eps
+
+        self.dim_out = dim_out
+
+        self.kernel = kernel
+        self.stride = stride
+        self.dilation = dilation
+        self.adaptive = num_conv_kernels > 1
+
+        self.weights = nn.Parameter(
+            torch.randn((num_conv_kernels, dim_out, dim, kernel, kernel))
+        )
+
+        self.demod = demod
+
+        nn.init.kaiming_normal_(
+            self.weights, a=0, mode="fan_in", nonlinearity="leaky_relu"
+        )
+
+    def forward(
+        self, fmap, mod: Optional[Tensor] = None, kernel_mod: Optional[Tensor] = None
+    ):
+        """
+        notation
+
+        b - batch
+        n - convs
+        o - output
+        i - input
+        k - kernel
+        """
+
+        b, h = fmap.shape[0], fmap.shape[-2]
+
+        # account for feature map that has been expanded by the scale in the first dimension
+        # due to multiscale inputs and outputs
+
+        if mod.shape[0] != b:
+            mod = repeat(mod, "b ... -> (s b) ...", s=b // mod.shape[0])
+
+        if exists(kernel_mod):
+            kernel_mod_has_el = kernel_mod.numel() > 0
+
+            assert self.adaptive or not kernel_mod_has_el
+
+            if kernel_mod_has_el and kernel_mod.shape[0] != b:
+                kernel_mod = repeat(
+                    kernel_mod, "b ... -> (s b) ...", s=b // kernel_mod.shape[0]
+                )
+
+        # prepare weights for modulation
+
+        weights = self.weights
+
+        if self.adaptive:
+            weights = repeat(weights, "... -> b ...", b=b)
+
+            # determine an adaptive weight and 'select' the kernel to use with softmax
+
+            assert exists(kernel_mod) and kernel_mod.numel() > 0
+
+            kernel_attn = kernel_mod.softmax(dim=-1)
+            kernel_attn = rearrange(kernel_attn, "b n -> b n 1 1 1 1")
+
+            weights = reduce(weights * kernel_attn, "b n ... -> b ...", "sum")
+
+        # do the modulation, demodulation, as done in stylegan2
+
+        mod = rearrange(mod, "b i -> b 1 i 1 1")
+
+        weights = weights * (mod + 1)
+
+        if self.demod:
+            inv_norm = (
+                reduce(weights**2, "b o i k1 k2 -> b o 1 1 1", "sum")
+                .clamp(min=self.eps)
+                .rsqrt()
+            )
+            weights = weights * inv_norm
+
+        fmap = rearrange(fmap, "b c h w -> 1 (b c) h w")
+
+        weights = rearrange(weights, "b o ... -> (b o) ...")
+
+        padding = get_same_padding(h, self.kernel, self.dilation, self.stride)
+        fmap = F.conv2d(fmap, weights, padding=padding, groups=b)
+
+        return rearrange(fmap, "1 (b o) ... -> b o ...", b=b)
+
+
+class Attend(nn.Module):
+    def __init__(self, dropout=0.0, flash=False):
+        super().__init__()
+        self.dropout = dropout
+        self.attn_dropout = nn.Dropout(dropout)
+        self.scale = nn.Parameter(torch.randn(1))
+        self.flash = flash
+
+    def flash_attn(self, q, k, v):
+        q, k, v = map(lambda t: t.contiguous(), (q, k, v))
+        out = F.scaled_dot_product_attention(
+            q, k, v, dropout_p=self.dropout if self.training else 0.0
+        )
+        return out
+
+    def forward(self, q, k, v):
+        if self.flash:
+            return self.flash_attn(q, k, v)
+
+        scale = q.shape[-1] ** -0.5
+
+        # similarity
+        sim = einsum("b h i d, b h j d -> b h i j", q, k) * scale
+
+        # attention
+        attn = sim.softmax(dim=-1)
+        attn = self.attn_dropout(attn)
+
+        # aggregate values
+        out = einsum("b h i j, b h j d -> b h i d", attn, v)
+
+        return out
+
+
+def exists(x):
+    return x is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+
+
+def cast_tuple(t, length=1):
+    if isinstance(t, tuple):
+        return t
+    return (t,) * length
+
+
+def identity(t, *args, **kwargs):
+    return t
+
+
+def is_power_of_two(n):
+    return log2(n).is_integer()
+
+
+def null_iterator():
+    while True:
+        yield None
+
+def Downsample(dim, dim_out=None):
+    return nn.Sequential(
+        Rearrange("b c (h p1) (w p2) -> b (c p1 p2) h w", p1=2, p2=2),
+        nn.Conv2d(dim * 4, default(dim_out, dim), 1),
+    )
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.g = nn.Parameter(torch.ones(1, dim, 1, 1))
+        self.eps = 1e-4
+
+    def forward(self, x):
+        return F.normalize(x, dim=1) * self.g * (x.shape[1] ** 0.5)
+
+
+# building block modules
+
+
+class Block(nn.Module):
+    def __init__(self, dim, dim_out, groups=8, num_conv_kernels=0):
+        super().__init__()
+        self.proj = AdaptiveConv2DMod(
+            dim, dim_out, kernel=3, num_conv_kernels=num_conv_kernels
+        )
+        self.kernel = 3
+        self.dilation = 1
+        self.stride = 1
+
+        self.act = nn.SiLU()
+
+    def forward(self, x, conv_mods_iter: Optional[Iterable] = None):
+        conv_mods_iter = default(conv_mods_iter, null_iterator())
+
+        x = self.proj(x, mod=next(conv_mods_iter), kernel_mod=next(conv_mods_iter))
+
+        x = self.act(x)
+        return x
+
+
+class ResnetBlock(nn.Module):
+    def __init__(
+        self, dim, dim_out, *, groups=8, num_conv_kernels=0, style_dims: List = []
+    ):
+        super().__init__()
+        style_dims.extend([dim, num_conv_kernels, dim_out, num_conv_kernels])
+
+        self.block1 = Block(
+            dim, dim_out, groups=groups, num_conv_kernels=num_conv_kernels
+        )
+        self.block2 = Block(
+            dim_out, dim_out, groups=groups, num_conv_kernels=num_conv_kernels
+        )
+        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+
+    def forward(self, x, conv_mods_iter: Optional[Iterable] = None):
+        h = self.block1(x, conv_mods_iter=conv_mods_iter)
+        h = self.block2(h, conv_mods_iter=conv_mods_iter)
+
+        return h + self.res_conv(x)
+
+
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+
+        self.to_out = nn.Sequential(nn.Conv2d(hidden_dim, dim, 1), RMSNorm(dim))
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+
+        x = self.norm(x)
+
+        qkv = self.to_qkv(x).chunk(3, dim=1)
+        q, k, v = map(
+            lambda t: rearrange(t, "b (h c) x y -> b h c (x y)", h=self.heads), qkv
+        )
+
+        q = q.softmax(dim=-2)
+        k = k.softmax(dim=-1)
+
+        q = q * self.scale
+
+        context = torch.einsum("b h d n, b h e n -> b h d e", k, v)
+
+        out = torch.einsum("b h d e, b h d n -> b h e n", context, q)
+        out = rearrange(out, "b h c (x y) -> b (h c) x y", h=self.heads, x=h, y=w)
+        return self.to_out(out)
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32, flash=False):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+
+        self.norm = RMSNorm(dim)
+
+        self.attend = Attend(flash=flash)
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+        x = self.norm(x)
+        qkv = self.to_qkv(x).chunk(3, dim=1)
+
+        q, k, v = map(
+            lambda t: rearrange(t, "b (h c) x y -> b h (x y) c", h=self.heads), qkv
+        )
+
+        out = self.attend(q, k, v)
+        out = rearrange(out, "b h (x y) d -> b (h d) x y", x=h, y=w)
+
+        return self.to_out(out)
+
+
+# feedforward
+def FeedForward(dim, mult=4):
+    return nn.Sequential(
+        RMSNorm(dim),
+        nn.Conv2d(dim, dim * mult, 1),
+        nn.GELU(),
+        nn.Conv2d(dim * mult, dim, 1),
+    )
+
+
+# transformers
+class Transformer(nn.Module):
+    def __init__(self, dim, dim_head=64, heads=8, depth=1, flash_attn=True, ff_mult=4):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        Attention(
+                            dim=dim, dim_head=dim_head, heads=heads, flash=flash_attn
+                        ),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+
+        return x
+
+
+class LinearTransformer(nn.Module):
+    def __init__(self, dim, dim_head=64, heads=8, depth=1, ff_mult=4):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        LinearAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+
+        return x
+
+
+class NearestNeighborhoodUpsample(nn.Module):
+    def __init__(self, dim, dim_out=None):
+        super().__init__()
+        dim_out = default(dim_out, dim)
+        self.conv = nn.Conv2d(dim, dim_out, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+
+        if x.shape[0] >= 64:
+            x = x.contiguous()
+
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+
+        return x
+
+class EqualLinear(nn.Module):
+    def __init__(self, dim, dim_out, lr_mul=1, bias=True):
+        super().__init__()
+        self.weight = nn.Parameter(torch.randn(dim_out, dim))
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(dim_out))
+
+        self.lr_mul = lr_mul
+
+    def forward(self, input):
+        return F.linear(input, self.weight * self.lr_mul, bias=self.bias * self.lr_mul)
+
+
+class StyleGanNetwork(nn.Module):
+    def __init__(self, dim_in=128, dim_out=512, depth=8, lr_mul=0.1, dim_text_latent=0):
+        super().__init__()
+        self.dim_in = dim_in
+        self.dim_out = dim_out
+        self.dim_text_latent = dim_text_latent
+
+        layers = []
+        for i in range(depth):
+            is_first = i == 0
+
+            if is_first:
+                dim_in_layer = dim_in + dim_text_latent
+            else:
+                dim_in_layer = dim_out
+
+            dim_out_layer = dim_out
+
+            layers.extend(
+                [EqualLinear(dim_in_layer, dim_out_layer, lr_mul), nn.LeakyReLU(0.2)]
+            )
+
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x, text_latent=None):
+        x = F.normalize(x, dim=1)
+        if self.dim_text_latent > 0:
+            assert exists(text_latent)
+            x = torch.cat((x, text_latent), dim=-1)
+        return self.net(x)
+
+
+class UnetUpsampler(torch.nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        *,
+        image_size: int,
+        input_image_size: int,
+        init_dim: Optional[int] = None,
+        out_dim: Optional[int] = None,
+        style_network: Optional[dict] = None,
+        up_dim_mults: tuple = (1, 2, 4, 8, 16),
+        down_dim_mults: tuple = (4, 8, 16),
+        channels: int = 3,
+        resnet_block_groups: int = 8,
+        full_attn: tuple = (False, False, False, True, True),
+        flash_attn: bool = True,
+        self_attn_dim_head: int = 64,
+        self_attn_heads: int = 8,
+        attn_depths: tuple = (2, 2, 2, 2, 4),
+        mid_attn_depth: int = 4,
+        num_conv_kernels: int = 4,
+        resize_mode: str = "bilinear",
+        unconditional: bool = True,
+        skip_connect_scale: Optional[float] = None,
+    ):
+        super().__init__()
+        self.style_network = style_network = StyleGanNetwork(**style_network)
+        self.unconditional = unconditional
+        assert not (
+            unconditional
+            and exists(style_network)
+            and style_network.dim_text_latent > 0
+        )
+
+        assert is_power_of_two(image_size) and is_power_of_two(
+            input_image_size
+        ), "both output image size and input image size must be power of 2"
+        assert (
+            input_image_size < image_size
+        ), "input image size must be smaller than the output image size, thus upsampling"
+
+        self.image_size = image_size
+        self.input_image_size = input_image_size
+
+        style_embed_split_dims = []
+
+        self.channels = channels
+        input_channels = channels
+
+        init_dim = default(init_dim, dim)
+
+        up_dims = [init_dim, *map(lambda m: dim * m, up_dim_mults)]
+        init_down_dim = up_dims[len(up_dim_mults) - len(down_dim_mults)]
+        down_dims = [init_down_dim, *map(lambda m: dim * m, down_dim_mults)]
+        self.init_conv = nn.Conv2d(input_channels, init_down_dim, 7, padding=3)
+
+        up_in_out = list(zip(up_dims[:-1], up_dims[1:]))
+        down_in_out = list(zip(down_dims[:-1], down_dims[1:]))
+
+        block_klass = partial(
+            ResnetBlock,
+            groups=resnet_block_groups,
+            num_conv_kernels=num_conv_kernels,
+            style_dims=style_embed_split_dims,
+        )
+
+        FullAttention = partial(Transformer, flash_attn=flash_attn)
+        *_, mid_dim = up_dims
+
+        self.skip_connect_scale = default(skip_connect_scale, 2**-0.5)
+
+        self.downs = nn.ModuleList([])
+        self.ups = nn.ModuleList([])
+
+        block_count = 6
+
+        for ind, (
+            (dim_in, dim_out),
+            layer_full_attn,
+            layer_attn_depth,
+        ) in enumerate(zip(down_in_out, full_attn, attn_depths)):
+            attn_klass = FullAttention if layer_full_attn else LinearTransformer
+
+            blocks = []
+            for i in range(block_count):
+                blocks.append(block_klass(dim_in, dim_in))
+
+            self.downs.append(
+                nn.ModuleList(
+                    [
+                        nn.ModuleList(blocks),
+                        nn.ModuleList(
+                            [
+                                (
+                                    attn_klass(
+                                        dim_in,
+                                        dim_head=self_attn_dim_head,
+                                        heads=self_attn_heads,
+                                        depth=layer_attn_depth,
+                                    )
+                                    if layer_full_attn
+                                    else None
+                                ),
+                                nn.Conv2d(
+                                    dim_in, dim_out, kernel_size=3, stride=2, padding=1
+                                ),
+                            ]
+                        ),
+                    ]
+                )
+            )
+
+        self.mid_block1 = block_klass(mid_dim, mid_dim)
+        self.mid_attn = FullAttention(
+            mid_dim,
+            dim_head=self_attn_dim_head,
+            heads=self_attn_heads,
+            depth=mid_attn_depth,
+        )
+        self.mid_block2 = block_klass(mid_dim, mid_dim)
+
+        *_, last_dim = up_dims
+
+        for ind, (
+            (dim_in, dim_out),
+            layer_full_attn,
+            layer_attn_depth,
+        ) in enumerate(
+            zip(
+                reversed(up_in_out),
+                reversed(full_attn),
+                reversed(attn_depths),
+            )
+        ):
+            attn_klass = FullAttention if layer_full_attn else LinearTransformer
+
+            blocks = []
+            input_dim = dim_in * 2 if ind < len(down_in_out) else dim_in
+            for i in range(block_count):
+                blocks.append(block_klass(input_dim, dim_in))
+
+            self.ups.append(
+                nn.ModuleList(
+                    [
+                        nn.ModuleList(blocks),
+                        nn.ModuleList(
+                            [
+                                NearestNeighborhoodUpsample(
+                                    last_dim if ind == 0 else dim_out,
+                                    dim_in,
+                                ),
+                                (
+                                    attn_klass(
+                                        dim_in,
+                                        dim_head=self_attn_dim_head,
+                                        heads=self_attn_heads,
+                                        depth=layer_attn_depth,
+                                    )
+                                    if layer_full_attn
+                                    else None
+                                ),
+                            ]
+                        ),
+                    ]
+                )
+            )
+
+        self.out_dim = default(out_dim, channels)
+        self.final_res_block = block_klass(dim, dim)
+        self.final_to_rgb = nn.Conv2d(dim, channels, 1)
+        self.resize_mode = resize_mode
+        self.style_to_conv_modulations = nn.Linear(
+            style_network.dim_out, sum(style_embed_split_dims)
+        )
+        self.style_embed_split_dims = style_embed_split_dims
+
+    @property
+    def allowable_rgb_resolutions(self):
+        input_res_base = int(log2(self.input_image_size))
+        output_res_base = int(log2(self.image_size))
+        allowed_rgb_res_base = list(range(input_res_base, output_res_base))
+        return [*map(lambda p: 2**p, allowed_rgb_res_base)]
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def total_params(self):
+        return sum([p.numel() for p in self.parameters()])
+
+    def resize_image_to(self, x, size):
+        return F.interpolate(x, (size, size), mode=self.resize_mode)
+
+    def forward(
+        self,
+        lowres_image: torch.Tensor,
+        styles: Optional[torch.Tensor] = None,
+        noise: Optional[torch.Tensor] = None,
+        global_text_tokens: Optional[torch.Tensor] = None,
+        return_all_rgbs: bool = False,
+    ):
+        x = lowres_image
+
+        noise_scale = 0.001  # Adjust the scale of the noise as needed
+        noise_aug = torch.randn_like(x) * noise_scale
+        x = x + noise_aug
+        x = x.clamp(0, 1)
+
+        shape = x.shape
+        batch_size = shape[0]
+
+        assert shape[-2:] == ((self.input_image_size,) * 2)
+
+        # styles
+        if not exists(styles):
+            assert exists(self.style_network)
+
+            noise = default(
+                noise,
+                torch.randn(
+                    (batch_size, self.style_network.dim_in), device=self.device
+                ),
+            )
+            styles = self.style_network(noise, global_text_tokens)
+
+        # project styles to conv modulations
+        conv_mods = self.style_to_conv_modulations(styles)
+        conv_mods = conv_mods.split(self.style_embed_split_dims, dim=-1)
+        conv_mods = iter(conv_mods)
+
+        x = self.init_conv(x)
+
+        h = []
+        for blocks, (attn, downsample) in self.downs:
+            for block in blocks:
+                x = block(x, conv_mods_iter=conv_mods)
+                h.append(x)
+
+            if attn is not None:
+                x = attn(x)
+
+            x = downsample(x)
+
+        x = self.mid_block1(x, conv_mods_iter=conv_mods)
+        x = self.mid_attn(x)
+        x = self.mid_block2(x, conv_mods_iter=conv_mods)
+
+        for (
+            blocks,
+            (
+                upsample,
+                attn,
+            ),
+        ) in self.ups:
+            x = upsample(x)
+            for block in blocks:
+                if h != []:
+                    res = h.pop()
+                    res = res * self.skip_connect_scale
+                    x = torch.cat((x, res), dim=1)
+
+                x = block(x, conv_mods_iter=conv_mods)
+
+            if attn is not None:
+                x = attn(x)
+
+        x = self.final_res_block(x, conv_mods_iter=conv_mods)
+        rgb = self.final_to_rgb(x)
+
+        if not return_all_rgbs:
+            return rgb
+
+        return rgb, []
+
+
+def tile_image(image, chunk_size=64):
+    c, h, w = image.shape
+    h_chunks = ceil(h / chunk_size)
+    w_chunks = ceil(w / chunk_size)
+    tiles = []
+    for i in range(h_chunks):
+        for j in range(w_chunks):
+            tile = image[:, i * chunk_size:(i + 1) * chunk_size, j * chunk_size:(j + 1) * chunk_size]
+            tiles.append(tile)
+    return tiles, h_chunks, w_chunks
+
+
+def merge_tiles(tiles, h_chunks, w_chunks, chunk_size=64):
+    # Determine the shape of the output tensor
+    c = tiles[0].shape[0]
+    h = h_chunks * chunk_size
+    w = w_chunks * chunk_size
+
+    # Create an empty tensor to hold the merged image
+    merged = torch.zeros((c, h, w), dtype=tiles[0].dtype)
+
+    # Iterate over the tiles and place them in the correct position
+    for idx, tile in enumerate(tiles):
+        i = idx // w_chunks
+        j = idx % w_chunks
+
+        h_start = i * chunk_size
+        w_start = j * chunk_size
+
+        tile_h, tile_w = tile.shape[1:]
+        merged[:, h_start:h_start+tile_h, w_start:w_start+tile_w] = tile
+
+    return merged
+
+
+class AuraSR:
+    def __init__(self, config: dict[str, Any], device: str = "cuda"):
+        self.upsampler = UnetUpsampler(**config).to(device)
+        self.input_image_size = config["input_image_size"]
+
+    @classmethod
+    def from_pretrained(cls, model_id: str = "fal-ai/AuraSR", use_safetensors: bool = True):
+        import json
+        import torch
+        from pathlib import Path
+        from huggingface_hub import snapshot_download
+
+        # Check if model_id is a local file
+        if Path(model_id).is_file():
+            local_file = Path(model_id)
+            if local_file.suffix == '.safetensors':
+                use_safetensors = True
+            elif local_file.suffix == '.ckpt':
+                use_safetensors = False
+            else:
+                raise ValueError(f"Unsupported file format: {local_file.suffix}. Please use .safetensors or .ckpt files.")
+
+            # For local files, we need to provide the config separately
+            config_path = local_file.with_name('config.json')
+            if not config_path.exists():
+                raise FileNotFoundError(
+                    f"Config file not found: {config_path}. "
+                    f"When loading from a local file, ensure that 'config.json' "
+                    f"is present in the same directory as '{local_file.name}'. "
+                    f"If you're trying to load a model from Hugging Face, "
+                    f"please provide the model ID instead of a file path."
+                )
+
+            config = json.loads(config_path.read_text())
+            hf_model_path = local_file.parent
+        else:
+            hf_model_path = Path(snapshot_download(model_id))
+            config = json.loads((hf_model_path / "config.json").read_text())
+
+        model = cls(config)
+
+        if use_safetensors:
+            try:
+                from safetensors.torch import load_file
+                checkpoint = load_file(hf_model_path / "model.safetensors" if not Path(model_id).is_file() else model_id)
+            except ImportError:
+                raise ImportError(
+                    "The safetensors library is not installed. "
+                    "Please install it with `pip install safetensors` "
+                    "or use `use_safetensors=False` to load the model with PyTorch."
+                )
+        else:
+            checkpoint = torch.load(hf_model_path / "model.ckpt" if not Path(model_id).is_file() else model_id)
+
+        model.upsampler.load_state_dict(checkpoint, strict=True)
+        return model
+
+    @torch.no_grad()
+    def upscale_4x(self, image: Image.Image, max_batch_size=8) -> Image.Image:
+        tensor_transform = transforms.ToTensor()
+        device = self.upsampler.device
+
+        image_tensor = tensor_transform(image).unsqueeze(0)
+        _, _, h, w = image_tensor.shape
+        pad_h = (self.input_image_size - h % self.input_image_size) % self.input_image_size
+        pad_w = (self.input_image_size - w % self.input_image_size) % self.input_image_size
+
+        # Pad the image
+        image_tensor = torch.nn.functional.pad(image_tensor, (0, pad_w, 0, pad_h), mode='reflect').squeeze(0)
+        tiles, h_chunks, w_chunks = tile_image(image_tensor, self.input_image_size)
+
+        # Batch processing of tiles
+        num_tiles = len(tiles)
+        batches = [tiles[i:i + max_batch_size] for i in range(0, num_tiles, max_batch_size)]
+        reconstructed_tiles = []
+
+        for batch in batches:
+            model_input = torch.stack(batch).to(device)
+            generator_output = self.upsampler(
+                lowres_image=model_input,
+                noise=torch.randn(model_input.shape[0], 128, device=device)
+            )
+            reconstructed_tiles.extend(list(generator_output.clamp_(0, 1).detach().cpu()))
+
+        merged_tensor = merge_tiles(reconstructed_tiles, h_chunks, w_chunks, self.input_image_size * 4)
+        unpadded = merged_tensor[:, :h * 4, :w * 4]
+
+        to_pil = transforms.ToPILImage()
+        return to_pil(unpadded)
diff --git a/modules/postprocess/aurasr_model.py b/modules/postprocess/aurasr_model.py
new file mode 100644
index 000000000..d1e69455e
--- /dev/null
+++ b/modules/postprocess/aurasr_model.py
@@ -0,0 +1,37 @@
+import torch
+import diffusers
+from PIL import Image
+from modules import shared, devices
+from modules.upscaler import Upscaler, UpscalerData
+from installer import install
+
+class UpscalerAuraSR(Upscaler):
+    def __init__(self, dirname): # pylint: disable=super-init-not-called
+        self.name = "AuraSR"
+        self.user_path = dirname
+        self.model = None
+        if not shared.native:
+            super().__init__()
+            return
+        self.scalers = [
+            UpscalerData(name="Aura SR 4x", path="stabilityai/sd-x2-latent-upscaler", upscaler=self, model=None, scale=4),
+        ]
+
+    def callback(self, _step: int, _timestep: int, _latents: torch.FloatTensor):
+        pass
+
+    def do_upscale(self, img: Image.Image, selected_model):
+        from modules.postprocess.aurasr_arch import AuraSR
+        if self.model is None:
+            self.model = AuraSR.from_pretrained("vladmandic/aurasr", use_safetensors=False)
+        devices.torch_gc()
+
+        self.model.upsampler.to(devices.device)
+        image = self.model.upscale_4x(img)
+        self.model.upsampler.to(devices.cpu)
+
+        if shared.opts.upscaler_unload and selected_model in self.models:
+            self.model = None
+            shared.log.debug(f"Upscaler unloaded: type={self.name} model={selected_model}")
+            devices.torch_gc(force=True)
+        return image
diff --git a/modules/processing.py b/modules/processing.py
index d453e0b15..173d85564 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -389,6 +389,8 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
 
         if hasattr(shared.sd_model, 'restore_pipeline') and shared.sd_model.restore_pipeline is not None:
             shared.sd_model.restore_pipeline()
+        if shared.native: # reset pipeline for each iteration
+            shared.sd_model = sd_models.set_diffuser_pipe(shared.sd_model, sd_models.DiffusersTaskType.TEXT_2_IMAGE)
 
         t1 = time.time()
         shared.log.info(f'Processed: images={len(output_images)} time={t1 - t0:.2f} its={(p.steps * len(output_images)) / (t1 - t0):.2f} memory={memstats.memory_stats()}')
diff --git a/modules/processing_callbacks.py b/modules/processing_callbacks.py
index 824a79007..d6b81b21c 100644
--- a/modules/processing_callbacks.py
+++ b/modules/processing_callbacks.py
@@ -38,7 +38,8 @@ def diffusers_callback(pipe, step: int, timestep: int, kwargs: dict):
         return kwargs
     latents = kwargs.get('latents', None)
     debug_callback(f'Callback: step={step} timestep={timestep} latents={latents.shape if latents is not None else None} kwargs={list(kwargs)}')
-    shared.state.sampling_step = step
+    order = getattr(pipe.scheduler, "order", 1) if hasattr(pipe, 'scheduler') else 1
+    shared.state.sampling_step = step // order
     if shared.state.interrupted or shared.state.skipped:
         raise AssertionError('Interrupted...')
     if shared.state.paused:
diff --git a/modules/processing_diffusers.py b/modules/processing_diffusers.py
index dca8e0a71..ef1eb9d69 100644
--- a/modules/processing_diffusers.py
+++ b/modules/processing_diffusers.py
@@ -85,7 +85,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
 
     shared.sd_model = update_pipeline(shared.sd_model, p)
     shared.log.info(f'Base: class={shared.sd_model.__class__.__name__}')
-    update_sampler(p, shared.sd_model) # TODO SD3
+    update_sampler(p, shared.sd_model)
     base_args = set_pipeline_args(
         p=p,
         model=shared.sd_model,
@@ -104,7 +104,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
         clip_skip=p.clip_skip,
         desc='Base',
     )
-    shared.state.sampling_steps = base_args.get('prior_num_inference_steps', None) or base_args.get('num_inference_steps', None) or p.steps
+    shared.state.sampling_steps = base_args.get('prior_num_inference_steps', None) or p.steps or base_args.get('num_inference_steps', None)
     if shared.opts.scheduler_eta is not None and shared.opts.scheduler_eta > 0 and shared.opts.scheduler_eta < 1:
         p.extra_generation_params["Sampler Eta"] = shared.opts.scheduler_eta
     output = None
@@ -215,7 +215,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
                 desc='Hires',
             )
             shared.state.job = 'HiRes'
-            shared.state.sampling_steps = hires_args.get('prior_num_inference_steps', None) or hires_args.get('num_inference_steps', None) or p.steps
+            shared.state.sampling_steps = hires_args.get('prior_num_inference_steps', None) or p.steps or hires_args.get('num_inference_steps', None)
             try:
                 sd_models_compile.check_deepcache(enable=True)
                 output = shared.sd_model(**hires_args) # pylint: disable=not-callable
@@ -280,7 +280,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
                 clip_skip=p.clip_skip,
                 desc='Refiner',
             )
-            shared.state.sampling_steps = refiner_args.get('prior_num_inference_steps', None) or refiner_args.get('num_inference_steps', None) or p.steps
+            shared.state.sampling_steps = refiner_args.get('prior_num_inference_steps', None) or p.steps or refiner_args.get('num_inference_steps', None)
             try:
                 if 'requires_aesthetics_score' in shared.sd_refiner.config: # sdxl-model needs false and sdxl-refiner needs true
                     shared.sd_refiner.register_to_config(requires_aesthetics_score = getattr(shared.sd_refiner, 'tokenizer', None) is None)
diff --git a/modules/progress.py b/modules/progress.py
index f5573374b..aeef195b4 100644
--- a/modules/progress.py
+++ b/modules/progress.py
@@ -62,7 +62,7 @@ def progressapi(req: ProgressRequest):
     paused = shared.state.paused
     if not active:
         return InternalProgressResponse(job=shared.state.job, active=active, queued=queued, paused=paused, completed=completed, id_live_preview=-1, textinfo="Queued..." if queued else "Waiting...")
-    shared.state.job_count = max(shared.state.job_count, shared.state.job_no)
+    shared.state.job_count = max(shared.state.frame_count, shared.state.job_count, shared.state.job_no)
     batch_x = max(shared.state.job_no, 0)
     batch_y = max(shared.state.job_count, 1)
     step_x = max(shared.state.sampling_step, 0)
diff --git a/modules/prompt_parser_diffusers.py b/modules/prompt_parser_diffusers.py
index b212d08a6..167d33b53 100644
--- a/modules/prompt_parser_diffusers.py
+++ b/modules/prompt_parser_diffusers.py
@@ -389,7 +389,7 @@ def get_weighted_text_embeddings(pipe, prompt: str = "", neg_prompt: str = "", c
             except Exception:
                 pooled_prompt_embeds = None
                 negative_pooled_prompt_embeds = None
-        debug(f'Prompt: pooled shape={pooled_prompt_embeds[0].shape} time={(time.time() - t0):.3f}')
+        debug(f'Prompt: pooled shape={pooled_prompt_embeds[0].shape if pooled_prompt_embeds is not None else None} time={(time.time() - t0):.3f}')
 
     prompt_embeds = torch.cat(prompt_embeds, dim=-1) if len(prompt_embeds) > 1 else prompt_embeds[0]
     negative_prompt_embeds = torch.cat(negative_prompt_embeds, dim=-1) if len(negative_prompt_embeds) > 1 else \
diff --git a/modules/scripts.py b/modules/scripts.py
index 87a25a56b..3dbeffb3f 100644
--- a/modules/scripts.py
+++ b/modules/scripts.py
@@ -489,10 +489,9 @@ class ScriptRunner:
         s = ScriptSummary('before-process')
         for script in self.alwayson_scripts:
             try:
-                args = p.script_args[script.args_from:script.args_to]
-                if len(args) == 0:
-                    continue
-                script.before_process(p, *args, **kwargs)
+                if (script.args_to > 0) and (script.args_to >= script.args_from):
+                    args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                    script.before_process(p, *args, **kwargs)
             except Exception as e:
                 errors.display(e, f"Error running before process: {script.filename}")
             s.record(script.title())
@@ -502,10 +501,9 @@ class ScriptRunner:
         s = ScriptSummary('process')
         for script in self.alwayson_scripts:
             try:
-                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
-                if len(args) == 0:
-                    continue
-                script.process(p, *args, **kwargs)
+                if (script.args_to > 0) and (script.args_to >= script.args_from):
+                    args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                    script.process(p, *args, **kwargs)
             except Exception as e:
                 errors.display(e, f'Running script process: {script.filename}')
             s.record(script.title())
@@ -516,10 +514,9 @@ class ScriptRunner:
         processed = None
         for script in self.alwayson_scripts:
             try:
-                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
-                if len(args) == 0:
-                    continue
-                processed = script.process_images(p, *args, **kwargs)
+                if (script.args_to > 0) and (script.args_to >= script.args_from):
+                    args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                    processed = script.process_images(p, *args, **kwargs)
             except Exception as e:
                 errors.display(e, f'Running script process images: {script.filename}')
             s.record(script.title())
@@ -530,10 +527,9 @@ class ScriptRunner:
         s = ScriptSummary('before-process-batch')
         for script in self.alwayson_scripts:
             try:
-                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
-                if len(args) == 0:
-                    continue
-                script.before_process_batch(p, *args, **kwargs)
+                if (script.args_to > 0) and (script.args_to >= script.args_from):
+                    args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                    script.before_process_batch(p, *args, **kwargs)
             except Exception as e:
                 errors.display(e, f'Running script before process batch: {script.filename}')
             s.record(script.title())
@@ -543,10 +539,9 @@ class ScriptRunner:
         s = ScriptSummary('process-batch')
         for script in self.alwayson_scripts:
             try:
-                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
-                if len(args) == 0:
-                    continue
-                script.process_batch(p, *args, **kwargs)
+                if (script.args_to > 0) and (script.args_to >= script.args_from):
+                    args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                    script.process_batch(p, *args, **kwargs)
             except Exception as e:
                 errors.display(e, f'Running script process batch: {script.filename}')
             s.record(script.title())
@@ -556,10 +551,9 @@ class ScriptRunner:
         s = ScriptSummary('postprocess')
         for script in self.alwayson_scripts:
             try:
-                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
-                if len(args) == 0:
-                    continue
-                script.postprocess(p, processed, *args)
+                if (script.args_to > 0) and (script.args_to >= script.args_from):
+                    args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                    script.postprocess(p, processed, *args)
             except Exception as e:
                 errors.display(e, f'Running script postprocess: {script.filename}')
             s.record(script.title())
@@ -569,10 +563,9 @@ class ScriptRunner:
         s = ScriptSummary('postprocess-batch')
         for script in self.alwayson_scripts:
             try:
-                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
-                if len(args) == 0:
-                    continue
-                script.postprocess_batch(p, *args, images=images, **kwargs)
+                if (script.args_to > 0) and (script.args_to >= script.args_from):
+                    args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                    script.postprocess_batch(p, *args, images=images, **kwargs)
             except Exception as e:
                 errors.display(e, f'Running script before postprocess batch: {script.filename}')
             s.record(script.title())
@@ -582,10 +575,9 @@ class ScriptRunner:
         s = ScriptSummary('postprocess-batch-list')
         for script in self.alwayson_scripts:
             try:
-                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
-                if len(args) == 0:
-                    continue
-                script.postprocess_batch_list(p, pp, *args, **kwargs)
+                if (script.args_to > 0) and (script.args_to >= script.args_from):
+                    args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                    script.postprocess_batch_list(p, pp, *args, **kwargs)
             except Exception as e:
                 errors.display(e, f'Running script before postprocess batch list: {script.filename}')
             s.record(script.title())
@@ -595,10 +587,9 @@ class ScriptRunner:
         s = ScriptSummary('postprocess-image')
         for script in self.alwayson_scripts:
             try:
-                args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
-                if len(args) == 0:
-                    continue
-                script.postprocess_image(p, pp, *args)
+                if (script.args_to > 0) and (script.args_to >= script.args_from):
+                    args = p.per_script_args.get(script.title(), p.script_args[script.args_from:script.args_to])
+                    script.postprocess_image(p, pp, *args)
             except Exception as e:
                 errors.display(e, f'Running script postprocess image: {script.filename}')
             s.record(script.title())
diff --git a/modules/sd_models.py b/modules/sd_models.py
index f039ea316..47cc8538f 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -547,7 +547,7 @@ def change_backend():
     shared.native = shared.backend == shared.Backend.DIFFUSERS
     checkpoints_loaded.clear()
     from modules.sd_samplers import list_samplers
-    list_samplers(shared.backend)
+    list_samplers()
     list_models()
     from modules.sd_vae import refresh_vae_list
     refresh_vae_list()
@@ -586,7 +586,7 @@ def detect_pipeline(f: str, op: str = 'model', warning=True, quiet=False):
                     guess = 'Stable Diffusion XL Instruct'
                 elif (size > 3138 and size < 3142): #3140
                     guess = 'Stable Diffusion XL'
-                elif (size > 5692 and size < 5698) or (size > 4134 and size < 4138) or (size > 10362 and size < 10366):
+                elif (size > 5692 and size < 5698) or (size > 4134 and size < 4138) or (size > 10362 and size < 10366) or (size > 15028 and size < 15228):
                     guess = 'Stable Diffusion 3'
             # guess by name
             """
@@ -611,6 +611,10 @@ def detect_pipeline(f: str, op: str = 'model', warning=True, quiet=False):
                 guess = 'Stable Cascade'
             if 'pixart-sigma' in f.lower():
                 guess = 'PixArt-Sigma'
+            if 'lumina-next' in f.lower():
+                guess = 'Lumina-Next'
+            if 'kolors' in f.lower():
+                guess = 'Kolors'
             # switch for specific variant
             if guess == 'Stable Diffusion' and 'inpaint' in f.lower():
                 guess = 'Stable Diffusion Inpaint'
@@ -992,6 +996,24 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
                     if debug_load:
                         errors.display(e, 'Load')
                     return
+            elif model_type in ['Lumina-Next']: # forced pipeline
+                try:
+                    from modules.model_lumina import load_lumina
+                    sd_model = load_lumina(checkpoint_info, diffusers_load_config)
+                except Exception as e:
+                    shared.log.error(f'Diffusers Failed loading {op}: {checkpoint_info.path} {e}')
+                    if debug_load:
+                        errors.display(e, 'Load')
+                    return
+            elif model_type in ['Kolors']: # forced pipeline
+                try:
+                    from modules.model_kolors import load_kolors
+                    sd_model = load_kolors(checkpoint_info, diffusers_load_config)
+                except Exception as e:
+                    shared.log.error(f'Diffusers Failed loading {op}: {checkpoint_info.path} {e}')
+                    if debug_load:
+                        errors.display(e, 'Load')
+                    return
             elif model_type in ['Stable Diffusion 3']:
                 try:
                     from modules.model_sd3 import load_sd3
@@ -1150,7 +1172,7 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
         timer.record("options")
 
         set_diffuser_offload(sd_model, op)
-        if op == 'model':
+        if op == 'model' and not (os.path.isdir(checkpoint_info.path) or checkpoint_info.type == 'huggingface'):
             sd_vae.apply_vae_config(shared.sd_model.sd_checkpoint_info.filename, vae_file, sd_model)
         if op == 'refiner' and shared.opts.diffusers_move_refiner:
             shared.log.debug('Moving refiner model to CPU')
diff --git a/modules/sd_models_compile.py b/modules/sd_models_compile.py
index ed3109869..fd7f189f6 100644
--- a/modules/sd_models_compile.py
+++ b/modules/sd_models_compile.py
@@ -309,6 +309,9 @@ def check_deepcache(enable: bool):
 
 def compile_deepcache(sd_model):
     global deepcache_worker # pylint: disable=global-statement
+    if not hasattr(sd_model, 'unet'):
+        shared.log.warning(f'Model compile using deep-cache: {sd_model.__class__} not supported')
+        return sd_model
     try:
         from DeepCache import DeepCacheSDHelper
     except Exception as e:
diff --git a/modules/sd_samplers.py b/modules/sd_samplers.py
index 886b49ce3..5be3526ae 100644
--- a/modules/sd_samplers.py
+++ b/modules/sd_samplers.py
@@ -14,7 +14,7 @@ samplers_map = {}
 loaded_config = None
 
 
-def list_samplers(backend_name = shared.backend):
+def list_samplers():
     global all_samplers # pylint: disable=global-statement
     global all_samplers_map # pylint: disable=global-statement
     global samplers # pylint: disable=global-statement
diff --git a/modules/sd_samplers_cfg_denoiser.py b/modules/sd_samplers_cfg_denoiser.py
index 6685a4d38..4b42d4918 100644
--- a/modules/sd_samplers_cfg_denoiser.py
+++ b/modules/sd_samplers_cfg_denoiser.py
@@ -1,5 +1,5 @@
-# TODO a1111 compatibility module
-# TODO cfg_denoiser implementation missing
+# a1111 compatibility module
+# cfg_denoiser implementation missing
 
 import torch
 from modules import prompt_parser, devices, sd_samplers_common
@@ -95,7 +95,7 @@ class CFGDenoiser(torch.nn.Module):
         if state.interrupted or state.skipped:
             raise sd_samplers_common.InterruptedException
 
-        # TODO cfg_scale implementation missing
+        # cfg_scale implementation missing for original backend
         # if sd_samplers_common.apply_refiner(self):
         #     cond = self.sampler.sampler_extra_args['cond']
         #    uncond = self.sampler.sampler_extra_args['uncond']
diff --git a/modules/sd_samplers_diffusers.py b/modules/sd_samplers_diffusers.py
index 997ca5c3a..fa0c7aebb 100644
--- a/modules/sd_samplers_diffusers.py
+++ b/modules/sd_samplers_diffusers.py
@@ -33,6 +33,7 @@ try:
         PNDMScheduler,
         SASolverScheduler,
         FlowMatchEulerDiscreteScheduler,
+        # FlowMatchHeunDiscreteScheduler,
     )
 except Exception as e:
     import diffusers
@@ -67,6 +68,7 @@ config = {
     'DPM++ 2M EDM': { 'solver_order': 2, 'solver_type': 'midpoint', 'final_sigmas_type': 'zero', 'algorithm_type': 'dpmsolver++' },
     'CMSI': { }, #{ 'sigma_min':  0.002, 'sigma_max': 80.0, 'sigma_data': 0.5, 's_noise': 1.0, 'rho': 7.0, 'clip_denoised': True },
     'Euler FlowMatch': { 'shift': 1, },
+    # 'Heun FlowMatch': { 'shift': 1, },
     'IPNDM': { },
 }
 
@@ -99,6 +101,7 @@ samplers_data_diffusers = [
     sd_samplers_common.SamplerData('TCD', lambda model: DiffusionSampler('TCD', TCDScheduler, model), [], {}),
     sd_samplers_common.SamplerData('CMSI', lambda model: DiffusionSampler('CMSI', CMStochasticIterativeScheduler, model), [], {}),
     sd_samplers_common.SamplerData('Euler FlowMatch', lambda model: DiffusionSampler('Euler FlowMatch', FlowMatchEulerDiscreteScheduler, model), [], {}),
+    # sd_samplers_common.SamplerData('Heun FlowMatch', lambda model: DiffusionSampler('Heun FlowMatch', FlowMatchHeunDiscreteScheduler, model), [], {}),
 
     sd_samplers_common.SamplerData('Same as primary', None, [], {}),
 ]
diff --git a/modules/sd_samplers_timesteps.py b/modules/sd_samplers_timesteps.py
index 0e8e01909..aa9e0bbe2 100644
--- a/modules/sd_samplers_timesteps.py
+++ b/modules/sd_samplers_timesteps.py
@@ -1,4 +1,4 @@
-# TODO a1111 compatibility module
+# a1111 compatibility module
 
 import torch
 from modules import sd_samplers_common, sd_samplers_timesteps_impl, sd_samplers_compvis
diff --git a/modules/sd_samplers_timesteps_impl.py b/modules/sd_samplers_timesteps_impl.py
index 03716ee08..5784a760b 100644
--- a/modules/sd_samplers_timesteps_impl.py
+++ b/modules/sd_samplers_timesteps_impl.py
@@ -1,4 +1,4 @@
-# TODO a1111 compatibility module
+# a1111 compatibility module
 
 import torch
 import tqdm
diff --git a/modules/shared_items.py b/modules/shared_items.py
index 40785da85..e30bb16aa 100644
--- a/modules/shared_items.py
+++ b/modules/shared_items.py
@@ -71,15 +71,20 @@ def get_pipelines():
         'Kandinsky 3': getattr(diffusers, 'Kandinsky3Pipeline', None),
         'DeepFloyd IF': getattr(diffusers, 'IFPipeline', None),
         'Custom Diffusers Pipeline': getattr(diffusers, 'DiffusionPipeline', None),
+        'Kolors': getattr(diffusers, 'StableDiffusionXLPipeline', None),
         'InstaFlow': getattr(diffusers, 'StableDiffusionPipeline', None), # dynamically redefined and loaded in sd_models.load_diffuser
         'SegMoE': getattr(diffusers, 'StableDiffusionPipeline', None), # dynamically redefined and loaded in sd_models.load_diffuser
     }
-    if hasattr(diffusers, 'OnnxStableDiffusionXLPipeline'):
+    if hasattr(diffusers, 'OnnxStableDiffusionPipeline'):
         onnx_pipelines = {
             'ONNX Stable Diffusion': getattr(diffusers, 'OnnxStableDiffusionPipeline', None),
             'ONNX Stable Diffusion Img2Img': getattr(diffusers, 'OnnxStableDiffusionImg2ImgPipeline', None),
             'ONNX Stable Diffusion Inpaint': getattr(diffusers, 'OnnxStableDiffusionInpaintPipeline', None),
             'ONNX Stable Diffusion Upscale': getattr(diffusers, 'OnnxStableDiffusionUpscalePipeline', None),
+        }
+        pipelines.update(onnx_pipelines)
+    if hasattr(diffusers, 'OnnxStableDiffusionXLPipeline'):
+        onnx_pipelines = {
             'ONNX Stable Diffusion XL': getattr(diffusers, 'OnnxStableDiffusionXLPipeline', None),
             'ONNX Stable Diffusion XL Img2Img': getattr(diffusers, 'OnnxStableDiffusionXLImg2ImgPipeline', None),
         }
@@ -95,6 +100,8 @@ def get_pipelines():
     if hasattr(diffusers, 'StableDiffusion3Pipeline'):
         pipelines['Stable Diffusion 3'] = getattr(diffusers, 'StableDiffusion3Pipeline', None)
         pipelines['Stable Diffusion 3 Img2Img'] = getattr(diffusers, 'StableDiffusion3Img2ImgPipeline', None)
+    if hasattr(diffusers, 'LuminaText2ImgPipeline'):
+        pipelines['Lumina-Next'] = getattr(diffusers, 'LuminaText2ImgPipeline', None)
 
     for k, v in pipelines.items():
         if k != 'Autodetect' and v is None:
diff --git a/modules/shared_state.py b/modules/shared_state.py
index 79ee20f19..82f8d21dc 100644
--- a/modules/shared_state.py
+++ b/modules/shared_state.py
@@ -12,6 +12,7 @@ class State:
     job = ""
     job_no = 0
     job_count = 0
+    frame_count = 0
     total_jobs = 0
     job_timestamp = '0'
     sampling_step = 0
@@ -71,6 +72,7 @@ class State:
         self.interrupted = False
         self.job = title
         self.job_count = -1
+        self.frame_count = -1
         self.job_no = 0
         self.job_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
         self.paused = False
@@ -93,6 +95,7 @@ class State:
         self.job = ""
         self.job_count = 0
         self.job_no = 0
+        self.frame_count = 0
         self.paused = False
         self.interrupted = False
         self.skipped = False
diff --git a/modules/textual_inversion/textual_inversion.py b/modules/textual_inversion/textual_inversion.py
index b0779716b..efd8575f2 100644
--- a/modules/textual_inversion/textual_inversion.py
+++ b/modules/textual_inversion/textual_inversion.py
@@ -1,7 +1,6 @@
 from typing import List, Union
 import os
 import time
-from collections import namedtuple
 import torch
 import safetensors.torch
 from PIL import Image
@@ -12,7 +11,6 @@ from modules.files_cache import directory_files, directory_mtime, extension_filt
 
 debug = shared.log.trace if os.environ.get('SD_TI_DEBUG', None) is not None else lambda *args, **kwargs: None
 debug('Trace: TEXTUAL INVERSION')
-TokenToAdd = namedtuple("TokenToAdd", ["clip_l", "clip_g"])
 
 
 def list_embeddings(*dirs):
@@ -21,6 +19,134 @@ def list_embeddings(*dirs):
     return list(filter(lambda fp: is_ext(fp) and is_not_preview(fp) and os.stat(fp).st_size > 0, directory_files(*dirs)))
 
 
+def open_embeddings(filename):
+    """
+    Load Embedding files from drive. Image embeddings not currently supported.
+    """
+    if filename is None:
+        return
+    filenames = list(filename)
+    exts = [".SAFETENSORS", '.BIN', '.PT']
+    embeddings = []
+    skipped = []
+    for _filename in filenames:
+        # debug(f'Embedding check: {filename}')
+        fullname = _filename
+        _filename = os.path.basename(fullname)
+        fn, ext = os.path.splitext(_filename)
+        name = os.path.basename(fn)
+        embedding = Embedding(vec=[], name=name, filename=fullname)
+        try:
+            if ext.upper() not in exts:
+                debug(f'extension `{ext}` is invalid, expected one of: {exts}')
+                skipped.append(name)
+                continue
+            if ext.upper() in ['.SAFETENSORS']:
+                with safetensors.torch.safe_open(embedding.filename, framework="pt") as f:  # type: ignore
+                    for k in f.keys():
+                        embedding.vec.append(f.get_tensor(k))
+            else:  # fallback for sd1.5 pt embeddings
+                vectors = torch.load(fullname, map_location=devices.device)["string_to_param"]["*"]
+                embedding.vec.append(vectors)
+            embedding.tokens = [embedding.name if i == 0 else f"{embedding.name}_{i}" for i in range(len(embedding.vec[0]))]
+        except Exception as e:
+            debug(f"Could not load embedding file {fullname} {e}")
+        if embedding.vec:
+            embeddings.append(embedding)
+        else:
+            skipped.append(embedding)
+    return embeddings, skipped
+
+
+def convert_bundled(data):
+    """
+    Bundled embeddings are passed as a dict from lora loading, convert to Embedding objects and pass back as list.
+    """
+    embeddings = []
+    for key in data.keys():
+        embedding = Embedding(vec=[], name=key, filename=None)
+        for vector in data[key].values():
+            embedding.vec.append(vector)
+        embedding.tokens = [embedding.name if i == 0 else f"{embedding.name}_{i}" for i in range(len(embedding.vec[0]))]
+        embeddings.append(embedding)
+    return embeddings, []
+
+
+def get_text_encoders():
+    """
+    Select all text encoder and tokenizer pairs from known pipelines, and index them based on the dimensionality of
+    their embedding layers.
+    """
+    pipe = shared.sd_model
+    te_names = ["text_encoder", "text_encoder_2", "text_encoder_3"]
+    tokenizers_names = ["tokenizer", "tokenizer_2", "tokenizer_3"]
+    text_encoders = []
+    tokenizers = []
+    hidden_sizes = []
+    for te, tok in zip(te_names, tokenizers_names):
+        text_encoder = getattr(pipe, te, None)
+        if text_encoder is None:
+            continue
+        tokenizer = getattr(pipe, tok, None)
+        hidden_size = text_encoder.get_input_embeddings().weight.data.shape[-1] or None
+        if all([text_encoder, tokenizer, hidden_size]):
+            text_encoders.append(text_encoder)
+            tokenizers.append(tokenizer)
+            hidden_sizes.append(hidden_size)
+    return text_encoders, tokenizers, hidden_sizes
+
+
+def deref_tokenizers(tokens, tokenizers):
+    """
+    Bundled embeddings may have the same name as a seperately loaded embedding, or there may be multiple LoRA with
+    differing numbers of vectors. By editing the AddedToken objects, and deleting the dict keys pointing to them,
+    we can ensure that a smaller embedding will not get tokenized as itself, plus the remaining vectors of the previous.
+    """
+    for tokenizer in tokenizers:
+        if len(tokens) > 1:
+            last_token = tokens[-1]
+            suffix = int(last_token.split("_")[-1])
+            newsuffix = suffix + 1
+            while last_token.replace(str(suffix), str(newsuffix)) in tokenizer.get_vocab():
+                idx = tokenizer.convert_tokens_to_ids(last_token.replace(str(suffix), str(newsuffix)))
+                debug(f"Textual inversion: deref idx={idx}")
+                del tokenizer._added_tokens_encoder[last_token.replace(str(suffix), str(newsuffix))] # pylint: disable=protected-access
+                tokenizer._added_tokens_decoder[idx].content = str(time.time()) # pylint: disable=protected-access
+                newsuffix += 1
+
+
+def insert_tokens(embeddings: list, tokenizers: list):
+    """
+    Add all tokens to each tokenizer in the list, with one call to each.
+    """
+    tokens = []
+    for embedding in embeddings:
+        tokens += embedding.tokens
+    for tokenizer in tokenizers:
+        tokenizer.add_tokens(tokens)
+
+
+def insert_vectors(embedding, tokenizers, text_encoders, hiddensizes):
+    """
+    Insert embeddings into the input embedding layer of a list of text encoders, matched based on embedding size,
+    not by name.
+    Future warning, if another text encoder becomes available with embedding dimensions in [768,1280,4096]
+    this may cause collisions.
+    """
+    for vector, size in zip(embedding.vec, embedding.vector_sizes):
+        if size not in hiddensizes:
+            continue
+        idx = hiddensizes.index(size)
+        unk_token_id = tokenizers[idx].convert_tokens_to_ids(tokenizers[idx].unk_token)
+        if text_encoders[idx].get_input_embeddings().weight.data.shape[0] != len(tokenizers[idx]):
+            text_encoders[idx].resize_token_embeddings(len(tokenizers[idx]))
+        for token, v in zip(embedding.tokens, vector.unbind()):
+            token_id = tokenizers[idx].convert_tokens_to_ids(token)
+            if token_id > unk_token_id:
+                text_encoders[idx].get_input_embeddings().weight.data[token_id] = v
+
+
+
 class Embedding:
     def __init__(self, vec, name, filename=None, step=None):
         self.vec = vec
@@ -35,6 +161,7 @@ class Embedding:
         self.sd_checkpoint = None
         self.sd_checkpoint_name = None
         self.optimizer_state_dict = None
+        self.tokens = None
 
     def save(self, filename):
         embedding_data = {
@@ -82,6 +209,10 @@ class DirWithTextualInversionEmbeddings:
 
 
 def convert_embedding(tensor, text_encoder, text_encoder_2):
+    """
+    Given a tensor of shape (b, embed_dim) and two text encoders whose tokenizers match, return a tensor with
+    approximately mathcing meaning, or padding if the input tensor is dissimilar to any frozen text embed
+    """
     with torch.no_grad():
         vectors = []
         clip_l_embeds = text_encoder.get_input_embeddings().weight.data.clone().to(device=devices.device)
@@ -91,7 +222,7 @@ def convert_embedding(tensor, text_encoder, text_encoder_2):
             if values < 0.707:  # Arbitrary similarity to cutoff, here 45 degrees
                 indices *= 0  # Use SDXL padding vector 0
             vectors.append(indices)
-        vectors = torch.stack(vectors)
+        vectors = torch.stack(vectors).to(text_encoder_2.device)
         output = text_encoder_2.get_input_embeddings().weight.data[vectors]
     return output
 
@@ -135,123 +266,48 @@ class EmbeddingDatabase:
         vec = shared.sd_model.cond_stage_model.encode_embedding_init_text(",", 1)
         return vec.shape[1]
 
-    def load_diffusers_embedding(self, filename: Union[str, List[str]]):
-        _loaded_pre = len(self.word_embeddings)
-        embeddings_to_load = []
-        loaded_embeddings = {}
-        skipped_embeddings = []
+    def load_diffusers_embedding(self, filename: Union[str, List[str]] = None, data: dict = None):
+        """
+        File names take precidence over bundled embeddings passed as a dict.
+        Bundled embeddings are automatically set to overwrite previous embeddings.
+        """
+        overwrite = bool(data)
         if not shared.sd_loaded:
             return 0
-        tokenizer   = getattr(shared.sd_model, 'tokenizer',   None)
-        tokenizer_2 = getattr(shared.sd_model, 'tokenizer_2', None)
-        clip_l = getattr(shared.sd_model, 'text_encoder',   None)
-        clip_g = getattr(shared.sd_model, 'text_encoder_2', None)
-        if clip_g and tokenizer_2:
-            model_type = 'SDXL'
-        elif clip_l and tokenizer:
-            model_type = 'SD'
-        else:
+        embeddings, skipped = open_embeddings(filename) or convert_bundled(data)
+        for skip in skipped:
+            self.skipped_embeddings[skip.name] = skipped
+        if not embeddings:
             return 0
-        filenames = list(filename)
-        exts = [".SAFETENSORS", '.BIN', '.PT', '.PNG', '.WEBP', '.JXL', '.AVIF']
-        for _filename in filenames:
-            # debug(f'Embedding check: {filename}')
-            fullname = _filename
-            _filename = os.path.basename(fullname)
-            fn, ext = os.path.splitext(_filename)
-            name = os.path.basename(fn)
-            embedding = Embedding(vec=None, name=name, filename=fullname)
-            tokenizer_vocab = tokenizer.get_vocab()
-            try:
-                if ext.upper() not in exts:
-                    raise ValueError(f'extension `{ext}` is invalid, expected one of: {exts}')
-                if name in tokenizer.get_vocab() or f"{name}_1" in tokenizer.get_vocab():
-                    loaded_embeddings[name] = embedding
-                    debug(f'Embedding already loaded: {name}')
-                embeddings_to_load.append(embedding)
-            except Exception as e:
-                skipped_embeddings.append(embedding)
-                debug(f'Embedding skipped: "{name}" {e}')
-                continue
-            embeddings_to_load = sorted(embeddings_to_load, key=lambda e: exts.index(os.path.splitext(e.filename)[1].upper()))
-
-        tokens_to_add = {}
-        for embedding in embeddings_to_load:
-            try:
-                if embedding.name in tokens_to_add or embedding.name in loaded_embeddings:
-                    raise ValueError('duplicate token')
-                embeddings_dict = {}
-                _, ext = os.path.splitext(embedding.filename)
-                if ext.upper() in ['.SAFETENSORS']:
-                    with safetensors.torch.safe_open(embedding.filename, framework="pt") as f: # type: ignore
-                        for k in f.keys():
-                            embeddings_dict[k] = f.get_tensor(k)
-                else:  # fallback for sd1.5 pt embeddings
-                    embeddings_dict["clip_l"] = self.load_from_file(embedding.filename, embedding.filename)
-                if 'emb_params' in embeddings_dict and 'clip_l' not in embeddings_dict:
-                    embeddings_dict["clip_l"] = embeddings_dict["emb_params"]
-                if 'clip_l' not in embeddings_dict:
-                    raise ValueError('Invalid Embedding, dict missing required key `clip_l`')
-                if 'clip_g' not in embeddings_dict and model_type == "SDXL" and shared.opts.diffusers_convert_embed:
-                    embeddings_dict["clip_g"] = convert_embedding(embeddings_dict["clip_l"], clip_l, clip_g)
-                if 'clip_g' in embeddings_dict:
-                    embedding_type = 'SDXL'
-                else:
-                    embedding_type = 'SD'
-                if embedding_type != model_type:
-                    raise ValueError(f'Unable to load {embedding_type} Embedding "{embedding.name}" into {model_type} Model')
-                _tokens_to_add = {}
-                for i in range(len(embeddings_dict["clip_l"])):
-                    if len(clip_l.get_input_embeddings().weight.data[0]) == len(embeddings_dict["clip_l"][i]):
-                        token = embedding.name if i == 0 else f"{embedding.name}_{i}"
-                        if token in tokenizer_vocab:
-                            raise RuntimeError(f'Multi-Vector Embedding would add pre-existing Token in Vocabulary: {token}')
-                        if token in tokens_to_add:
-                            raise RuntimeError(f'Multi-Vector Embedding would add duplicate Token to Add: {token}')
-                        _tokens_to_add[token] = TokenToAdd(
-                            embeddings_dict["clip_l"][i],
-                            embeddings_dict["clip_g"][i] if 'clip_g' in embeddings_dict else None
-                        )
-                if not _tokens_to_add:
-                    raise ValueError('no valid tokens to add')
-                tokens_to_add.update(_tokens_to_add)
-                loaded_embeddings[embedding.name] = embedding
-            except Exception as e:
-                debug(f"Embedding loading: {embedding.filename} {e}")
-                continue
-        if len(tokens_to_add) > 0:
-            tokenizer.add_tokens(list(tokens_to_add.keys()))
-            clip_l.resize_token_embeddings(len(tokenizer))
-            if model_type == 'SDXL':
-                tokenizer_2.add_tokens(list(tokens_to_add.keys())) # type: ignore
-                clip_g.resize_token_embeddings(len(tokenizer_2)) # type: ignore
-            unk_token_id = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
-            for token, data in tokens_to_add.items():
-                token_id = tokenizer.convert_tokens_to_ids(token)
-                if token_id > unk_token_id:
-                    clip_l.get_input_embeddings().weight.data[token_id] = data.clip_l
-                    if model_type == 'SDXL':
-                        clip_g.get_input_embeddings().weight.data[token_id] = data.clip_g # type: ignore
-
-        for embedding in loaded_embeddings.values():
-            if not embedding:
-                continue
-            self.register_embedding(embedding, shared.sd_model)
-            if embedding in embeddings_to_load:
-                embeddings_to_load.remove(embedding)
-        skipped_embeddings.extend(embeddings_to_load)
-        for embedding in skipped_embeddings:
-            if loaded_embeddings.get(embedding.name, None) == embedding:
-                continue
-            self.skipped_embeddings[embedding.name] = embedding
-        try:
-            if model_type == 'SD':
-                debug(f"Embeddings loaded: text-encoder={shared.sd_model.text_encoder.get_input_embeddings().weight.data.shape[0]}")
-            if model_type == 'SDXL':
-                debug(f"Embeddings loaded: text-encoder-1={shared.sd_model.text_encoder.get_input_embeddings().weight.data.shape[0]} text-encoder-2={shared.sd_model.text_encoder_2.get_input_embeddings().weight.data.shape[0]}")
-        except Exception:
-            pass
-        return len(self.word_embeddings) - _loaded_pre
+        text_encoders, tokenizers, hiddensizes = get_text_encoders()
+        if not all([text_encoders, tokenizers, hiddensizes]):
+            return 0
+        for embedding in embeddings:
+            embedding.vector_sizes = [v.shape[-1] for v in embedding.vec]
+            if shared.opts.diffusers_convert_embed and 768 in hiddensizes and 1280 in hiddensizes and 1280 not in embedding.vector_sizes and 768 in embedding.vector_sizes:
+                embedding.vec.append(
+                    convert_embedding(embedding.vec[embedding.vector_sizes.index(768)], text_encoders[hiddensizes.index(768)],
+                                      text_encoders[hiddensizes.index(1280)]))
+                embedding.vector_sizes.append(1280)
+            if (not all(vs in hiddensizes for vs in embedding.vector_sizes) or  # Skip SD2.1 in SD1.5/SDXL/SD3 vis versa
+                    len(embedding.vector_sizes) > len(hiddensizes) or  # Skip SDXL/SD3 in SD1.5
+                    (len(embedding.vector_sizes) < len(hiddensizes) and len(embedding.vector_sizes) != 2)):  # SD3 no T5
+                embedding.tokens = []
+                self.skipped_embeddings[embedding.name] = embedding
+        if overwrite:
+            shared.log.info(f"Loading Bundled embeddings: {list(data.keys())}")
+            for embedding in embeddings:
+                if embedding.name not in self.skipped_embeddings:
+                    deref_tokenizers(embedding.tokens, tokenizers)
+        insert_tokens(embeddings, tokenizers)
+        for embedding in embeddings:
+            if embedding.name not in self.skipped_embeddings:
+                try:
+                    insert_vectors(embedding, tokenizers, text_encoders, hiddensizes)
+                    self.register_embedding(embedding, shared.sd_model)
+                except Exception as e:
+                    shared.log.error(f'Embedding load: name={embedding.name} fn={embedding.filename} {e}')
+        return
 
     def load_from_file(self, path, filename):
         name, ext = os.path.splitext(filename)
@@ -259,14 +315,14 @@ class EmbeddingDatabase:
 
         if ext in ['.PNG', '.WEBP', '.JXL', '.AVIF']:
             if '.preview' in filename.lower():
-                return
+                return None
             embed_image = Image.open(path)
             if hasattr(embed_image, 'text') and 'sd-ti-embedding' in embed_image.text:
                 data = embedding_from_b64(embed_image.text['sd-ti-embedding'])
             else:
                 data = extract_image_data_embed(embed_image)
                 if not data: # if data is None, means this is not an embeding, just a preview image
-                    return
+                    return None
         elif ext in ['.BIN', '.PT']:
             data = torch.load(path, map_location="cpu")
         elif ext in ['.SAFETENSORS']:
@@ -284,7 +340,7 @@ class EmbeddingDatabase:
         elif type(data) == dict and type(next(iter(data.values()))) == torch.Tensor:
             if len(data.keys()) != 1:
                 self.skipped_embeddings[name] = Embedding(None, name=name, filename=path)
-                return
+                return None
             emb = next(iter(data.values()))
             if len(emb.shape) == 1:
                 emb = emb.unsqueeze(0)
diff --git a/modules/ui_common.py b/modules/ui_common.py
index 1b8c5aade..f83e8594f 100644
--- a/modules/ui_common.py
+++ b/modules/ui_common.py
@@ -407,4 +407,6 @@ def update_token_counter(text, steps):
             ids = getattr(ids, 'input_ids', [])
             token_count = len(ids) - int(has_bos_token) - int(has_eos_token)
             max_length = shared.sd_model.tokenizer.model_max_length - int(has_bos_token) - int(has_eos_token)
+            if max_length is None or max_length < 0 or max_length > 10000:
+                max_length = 0
     return f"<span class='gr-box gr-text-input'>{token_count}/{max_length}</span>"
diff --git a/modules/ui_extensions.py b/modules/ui_extensions.py
index 8a08fc322..4e3fd3588 100644
--- a/modules/ui_extensions.py
+++ b/modules/ui_extensions.py
@@ -206,7 +206,7 @@ def uninstall_extension(extension_path, search_text, sort_column):
     if len(found) > 0 and os.path.isdir(extension_path):
         found = found[0]
         try:
-            shutil.rmtree(found.path, ignore_errors=False, onerror=errorRemoveReadonly)
+            shutil.rmtree(found.path, ignore_errors=False, onerror=errorRemoveReadonly) # pylint: disable=deprecated-argument
             # extensions.extensions = [extension for extension in extensions.extensions if os.path.abspath(found.path) != os.path.abspath(extension_path)]
         except Exception as e:
             shared.log.warning(f'Extension uninstall failed: {found.path} {e}')
diff --git a/modules/vqa.py b/modules/vqa.py
index fd994ccd2..87ea8431c 100644
--- a/modules/vqa.py
+++ b/modules/vqa.py
@@ -1,3 +1,4 @@
+import json
 import torch
 import transformers
 import transformers.dynamic_module_utils
@@ -11,6 +12,7 @@ loaded: str = None
 MODELS = {
     "MS Florence 2 Base": "microsoft/Florence-2-base", # 0.5GB
     "MS Florence 2 Large": "microsoft/Florence-2-large", # 1.5GB
+    "CogFlorence 2 Large": "thwri/CogFlorence-2-Large-Freeze", # 1.6GB
     "Moondream 2": "vikhyatk/moondream2", # 3.7GB
     "GIT TextCaps Base": "microsoft/git-base-textcaps", # 0.7GB
     "GIT VQA Base": "microsoft/git-base-vqav2", # 0.7GB
@@ -166,6 +168,11 @@ def florence(question: str, image: Image.Image, repo: str = None):
 
     if 'task' in response:
         response = response['task']
+    if 'answer' in response:
+        response = response['answer']
+    if isinstance(response, dict):
+        response = json.dumps(response)
+    response = response.replace('\n', '').replace('\r', '').replace('\t', '').strip()
     shared.log.debug(f'VQA: task={task} response="{response}"')
     return response
 
diff --git a/modules/xadapter/pipeline_sd_xl_adapter.py b/modules/xadapter/pipeline_sd_xl_adapter.py
index 65e04ecab..757681972 100644
--- a/modules/xadapter/pipeline_sd_xl_adapter.py
+++ b/modules/xadapter/pipeline_sd_xl_adapter.py
@@ -24,8 +24,7 @@ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInver
 from diffusers.models import AutoencoderKL
 from diffusers.models.attention_processor import (
     AttnProcessor2_0,
-    LoRAAttnProcessor2_0,
-    LoRAXFormersAttnProcessor,
+    FusedAttnProcessor2_0,
     XFormersAttnProcessor,
 )
 from diffusers.schedulers import KarrasDiffusionSchedulers
@@ -558,8 +557,7 @@ class StableDiffusionXLAdapterPipeline(DiffusionPipeline, FromSingleFileMixin, L
             (
                 AttnProcessor2_0,
                 XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
             ),
         )
         # if xformers or torch_2_0 is used attention block does not need
diff --git a/modules/xadapter/pipeline_sd_xl_adapter_controlnet.py b/modules/xadapter/pipeline_sd_xl_adapter_controlnet.py
index ddb334568..f0f27fa69 100644
--- a/modules/xadapter/pipeline_sd_xl_adapter_controlnet.py
+++ b/modules/xadapter/pipeline_sd_xl_adapter_controlnet.py
@@ -30,8 +30,7 @@ from diffusers.models import AutoencoderKL, ControlNetModel
 
 from diffusers.models.attention_processor import (
     AttnProcessor2_0,
-    LoRAAttnProcessor2_0,
-    LoRAXFormersAttnProcessor,
+    FusedAttnProcessor2_0,
     XFormersAttnProcessor,
 )
 from diffusers.schedulers import KarrasDiffusionSchedulers
@@ -572,8 +571,7 @@ class StableDiffusionXLAdapterControlnetPipeline(DiffusionPipeline, FromSingleFi
             (
                 AttnProcessor2_0,
                 XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
             ),
         )
         # if xformers or torch_2_0 is used attention block does not need
diff --git a/modules/xadapter/pipeline_sd_xl_adapter_controlnet_img2img.py b/modules/xadapter/pipeline_sd_xl_adapter_controlnet_img2img.py
index d1cf59033..c2dadfbfc 100644
--- a/modules/xadapter/pipeline_sd_xl_adapter_controlnet_img2img.py
+++ b/modules/xadapter/pipeline_sd_xl_adapter_controlnet_img2img.py
@@ -31,8 +31,7 @@ from diffusers.models import AutoencoderKL, ControlNetModel
 
 from diffusers.models.attention_processor import (
     AttnProcessor2_0,
-    LoRAAttnProcessor2_0,
-    LoRAXFormersAttnProcessor,
+    FusedAttnProcessor2_0,
     XFormersAttnProcessor,
 )
 from diffusers.schedulers import KarrasDiffusionSchedulers
@@ -571,8 +570,7 @@ class StableDiffusionXLAdapterControlnetI2IPipeline(DiffusionPipeline, FromSingl
             (
                 AttnProcessor2_0,
                 XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
             ),
         )
         # if xformers or torch_2_0 is used attention block does not need
diff --git a/modules/zluda_installer.py b/modules/zluda_installer.py
index 06c019a09..d1e12f011 100644
--- a/modules/zluda_installer.py
+++ b/modules/zluda_installer.py
@@ -33,7 +33,7 @@ def install(zluda_path: os.PathLike) -> None:
     if os.path.exists(zluda_path):
         return
 
-    if platform.system() != 'Windows': # TODO
+    if platform.system() != 'Windows': # Windows-only. (PyTorch should be rebuilt on Linux)
         return
 
     urllib.request.urlretrieve(f'https://github.com/lshqqytiger/ZLUDA/releases/download/{RELEASE}/ZLUDA-windows-amd64.zip', '_zluda')
diff --git a/requirements.txt b/requirements.txt
index 555d99e65..69a74cac8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -53,7 +53,7 @@ pandas
 protobuf==4.25.3
 pytorch_lightning==1.9.4
 tokenizers==0.19.1
-transformers==4.41.2
+transformers==4.42.3
 urllib3==1.26.19
 Pillow==10.3.0
 timm==0.9.16
diff --git a/scripts/demofusion.py b/scripts/demofusion.py
index 95e58a74e..f7cdfe543 100644
--- a/scripts/demofusion.py
+++ b/scripts/demofusion.py
@@ -8,7 +8,7 @@ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokeniz
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.models.attention_processor import AttnProcessor2_0, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor
+from diffusers.models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import is_accelerate_available, is_accelerate_version
@@ -484,8 +484,7 @@ class DemoFusionSDXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderM
             (
                 AttnProcessor2_0,
                 XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
             ),
         )
         # if xformers or torch_2_0 is used attention block does not need
diff --git a/scripts/differential_diffusion.py b/scripts/differential_diffusion.py
index b48e0f6e6..36aa17eca 100644
--- a/scripts/differential_diffusion.py
+++ b/scripts/differential_diffusion.py
@@ -22,8 +22,7 @@ from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInver
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.attention_processor import (
     AttnProcessor2_0,
-    LoRAAttnProcessor2_0,
-    LoRAXFormersAttnProcessor,
+    FusedAttnProcessor2_0,
     XFormersAttnProcessor,
 )
 from diffusers.configuration_utils import FrozenDict
@@ -631,8 +630,7 @@ class StableDiffusionXLDiffImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixi
             (
                 AttnProcessor2_0,
                 XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
             ),
         )
         # if xformers or torch_2_0 is used attention block does not need
diff --git a/wiki b/wiki
index c5c9e8998..68fa996e9 160000
--- a/wiki
+++ b/wiki
@@ -1 +1 @@
-Subproject commit c5c9e89981c8bd35b51823315418a4a4864bb5e1
+Subproject commit 68fa996e9231572c244548ef2690adbce018d70b