diff --git a/html/reference-distilled.json b/html/reference-distilled.json
index 123d8e6e4..e77c3b4b6 100644
--- a/html/reference-distilled.json
+++ b/html/reference-distilled.json
@@ -165,7 +165,7 @@
   "Black Forest Labs FLUX.2 Klein 4B": {
     "path": "black-forest-labs/FLUX.2-klein-4B",
     "preview": "black-forest-labs--FLUX.2-klein-4B.jpg",
-    "desc": "FLUX.2-klein-4B is a 4 billion parameter size-distilled version of FLUX.2-dev optimized for consumer GPUs. Achieves sub-second inference with 4 steps while fitting in ~13GB VRAM. Supports both text-to-image generation and multi-reference image editing. Apache 2.0 licensed.",
+    "desc": "FLUX.2-klein-4B is a 4 billion parameter size-distilled version of FLUX.2-dev optimized for consumer GPUs. Achieves sub-second inference with 4 steps. Supports both text-to-image generation and multi-reference image editing. Apache 2.0 licensed.",
     "skip": true,
     "tags": "distilled",
     "extras": "sampler: Default, cfg_scale: 4.0, steps: 4",
@@ -175,7 +175,7 @@
   "Black Forest Labs FLUX.2 Klein 9B": {
     "path": "black-forest-labs/FLUX.2-klein-9B",
     "preview": "black-forest-labs--FLUX.2-klein-9B.jpg",
-    "desc": "FLUX.2-klein-9B is a 9 billion parameter size-distilled version of FLUX.2-dev. Higher quality than 4B variant with sub-second inference using 4 steps. Requires ~29GB VRAM. Supports text-to-image and multi-reference editing. Non-commercial license.",
+    "desc": "FLUX.2-klein-9B is a 9 billion parameter size-distilled version of FLUX.2-dev. Higher quality than 4B variant with sub-second inference using 4 steps. Supports text-to-image and multi-reference editing. Non-commercial license.",
     "skip": true,
     "tags": "distilled",
     "extras": "sampler: Default, cfg_scale: 4.0, steps: 4",
diff --git a/html/reference.json b/html/reference.json
index 63cd80124..2f1f6562b 100644
--- a/html/reference.json
+++ b/html/reference.json
@@ -127,7 +127,7 @@
   "Black Forest Labs FLUX.2 Klein Base 4B": {
     "path": "black-forest-labs/FLUX.2-klein-base-4B",
     "preview": "black-forest-labs--FLUX.2-klein-base-4B.jpg",
-    "desc": "FLUX.2-klein-base-4B is the undistilled 4 billion parameter base model of FLUX.2-klein. Requires 50 inference steps for full quality but offers flexibility for fine-tuning. Fits in ~13GB VRAM. Supports text-to-image and multi-reference editing. Apache 2.0 licensed.",
+    "desc": "FLUX.2-klein-base-4B is the undistilled 4 billion parameter base model of FLUX.2-klein. Requires 50 inference steps for full quality but offers flexibility for fine-tuning. Supports text-to-image and multi-reference editing. Apache 2.0 licensed.",
     "skip": true,
     "extras": "sampler: Default, cfg_scale: 4.0, steps: 50",
     "size": 8.5,
@@ -136,7 +136,7 @@
   "Black Forest Labs FLUX.2 Klein Base 9B": {
     "path": "black-forest-labs/FLUX.2-klein-base-9B",
     "preview": "black-forest-labs--FLUX.2-klein-base-9B.jpg",
-    "desc": "FLUX.2-klein-base-9B is the undistilled 9 billion parameter base model of FLUX.2-klein. Requires 50 inference steps for full quality but offers flexibility for fine-tuning. Requires ~29GB VRAM. Supports text-to-image and multi-reference editing. Non-commercial license.",
+    "desc": "FLUX.2-klein-base-9B is the undistilled 9 billion parameter base model of FLUX.2-klein. Requires 50 inference steps for full quality but offers flexibility for fine-tuning. Supports text-to-image and multi-reference editing. Non-commercial license.",
     "skip": true,
     "extras": "sampler: Default, cfg_scale: 4.0, steps: 50",
     "size": 18.5,
@@ -146,7 +146,7 @@
   "Z-Image-Turbo": {
     "path": "Tongyi-MAI/Z-Image-Turbo",
     "preview": "Tongyi-MAI--Z-Image-Turbo.jpg",
-    "desc": "Z-Image-Turbo, a distilled version of Z-Image that matches or exceeds leading competitors with only 8 NFEs (Number of Function Evaluations). It offers sub-second inference latency on enterprise-grade H800 GPUs and fits comfortably within 16G VRAM consumer devices. It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.",
+    "desc": "Z-Image-Turbo, a distilled version of Z-Image that matches or exceeds leading competitors with only 8 NFEs (Number of Function Evaluations). It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.",
     "skip": true,
     "extras": "sampler: Default, cfg_scale: 1.0, steps: 9",
     "size": 20.3,
diff --git a/modules/sd_vae_taesd.py b/modules/sd_vae_taesd.py
index 80cf5de69..1c5dd0dd0 100644
--- a/modules/sd_vae_taesd.py
+++ b/modules/sd_vae_taesd.py
@@ -12,6 +12,9 @@ import torch
 from modules import devices, paths, shared
 
 
+debug = os.environ.get('SD_PREVIEW_DEBUG', None) is not None
+
+
 TAESD_MODELS = {
     'TAESD 1.3 Mocha Croissant': { 'fn': 'taesd_13_', 'uri': 'https://github.com/madebyollin/taesd/raw/7f572ca629c9b0d3c9f71140e5f501e09f9ea280', 'model': None },
     'TAESD 1.2 Chocolate-Dipped Shortbread': { 'fn': 'taesd_12_', 'uri': 'https://github.com/madebyollin/taesd/raw/8909b44e3befaa0efa79c5791e4fe1c4d4f7884e', 'model': None },
@@ -158,8 +161,9 @@ def decode(latents):
                 dtype = devices.dtype_vae if devices.dtype_vae != torch.bfloat16 else torch.float16 # taesd does not support bf16
                 tensor = latents.unsqueeze(0) if len(latents.shape) == 3 else latents
                 tensor = tensor.detach().clone().to(devices.device, dtype=dtype)
-                shared.log.debug(f'Decode: type="taesd" variant="{variant}" input={latents.shape} tensor={tensor.shape}')
-                # FLUX.2 has 128 latent channels that need reshaping to 32 channels for TAESD
+                if debug:
+                    shared.log.debug(f'Decode: type="taesd" variant="{variant}" input={latents.shape} tensor={tensor.shape}')
+                # Fallback: reshape packed 128-channel latents to 32 channels if not already unpacked
                 if variant == 'TAE FLUX.2' and len(tensor.shape) == 4 and tensor.shape[1] == 128:
                     b, _c, h, w = tensor.shape
                     tensor = tensor.reshape(b, 32, h * 2, w * 2)
diff --git a/pipelines/model_flux2_klein.py b/pipelines/model_flux2_klein.py
index 9b8b05ac8..d810821d9 100644
--- a/pipelines/model_flux2_klein.py
+++ b/pipelines/model_flux2_klein.py
@@ -16,7 +16,7 @@ def load_flux2_klein(checkpoint_info, diffusers_load_config=None):
     # Load transformer - Klein uses Flux2Transformer2DModel (same class as Flux2, different size)
     transformer = generic.load_transformer(repo_id, cls_name=diffusers.Flux2Transformer2DModel, load_config=diffusers_load_config)
 
-    # Load text encoder - Klein uses Qwen3ForCausalLM (8B), shared across all Klein variants
+    # Load text encoder - Klein uses Qwen3 (4B for Klein-4B, 8B for Klein-9B)
     text_encoder = generic.load_text_encoder(repo_id, cls_name=transformers.Qwen3ForCausalLM, load_config=diffusers_load_config)
 
     pipe = diffusers.Flux2KleinPipeline.from_pretrained(