fix: correct comments and cleanup model descriptions

- Fix Klein text encoder comment to specify correct sizes per variant
- Lock TAESD decode logging behind SD_PREVIEW_DEBUG env var
- Fix misleading comment about FLUX.2 128-channel reshape (is fallback)
- Remove VRAM requirements from model descriptions in reference files
pull/4553/head
CalamitousFelicitousness 2026-01-16 03:24:39 +00:00
parent 5e2bc01367
commit eaa8dbcd42
4 changed files with 12 additions and 8 deletions

View File

@ -165,7 +165,7 @@
"Black Forest Labs FLUX.2 Klein 4B": {
"path": "black-forest-labs/FLUX.2-klein-4B",
"preview": "black-forest-labs--FLUX.2-klein-4B.jpg",
"desc": "FLUX.2-klein-4B is a 4 billion parameter size-distilled version of FLUX.2-dev optimized for consumer GPUs. Achieves sub-second inference with 4 steps while fitting in ~13GB VRAM. Supports both text-to-image generation and multi-reference image editing. Apache 2.0 licensed.",
"desc": "FLUX.2-klein-4B is a 4 billion parameter size-distilled version of FLUX.2-dev optimized for consumer GPUs. Achieves sub-second inference with 4 steps. Supports both text-to-image generation and multi-reference image editing. Apache 2.0 licensed.",
"skip": true,
"tags": "distilled",
"extras": "sampler: Default, cfg_scale: 4.0, steps: 4",
@ -175,7 +175,7 @@
"Black Forest Labs FLUX.2 Klein 9B": {
"path": "black-forest-labs/FLUX.2-klein-9B",
"preview": "black-forest-labs--FLUX.2-klein-9B.jpg",
"desc": "FLUX.2-klein-9B is a 9 billion parameter size-distilled version of FLUX.2-dev. Higher quality than 4B variant with sub-second inference using 4 steps. Requires ~29GB VRAM. Supports text-to-image and multi-reference editing. Non-commercial license.",
"desc": "FLUX.2-klein-9B is a 9 billion parameter size-distilled version of FLUX.2-dev. Higher quality than 4B variant with sub-second inference using 4 steps. Supports text-to-image and multi-reference editing. Non-commercial license.",
"skip": true,
"tags": "distilled",
"extras": "sampler: Default, cfg_scale: 4.0, steps: 4",

View File

@ -127,7 +127,7 @@
"Black Forest Labs FLUX.2 Klein Base 4B": {
"path": "black-forest-labs/FLUX.2-klein-base-4B",
"preview": "black-forest-labs--FLUX.2-klein-base-4B.jpg",
"desc": "FLUX.2-klein-base-4B is the undistilled 4 billion parameter base model of FLUX.2-klein. Requires 50 inference steps for full quality but offers flexibility for fine-tuning. Fits in ~13GB VRAM. Supports text-to-image and multi-reference editing. Apache 2.0 licensed.",
"desc": "FLUX.2-klein-base-4B is the undistilled 4 billion parameter base model of FLUX.2-klein. Requires 50 inference steps for full quality but offers flexibility for fine-tuning. Supports text-to-image and multi-reference editing. Apache 2.0 licensed.",
"skip": true,
"extras": "sampler: Default, cfg_scale: 4.0, steps: 50",
"size": 8.5,
@ -136,7 +136,7 @@
"Black Forest Labs FLUX.2 Klein Base 9B": {
"path": "black-forest-labs/FLUX.2-klein-base-9B",
"preview": "black-forest-labs--FLUX.2-klein-base-9B.jpg",
"desc": "FLUX.2-klein-base-9B is the undistilled 9 billion parameter base model of FLUX.2-klein. Requires 50 inference steps for full quality but offers flexibility for fine-tuning. Requires ~29GB VRAM. Supports text-to-image and multi-reference editing. Non-commercial license.",
"desc": "FLUX.2-klein-base-9B is the undistilled 9 billion parameter base model of FLUX.2-klein. Requires 50 inference steps for full quality but offers flexibility for fine-tuning. Supports text-to-image and multi-reference editing. Non-commercial license.",
"skip": true,
"extras": "sampler: Default, cfg_scale: 4.0, steps: 50",
"size": 18.5,
@ -146,7 +146,7 @@
"Z-Image-Turbo": {
"path": "Tongyi-MAI/Z-Image-Turbo",
"preview": "Tongyi-MAI--Z-Image-Turbo.jpg",
"desc": "Z-Image-Turbo, a distilled version of Z-Image that matches or exceeds leading competitors with only 8 NFEs (Number of Function Evaluations). It offers sub-second inference latency on enterprise-grade H800 GPUs and fits comfortably within 16G VRAM consumer devices. It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.",
"desc": "Z-Image-Turbo, a distilled version of Z-Image that matches or exceeds leading competitors with only 8 NFEs (Number of Function Evaluations). It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.",
"skip": true,
"extras": "sampler: Default, cfg_scale: 1.0, steps: 9",
"size": 20.3,

View File

@ -12,6 +12,9 @@ import torch
from modules import devices, paths, shared
debug = os.environ.get('SD_PREVIEW_DEBUG', None) is not None
TAESD_MODELS = {
'TAESD 1.3 Mocha Croissant': { 'fn': 'taesd_13_', 'uri': 'https://github.com/madebyollin/taesd/raw/7f572ca629c9b0d3c9f71140e5f501e09f9ea280', 'model': None },
'TAESD 1.2 Chocolate-Dipped Shortbread': { 'fn': 'taesd_12_', 'uri': 'https://github.com/madebyollin/taesd/raw/8909b44e3befaa0efa79c5791e4fe1c4d4f7884e', 'model': None },
@ -158,8 +161,9 @@ def decode(latents):
dtype = devices.dtype_vae if devices.dtype_vae != torch.bfloat16 else torch.float16 # taesd does not support bf16
tensor = latents.unsqueeze(0) if len(latents.shape) == 3 else latents
tensor = tensor.detach().clone().to(devices.device, dtype=dtype)
shared.log.debug(f'Decode: type="taesd" variant="{variant}" input={latents.shape} tensor={tensor.shape}')
# FLUX.2 has 128 latent channels that need reshaping to 32 channels for TAESD
if debug:
shared.log.debug(f'Decode: type="taesd" variant="{variant}" input={latents.shape} tensor={tensor.shape}')
# Fallback: reshape packed 128-channel latents to 32 channels if not already unpacked
if variant == 'TAE FLUX.2' and len(tensor.shape) == 4 and tensor.shape[1] == 128:
b, _c, h, w = tensor.shape
tensor = tensor.reshape(b, 32, h * 2, w * 2)

View File

@ -16,7 +16,7 @@ def load_flux2_klein(checkpoint_info, diffusers_load_config=None):
# Load transformer - Klein uses Flux2Transformer2DModel (same class as Flux2, different size)
transformer = generic.load_transformer(repo_id, cls_name=diffusers.Flux2Transformer2DModel, load_config=diffusers_load_config)
# Load text encoder - Klein uses Qwen3ForCausalLM (8B), shared across all Klein variants
# Load text encoder - Klein uses Qwen3 (4B for Klein-4B, 8B for Klein-9B)
text_encoder = generic.load_text_encoder(repo_id, cls_name=transformers.Qwen3ForCausalLM, load_config=diffusers_load_config)
pipe = diffusers.Flux2KleinPipeline.from_pretrained(