add ernie-image

Signed-off-by: Vladimir Mandic <mandic00@live.com>
pull/4770/head
Vladimir Mandic 2026-04-15 11:06:32 +00:00
parent d29c133891
commit db8c6e9243
13 changed files with 93 additions and 6 deletions

1
.gitignore vendored
View File

@ -37,6 +37,7 @@ package-lock.json
*.rar
*.7z
*.pyc
*.out
/*.bat
/*.sh
/*.txt

View File

@ -3,6 +3,9 @@
## Update for 2026-04-15
- **Models**
- [Baidu ERNIE-Image](https://huggingface.co/baidu/ERNIE-Image) text-to-image FlowMatch diffusion transformer model with Mistral3 text encoding
includes *ERNIE-Image* (base) and *ERNIE-Image-Turbo* (distilled) variants
uses *ErnieImageTransformer2DModel* with *AutoencoderKLFlux2* latent decoding at 1024px
- [Zeta-Chroma](https://huggingface.co/lodestones/Zeta-Chroma) pixel-space diffusion transformer image model
generates images directly in RGB space using NextDiT-style architecture
*note*: requires large number of steps to achieve sane results
@ -72,10 +75,10 @@
- UI CSS fixes, thanks @awsr
- detect/warn if space present in system path
- add `ftfy` to requirements
- fix upscaler init error should not block server
- upscaler init error should not block server
- improve torch nvidia arch detection
- add torch amd arch detection
- fix prompt weighted lists and internal wildcards
- prompt weighted lists and internal wildcards
- improve `path_to_repo` handling for custom paths
- eliminate `api` auth security bypass
- multiple `schedulers` signature corrections
@ -89,6 +92,7 @@
- patch `unipc` for timesteps device placement, thanks @resonantsky
- `civitai` search and base-model discovery improvements
- validate all `reference` jsons
- ui log formatting
## Update for 2026-04-01

View File

@ -56,6 +56,16 @@
"tags": "distilled",
"date": "2025 August"
},
"Baidu ERNIE-Image-Turbo": {
"path": "baidu/ERNIE-Image-Turbo",
"preview": "baidu--ERNIE-Image-Turbo.jpg",
"desc": "ERNIE-Image-Turbo is a distilled ERNIE-Image variant optimized for fast generation with fewer denoising steps.",
"skip": true,
"extras": "sampler: Default, cfg_scale: 1.0, steps: 8",
"size": 0,
"tags": "distilled",
"date": "2026 April"
},
"Qwen-Image-Lightning-Edit": {
"path": "vladmandic/Qwen-Lightning-Edit",
"preview": "vladmandic--Qwen-Lightning-Edit.jpg",

View File

@ -162,6 +162,16 @@
"date": "2025 November"
},
"Baidu ERNIE-Image": {
"path": "baidu/ERNIE-Image",
"preview": "baidu--ERNIE-Image.jpg",
"desc": "ERNIE-Image is a text-to-image diffusion transformer model that combines a Mistral3 text encoder with a FlowMatch transformer and Flux2-style VAE for 1024px image generation.",
"skip": true,
"extras": "sampler: Default, cfg_scale: 4.0, steps: 50",
"size": 0,
"date": "2026 April"
},
"Qwen-Image": {
"path": "Qwen/Qwen-Image",
"preview": "Qwen--Qwen-Image.jpg",

View File

@ -30,8 +30,9 @@ async function logMonitor() {
const level = `<td style="color: var(--color-${l.level.toLowerCase()})">${l.level}</td>`;
if (l.level === 'WARNING') logWarnings++;
if (l.level === 'ERROR') logErrors++;
const module = `<td style="color: var(--var(--neutral-400))">${l.module}</td>`;
row.innerHTML = `<td>${dateToStr(l.created)}</td>${level}<td>${l.facility}</td>${module}<td>${htmlEscape(l.msg)}</td>`;
const module = `<td style="color: var(--neutral-400)">${l.module}</td>`;
const facility = l.facility !== 'sd' ? `<td>${l.facility}</td>` : '<td></td>';
row.innerHTML = `<td>${dateToStr(l.created)}</td>${level}${facility}${module}<td>${htmlEscape(l.msg)}</td>`;
logMonitorEl.appendChild(row);
} catch (e) {
error(`logMonitor: ${e}\n${line}`);

View File

View File

@ -88,6 +88,8 @@ def get_model_type(pipe):
model_type = 'meissonic'
elif 'Qwen' in name:
model_type = 'qwen'
elif 'ErnieImage' in name or 'ERNIE-Image' in name:
model_type = 'ernieimage'
elif 'NextStep' in name:
model_type = 'nextstep'
elif 'XOmni' in name or 'X-Omni' in name:

View File

@ -142,6 +142,8 @@ def guess_by_name(fn, current_guess):
new_guess = 'PRX'
elif 'gemini-' in fn.lower() and 'image' in fn.lower():
new_guess = 'NanoBanana'
elif 'ernie-image' in fn.lower():
new_guess = 'ERNIE-Image'
elif 'z-image' in fn.lower() or 'z_image' in fn.lower():
new_guess = 'Z-Image'
elif 'longcat-image' in fn.lower():

View File

@ -4,6 +4,10 @@ from modules import shared, errors, timer, sd_models
from modules.logger import log
debug_output = os.environ.get('SD_PROMPT_DEBUG', None)
debug = log.trace if debug_output is not None else lambda *args, **kwargs: None
def hijack_encode_prompt(*args, **kwargs):
jobid = shared.state.begin('TE Encode')
t0 = time.time()
@ -18,8 +22,7 @@ def hijack_encode_prompt(*args, **kwargs):
prompt = args_copy[0]
patch_prompt = True
res = prompt
if prompt is not None:
log.debug(f'Encode: prompt="{prompt}" hijack=True')
debug(f'Encode: prompt="{prompt}" hijack=True')
if hasattr(shared.sd_model, 'before_prompt_encode'):
log.debug(f'Encode: prompt="{prompt}" op=before')

View File

@ -509,6 +509,10 @@ def load_diffuser_force(detected_model_type, checkpoint_info, diffusers_load_con
from pipelines.model_prx import load_prx
sd_model = load_prx(checkpoint_info, diffusers_load_config)
allow_post_quant = False
elif model_type in ['ERNIE-Image']:
from pipelines.model_ernie import load_ernie_image
sd_model = load_ernie_image(checkpoint_info, diffusers_load_config)
allow_post_quant = False
elif model_type in ['Z-Image']:
from pipelines.model_z_image import load_z_image
sd_model = load_z_image(checkpoint_info, diffusers_load_config)
@ -1293,10 +1297,14 @@ def set_diffuser_pipe(pipe, new_pipe_type):
def add_noise_pred_to_diffusers_callback(pipe):
print('HERE1', hasattr(pipe, "_callback_tensor_inputs"))
if not hasattr(pipe, "_callback_tensor_inputs"):
return pipe
if pipe.__class__.__name__.startswith("Anima"):
return pipe
if pipe.__class__.__name__.startswith("ErnieImage"):
print('HERE2')
return pipe
if pipe.__class__.__name__.startswith("StableCascade") and ("predicted_image_embedding" not in pipe._callback_tensor_inputs): # pylint: disable=protected-access
pipe.prior_pipe._callback_tensor_inputs.append("predicted_image_embedding") # pylint: disable=protected-access
elif "noise_pred" not in pipe._callback_tensor_inputs: # pylint: disable=protected-access

View File

@ -48,6 +48,7 @@ pipelines = {
'WanAI': getattr(diffusers, 'WanPipeline', None),
'Qwen': getattr(diffusers, 'QwenImagePipeline', None),
'HunyuanImage': getattr(diffusers, 'HunyuanImagePipeline', None),
'ERNIE-Image': getattr(diffusers, 'ErnieImagePipeline', None),
'Z-Image': getattr(diffusers, 'ZImagePipeline', None),
'FLUX2': getattr(diffusers, 'Flux2Pipeline', None),
'FLUX2 Klein': getattr(diffusers, 'Flux2KleinPipeline', None),

45
pipelines/model_ernie.py Normal file
View File

@ -0,0 +1,45 @@
import diffusers
import transformers
from modules import shared, devices, sd_models, model_quant, sd_hijack_te, sd_hijack_vae
from modules.logger import log
from pipelines import generic
def load_ernie_image(checkpoint_info, diffusers_load_config=None):
if diffusers_load_config is None:
diffusers_load_config = {}
repo_id = sd_models.path_to_repo(checkpoint_info)
sd_models.hf_auth_check(checkpoint_info)
load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config, allow_quant=False)
log.debug(f'Load model: type=ERNIE-Image repo="{repo_id}" offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}')
transformer = generic.load_transformer(
repo_id,
cls_name=diffusers.ErnieImageTransformer2DModel,
load_config=diffusers_load_config,
)
text_encoder = generic.load_text_encoder(
repo_id,
cls_name=transformers.Mistral3Model,
load_config=diffusers_load_config,
)
pipe = diffusers.ErnieImagePipeline.from_pretrained(
repo_id,
cache_dir=shared.opts.diffusers_dir,
transformer=transformer,
text_encoder=text_encoder,
**load_args,
)
pipe.task_args = {
'output_type': 'np',
}
del transformer
del text_encoder
sd_hijack_te.init_hijack(pipe)
sd_hijack_vae.init_hijack(pipe)
devices.torch_gc(force=True, reason='load')
return pipe