add ovis-image

Signed-off-by: vladmandic <mandic00@live.com>
pull/4497/head
vladmandic 2025-12-26 08:05:25 +01:00
parent f37fa88824
commit bb13aabe17
11 changed files with 77 additions and 10 deletions

View File

@ -1,13 +1,16 @@
# Change Log for SD.Next
## Known issues
## Update for 2025-12-26
- z-image-turbo controlnet device mismatch: <https://github.com/huggingface/diffusers/pull/12886>
- z-image-turbo safetensors loader: <https://github.com/huggingface/diffusers/issues/12887>
- kandinsky-image-5 hardcoded cuda: <https://github.com/huggingface/diffusers/pull/12814>
- peft lora with torch-rocm-windows: <https://github.com/huggingface/peft/pull/2963>
### Highlights for 2025-12-26
## Update for 2025-12-25
End of year release with:
- Several new models including highly anticipated **Qwen-Image-Edit 2511** as well as **Qwen-Image-Layered**, **LongCat Image** and **Ovis Image**
- New features including support for **Z-Image** *ControlNets* and *fine-tunes* and **Detailer** segmentation support
[ReadMe](https://github.com/vladmandic/automatic/blob/master/README.md) | [ChangeLog](https://github.com/vladmandic/automatic/blob/master/CHANGELOG.md) | [Docs](https://vladmandic.github.io/sdnext-docs/) | [WiKi](https://github.com/vladmandic/automatic/wiki) | [Discord](https://discord.com/invite/sd-next-federal-batch-inspectors-1101998836328697867) | [Sponsor](https://github.com/sponsors/vladmandic)
### Details for 2025-12-26
- **Models**
- [LongCat Image](https://github.com/meituan-longcat/LongCat-Image) in *Image* and *Image Edit* variants
@ -17,6 +20,8 @@
- [Qwen-Image-Layered](https://huggingface.co/Qwen/Qwen-Image-Layered) in *base* and *pre-quantized* variants
Qwen-Image-Layered, a model capable of decomposing an image into multiple RGBA layers
*note*: set number of desired output layers in *settings -> model options*
- [Ovis Image 7B](https://huggingface.co/AIDC-AI/Ovis-Image-7B)
Ovis Image is a new text-to-image base model based on Qwen3 text-encoder and optimized for text-rendering
- **Features**
- Google **Gemini** and **Veo** models support for both *Dev* and *Vertex* access methods
see [docs](https://vladmandic.github.io/sdnext-docs/Google-GenAI/) for details
@ -45,6 +50,7 @@
- control input media with non-english locales
- handle embeds when on meta device
- improve offloading when model has manual modules
- fix ui section colapsible state, thanks @awsr
## Update for 2025-12-11

View File

@ -1,5 +1,12 @@
# TODO
## Known issues
- z-image-turbo controlnet device mismatch: <https://github.com/huggingface/diffusers/pull/12886>
- z-image-turbo safetensors loader: <https://github.com/huggingface/diffusers/issues/12887>
- kandinsky-image-5 hardcoded cuda: <https://github.com/huggingface/diffusers/pull/12814>
- peft lora with torch-rocm-windows: <https://github.com/huggingface/peft/pull/2963>
## Project Board
- <https://github.com/users/vladmandic/projects>

@ -1 +1 @@
Subproject commit c6dc85eb28a02bc7af268497b7a5a596770c5d7b
Subproject commit 2a7005fbcf8985644b66121365fa7228a65f34b0

@ -1 +1 @@
Subproject commit af99fbab29e9a424c4e79fa8e4ae194481cb5f75
Subproject commit 50f7613276ecb37015cc4dd270a033d32d038415

View File

@ -938,6 +938,16 @@
"date": "2024 January"
},
"AIDC Ovis-Image 7B": {
"path": "AIDC-AI/Ovis-Image-7B",
"skip": true,
"desc": "Built upon Ovis-U1, Ovis-Image is a 7B text-to-image model specifically optimized for high-quality text rendering, designed to operate efficiently under stringent computational constraints.",
"preview": "AIDC-AI--Ovis-Image-7B.jpg",
"size": 23.38,
"date": "2025 December",
"extras": ""
},
"HDM-XUT 340M Anime": {
"path": "KBlueLeaf/HDM-xut-340M-anime",
"skip": true,

View File

@ -648,7 +648,7 @@ def check_diffusers():
t_start = time.time()
if args.skip_all:
return
sha = '52766e6a6939ac6e74375bde5e19c5e0b90d24c1' # diffusers commit hash
sha = 'f6b6a7181eb44f0120b29cd897c129275f366c2a' # diffusers commit hash
# if args.use_rocm or args.use_zluda or args.use_directml:
# sha = '043ab2520f6a19fce78e6e060a68dbc947edb9f9' # lock diffusers versions for now
pkg = pkg_resources.working_set.by_key.get('diffusers', None)

View File

@ -78,6 +78,8 @@ def get_model_type(pipe):
model_type = 'prx'
elif 'LongCat' in name:
model_type = 'longcat'
elif 'Ovis-Image' in name:
model_type = 'ovis'
# video models
elif "CogVideo" in name:
model_type = 'cogvideo'

View File

@ -141,6 +141,8 @@ def guess_by_name(fn, current_guess):
new_guess = 'Z-Image'
elif 'longcat-image' in fn.lower():
new_guess = 'LongCat'
elif 'ovis-image' in fn.lower():
new_guess = 'Ovis-Image'
if debug_load:
shared.log.trace(f'Autodetect: method=name file="{fn}" previous="{current_guess}" current="{new_guess}"')
return new_guess or current_guess

View File

@ -479,6 +479,10 @@ def load_diffuser_force(detected_model_type, checkpoint_info, diffusers_load_con
from pipelines.model_longcat import load_longcat
sd_model = load_longcat(checkpoint_info, diffusers_load_config)
allow_post_quant = False
elif model_type in ['Overfit']:
from pipelines.model_ovis import load_ovis
sd_model = load_ovis(checkpoint_info, diffusers_load_config)
allow_post_quant = False
except Exception as e:
shared.log.error(f'Load {op}: path="{checkpoint_info.path}" {e}')
if debug_load:

36
pipelines/model_ovis.py Normal file
View File

@ -0,0 +1,36 @@
import transformers
import diffusers
from modules import shared, devices, sd_models, model_quant, sd_hijack_te
from pipelines import generic
def load_ovis(checkpoint_info, diffusers_load_config=None):
if diffusers_load_config is None:
diffusers_load_config = {}
repo_id = sd_models.path_to_repo(checkpoint_info)
sd_models.hf_auth_check(checkpoint_info)
load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config, allow_quant=False)
shared.log.debug(f'Load model: type=OvisImage repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={diffusers_load_config}')
transformer = generic.load_transformer(repo_id, cls_name=diffusers.OvisImageTransformer2DModel, load_config=diffusers_load_config)
text_encoder = generic.load_text_encoder(repo_id, cls_name=transformers.Qwen3Model, load_config=diffusers_load_config)
pipe = diffusers.OvisImagePipeline.from_pretrained(
repo_id,
cache_dir=shared.opts.diffusers_dir,
transformer=transformer,
text_encoder=text_encoder,
**load_args,
)
pipe.task_args = {
'output_type': 'np',
}
del transformer
del text_encoder
sd_hijack_te.init_hijack(pipe)
devices.torch_gc(force=True, reason='load')
return pipe

View File

@ -11,7 +11,7 @@ def load_z_image(checkpoint_info, diffusers_load_config=None):
sd_models.hf_auth_check(checkpoint_info)
load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config, allow_quant=False)
shared.log.debug(f'Load model: type=Z-Image repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={diffusers_load_config}')
shared.log.debug(f'Load model: type=ZImage repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={diffusers_load_config}')
transformer = generic.load_transformer(repo_id, cls_name=diffusers.ZImageTransformer2DModel, load_config=diffusers_load_config)
text_encoder = generic.load_text_encoder(repo_id, cls_name=transformers.Qwen3ForCausalLM, load_config=diffusers_load_config)