mirror of https://github.com/vladmandic/automatic
parent
f37fa88824
commit
bb13aabe17
18
CHANGELOG.md
18
CHANGELOG.md
|
|
@ -1,13 +1,16 @@
|
|||
# Change Log for SD.Next
|
||||
|
||||
## Known issues
|
||||
## Update for 2025-12-26
|
||||
|
||||
- z-image-turbo controlnet device mismatch: <https://github.com/huggingface/diffusers/pull/12886>
|
||||
- z-image-turbo safetensors loader: <https://github.com/huggingface/diffusers/issues/12887>
|
||||
- kandinsky-image-5 hardcoded cuda: <https://github.com/huggingface/diffusers/pull/12814>
|
||||
- peft lora with torch-rocm-windows: <https://github.com/huggingface/peft/pull/2963>
|
||||
### Highlights for 2025-12-26
|
||||
|
||||
## Update for 2025-12-25
|
||||
End of year release with:
|
||||
- Several new models including highly anticipated **Qwen-Image-Edit 2511** as well as **Qwen-Image-Layered**, **LongCat Image** and **Ovis Image**
|
||||
- New features including support for **Z-Image** *ControlNets* and *fine-tunes* and **Detailer** segmentation support
|
||||
|
||||
[ReadMe](https://github.com/vladmandic/automatic/blob/master/README.md) | [ChangeLog](https://github.com/vladmandic/automatic/blob/master/CHANGELOG.md) | [Docs](https://vladmandic.github.io/sdnext-docs/) | [WiKi](https://github.com/vladmandic/automatic/wiki) | [Discord](https://discord.com/invite/sd-next-federal-batch-inspectors-1101998836328697867) | [Sponsor](https://github.com/sponsors/vladmandic)
|
||||
|
||||
### Details for 2025-12-26
|
||||
|
||||
- **Models**
|
||||
- [LongCat Image](https://github.com/meituan-longcat/LongCat-Image) in *Image* and *Image Edit* variants
|
||||
|
|
@ -17,6 +20,8 @@
|
|||
- [Qwen-Image-Layered](https://huggingface.co/Qwen/Qwen-Image-Layered) in *base* and *pre-quantized* variants
|
||||
Qwen-Image-Layered, a model capable of decomposing an image into multiple RGBA layers
|
||||
*note*: set number of desired output layers in *settings -> model options*
|
||||
- [Ovis Image 7B](https://huggingface.co/AIDC-AI/Ovis-Image-7B)
|
||||
Ovis Image is a new text-to-image base model based on Qwen3 text-encoder and optimized for text-rendering
|
||||
- **Features**
|
||||
- Google **Gemini** and **Veo** models support for both *Dev* and *Vertex* access methods
|
||||
see [docs](https://vladmandic.github.io/sdnext-docs/Google-GenAI/) for details
|
||||
|
|
@ -45,6 +50,7 @@
|
|||
- control input media with non-english locales
|
||||
- handle embeds when on meta device
|
||||
- improve offloading when model has manual modules
|
||||
- fix ui section colapsible state, thanks @awsr
|
||||
|
||||
## Update for 2025-12-11
|
||||
|
||||
|
|
|
|||
7
TODO.md
7
TODO.md
|
|
@ -1,5 +1,12 @@
|
|||
# TODO
|
||||
|
||||
## Known issues
|
||||
|
||||
- z-image-turbo controlnet device mismatch: <https://github.com/huggingface/diffusers/pull/12886>
|
||||
- z-image-turbo safetensors loader: <https://github.com/huggingface/diffusers/issues/12887>
|
||||
- kandinsky-image-5 hardcoded cuda: <https://github.com/huggingface/diffusers/pull/12814>
|
||||
- peft lora with torch-rocm-windows: <https://github.com/huggingface/peft/pull/2963>
|
||||
|
||||
## Project Board
|
||||
|
||||
- <https://github.com/users/vladmandic/projects>
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
Subproject commit c6dc85eb28a02bc7af268497b7a5a596770c5d7b
|
||||
Subproject commit 2a7005fbcf8985644b66121365fa7228a65f34b0
|
||||
|
|
@ -1 +1 @@
|
|||
Subproject commit af99fbab29e9a424c4e79fa8e4ae194481cb5f75
|
||||
Subproject commit 50f7613276ecb37015cc4dd270a033d32d038415
|
||||
|
|
@ -938,6 +938,16 @@
|
|||
"date": "2024 January"
|
||||
},
|
||||
|
||||
"AIDC Ovis-Image 7B": {
|
||||
"path": "AIDC-AI/Ovis-Image-7B",
|
||||
"skip": true,
|
||||
"desc": "Built upon Ovis-U1, Ovis-Image is a 7B text-to-image model specifically optimized for high-quality text rendering, designed to operate efficiently under stringent computational constraints.",
|
||||
"preview": "AIDC-AI--Ovis-Image-7B.jpg",
|
||||
"size": 23.38,
|
||||
"date": "2025 December",
|
||||
"extras": ""
|
||||
},
|
||||
|
||||
"HDM-XUT 340M Anime": {
|
||||
"path": "KBlueLeaf/HDM-xut-340M-anime",
|
||||
"skip": true,
|
||||
|
|
|
|||
|
|
@ -648,7 +648,7 @@ def check_diffusers():
|
|||
t_start = time.time()
|
||||
if args.skip_all:
|
||||
return
|
||||
sha = '52766e6a6939ac6e74375bde5e19c5e0b90d24c1' # diffusers commit hash
|
||||
sha = 'f6b6a7181eb44f0120b29cd897c129275f366c2a' # diffusers commit hash
|
||||
# if args.use_rocm or args.use_zluda or args.use_directml:
|
||||
# sha = '043ab2520f6a19fce78e6e060a68dbc947edb9f9' # lock diffusers versions for now
|
||||
pkg = pkg_resources.working_set.by_key.get('diffusers', None)
|
||||
|
|
|
|||
|
|
@ -78,6 +78,8 @@ def get_model_type(pipe):
|
|||
model_type = 'prx'
|
||||
elif 'LongCat' in name:
|
||||
model_type = 'longcat'
|
||||
elif 'Ovis-Image' in name:
|
||||
model_type = 'ovis'
|
||||
# video models
|
||||
elif "CogVideo" in name:
|
||||
model_type = 'cogvideo'
|
||||
|
|
|
|||
|
|
@ -141,6 +141,8 @@ def guess_by_name(fn, current_guess):
|
|||
new_guess = 'Z-Image'
|
||||
elif 'longcat-image' in fn.lower():
|
||||
new_guess = 'LongCat'
|
||||
elif 'ovis-image' in fn.lower():
|
||||
new_guess = 'Ovis-Image'
|
||||
if debug_load:
|
||||
shared.log.trace(f'Autodetect: method=name file="{fn}" previous="{current_guess}" current="{new_guess}"')
|
||||
return new_guess or current_guess
|
||||
|
|
|
|||
|
|
@ -479,6 +479,10 @@ def load_diffuser_force(detected_model_type, checkpoint_info, diffusers_load_con
|
|||
from pipelines.model_longcat import load_longcat
|
||||
sd_model = load_longcat(checkpoint_info, diffusers_load_config)
|
||||
allow_post_quant = False
|
||||
elif model_type in ['Overfit']:
|
||||
from pipelines.model_ovis import load_ovis
|
||||
sd_model = load_ovis(checkpoint_info, diffusers_load_config)
|
||||
allow_post_quant = False
|
||||
except Exception as e:
|
||||
shared.log.error(f'Load {op}: path="{checkpoint_info.path}" {e}')
|
||||
if debug_load:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,36 @@
|
|||
import transformers
|
||||
import diffusers
|
||||
from modules import shared, devices, sd_models, model_quant, sd_hijack_te
|
||||
from pipelines import generic
|
||||
|
||||
|
||||
def load_ovis(checkpoint_info, diffusers_load_config=None):
|
||||
if diffusers_load_config is None:
|
||||
diffusers_load_config = {}
|
||||
repo_id = sd_models.path_to_repo(checkpoint_info)
|
||||
sd_models.hf_auth_check(checkpoint_info)
|
||||
|
||||
load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config, allow_quant=False)
|
||||
shared.log.debug(f'Load model: type=OvisImage repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={diffusers_load_config}')
|
||||
|
||||
transformer = generic.load_transformer(repo_id, cls_name=diffusers.OvisImageTransformer2DModel, load_config=diffusers_load_config)
|
||||
text_encoder = generic.load_text_encoder(repo_id, cls_name=transformers.Qwen3Model, load_config=diffusers_load_config)
|
||||
|
||||
pipe = diffusers.OvisImagePipeline.from_pretrained(
|
||||
repo_id,
|
||||
cache_dir=shared.opts.diffusers_dir,
|
||||
transformer=transformer,
|
||||
text_encoder=text_encoder,
|
||||
**load_args,
|
||||
)
|
||||
|
||||
pipe.task_args = {
|
||||
'output_type': 'np',
|
||||
}
|
||||
|
||||
del transformer
|
||||
del text_encoder
|
||||
sd_hijack_te.init_hijack(pipe)
|
||||
|
||||
devices.torch_gc(force=True, reason='load')
|
||||
return pipe
|
||||
|
|
@ -11,7 +11,7 @@ def load_z_image(checkpoint_info, diffusers_load_config=None):
|
|||
sd_models.hf_auth_check(checkpoint_info)
|
||||
|
||||
load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config, allow_quant=False)
|
||||
shared.log.debug(f'Load model: type=Z-Image repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={diffusers_load_config}')
|
||||
shared.log.debug(f'Load model: type=ZImage repo="{repo_id}" config={diffusers_load_config} offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={diffusers_load_config}')
|
||||
|
||||
transformer = generic.load_transformer(repo_id, cls_name=diffusers.ZImageTransformer2DModel, load_config=diffusers_load_config)
|
||||
text_encoder = generic.load_text_encoder(repo_id, cls_name=transformers.Qwen3ForCausalLM, load_config=diffusers_load_config)
|
||||
|
|
|
|||
Loading…
Reference in New Issue