mirror of https://github.com/vladmandic/automatic
add nvidia-cosmos-predict-2
Signed-off-by: Vladimir Mandic <mandic00@live.com>pull/3992/head^2
parent
61e949c1cc
commit
ada38d57b4
|
|
@ -2,8 +2,14 @@
|
|||
|
||||
## Update for 2025-06-26
|
||||
|
||||
- **Models**
|
||||
- [nVidia Cosmos-Predict2 T2I](https://research.nvidia.com/labs/dir/cosmos-predict2/) *2B and 14B*
|
||||
- available via *networks -> models -> reference*
|
||||
- *note*: this is a gated model, you need to [accept terms](https://huggingface.co/nvidia/Cosmos-Predict2-2B-Text2Image) and set your [huggingface token](https://vladmandic.github.io/sdnext-docs/Gated/)
|
||||
- [JoyCaption Beta](https://huggingface.co/fancyfeast/llama-joycaption-beta-one-hf-llava) support (in addition to existing JoyCaption Alpha)
|
||||
- available via *caption -> vlm caption*
|
||||
|
||||
- **Changes**
|
||||
- Add [JoyCaption Beta](https://huggingface.co/fancyfeast/llama-joycaption-beta-one-hf-llava) support (in addition to existing JoyCaption Alpha)
|
||||
- Support Remote VAE with *Omnigen, Lumina 2 and PixArt*
|
||||
- Use Diffusers version of *OmniGen*
|
||||
- Control move global settings to control elements -> control settings tab
|
||||
|
|
|
|||
|
|
@ -236,7 +236,20 @@
|
|||
"desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
|
||||
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
||||
"skip": true
|
||||
},
|
||||
},
|
||||
|
||||
"nVidia Cosmos-Predict2 T2I 2B": {
|
||||
"path": "nvidia/Cosmos-Predict2-2B-Text2Image",
|
||||
"desc": "Cosmos-Predict2: A family of highly performant pre-trained world foundation models purpose-built for generating physics-aware images, videos and world states for physical AI development.",
|
||||
"preview": "nvidia--Cosmos-Predict2-2B-Text2Image.jpg",
|
||||
"skip": true
|
||||
},
|
||||
"nVidia Cosmos-Predict2 T2I 14B": {
|
||||
"path": "nvidia/Cosmos-Predict2-14B-Text2Image",
|
||||
"desc": "Cosmos-Predict2: A family of highly performant pre-trained world foundation models purpose-built for generating physics-aware images, videos and world states for physical AI development.",
|
||||
"preview": "nvidia--Cosmos-Predict2-2B-Text2Image.jpg",
|
||||
"skip": true
|
||||
},
|
||||
|
||||
"VectorSpaceLab OmniGen v1": {
|
||||
"path": "Shitao/OmniGen-v1-diffusers",
|
||||
|
|
|
|||
Binary file not shown.
|
After Width: | Height: | Size: 42 KiB |
|
|
@ -35,9 +35,13 @@ warnings.filterwarnings(action="ignore", category=DeprecationWarning)
|
|||
warnings.filterwarnings(action="ignore", category=FutureWarning)
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="torchvision")
|
||||
try:
|
||||
import torch._logging # pylint: disable=ungrouped-imports
|
||||
torch._logging._internal.DEFAULT_LOG_LEVEL = logging.ERROR # pylint: disable=protected-access
|
||||
torch._logging.set_logs(all=logging.ERROR, bytecode=False, aot_graphs=False, aot_joint_graph=False, ddp_graphs=False, graph=False, graph_code=False, graph_breaks=False, graph_sizes=False, guards=False, recompiles=False, recompiles_verbose=False, trace_source=False, trace_call=False, trace_bytecode=False, output_code=False, kernel_code=False, schedule=False, perf_hints=False, post_grad_graphs=False, onnx_diagnostics=False, fusion=False, overlap=False, export=None, modules=None, cudagraphs=False, sym_node=False, compiled_autograd_verbose=False) # pylint: disable=protected-access
|
||||
except Exception:
|
||||
pass
|
||||
torch._dynamo.config.verbose = False # pylint: disable=protected-access
|
||||
torch._dynamo.config.suppress_errors = True # pylint: disable=protected-access
|
||||
except Exception as e:
|
||||
errors.log.warning(f'Torch logging: {e}')
|
||||
if ".dev" in torch.__version__ or "+git" in torch.__version__:
|
||||
torch.__long_version__ = torch.__version__
|
||||
torch.__version__ = re.search(r'[\d.]+[\d]', torch.__version__).group(0)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,108 @@
|
|||
import os
|
||||
import transformers
|
||||
import diffusers
|
||||
from huggingface_hub import auth_check
|
||||
from modules import shared, devices, sd_models, model_quant, modelloader, sd_hijack_te
|
||||
|
||||
|
||||
def load_transformer(repo_id, diffusers_load_config={}):
|
||||
load_args, quant_args = model_quant.get_dit_args(diffusers_load_config, module='Transformer', device_map=True)
|
||||
fn = None
|
||||
|
||||
if shared.opts.sd_unet is not None and shared.opts.sd_unet != 'Default':
|
||||
from modules import sd_unet
|
||||
if shared.opts.sd_unet not in list(sd_unet.unet_dict):
|
||||
shared.log.error(f'Load module: type=Transformer not found: {shared.opts.sd_unet}')
|
||||
return None
|
||||
fn = sd_unet.unet_dict[shared.opts.sd_unet] if os.path.exists(sd_unet.unet_dict[shared.opts.sd_unet]) else None
|
||||
|
||||
if fn is not None and 'gguf' in fn.lower():
|
||||
shared.log.error('Load model: type=Cosmos format="gguf" unsupported')
|
||||
transformer = None
|
||||
elif fn is not None and 'safetensors' in fn.lower():
|
||||
shared.log.debug(f'Load model: type=Cosmos transformer="{repo_id}" quant="{model_quant.get_quant(repo_id)}" args={load_args}')
|
||||
transformer = diffusers.CosmosTransformer3DModel.from_single_file(fn, cache_dir=shared.opts.hfcache_dir, **load_args)
|
||||
else:
|
||||
shared.log.debug(f'Load model: type=Cosmos transformer="{repo_id}" quant="{model_quant.get_quant_type(quant_args)}" args={load_args}')
|
||||
transformer = diffusers.CosmosTransformer3DModel.from_pretrained(
|
||||
repo_id,
|
||||
subfolder="transformer",
|
||||
cache_dir=shared.opts.hfcache_dir,
|
||||
**load_args,
|
||||
**quant_args,
|
||||
)
|
||||
if shared.opts.diffusers_offload_mode != 'none' and transformer is not None:
|
||||
sd_models.move_model(transformer, devices.cpu)
|
||||
return transformer
|
||||
|
||||
|
||||
def load_text_encoder(repo_id, diffusers_load_config={}):
|
||||
load_args, quant_args = model_quant.get_dit_args(diffusers_load_config, module='TE', device_map=True)
|
||||
shared.log.debug(f'Load model: type=Cosmos te="{repo_id}" quant="{model_quant.get_quant_type(quant_args)}" args={load_args}')
|
||||
text_encoder = transformers.T5EncoderModel.from_pretrained(
|
||||
repo_id,
|
||||
subfolder="text_encoder",
|
||||
cache_dir=shared.opts.hfcache_dir,
|
||||
**load_args,
|
||||
**quant_args,
|
||||
)
|
||||
if shared.opts.diffusers_offload_mode != 'none' and text_encoder is not None:
|
||||
sd_models.move_model(text_encoder, devices.cpu)
|
||||
|
||||
load_args, quant_args = model_quant.get_dit_args(diffusers_load_config, module='LLM', device_map=True)
|
||||
llama_repo = shared.opts.model_h1_llama_repo if shared.opts.model_h1_llama_repo != 'Default' else 'meta-llama/Meta-Llama-3.1-8B-Instruct'
|
||||
shared.log.debug(f'Load model: type=HiDream te4="{llama_repo}" quant="{model_quant.get_quant_type(quant_args)}" args={load_args}')
|
||||
|
||||
return text_encoder
|
||||
|
||||
|
||||
def load_cosmos_t2i(checkpoint_info, diffusers_load_config={}):
|
||||
repo_id = sd_models.path_to_repo(checkpoint_info.name)
|
||||
login = modelloader.hf_login()
|
||||
try:
|
||||
auth_check(repo_id)
|
||||
except Exception as e:
|
||||
shared.log.error(f'Load model: repo="{repo_id}" login={login} {e}')
|
||||
return False
|
||||
|
||||
transformer = load_transformer(repo_id, diffusers_load_config)
|
||||
text_encoder = load_text_encoder(repo_id, diffusers_load_config)
|
||||
safety_checker = Fake_safety_checker()
|
||||
|
||||
load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config, module='Model')
|
||||
shared.log.debug(f'Load model: type=Cosmos model="{checkpoint_info.name}" repo="{repo_id}" offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}')
|
||||
|
||||
cls = diffusers.Cosmos2TextToImagePipeline
|
||||
pipe = cls.from_pretrained(
|
||||
repo_id,
|
||||
transformer=transformer,
|
||||
text_encoder=text_encoder,
|
||||
safety_checker=safety_checker,
|
||||
cache_dir=shared.opts.diffusers_dir,
|
||||
**load_args,
|
||||
)
|
||||
|
||||
sd_hijack_te.init_hijack(pipe)
|
||||
del text_encoder
|
||||
del transformer
|
||||
|
||||
devices.torch_gc()
|
||||
return pipe
|
||||
|
||||
|
||||
class Fake_safety_checker:
|
||||
def __init__(self):
|
||||
from diffusers.utils import import_utils
|
||||
import_utils._cosmos_guardrail_available = True # pylint: disable=protected-access
|
||||
|
||||
def __call__(self, *args, **kwargs): # pylint: disable=unused-argument
|
||||
return
|
||||
|
||||
def to(self, _device):
|
||||
pass
|
||||
|
||||
def check_text_safety(self, _prompt):
|
||||
return True
|
||||
|
||||
def check_video_safety(self, vid):
|
||||
return vid
|
||||
|
|
@ -43,6 +43,8 @@ def get_model_type(pipe):
|
|||
model_type = 'sana'
|
||||
elif "HiDream" in name:
|
||||
model_type = 'h1'
|
||||
elif "Cosmos2TextToImage" in name:
|
||||
model_type = 'cosmos'
|
||||
# video models
|
||||
elif "CogVideo" in name:
|
||||
model_type = 'cogvideo'
|
||||
|
|
|
|||
|
|
@ -298,6 +298,8 @@ def set_pipeline_args(p, model, prompts:list, negative_prompts:list, prompts_2:t
|
|||
args['control_strength'] = p.denoising_strength
|
||||
args['width'] = p.width
|
||||
args['height'] = p.height
|
||||
if 'Cosmos2TextToImagePipeline':
|
||||
kwargs['output_type'] = 'np' # cosmos uses wan-vae which is weird
|
||||
# set callbacks
|
||||
if 'prior_callback_steps' in possible: # Wuerstchen / Cascade
|
||||
args['prior_callback_steps'] = 1
|
||||
|
|
|
|||
|
|
@ -130,7 +130,7 @@ def full_vae_decode(latents, model):
|
|||
# normalize latents
|
||||
latents_mean = model.vae.config.get("latents_mean", None)
|
||||
latents_std = model.vae.config.get("latents_std", None)
|
||||
scaling_factor = model.vae.config.get("scaling_factor", None)
|
||||
scaling_factor = model.vae.config.get("scaling_factor", 1.0)
|
||||
shift_factor = model.vae.config.get("shift_factor", None)
|
||||
if latents_mean and latents_std:
|
||||
latents_mean = (torch.tensor(latents_mean).view(1, -1, 1, 1).to(latents.device, latents.dtype))
|
||||
|
|
|
|||
|
|
@ -98,6 +98,8 @@ def detect_pipeline(f: str, op: str = 'model', warning=True, quiet=False):
|
|||
warn(f'Model detected as FLUX UNET model, but attempting to load a base model: {op}={f} size={size} MB')
|
||||
if 'flex.2' in f.lower():
|
||||
guess = 'FLEX'
|
||||
if 'cosmos-predict2' in f.lower():
|
||||
guess = 'Cosmos'
|
||||
# guess for diffusers
|
||||
index = os.path.join(f, 'model_index.json')
|
||||
if os.path.exists(index) and os.path.isfile(index):
|
||||
|
|
@ -113,6 +115,7 @@ def detect_pipeline(f: str, op: str = 'model', warning=True, quiet=False):
|
|||
guess = 'Stable Diffusion 3'
|
||||
if callable(pipeline) and 'Lumina2' in pipeline.__name__:
|
||||
guess = 'Lumina 2'
|
||||
|
||||
# switch for specific variant
|
||||
if guess == 'Stable Diffusion' and 'inpaint' in f.lower():
|
||||
guess = 'Stable Diffusion Inpaint'
|
||||
|
|
@ -122,6 +125,7 @@ def detect_pipeline(f: str, op: str = 'model', warning=True, quiet=False):
|
|||
guess = 'Stable Diffusion XL Inpaint'
|
||||
elif guess == 'Stable Diffusion XL' and 'instruct' in f.lower():
|
||||
guess = 'Stable Diffusion XL Instruct'
|
||||
|
||||
# get actual pipeline
|
||||
pipeline = shared_items.get_pipelines().get(guess, None) if pipeline is None else pipeline
|
||||
if debug_load is not None:
|
||||
|
|
|
|||
|
|
@ -350,6 +350,9 @@ def load_diffuser_force(model_type, checkpoint_info, diffusers_load_config, op='
|
|||
elif model_type in ['HiDream']:
|
||||
from modules.model_hidream import load_hidream
|
||||
sd_model = load_hidream(checkpoint_info, diffusers_load_config)
|
||||
elif model_type in ['Cosmos']:
|
||||
from modules.model_cosmos import load_cosmos_t2i
|
||||
sd_model = load_cosmos_t2i(checkpoint_info, diffusers_load_config)
|
||||
except Exception as e:
|
||||
shared.log.error(f'Load {op}: path="{checkpoint_info.path}" {e}')
|
||||
if debug_load:
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ from modules.timer import process as process_timer
|
|||
|
||||
debug = os.environ.get('SD_MOVE_DEBUG', None) is not None
|
||||
debug_move = log.trace if debug else lambda *args, **kwargs: None
|
||||
offload_warn = ['sc', 'sd3', 'f1', 'h1', 'hunyuandit', 'auraflow', 'omnigen', 'cogview4']
|
||||
offload_warn = ['sc', 'sd3', 'f1', 'h1', 'hunyuandit', 'auraflow', 'omnigen', 'cogview4', 'cosmos']
|
||||
offload_post = ['h1']
|
||||
offload_hook_instance = None
|
||||
balanced_offload_exclude = ['CogView4Pipeline']
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ from modules import shared, devices, processing, images, sd_vae_approx, sd_vae_t
|
|||
|
||||
SamplerData = namedtuple('SamplerData', ['name', 'constructor', 'aliases', 'options'])
|
||||
approximation_indexes = { "Simple": 0, "Approximate": 1, "TAESD": 2, "Full VAE": 3 }
|
||||
flow_models = ['f1', 'sd3', 'lumina', 'auraflow', 'sana', 'lumina2', 'cogview4', 'h1']
|
||||
flow_models = ['f1', 'sd3', 'lumina', 'auraflow', 'sana', 'lumina2', 'cogview4', 'h1', 'cosmos']
|
||||
warned = False
|
||||
queue_lock = threading.Lock()
|
||||
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ pipelines = {
|
|||
'Amused': getattr(diffusers, 'AmusedPipeline', None),
|
||||
'HiDream': getattr(diffusers, 'HiDreamImagePipeline', None),
|
||||
'OmniGenPipeline': getattr(diffusers, 'OmniGenPipeline', None),
|
||||
'Cosmos': getattr(diffusers, 'Cosmos2TextToImagePipeline', None),
|
||||
|
||||
# dynamically imported and redefined later
|
||||
'Meissonic': getattr(diffusers, 'DiffusionPipeline', None), # dynamically redefined and loaded in sd_models.load_diffuser
|
||||
|
|
|
|||
Loading…
Reference in New Issue