diff --git a/CHANGELOG.md b/CHANGELOG.md index bdd09b0e8..0b4cc2a0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,14 @@ ## Update for 2025-06-26 +- **Models** + - [nVidia Cosmos-Predict2 T2I](https://research.nvidia.com/labs/dir/cosmos-predict2/) *2B and 14B* + - available via *networks -> models -> reference* + - *note*: this is a gated model, you need to [accept terms](https://huggingface.co/nvidia/Cosmos-Predict2-2B-Text2Image) and set your [huggingface token](https://vladmandic.github.io/sdnext-docs/Gated/) + - [JoyCaption Beta](https://huggingface.co/fancyfeast/llama-joycaption-beta-one-hf-llava) support (in addition to existing JoyCaption Alpha) + - available via *caption -> vlm caption* + - **Changes** - - Add [JoyCaption Beta](https://huggingface.co/fancyfeast/llama-joycaption-beta-one-hf-llava) support (in addition to existing JoyCaption Alpha) - Support Remote VAE with *Omnigen, Lumina 2 and PixArt* - Use Diffusers version of *OmniGen* - Control move global settings to control elements -> control settings tab diff --git a/html/reference.json b/html/reference.json index fe89f47ac..07d0c97dc 100644 --- a/html/reference.json +++ b/html/reference.json @@ -236,7 +236,20 @@ "desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.", "preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg", "skip": true - }, + }, + + "nVidia Cosmos-Predict2 T2I 2B": { + "path": "nvidia/Cosmos-Predict2-2B-Text2Image", + "desc": "Cosmos-Predict2: A family of highly performant pre-trained world foundation models purpose-built for generating physics-aware images, videos and world states for physical AI development.", + "preview": "nvidia--Cosmos-Predict2-2B-Text2Image.jpg", + "skip": true + }, + "nVidia Cosmos-Predict2 T2I 14B": { + "path": "nvidia/Cosmos-Predict2-14B-Text2Image", + "desc": "Cosmos-Predict2: A family of highly performant pre-trained world foundation models purpose-built for generating physics-aware images, videos and world states for physical AI development.", + "preview": "nvidia--Cosmos-Predict2-2B-Text2Image.jpg", + "skip": true + }, "VectorSpaceLab OmniGen v1": { "path": "Shitao/OmniGen-v1-diffusers", diff --git a/models/Reference/nvidia--Cosmos-Predict2-2B-Text2Image.jpg b/models/Reference/nvidia--Cosmos-Predict2-2B-Text2Image.jpg new file mode 100644 index 000000000..54b9af8c1 Binary files /dev/null and b/models/Reference/nvidia--Cosmos-Predict2-2B-Text2Image.jpg differ diff --git a/modules/loader.py b/modules/loader.py index 2574c90b4..cb554c38e 100644 --- a/modules/loader.py +++ b/modules/loader.py @@ -35,9 +35,13 @@ warnings.filterwarnings(action="ignore", category=DeprecationWarning) warnings.filterwarnings(action="ignore", category=FutureWarning) warnings.filterwarnings(action="ignore", category=UserWarning, module="torchvision") try: + import torch._logging # pylint: disable=ungrouped-imports + torch._logging._internal.DEFAULT_LOG_LEVEL = logging.ERROR # pylint: disable=protected-access torch._logging.set_logs(all=logging.ERROR, bytecode=False, aot_graphs=False, aot_joint_graph=False, ddp_graphs=False, graph=False, graph_code=False, graph_breaks=False, graph_sizes=False, guards=False, recompiles=False, recompiles_verbose=False, trace_source=False, trace_call=False, trace_bytecode=False, output_code=False, kernel_code=False, schedule=False, perf_hints=False, post_grad_graphs=False, onnx_diagnostics=False, fusion=False, overlap=False, export=None, modules=None, cudagraphs=False, sym_node=False, compiled_autograd_verbose=False) # pylint: disable=protected-access -except Exception: - pass + torch._dynamo.config.verbose = False # pylint: disable=protected-access + torch._dynamo.config.suppress_errors = True # pylint: disable=protected-access +except Exception as e: + errors.log.warning(f'Torch logging: {e}') if ".dev" in torch.__version__ or "+git" in torch.__version__: torch.__long_version__ = torch.__version__ torch.__version__ = re.search(r'[\d.]+[\d]', torch.__version__).group(0) diff --git a/modules/model_cosmos.py b/modules/model_cosmos.py new file mode 100644 index 000000000..c482d033b --- /dev/null +++ b/modules/model_cosmos.py @@ -0,0 +1,108 @@ +import os +import transformers +import diffusers +from huggingface_hub import auth_check +from modules import shared, devices, sd_models, model_quant, modelloader, sd_hijack_te + + +def load_transformer(repo_id, diffusers_load_config={}): + load_args, quant_args = model_quant.get_dit_args(diffusers_load_config, module='Transformer', device_map=True) + fn = None + + if shared.opts.sd_unet is not None and shared.opts.sd_unet != 'Default': + from modules import sd_unet + if shared.opts.sd_unet not in list(sd_unet.unet_dict): + shared.log.error(f'Load module: type=Transformer not found: {shared.opts.sd_unet}') + return None + fn = sd_unet.unet_dict[shared.opts.sd_unet] if os.path.exists(sd_unet.unet_dict[shared.opts.sd_unet]) else None + + if fn is not None and 'gguf' in fn.lower(): + shared.log.error('Load model: type=Cosmos format="gguf" unsupported') + transformer = None + elif fn is not None and 'safetensors' in fn.lower(): + shared.log.debug(f'Load model: type=Cosmos transformer="{repo_id}" quant="{model_quant.get_quant(repo_id)}" args={load_args}') + transformer = diffusers.CosmosTransformer3DModel.from_single_file(fn, cache_dir=shared.opts.hfcache_dir, **load_args) + else: + shared.log.debug(f'Load model: type=Cosmos transformer="{repo_id}" quant="{model_quant.get_quant_type(quant_args)}" args={load_args}') + transformer = diffusers.CosmosTransformer3DModel.from_pretrained( + repo_id, + subfolder="transformer", + cache_dir=shared.opts.hfcache_dir, + **load_args, + **quant_args, + ) + if shared.opts.diffusers_offload_mode != 'none' and transformer is not None: + sd_models.move_model(transformer, devices.cpu) + return transformer + + +def load_text_encoder(repo_id, diffusers_load_config={}): + load_args, quant_args = model_quant.get_dit_args(diffusers_load_config, module='TE', device_map=True) + shared.log.debug(f'Load model: type=Cosmos te="{repo_id}" quant="{model_quant.get_quant_type(quant_args)}" args={load_args}') + text_encoder = transformers.T5EncoderModel.from_pretrained( + repo_id, + subfolder="text_encoder", + cache_dir=shared.opts.hfcache_dir, + **load_args, + **quant_args, + ) + if shared.opts.diffusers_offload_mode != 'none' and text_encoder is not None: + sd_models.move_model(text_encoder, devices.cpu) + + load_args, quant_args = model_quant.get_dit_args(diffusers_load_config, module='LLM', device_map=True) + llama_repo = shared.opts.model_h1_llama_repo if shared.opts.model_h1_llama_repo != 'Default' else 'meta-llama/Meta-Llama-3.1-8B-Instruct' + shared.log.debug(f'Load model: type=HiDream te4="{llama_repo}" quant="{model_quant.get_quant_type(quant_args)}" args={load_args}') + + return text_encoder + + +def load_cosmos_t2i(checkpoint_info, diffusers_load_config={}): + repo_id = sd_models.path_to_repo(checkpoint_info.name) + login = modelloader.hf_login() + try: + auth_check(repo_id) + except Exception as e: + shared.log.error(f'Load model: repo="{repo_id}" login={login} {e}') + return False + + transformer = load_transformer(repo_id, diffusers_load_config) + text_encoder = load_text_encoder(repo_id, diffusers_load_config) + safety_checker = Fake_safety_checker() + + load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config, module='Model') + shared.log.debug(f'Load model: type=Cosmos model="{checkpoint_info.name}" repo="{repo_id}" offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}') + + cls = diffusers.Cosmos2TextToImagePipeline + pipe = cls.from_pretrained( + repo_id, + transformer=transformer, + text_encoder=text_encoder, + safety_checker=safety_checker, + cache_dir=shared.opts.diffusers_dir, + **load_args, + ) + + sd_hijack_te.init_hijack(pipe) + del text_encoder + del transformer + + devices.torch_gc() + return pipe + + +class Fake_safety_checker: + def __init__(self): + from diffusers.utils import import_utils + import_utils._cosmos_guardrail_available = True # pylint: disable=protected-access + + def __call__(self, *args, **kwargs): # pylint: disable=unused-argument + return + + def to(self, _device): + pass + + def check_text_safety(self, _prompt): + return True + + def check_video_safety(self, vid): + return vid diff --git a/modules/modeldata.py b/modules/modeldata.py index 688cff23d..1de6a74a4 100644 --- a/modules/modeldata.py +++ b/modules/modeldata.py @@ -43,6 +43,8 @@ def get_model_type(pipe): model_type = 'sana' elif "HiDream" in name: model_type = 'h1' + elif "Cosmos2TextToImage" in name: + model_type = 'cosmos' # video models elif "CogVideo" in name: model_type = 'cogvideo' diff --git a/modules/processing_args.py b/modules/processing_args.py index d9ad9869f..20e8becec 100644 --- a/modules/processing_args.py +++ b/modules/processing_args.py @@ -298,6 +298,8 @@ def set_pipeline_args(p, model, prompts:list, negative_prompts:list, prompts_2:t args['control_strength'] = p.denoising_strength args['width'] = p.width args['height'] = p.height + if 'Cosmos2TextToImagePipeline': + kwargs['output_type'] = 'np' # cosmos uses wan-vae which is weird # set callbacks if 'prior_callback_steps' in possible: # Wuerstchen / Cascade args['prior_callback_steps'] = 1 diff --git a/modules/processing_vae.py b/modules/processing_vae.py index 36d5d0dda..6feaec0cd 100644 --- a/modules/processing_vae.py +++ b/modules/processing_vae.py @@ -130,7 +130,7 @@ def full_vae_decode(latents, model): # normalize latents latents_mean = model.vae.config.get("latents_mean", None) latents_std = model.vae.config.get("latents_std", None) - scaling_factor = model.vae.config.get("scaling_factor", None) + scaling_factor = model.vae.config.get("scaling_factor", 1.0) shift_factor = model.vae.config.get("shift_factor", None) if latents_mean and latents_std: latents_mean = (torch.tensor(latents_mean).view(1, -1, 1, 1).to(latents.device, latents.dtype)) diff --git a/modules/sd_detect.py b/modules/sd_detect.py index 3c2c1be72..fb8c9d6b5 100644 --- a/modules/sd_detect.py +++ b/modules/sd_detect.py @@ -98,6 +98,8 @@ def detect_pipeline(f: str, op: str = 'model', warning=True, quiet=False): warn(f'Model detected as FLUX UNET model, but attempting to load a base model: {op}={f} size={size} MB') if 'flex.2' in f.lower(): guess = 'FLEX' + if 'cosmos-predict2' in f.lower(): + guess = 'Cosmos' # guess for diffusers index = os.path.join(f, 'model_index.json') if os.path.exists(index) and os.path.isfile(index): @@ -113,6 +115,7 @@ def detect_pipeline(f: str, op: str = 'model', warning=True, quiet=False): guess = 'Stable Diffusion 3' if callable(pipeline) and 'Lumina2' in pipeline.__name__: guess = 'Lumina 2' + # switch for specific variant if guess == 'Stable Diffusion' and 'inpaint' in f.lower(): guess = 'Stable Diffusion Inpaint' @@ -122,6 +125,7 @@ def detect_pipeline(f: str, op: str = 'model', warning=True, quiet=False): guess = 'Stable Diffusion XL Inpaint' elif guess == 'Stable Diffusion XL' and 'instruct' in f.lower(): guess = 'Stable Diffusion XL Instruct' + # get actual pipeline pipeline = shared_items.get_pipelines().get(guess, None) if pipeline is None else pipeline if debug_load is not None: diff --git a/modules/sd_models.py b/modules/sd_models.py index 0530f5a5b..1d85ed47b 100644 --- a/modules/sd_models.py +++ b/modules/sd_models.py @@ -350,6 +350,9 @@ def load_diffuser_force(model_type, checkpoint_info, diffusers_load_config, op=' elif model_type in ['HiDream']: from modules.model_hidream import load_hidream sd_model = load_hidream(checkpoint_info, diffusers_load_config) + elif model_type in ['Cosmos']: + from modules.model_cosmos import load_cosmos_t2i + sd_model = load_cosmos_t2i(checkpoint_info, diffusers_load_config) except Exception as e: shared.log.error(f'Load {op}: path="{checkpoint_info.path}" {e}') if debug_load: diff --git a/modules/sd_offload.py b/modules/sd_offload.py index 70777a30a..0db3a2ba0 100644 --- a/modules/sd_offload.py +++ b/modules/sd_offload.py @@ -12,7 +12,7 @@ from modules.timer import process as process_timer debug = os.environ.get('SD_MOVE_DEBUG', None) is not None debug_move = log.trace if debug else lambda *args, **kwargs: None -offload_warn = ['sc', 'sd3', 'f1', 'h1', 'hunyuandit', 'auraflow', 'omnigen', 'cogview4'] +offload_warn = ['sc', 'sd3', 'f1', 'h1', 'hunyuandit', 'auraflow', 'omnigen', 'cogview4', 'cosmos'] offload_post = ['h1'] offload_hook_instance = None balanced_offload_exclude = ['CogView4Pipeline'] diff --git a/modules/sd_samplers_common.py b/modules/sd_samplers_common.py index 52fd6313e..7f3e802ec 100644 --- a/modules/sd_samplers_common.py +++ b/modules/sd_samplers_common.py @@ -9,7 +9,7 @@ from modules import shared, devices, processing, images, sd_vae_approx, sd_vae_t SamplerData = namedtuple('SamplerData', ['name', 'constructor', 'aliases', 'options']) approximation_indexes = { "Simple": 0, "Approximate": 1, "TAESD": 2, "Full VAE": 3 } -flow_models = ['f1', 'sd3', 'lumina', 'auraflow', 'sana', 'lumina2', 'cogview4', 'h1'] +flow_models = ['f1', 'sd3', 'lumina', 'auraflow', 'sana', 'lumina2', 'cogview4', 'h1', 'cosmos'] warned = False queue_lock = threading.Lock() diff --git a/modules/shared_items.py b/modules/shared_items.py index baf247a35..967f17b30 100644 --- a/modules/shared_items.py +++ b/modules/shared_items.py @@ -42,6 +42,7 @@ pipelines = { 'Amused': getattr(diffusers, 'AmusedPipeline', None), 'HiDream': getattr(diffusers, 'HiDreamImagePipeline', None), 'OmniGenPipeline': getattr(diffusers, 'OmniGenPipeline', None), + 'Cosmos': getattr(diffusers, 'Cosmos2TextToImagePipeline', None), # dynamically imported and redefined later 'Meissonic': getattr(diffusers, 'DiffusionPipeline', None), # dynamically redefined and loaded in sd_models.load_diffuser