add nvidia-cosmos-predict-2

Signed-off-by: Vladimir Mandic <mandic00@live.com>
2025-06-26 12:21:35 -04:00 · 2025-06-26 12:21:35 -04:00 · ada38d57b4
parent 61e949c1cc
commit ada38d57b4
13 changed files with 150 additions and 7 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,8 +2,14 @@

 ## Update for 2025-06-26

+- **Models**
+  - [nVidia Cosmos-Predict2 T2I](https://research.nvidia.com/labs/dir/cosmos-predict2/) *2B and 14B*
+    - available via *networks -> models -> reference*  
+    - *note*: this is a gated model, you need to [accept terms](https://huggingface.co/nvidia/Cosmos-Predict2-2B-Text2Image) and set your [huggingface token](https://vladmandic.github.io/sdnext-docs/Gated/)  
+  - [JoyCaption Beta](https://huggingface.co/fancyfeast/llama-joycaption-beta-one-hf-llava) support (in addition to existing JoyCaption Alpha)  
+    - available via *caption -> vlm caption*
+
 - **Changes**  
-  - Add [JoyCaption Beta](https://huggingface.co/fancyfeast/llama-joycaption-beta-one-hf-llava) support (in addition to existing JoyCaption Alpha)  
  - Support Remote VAE with *Omnigen, Lumina 2 and PixArt*  
  - Use Diffusers version of *OmniGen*  
  - Control move global settings to control elements -> control settings tab  
--- a/html/reference.json
+++ b/html/reference.json
@ -236,7 +236,20 @@
    "desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
    "preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
    "skip": true
-  }, 
+  },
+
+  "nVidia Cosmos-Predict2 T2I 2B": {
+    "path": "nvidia/Cosmos-Predict2-2B-Text2Image",
+    "desc": "Cosmos-Predict2: A family of highly performant pre-trained world foundation models purpose-built for generating physics-aware images, videos and world states for physical AI development.",
+    "preview": "nvidia--Cosmos-Predict2-2B-Text2Image.jpg",
+    "skip": true
+  },
+  "nVidia Cosmos-Predict2 T2I 14B": {
+    "path": "nvidia/Cosmos-Predict2-14B-Text2Image",
+    "desc": "Cosmos-Predict2: A family of highly performant pre-trained world foundation models purpose-built for generating physics-aware images, videos and world states for physical AI development.",
+    "preview": "nvidia--Cosmos-Predict2-2B-Text2Image.jpg",
+    "skip": true
+  },

  "VectorSpaceLab OmniGen v1": {
    "path": "Shitao/OmniGen-v1-diffusers",
--- a/models/Reference/nvidia--Cosmos-Predict2-2B-Text2Image.jpg
+++ b/models/Reference/nvidia--Cosmos-Predict2-2B-Text2Image.jpg
--- a/modules/loader.py
+++ b/modules/loader.py
@ -35,9 +35,13 @@ warnings.filterwarnings(action="ignore", category=DeprecationWarning)
 warnings.filterwarnings(action="ignore", category=FutureWarning)
 warnings.filterwarnings(action="ignore", category=UserWarning, module="torchvision")
 try:
+    import torch._logging # pylint: disable=ungrouped-imports
+    torch._logging._internal.DEFAULT_LOG_LEVEL = logging.ERROR # pylint: disable=protected-access
    torch._logging.set_logs(all=logging.ERROR, bytecode=False, aot_graphs=False, aot_joint_graph=False, ddp_graphs=False, graph=False, graph_code=False, graph_breaks=False, graph_sizes=False, guards=False, recompiles=False, recompiles_verbose=False, trace_source=False, trace_call=False, trace_bytecode=False, output_code=False, kernel_code=False, schedule=False, perf_hints=False, post_grad_graphs=False, onnx_diagnostics=False, fusion=False, overlap=False, export=None, modules=None, cudagraphs=False, sym_node=False, compiled_autograd_verbose=False) # pylint: disable=protected-access
-except Exception:
-    pass
+    torch._dynamo.config.verbose = False # pylint: disable=protected-access
+    torch._dynamo.config.suppress_errors = True # pylint: disable=protected-access
+except Exception as e:
+    errors.log.warning(f'Torch logging: {e}')
 if ".dev" in torch.__version__ or "+git" in torch.__version__:
    torch.__long_version__ = torch.__version__
    torch.__version__ = re.search(r'[\d.]+[\d]', torch.__version__).group(0)
--- a/modules/model_cosmos.py
+++ b/modules/model_cosmos.py
@ -0,0 +1,108 @@
+import os
+import transformers
+import diffusers
+from huggingface_hub import auth_check
+from modules import shared, devices, sd_models, model_quant, modelloader, sd_hijack_te
+
+
+def load_transformer(repo_id, diffusers_load_config={}):
+    load_args, quant_args = model_quant.get_dit_args(diffusers_load_config, module='Transformer', device_map=True)
+    fn = None
+
+    if shared.opts.sd_unet is not None and shared.opts.sd_unet != 'Default':
+        from modules import sd_unet
+        if shared.opts.sd_unet not in list(sd_unet.unet_dict):
+            shared.log.error(f'Load module: type=Transformer not found: {shared.opts.sd_unet}')
+            return None
+        fn = sd_unet.unet_dict[shared.opts.sd_unet] if os.path.exists(sd_unet.unet_dict[shared.opts.sd_unet]) else None
+
+    if fn is not None and 'gguf' in fn.lower():
+        shared.log.error('Load model: type=Cosmos format="gguf" unsupported')
+        transformer = None
+    elif fn is not None and 'safetensors' in fn.lower():
+        shared.log.debug(f'Load model: type=Cosmos transformer="{repo_id}" quant="{model_quant.get_quant(repo_id)}" args={load_args}')
+        transformer = diffusers.CosmosTransformer3DModel.from_single_file(fn, cache_dir=shared.opts.hfcache_dir, **load_args)
+    else:
+        shared.log.debug(f'Load model: type=Cosmos transformer="{repo_id}" quant="{model_quant.get_quant_type(quant_args)}" args={load_args}')
+        transformer = diffusers.CosmosTransformer3DModel.from_pretrained(
+            repo_id,
+            subfolder="transformer",
+            cache_dir=shared.opts.hfcache_dir,
+            **load_args,
+            **quant_args,
+        )
+    if shared.opts.diffusers_offload_mode != 'none' and transformer is not None:
+        sd_models.move_model(transformer, devices.cpu)
+    return transformer
+
+
+def load_text_encoder(repo_id, diffusers_load_config={}):
+    load_args, quant_args = model_quant.get_dit_args(diffusers_load_config, module='TE', device_map=True)
+    shared.log.debug(f'Load model: type=Cosmos te="{repo_id}" quant="{model_quant.get_quant_type(quant_args)}" args={load_args}')
+    text_encoder = transformers.T5EncoderModel.from_pretrained(
+        repo_id,
+        subfolder="text_encoder",
+        cache_dir=shared.opts.hfcache_dir,
+        **load_args,
+        **quant_args,
+    )
+    if shared.opts.diffusers_offload_mode != 'none' and text_encoder is not None:
+        sd_models.move_model(text_encoder, devices.cpu)
+
+    load_args, quant_args = model_quant.get_dit_args(diffusers_load_config, module='LLM', device_map=True)
+    llama_repo = shared.opts.model_h1_llama_repo if shared.opts.model_h1_llama_repo != 'Default' else 'meta-llama/Meta-Llama-3.1-8B-Instruct'
+    shared.log.debug(f'Load model: type=HiDream te4="{llama_repo}" quant="{model_quant.get_quant_type(quant_args)}" args={load_args}')
+
+    return text_encoder
+
+
+def load_cosmos_t2i(checkpoint_info, diffusers_load_config={}):
+    repo_id = sd_models.path_to_repo(checkpoint_info.name)
+    login = modelloader.hf_login()
+    try:
+        auth_check(repo_id)
+    except Exception as e:
+        shared.log.error(f'Load model: repo="{repo_id}" login={login} {e}')
+        return False
+
+    transformer = load_transformer(repo_id, diffusers_load_config)
+    text_encoder = load_text_encoder(repo_id, diffusers_load_config)
+    safety_checker = Fake_safety_checker()
+
+    load_args, _quant_args = model_quant.get_dit_args(diffusers_load_config, module='Model')
+    shared.log.debug(f'Load model: type=Cosmos model="{checkpoint_info.name}" repo="{repo_id}" offload={shared.opts.diffusers_offload_mode} dtype={devices.dtype} args={load_args}')
+
+    cls = diffusers.Cosmos2TextToImagePipeline
+    pipe = cls.from_pretrained(
+        repo_id,
+        transformer=transformer,
+        text_encoder=text_encoder,
+        safety_checker=safety_checker,
+        cache_dir=shared.opts.diffusers_dir,
+        **load_args,
+    )
+
+    sd_hijack_te.init_hijack(pipe)
+    del text_encoder
+    del transformer
+
+    devices.torch_gc()
+    return pipe
+
+
+class Fake_safety_checker:
+    def __init__(self):
+        from diffusers.utils import import_utils
+        import_utils._cosmos_guardrail_available = True # pylint: disable=protected-access
+
+    def __call__(self, *args, **kwargs): # pylint: disable=unused-argument
+        return
+
+    def to(self, _device):
+        pass
+
+    def check_text_safety(self, _prompt):
+        return True
+
+    def check_video_safety(self, vid):
+        return vid
--- a/modules/modeldata.py
+++ b/modules/modeldata.py
@ -43,6 +43,8 @@ def get_model_type(pipe):
        model_type = 'sana'
    elif "HiDream" in name:
        model_type = 'h1'
+    elif "Cosmos2TextToImage" in name:
+        model_type = 'cosmos'
    # video models
    elif "CogVideo" in name:
        model_type = 'cogvideo'
--- a/modules/processing_args.py
+++ b/modules/processing_args.py
@ -298,6 +298,8 @@ def set_pipeline_args(p, model, prompts:list, negative_prompts:list, prompts_2:t
            args['control_strength'] = p.denoising_strength
            args['width'] = p.width
            args['height'] = p.height
+    if 'Cosmos2TextToImagePipeline':
+        kwargs['output_type'] = 'np' # cosmos uses wan-vae which is weird
    # set callbacks
    if 'prior_callback_steps' in possible:  # Wuerstchen / Cascade
        args['prior_callback_steps'] = 1
--- a/modules/processing_vae.py
+++ b/modules/processing_vae.py
@ -130,7 +130,7 @@ def full_vae_decode(latents, model):
    # normalize latents
    latents_mean = model.vae.config.get("latents_mean", None)
    latents_std = model.vae.config.get("latents_std", None)
-    scaling_factor = model.vae.config.get("scaling_factor", None)
+    scaling_factor = model.vae.config.get("scaling_factor", 1.0)
    shift_factor = model.vae.config.get("shift_factor", None)
    if latents_mean and latents_std:
        latents_mean = (torch.tensor(latents_mean).view(1, -1, 1, 1).to(latents.device, latents.dtype))
--- a/modules/sd_detect.py
+++ b/modules/sd_detect.py
@ -98,6 +98,8 @@ def detect_pipeline(f: str, op: str = 'model', warning=True, quiet=False):
                    warn(f'Model detected as FLUX UNET model, but attempting to load a base model: {op}={f} size={size} MB')
            if 'flex.2' in f.lower():
                guess = 'FLEX'
+            if 'cosmos-predict2' in f.lower():
+                guess = 'Cosmos'
            # guess for diffusers
            index = os.path.join(f, 'model_index.json')
            if os.path.exists(index) and os.path.isfile(index):
@ -113,6 +115,7 @@ def detect_pipeline(f: str, op: str = 'model', warning=True, quiet=False):
                    guess = 'Stable Diffusion 3'
                if callable(pipeline) and 'Lumina2' in pipeline.__name__:
                    guess = 'Lumina 2'
+
            # switch for specific variant
            if guess == 'Stable Diffusion' and 'inpaint' in f.lower():
                guess = 'Stable Diffusion Inpaint'
@ -122,6 +125,7 @@ def detect_pipeline(f: str, op: str = 'model', warning=True, quiet=False):
                guess = 'Stable Diffusion XL Inpaint'
            elif guess == 'Stable Diffusion XL' and 'instruct' in f.lower():
                guess = 'Stable Diffusion XL Instruct'
+
            # get actual pipeline
            pipeline = shared_items.get_pipelines().get(guess, None) if pipeline is None else pipeline
            if debug_load is not None:
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@ -350,6 +350,9 @@ def load_diffuser_force(model_type, checkpoint_info, diffusers_load_config, op='
        elif model_type in ['HiDream']:
            from modules.model_hidream import load_hidream
            sd_model = load_hidream(checkpoint_info, diffusers_load_config)
+        elif model_type in ['Cosmos']:
+            from modules.model_cosmos import load_cosmos_t2i
+            sd_model = load_cosmos_t2i(checkpoint_info, diffusers_load_config)
    except Exception as e:
        shared.log.error(f'Load {op}: path="{checkpoint_info.path}" {e}')
        if debug_load:
--- a/modules/sd_offload.py
+++ b/modules/sd_offload.py
@ -12,7 +12,7 @@ from modules.timer import process as process_timer

 debug = os.environ.get('SD_MOVE_DEBUG', None) is not None
 debug_move = log.trace if debug else lambda *args, **kwargs: None
-offload_warn = ['sc', 'sd3', 'f1', 'h1', 'hunyuandit', 'auraflow', 'omnigen', 'cogview4']
+offload_warn = ['sc', 'sd3', 'f1', 'h1', 'hunyuandit', 'auraflow', 'omnigen', 'cogview4', 'cosmos']
 offload_post = ['h1']
 offload_hook_instance = None
 balanced_offload_exclude = ['CogView4Pipeline']
--- a/modules/sd_samplers_common.py
+++ b/modules/sd_samplers_common.py
@ -9,7 +9,7 @@ from modules import shared, devices, processing, images, sd_vae_approx, sd_vae_t

 SamplerData = namedtuple('SamplerData', ['name', 'constructor', 'aliases', 'options'])
 approximation_indexes = { "Simple": 0, "Approximate": 1, "TAESD": 2, "Full VAE": 3 }
-flow_models = ['f1', 'sd3', 'lumina', 'auraflow', 'sana', 'lumina2', 'cogview4', 'h1']
+flow_models = ['f1', 'sd3', 'lumina', 'auraflow', 'sana', 'lumina2', 'cogview4', 'h1', 'cosmos']
 warned = False
 queue_lock = threading.Lock()

--- a/modules/shared_items.py
+++ b/modules/shared_items.py
@ -42,6 +42,7 @@ pipelines = {
    'Amused': getattr(diffusers, 'AmusedPipeline', None),
    'HiDream': getattr(diffusers, 'HiDreamImagePipeline', None),
    'OmniGenPipeline': getattr(diffusers, 'OmniGenPipeline', None),
+    'Cosmos': getattr(diffusers, 'Cosmos2TextToImagePipeline', None),

    # dynamically imported and redefined later
    'Meissonic': getattr(diffusers, 'DiffusionPipeline', None), # dynamically redefined and loaded in sd_models.load_diffuser