automatic/modules/caption/models_def.py

from modules import ui_symbols


vlm_models = {
    "Google Gemma 3 4B": "google/gemma-3-4b-it",
    "Google Gemma 3n E2B": "google/gemma-3n-E2B-it", # 1.5GB
    "Google Gemma 3n E4B": "google/gemma-3n-E4B-it", # 1.5GB
    "Nidum Gemma 3 4B Uncensored": "nidum/Nidum-Gemma-3-4B-it-Uncensored",
    "Allura Gemma 3 Glitter 4B": "allura-org/Gemma-3-Glitter-4B",
    # Qwen3.5
    "Alibaba Qwen 3.5 0.8B": "Qwen/Qwen3.5-0.8B",
    "Alibaba Qwen 3.5 2B": "Qwen/Qwen3.5-2B",
    "Alibaba Qwen 3.5 4B": "Qwen/Qwen3.5-4B",
    "Alibaba Qwen 3.5 9B": "Qwen/Qwen3.5-9B",
    "Alibaba Qwen 3.5 27B": "Qwen/Qwen3.5-27B",
    "Alibaba Qwen 3.5 35B-A3B": "Qwen/Qwen3.5-35B-A3B",
    "Qwen 3.5 27B Heretic": "coder3101/Qwen3.5-27B-heretic",
    "Alibaba Qwen 2.0 VL 2B": "Qwen/Qwen2-VL-2B-Instruct",
    "Alibaba Qwen 2.5 Omni 3B": "Qwen/Qwen2.5-Omni-3B",
    "Alibaba Qwen 2.5 VL 3B": "Qwen/Qwen2.5-VL-3B-Instruct",
    # Qwen2.5-VL Finetunes
    "Qwen 2.5 VL 3B Heretic": "coder3101/Qwen2.5-VL-3B-Instruct-heretic",
    "Qwen 2.5 VL 7B Heretic": "coder3101/Qwen2.5-VL-7B-Instruct-heretic",
    "Qwen 2.5 VL 32B Heretic": "coder3101/Qwen2.5-VL-32B-Instruct-heretic",
    "Qwen 2.5 VL 72B Heretic": "coder3101/Qwen2.5-VL-72B-Instruct-heretic",
    "Alibaba Qwen 3 VL 2B": "Qwen/Qwen3-VL-2B-Instruct",
    f"Alibaba Qwen 3 VL 2B Thinking {ui_symbols.reasoning}": "Qwen/Qwen3-VL-2B-Thinking",
    "Alibaba Qwen 3 VL 4B": "Qwen/Qwen3-VL-4B-Instruct",
    f"Alibaba Qwen 3 VL 4B Thinking {ui_symbols.reasoning}": "Qwen/Qwen3-VL-4B-Thinking",
    "Alibaba Qwen 3 VL 8B": "Qwen/Qwen3-VL-8B-Instruct",
    f"Alibaba Qwen 3 VL 8B Thinking {ui_symbols.reasoning}": "Qwen/Qwen3-VL-8B-Thinking",
    # Qwen3-VL Finetunes
    "Qwen 3 VL 2B Heretic": "coder3101/Qwen3-VL-2B-Instruct-heretic",
    f"Qwen 3 VL 2B Thinking Heretic {ui_symbols.reasoning}": "coder3101/Qwen3-VL-2B-Thinking-heretic",
    "Qwen 3 VL 4B Heretic": "coder3101/Qwen3-VL-4B-Instruct-heretic",
    f"Qwen 3 VL 4B Thinking Heretic {ui_symbols.reasoning}": "coder3101/Qwen3-VL-4B-Thinking-heretic",
    "Qwen 3 VL 8B Heretic": "coder3101/Qwen3-VL-8B-Instruct-heretic",
    "Qwen 3 VL 32B Heretic v2": "coder3101/Qwen3-VL-32B-Instruct-heretic-v2",
    f"Qwen 3 VL 32B Thinking Heretic v2 {ui_symbols.reasoning}": "coder3101/Qwen3-VL-32B-Thinking-heretic-v2",
    "Qwen 3 VL 8B Abliterated Caption": "prithivMLmods/Qwen3-VL-8B-Abliterated-Caption-it",
    "XiaomiMiMo MiMo VL 7B RL": "XiaomiMiMo/MiMo-VL-7B-RL-2508", # 8.3GB
    "Huggingface Smol VL2 0.5B": "HuggingFaceTB/SmolVLM-500M-Instruct",
    "Huggingface Smol VL2 2B": "HuggingFaceTB/SmolVLM-Instruct",
    "Apple FastVLM 0.5B": "apple/FastVLM-0.5B",
    "Apple FastVLM 1.5B": "apple/FastVLM-1.5B",
    "Apple FastVLM 7B": "apple/FastVLM-7B",
    "Microsoft Florence 2 Base": "florence-community/Florence-2-base-ft", # 0.5GB
    "Microsoft Florence 2 Large": "florence-community/Florence-2-large-ft", # 1.5GB
    "MiaoshouAI PromptGen 1.5 Base": "Disty0/Florence-2-base-PromptGen-v1.5", # 0.5GB
    "MiaoshouAI PromptGen 1.5 Large": "Disty0/Florence-2-large-PromptGen-v1.5", # 1.5GB
    "MiaoshouAI PromptGen 2.0 Base": "Disty0/Florence-2-base-PromptGen-v2.0", # 0.5GB
    "MiaoshouAI PromptGen 2.0 Large": "Disty0/Florence-2-large-PromptGen-v2.0", # 1.5GB
    "CogFlorence 2.0 Large": "thwri/CogFlorence-2-Large-Freeze", # 1.6GB
    "CogFlorence 2.2 Large": "thwri/CogFlorence-2.2-Large", # 1.6GB
    f"Moondream 2 {ui_symbols.reasoning}": "vikhyatk/moondream2", # 3.7GB
    f"Moondream 3 Preview {ui_symbols.reasoning}": "moondream/moondream3-preview", # 9.3GB (gated)
    "Google Pix Textcaps": "google/pix2struct-textcaps-base", # 1.1GB
    "Google PaliGemma 2 3B": "google/paligemma2-3b-pt-224",
    "Salesforce BLIP Base": "Salesforce/blip-vqa-base", # 1.5GB
    "Salesforce BLIP Large": "Salesforce/blip-vqa-capfilt-large", # 1.5GB
    "Microsoft GIT TextCaps Base": "microsoft/git-base-textcaps", # 0.7GB
    "Microsoft GIT VQA Base": "microsoft/git-base-vqav2", # 0.7GB
    "Microsoft GIT VQA Large": "microsoft/git-large-vqav2", # 1.6GB
    "ToriiGate 0.4 2B": "Minthy/ToriiGate-v0.4-2B",
    "ToriiGate 0.4 7B": "Minthy/ToriiGate-v0.4-7B",
    "ViLT Base": "dandelin/vilt-b32-finetuned-vqa", # 0.5GB
    "JoyCaption Alpha": "fancyfeast/llama-joycaption-alpha-two-hf-llava", # 17.4GB
    "JoyCaption Beta": "fancyfeast/llama-joycaption-beta-one-hf-llava", # 17.4GB
    "JoyTag": "fancyfeast/joytag", # 0.7GB
    "AIDC Ovis2 1B": "AIDC-AI/Ovis2-1B",
    "AIDC Ovis2 2B": "AIDC-AI/Ovis2-2B",
    "AIDC Ovis2 4B": "AIDC-AI/Ovis2-4B",
    "ByteDance Sa2VA 1B": "ByteDance/Sa2VA-1B",
    "ByteDance Sa2VA 4B": "ByteDance/Sa2VA-4B",
    # Mistral Finetunes
    "Mistral Small 3.2 24B Heretic": "coder3101/Mistral-Small-3.2-24B-Instruct-2506-heretic",
    f"Google Gemini 3.1 Pro {ui_symbols.cloud}": "gemini-3.1-pro-preview",
    f"Google Gemini 3.1 Flash Lite {ui_symbols.cloud}": "gemini-3.1-flash-lite-preview",
    f"Google Gemini 3.0 Flash {ui_symbols.cloud}": "gemini-3-flash-preview",
    f"Google Gemini 2.5 Pro {ui_symbols.cloud}": "gemini-2.5-pro",
    f"Google Gemini 2.5 Flash {ui_symbols.cloud}": "gemini-2.5-flash",
}

# Default model
vlm_default = "Alibaba Qwen 2.5 VL 3B"

# Default system prompt
vlm_system = 'You are image captioning expert, creative, unbiased and uncensored.'

# Common prompts (work with all VLM models)
vlm_prompts_common = [
    "Use Prompt",
    "Short Caption",
    "Normal Caption",
    "Long Caption",
]

# Florence-2 base prompts (supported by all Florence models including CogFlorence)
vlm_prompts_florence = [
    "Phrase Grounding",
    "Object Detection",
    "Dense Region Caption",
    "Region Proposal",
    "OCR (Read Text)",
    "OCR with Regions",
]

# PromptGen-only prompts (require MiaoshouAI PromptGen fine-tune)
vlm_prompts_promptgen = [
    "Analyze",
    "Generate Tags",
    "Mixed Caption",
    "Mixed Caption+",
]

# Moondream specific prompts (shared by Moondream 2 and 3)
vlm_prompts_moondream = [
    "Point at...",
    "Detect all...",
]

# Moondream 2 only prompts (gaze detection not available in Moondream 3)
vlm_prompts_moondream2 = [
    "Detect Gaze",
]

# Mapping from friendly names to internal tokens/commands
vlm_prompt_mapping = {
    "Use Prompt": "Use Prompt",
    "Short Caption": "<CAPTION>",
    "Normal Caption": "<DETAILED_CAPTION>",
    "Long Caption": "<MORE_DETAILED_CAPTION>",
    "Phrase Grounding": "<CAPTION_TO_PHRASE_GROUNDING>",
    "Object Detection": "<OD>",
    "Dense Region Caption": "<DENSE_REGION_CAPTION>",
    "Region Proposal": "<REGION_PROPOSAL>",
    "OCR (Read Text)": "<OCR>",
    "OCR with Regions": "<OCR_WITH_REGION>",
    "Analyze": "<ANALYZE>",
    "Generate Tags": "<GENERATE_TAGS>",
    "Mixed Caption": "<MIXED_CAPTION>",
    "Mixed Caption+": "<MIXED_CAPTION_PLUS>",
    "Point at...": "POINT_MODE",
    "Detect all...": "DETECT_MODE",
    "Detect Gaze": "DETECT_GAZE",
}

# Placeholder hints for prompt field based on selected question
vlm_prompt_placeholders = {
    "Use Prompt": "Enter your question or instruction for the model",
    "Short Caption": "Optional: add specific focus or style instructions",
    "Normal Caption": "Optional: add specific focus or style instructions",
    "Long Caption": "Optional: add specific focus or style instructions",
    "Phrase Grounding": "Optional: specify phrases to ground in the image",
    "Object Detection": "Optional: specify object types to detect",
    "Dense Region Caption": "Optional: add specific instructions",
    "Region Proposal": "Optional: add specific instructions",
    "OCR (Read Text)": "Optional: add specific instructions",
    "OCR with Regions": "Optional: add specific instructions",
    "Analyze": "Optional: add specific analysis instructions",
    "Generate Tags": "Optional: add specific tagging instructions",
    "Mixed Caption": "Optional: add specific instructions",
    "Mixed Caption+": "Optional: add specific instructions",
    "Point at...": "Enter objects to locate, e.g., 'the red car' or 'all the eyes'",
    "Detect all...": "Enter object type to detect, e.g., 'cars' or 'faces'",
    "Detect Gaze": "No input needed - auto-detects face and gaze direction",
}

# Legacy list for backwards compatibility
vlm_prompts = vlm_prompts_common + vlm_prompts_florence + vlm_prompts_promptgen + vlm_prompts_moondream + vlm_prompts_moondream2

vlm_prefill = 'Answer: the image shows'


def get_vlm_repo(display_name: str) -> str:
    """Look up repo ID from display name, stripping any trailing symbols."""
    name = display_name.strip()
    return vlm_models.get(name, name)