mirror of https://github.com/vladmandic/automatic
179 lines
8.4 KiB
Python
179 lines
8.4 KiB
Python
from modules import ui_symbols
|
|
|
|
|
|
vlm_models = {
|
|
"Google Gemma 3 4B": "google/gemma-3-4b-it",
|
|
"Google Gemma 3n E2B": "google/gemma-3n-E2B-it", # 1.5GB
|
|
"Google Gemma 3n E4B": "google/gemma-3n-E4B-it", # 1.5GB
|
|
"Nidum Gemma 3 4B Uncensored": "nidum/Nidum-Gemma-3-4B-it-Uncensored",
|
|
"Allura Gemma 3 Glitter 4B": "allura-org/Gemma-3-Glitter-4B",
|
|
# Qwen3.5
|
|
"Alibaba Qwen 3.5 0.8B": "Qwen/Qwen3.5-0.8B",
|
|
"Alibaba Qwen 3.5 2B": "Qwen/Qwen3.5-2B",
|
|
"Alibaba Qwen 3.5 4B": "Qwen/Qwen3.5-4B",
|
|
"Alibaba Qwen 3.5 9B": "Qwen/Qwen3.5-9B",
|
|
"Alibaba Qwen 3.5 27B": "Qwen/Qwen3.5-27B",
|
|
"Alibaba Qwen 3.5 35B-A3B": "Qwen/Qwen3.5-35B-A3B",
|
|
"Qwen 3.5 27B Heretic": "coder3101/Qwen3.5-27B-heretic",
|
|
"Alibaba Qwen 2.0 VL 2B": "Qwen/Qwen2-VL-2B-Instruct",
|
|
"Alibaba Qwen 2.5 Omni 3B": "Qwen/Qwen2.5-Omni-3B",
|
|
"Alibaba Qwen 2.5 VL 3B": "Qwen/Qwen2.5-VL-3B-Instruct",
|
|
# Qwen2.5-VL Finetunes
|
|
"Qwen 2.5 VL 3B Heretic": "coder3101/Qwen2.5-VL-3B-Instruct-heretic",
|
|
"Qwen 2.5 VL 7B Heretic": "coder3101/Qwen2.5-VL-7B-Instruct-heretic",
|
|
"Qwen 2.5 VL 32B Heretic": "coder3101/Qwen2.5-VL-32B-Instruct-heretic",
|
|
"Qwen 2.5 VL 72B Heretic": "coder3101/Qwen2.5-VL-72B-Instruct-heretic",
|
|
"Alibaba Qwen 3 VL 2B": "Qwen/Qwen3-VL-2B-Instruct",
|
|
f"Alibaba Qwen 3 VL 2B Thinking {ui_symbols.reasoning}": "Qwen/Qwen3-VL-2B-Thinking",
|
|
"Alibaba Qwen 3 VL 4B": "Qwen/Qwen3-VL-4B-Instruct",
|
|
f"Alibaba Qwen 3 VL 4B Thinking {ui_symbols.reasoning}": "Qwen/Qwen3-VL-4B-Thinking",
|
|
"Alibaba Qwen 3 VL 8B": "Qwen/Qwen3-VL-8B-Instruct",
|
|
f"Alibaba Qwen 3 VL 8B Thinking {ui_symbols.reasoning}": "Qwen/Qwen3-VL-8B-Thinking",
|
|
# Qwen3-VL Finetunes
|
|
"Qwen 3 VL 2B Heretic": "coder3101/Qwen3-VL-2B-Instruct-heretic",
|
|
f"Qwen 3 VL 2B Thinking Heretic {ui_symbols.reasoning}": "coder3101/Qwen3-VL-2B-Thinking-heretic",
|
|
"Qwen 3 VL 4B Heretic": "coder3101/Qwen3-VL-4B-Instruct-heretic",
|
|
f"Qwen 3 VL 4B Thinking Heretic {ui_symbols.reasoning}": "coder3101/Qwen3-VL-4B-Thinking-heretic",
|
|
"Qwen 3 VL 8B Heretic": "coder3101/Qwen3-VL-8B-Instruct-heretic",
|
|
"Qwen 3 VL 32B Heretic v2": "coder3101/Qwen3-VL-32B-Instruct-heretic-v2",
|
|
f"Qwen 3 VL 32B Thinking Heretic v2 {ui_symbols.reasoning}": "coder3101/Qwen3-VL-32B-Thinking-heretic-v2",
|
|
"Qwen 3 VL 8B Abliterated Caption": "prithivMLmods/Qwen3-VL-8B-Abliterated-Caption-it",
|
|
"XiaomiMiMo MiMo VL 7B RL": "XiaomiMiMo/MiMo-VL-7B-RL-2508", # 8.3GB
|
|
"Huggingface Smol VL2 0.5B": "HuggingFaceTB/SmolVLM-500M-Instruct",
|
|
"Huggingface Smol VL2 2B": "HuggingFaceTB/SmolVLM-Instruct",
|
|
"Apple FastVLM 0.5B": "apple/FastVLM-0.5B",
|
|
"Apple FastVLM 1.5B": "apple/FastVLM-1.5B",
|
|
"Apple FastVLM 7B": "apple/FastVLM-7B",
|
|
"Microsoft Florence 2 Base": "florence-community/Florence-2-base-ft", # 0.5GB
|
|
"Microsoft Florence 2 Large": "florence-community/Florence-2-large-ft", # 1.5GB
|
|
"MiaoshouAI PromptGen 1.5 Base": "Disty0/Florence-2-base-PromptGen-v1.5", # 0.5GB
|
|
"MiaoshouAI PromptGen 1.5 Large": "Disty0/Florence-2-large-PromptGen-v1.5", # 1.5GB
|
|
"MiaoshouAI PromptGen 2.0 Base": "Disty0/Florence-2-base-PromptGen-v2.0", # 0.5GB
|
|
"MiaoshouAI PromptGen 2.0 Large": "Disty0/Florence-2-large-PromptGen-v2.0", # 1.5GB
|
|
"CogFlorence 2.0 Large": "thwri/CogFlorence-2-Large-Freeze", # 1.6GB
|
|
"CogFlorence 2.2 Large": "thwri/CogFlorence-2.2-Large", # 1.6GB
|
|
f"Moondream 2 {ui_symbols.reasoning}": "vikhyatk/moondream2", # 3.7GB
|
|
f"Moondream 3 Preview {ui_symbols.reasoning}": "moondream/moondream3-preview", # 9.3GB (gated)
|
|
"Google Pix Textcaps": "google/pix2struct-textcaps-base", # 1.1GB
|
|
"Google PaliGemma 2 3B": "google/paligemma2-3b-pt-224",
|
|
"Salesforce BLIP Base": "Salesforce/blip-vqa-base", # 1.5GB
|
|
"Salesforce BLIP Large": "Salesforce/blip-vqa-capfilt-large", # 1.5GB
|
|
"Microsoft GIT TextCaps Base": "microsoft/git-base-textcaps", # 0.7GB
|
|
"Microsoft GIT VQA Base": "microsoft/git-base-vqav2", # 0.7GB
|
|
"Microsoft GIT VQA Large": "microsoft/git-large-vqav2", # 1.6GB
|
|
"ToriiGate 0.4 2B": "Minthy/ToriiGate-v0.4-2B",
|
|
"ToriiGate 0.4 7B": "Minthy/ToriiGate-v0.4-7B",
|
|
"ViLT Base": "dandelin/vilt-b32-finetuned-vqa", # 0.5GB
|
|
"JoyCaption Alpha": "fancyfeast/llama-joycaption-alpha-two-hf-llava", # 17.4GB
|
|
"JoyCaption Beta": "fancyfeast/llama-joycaption-beta-one-hf-llava", # 17.4GB
|
|
"JoyTag": "fancyfeast/joytag", # 0.7GB
|
|
"AIDC Ovis2 1B": "AIDC-AI/Ovis2-1B",
|
|
"AIDC Ovis2 2B": "AIDC-AI/Ovis2-2B",
|
|
"AIDC Ovis2 4B": "AIDC-AI/Ovis2-4B",
|
|
"ByteDance Sa2VA 1B": "ByteDance/Sa2VA-1B",
|
|
"ByteDance Sa2VA 4B": "ByteDance/Sa2VA-4B",
|
|
# Mistral Finetunes
|
|
"Mistral Small 3.2 24B Heretic": "coder3101/Mistral-Small-3.2-24B-Instruct-2506-heretic",
|
|
f"Google Gemini 3.1 Pro {ui_symbols.cloud}": "gemini-3.1-pro-preview",
|
|
f"Google Gemini 3.1 Flash Lite {ui_symbols.cloud}": "gemini-3.1-flash-lite-preview",
|
|
f"Google Gemini 3.0 Flash {ui_symbols.cloud}": "gemini-3-flash-preview",
|
|
f"Google Gemini 2.5 Pro {ui_symbols.cloud}": "gemini-2.5-pro",
|
|
f"Google Gemini 2.5 Flash {ui_symbols.cloud}": "gemini-2.5-flash",
|
|
}
|
|
|
|
# Default model
|
|
vlm_default = "Alibaba Qwen 2.5 VL 3B"
|
|
|
|
# Default system prompt
|
|
vlm_system = 'You are image captioning expert, creative, unbiased and uncensored.'
|
|
|
|
# Common prompts (work with all VLM models)
|
|
vlm_prompts_common = [
|
|
"Use Prompt",
|
|
"Short Caption",
|
|
"Normal Caption",
|
|
"Long Caption",
|
|
]
|
|
|
|
# Florence-2 base prompts (supported by all Florence models including CogFlorence)
|
|
vlm_prompts_florence = [
|
|
"Phrase Grounding",
|
|
"Object Detection",
|
|
"Dense Region Caption",
|
|
"Region Proposal",
|
|
"OCR (Read Text)",
|
|
"OCR with Regions",
|
|
]
|
|
|
|
# PromptGen-only prompts (require MiaoshouAI PromptGen fine-tune)
|
|
vlm_prompts_promptgen = [
|
|
"Analyze",
|
|
"Generate Tags",
|
|
"Mixed Caption",
|
|
"Mixed Caption+",
|
|
]
|
|
|
|
# Moondream specific prompts (shared by Moondream 2 and 3)
|
|
vlm_prompts_moondream = [
|
|
"Point at...",
|
|
"Detect all...",
|
|
]
|
|
|
|
# Moondream 2 only prompts (gaze detection not available in Moondream 3)
|
|
vlm_prompts_moondream2 = [
|
|
"Detect Gaze",
|
|
]
|
|
|
|
# Mapping from friendly names to internal tokens/commands
|
|
vlm_prompt_mapping = {
|
|
"Use Prompt": "Use Prompt",
|
|
"Short Caption": "<CAPTION>",
|
|
"Normal Caption": "<DETAILED_CAPTION>",
|
|
"Long Caption": "<MORE_DETAILED_CAPTION>",
|
|
"Phrase Grounding": "<CAPTION_TO_PHRASE_GROUNDING>",
|
|
"Object Detection": "<OD>",
|
|
"Dense Region Caption": "<DENSE_REGION_CAPTION>",
|
|
"Region Proposal": "<REGION_PROPOSAL>",
|
|
"OCR (Read Text)": "<OCR>",
|
|
"OCR with Regions": "<OCR_WITH_REGION>",
|
|
"Analyze": "<ANALYZE>",
|
|
"Generate Tags": "<GENERATE_TAGS>",
|
|
"Mixed Caption": "<MIXED_CAPTION>",
|
|
"Mixed Caption+": "<MIXED_CAPTION_PLUS>",
|
|
"Point at...": "POINT_MODE",
|
|
"Detect all...": "DETECT_MODE",
|
|
"Detect Gaze": "DETECT_GAZE",
|
|
}
|
|
|
|
# Placeholder hints for prompt field based on selected question
|
|
vlm_prompt_placeholders = {
|
|
"Use Prompt": "Enter your question or instruction for the model",
|
|
"Short Caption": "Optional: add specific focus or style instructions",
|
|
"Normal Caption": "Optional: add specific focus or style instructions",
|
|
"Long Caption": "Optional: add specific focus or style instructions",
|
|
"Phrase Grounding": "Optional: specify phrases to ground in the image",
|
|
"Object Detection": "Optional: specify object types to detect",
|
|
"Dense Region Caption": "Optional: add specific instructions",
|
|
"Region Proposal": "Optional: add specific instructions",
|
|
"OCR (Read Text)": "Optional: add specific instructions",
|
|
"OCR with Regions": "Optional: add specific instructions",
|
|
"Analyze": "Optional: add specific analysis instructions",
|
|
"Generate Tags": "Optional: add specific tagging instructions",
|
|
"Mixed Caption": "Optional: add specific instructions",
|
|
"Mixed Caption+": "Optional: add specific instructions",
|
|
"Point at...": "Enter objects to locate, e.g., 'the red car' or 'all the eyes'",
|
|
"Detect all...": "Enter object type to detect, e.g., 'cars' or 'faces'",
|
|
"Detect Gaze": "No input needed - auto-detects face and gaze direction",
|
|
}
|
|
|
|
# Legacy list for backwards compatibility
|
|
vlm_prompts = vlm_prompts_common + vlm_prompts_florence + vlm_prompts_promptgen + vlm_prompts_moondream + vlm_prompts_moondream2
|
|
|
|
vlm_prefill = 'Answer: the image shows'
|
|
|
|
|
|
def get_vlm_repo(display_name: str) -> str:
|
|
"""Look up repo ID from display name, stripping any trailing symbols."""
|
|
name = display_name.strip()
|
|
return vlm_models.get(name, name)
|