From 6b89cc846333de7b8a56dbe7579566abdaef79a2 Mon Sep 17 00:00:00 2001 From: CalamitousFelicitousness Date: Sun, 25 Jan 2026 00:41:07 +0000 Subject: [PATCH] feat(ui): add tooltips/hints to Caption tab Add comprehensive tooltips to Caption tab UI elements in locale_en.json: - Add new "llm" section for shared LLM/VLM parameters: System prompt, Prefill, Top-K, Top-P, Temperature, Num Beams, Use Samplers, Thinking Mode, Keep Thinking Trace, Keep Prefill - Add new "caption" section for caption-specific settings: VLM, OpenCLiP, Tagger tab labels and all their parameters including thresholds, tag formatting, batch options - Consolidate accordion labels in ui_caption.py: "Caption: Advanced Options" and "Caption: Batch" shared across VLM, OpenCLiP, and Tagger tabs (localized to "Advanced Options" and "Batch" in UI) - Remove duplicate entries from missing section --- html/locale_en.json | 96 ++++++++++++++++++++++++------------------- modules/ui_caption.py | 12 +++--- 2 files changed, 59 insertions(+), 49 deletions(-) diff --git a/html/locale_en.json b/html/locale_en.json index 599bf7de5..182edab18 100644 --- a/html/locale_en.json +++ b/html/locale_en.json @@ -1,4 +1,5 @@ -{"icons": [ +{ +"icons": [ {"id":"","label":"🎲️","localized":"","reload":"","hint":"Use random seed"}, {"id":"","label":"🔄","localized":"","reload":"","hint":"Reset values"}, {"id":"","label":"⬆️","localized":"","reload":"","hint":"Upload image"}, @@ -33,17 +34,9 @@ ], "main": [ {"id":"","label":"Prompt","localized":"","reload":"","hint":"Describe image you want to generate"}, - {"id":"","label":"VLM: Prompt","localized":"Prompt","reload":"","hint":"Enter your prompt/question here."}, - {"id":"","label":"VLM: Advanced Options","localized":"Advanced Options","reload":"","hint":"Advanced configuration options for the VLM model."}, - {"id":"","label":"VLM: Batch Caption","localized":"Batch Caption","reload":"","hint":"Process multiple images in a batch using VLM."}, - {"id":"","label":"CLiP: Advanced Options","localized":"Advanced Options","reload":"","hint":"Advanced configuration options for CLiP interrogation."}, - {"id":"","label":"CLiP: Batch Interrogate","localized":"Batch Interrogate","reload":"","hint":"Process multiple images in a batch using CLiP."}, - {"id":"","label":"Task","localized":"","reload":"","hint":"Changes which task the model will perform. Regular text prompts can be used when the task is set to Use Prompt.
When other options are selected, see the hint text inside an empty Prompt field for guidance."}, - {"id":"","label":"Prefill text","localized":"","reload":"","hint":"Pre-fills the start of the model's response to guide its output format or content by forcing it to continue the prefill text.
Prefill is filtered out and does not appear in the final response.

Leave empty to let the model generate its own response from scratch."}, {"id":"","label":"Start","localized":"","reload":"","hint":"Start"}, {"id":"","label":"End","localized":"","reload":"","hint":"End"}, {"id":"","label":"Core","localized":"","reload":"","hint":"Core settings"}, - {"id":"","label":"System prompt","localized":"","reload":"","hint":"System prompt controls behavior of the LLM. Processed first and persists throughout conversation. Has highest priority weighting and is always appended at the beginning of the sequence.

Use for: Response formatting rules, role definition, style."}, {"id":"","label":"Negative prompt","localized":"","reload":"","hint":"Describe what you don't want to see in generated image"}, {"id":"","label":"Text","localized":"","reload":"","hint":"Create image from text"}, {"id":"","label":"Image","localized":"","reload":"","hint":"Create image from image"}, @@ -52,15 +45,6 @@ {"id":"","label":"T2I","localized":"","reload":"","hint":"Create image from text
Legacy interface that mimics original text-to-image interface and behavior"}, {"id":"","label":"I2I","localized":"","reload":"","hint":"Create image from image
Legacy interface that mimics original image-to-image interface and behavior"}, {"id":"","label":"Process","localized":"","reload":"","hint":"Process existing image
Can be used to upscale images, remove backgrounds, obfuscate NSFW content, apply various filters and effects"}, - {"id":"","label":"Caption","localized":"","reload":"","hint":"Analyze existing images and create text descriptions"}, - {"id":"","label":"clip: min length","localized":"Min Length","reload":"","hint":"Minimum number of tokens in the generated caption."}, - {"id":"","label":"clip: max length","localized":"Max Length","reload":"","hint":"Maximum number of tokens in the generated caption."}, - {"id":"","label":"clip: chunk size","localized":"Chunk Size","reload":"","hint":"Batch size for processing description candidates (flavors). Higher values speed up interrogation but increase VRAM usage."}, - {"id":"","label":"clip: min flavors","localized":"Min Flavors","reload":"","hint":"Minimum number of descriptive tags (flavors) to keep in the final prompt."}, - {"id":"","label":"clip: max flavors","localized":"Max Flavors","reload":"","hint":"Maximum number of descriptive tags (flavors) to keep in the final prompt."}, - {"id":"","label":"clip: intermediates","localized":"Intermediates","reload":"","hint":"Size of the intermediate candidate pool when matching image features to descriptive tags (flavours). From this pool, the final tags are selected based on Min/Max Flavors. Higher values may improve quality but are slower."}, - {"id":"","label":"clip: num beams","localized":"CLiP Num Beams","reload":"","hint":"Number of beams for beam search during caption generation. Higher values search more possibilities but are slower."}, - {"id":"","label":"Interrogate","localized":"","reload":"","hint":"Run interrogate to get description of your image"}, {"id":"","label":"Models","localized":"","reload":"","hint":"Download, convert or merge your models and manage models metadata"}, {"id":"","label":"Sampler","localized":"","reload":"","hint":"Settings related to sampler and seed selection and configuration. Samplers guide the process of turning noise into an image over multiple steps."}, {"id":"","label":"Agent Scheduler","localized":"","reload":"","hint":"Enqueue your generate requests and run them in the background"}, @@ -89,8 +73,6 @@ {"id":"","label":"Lora","localized":"","reload":"","hint":"LoRA: Low-Rank Adaptation. Fine-tuned model that is applied on top of a loaded model"}, {"id":"","label":"Embedding","localized":"","reload":"","hint":"Textual inversion embedding is a trained embedded information about the subject"}, {"id":"","label":"Hypernetwork","localized":"","reload":"","hint":"Small trained neural network that modifies behavior of the loaded model"}, - {"id":"","label":"VLM Caption","localized":"","reload":"","hint":"Analyze image using vision langugage model"}, - {"id":"","label":"OpenCLiP","localized":"","reload":"","hint":"Analyze image using CLiP model via OpenCLiP"}, {"id":"","label":"VAE","localized":"","reload":"","hint":"Variational Auto Encoder: model used to run image decode at the end of generate"}, {"id":"","label":"History","localized":"","reload":"","hint":"List of previous generations that can be further reprocessed"}, {"id":"","label":"UI disable variable aspect ratio","localized":"","reload":"","hint":"When disabled, all thumbnails appear as squared images"}, @@ -149,6 +131,55 @@ {"id":"","label":"➠ Control","localized":"","reload":"","hint":"Transfer image to control interface"}, {"id":"","label":"➠ Caption","localized":"","reload":"","hint":"Transfer image to caption interface"} ], +"llm": [ + {"id":"","label":"System prompt","localized":"","reload":"","hint":"System prompt controls behavior of the LLM. Processed first and persists throughout conversation. Has highest priority weighting and is always appended at the beginning of the sequence.

Use for: Response formatting rules, role definition, style."}, + {"id":"","label":"Prefill text","localized":"","reload":"","hint":"Pre-fills the start of the model's response to guide its output format or content by forcing it to continue the prefill text.
Prefill is filtered out and does not appear in the final response.

Leave empty to let the model generate its own response from scratch."}, + {"id":"","label":"VLM Max tokens","localized":"","reload":"","hint":"Maximum number of tokens the model can generate in its response.
The model is not aware of this limit during generation and it won't make the model try to generate more detailed or more concise responses, it simply sets the hard limit for the length, and will forcefully cut off the response when the limit is reached."}, + {"id":"","label":"VLM Temperature","localized":"","reload":"","hint":"Controls randomness in token selection. Lower values (e.g., 0.1) make outputs more focused and deterministic, always choosing high-probability tokens.
Higher values (e.g., 0.9) increase creativity and diversity by allowing less probable tokens.

Set to 0 for fully deterministic output (always picks the most likely token)."}, + {"id":"","label":"VLM Num Beams","localized":"","reload":"","hint":"Maintains multiple candidate paths simultaneously and selects the overall best sequence.
Like exploring several drafts at once to find the best one. More thorough but much slower and less creative than random sampling.
Generally not recommended, most modern VLMs perform better with sampling methods.
Set to 1 to disable."}, + {"id":"","label":"Top-K","localized":"","reload":"","hint":"Limits token selection to the K most likely candidates at each step.
Lower values (e.g., 40) make outputs more focused and predictable, while higher values allow more diverse choices.
Set to 0 to disable."}, + {"id":"","label":"Top-P","localized":"","reload":"","hint":"Selects tokens from the smallest set whose cumulative probability exceeds P (e.g., 0.9).
Dynamically adapts the number of candidates based on model confidence; fewer options when certain, more when uncertain.
Set to 1 to disable."}, + {"id":"","label":"Use Samplers","localized":"","reload":"","hint":"Enable to use sampling (randomly selecting tokens based on sampling methods like Top-k or Top-p) or disable to use greedy decoding (selecting the most probable token at each step).
Enabling makes outputs more diverse and creative but less deterministic."}, + {"id":"","label":"Thinking Mode","localized":"","reload":"","hint":"Enables thinking/reasoning, allowing the model to take more time to generate responses.
This can lead to more thoughtful and detailed answers, but will increase response time.
This setting affects both hybrid and thinking-only models, and in some may result in lower overall quality than expected. For thinking-only models like Qwen3-VL this setting might have to be combined with prefill to guarantee preventing thinking.

Models supporting this feature are marked with an  icon."}, + {"id":"","label":"Keep Thinking Trace","localized":"","reload":"","hint":"Include the model's reasoning process in the final output.
Useful for understanding how the model arrived at its answer.
Only works with models that support thinking mode."}, + {"id":"","label":"Keep Prefill","localized":"","reload":"","hint":"Include the prefill text at the beginning of the final output.
If disabled, the prefill text used to guide the model is removed from the result."} +], +"caption": [ + {"id":"","label":"Caption","localized":"","reload":"","hint":"Analyze existing images and create text descriptions"}, + {"id":"","label":"VLM Caption","localized":"","reload":"","hint":"Analyze image using vision language model"}, + {"id":"","label":"OpenCLiP","localized":"","reload":"","hint":"Analyze image using CLiP model via OpenCLiP"}, + {"id":"","label":"Tagger","localized":"","reload":"","hint":"Tag images using anime-focused classification models like WaifuDiffusion or DeepBooru."}, + {"id":"","label":"Interrogate","localized":"","reload":"","hint":"Run interrogate to get description of your image"}, + {"id":"","label":"Caption: Advanced Options","localized":"Advanced Options","reload":"","hint":"Advanced configuration options for captioning models including sampling parameters, output formatting, and model-specific settings."}, + {"id":"","label":"Caption: Batch","localized":"Batch","reload":"","hint":"Process multiple images in a batch.
Select files directly or specify a folder path to process all images within."}, + {"id":"","label":"Default Caption Type","localized":"","reload":"","hint":"Default captioning method to use when clicking the main Interrogate button.
VLM: Vision-Language Model for detailed natural language descriptions.
OpenCLiP: CLIP-based analysis with style and flavor terms.
Tagger: Anime-style tags using WaifuDiffusion or DeepBooru models."}, + {"id":"","label":"VLM: Prompt","localized":"Prompt","reload":"","hint":"Enter your prompt/question here."}, + {"id":"","label":"vlm model","localized":"","reload":"","hint":"Select which model to use for Visual Language tasks.

Models which support thinking mode are marked with an  icon."}, + {"id":"","label":"Task","localized":"","reload":"","hint":"Changes which task the model will perform. Regular text prompts can be used when the task is set to Use Prompt.
When other options are selected, see the hint text inside an empty Prompt field for guidance."}, + {"id":"","label":"CLiP Model","localized":"","reload":"","hint":"CLIP model used for image-text similarity matching.
Larger models (ViT-L, ViT-H) are more accurate but slower and use more VRAM."}, + {"id":"","label":"Caption Model","localized":"","reload":"","hint":"BLIP model used to generate the initial image caption.
The caption model describes the image content which CLiP then enriches with style and flavor terms."}, + {"id":"","label":"Mode","localized":"","reload":"","hint":"Interrogation mode.
Fast: Quick caption with minimal flavor terms.
Classic: Standard interrogation with balanced quality and speed.
Best: Most thorough analysis, slowest but highest quality.
Negative: Generate terms to use as negative prompt."}, + {"id":"","label":"clip: min length","localized":"Min Length","reload":"","hint":"Minimum number of tokens in the generated caption."}, + {"id":"","label":"clip: max length","localized":"Max Length","reload":"","hint":"Maximum number of tokens in the generated caption."}, + {"id":"","label":"clip: chunk size","localized":"Chunk Size","reload":"","hint":"Batch size for processing description candidates (flavors). Higher values speed up interrogation but increase VRAM usage."}, + {"id":"","label":"clip: min flavors","localized":"Min Flavors","reload":"","hint":"Minimum number of descriptive tags (flavors) to keep in the final prompt."}, + {"id":"","label":"clip: max flavors","localized":"Max Flavors","reload":"","hint":"Maximum number of descriptive tags (flavors) to keep in the final prompt."}, + {"id":"","label":"clip: intermediates","localized":"Intermediates","reload":"","hint":"Size of the intermediate candidate pool when matching image features to descriptive tags (flavours). From this pool, the final tags are selected based on Min/Max Flavors. Higher values may improve quality but are slower."}, + {"id":"","label":"clip: num beams","localized":"CLiP Num Beams","reload":"","hint":"Number of beams for beam search during caption generation. Higher values search more possibilities but are slower."}, + {"id":"","label":"Tagger Model","localized":"","reload":"","hint":"Model to use for image tagging.
WaifuDiffusion models (wd-*): Modern taggers with separate general and character thresholds.
DeepBooru: Legacy tagger, uses only general threshold."}, + {"id":"","label":"General threshold","localized":"","reload":"","hint":"Confidence threshold for general tags (e.g., objects, actions, settings).
Only tags with confidence above this threshold are included in the output.
Higher values are more selective (fewer tags), lower values include more tags."}, + {"id":"","label":"Character threshold","localized":"","reload":"","hint":"Confidence threshold for character-specific tags (e.g., character names, specific traits).
Only tags with confidence above this threshold are included.
Higher values are more selective, lower values include more potential matches.
Not supported by DeepBooru models."}, + {"id":"","label":"Max tags","localized":"","reload":"","hint":"Maximum number of tags to include in the output.
Limits the result length when an image has many detected features.
Tags are sorted by confidence, so the most relevant ones are kept."}, + {"id":"","label":"Include rating","localized":"","reload":"","hint":"Include content rating tags in the output (e.g., safe, questionable, explicit).
Useful for filtering or categorizing images by their content rating."}, + {"id":"","label":"Sort alphabetically","localized":"","reload":"","hint":"Sort tags alphabetically instead of by confidence score.
When disabled, tags are sorted by confidence (highest first).
Alphabetical sorting makes it easier to find specific tags."}, + {"id":"","label":"Use spaces","localized":"","reload":"","hint":"Replace underscores with spaces in tag output.
Some prompt systems prefer spaces between words (e.g., 'long hair') while others use underscores (e.g., 'long_hair')."}, + {"id":"","label":"Escape brackets","localized":"","reload":"","hint":"Escape parentheses and brackets in tags with backslashes.
Required when tags contain characters that have special meaning in prompt syntax, such as ( ) [ ].
Enable this when using the output directly in prompts."}, + {"id":"","label":"Exclude tags","localized":"","reload":"","hint":"Comma-separated list of tags to exclude from the output.
Useful for filtering out unwanted or redundant tags that appear frequently."}, + {"id":"","label":"Show confidence scores","localized":"","reload":"","hint":"Display confidence scores alongside each tag.
Shows how certain the model is about each tag (0.0 to 1.0).
Useful for understanding which tags are most reliable."}, + {"id":"","label":"Save Caption Files","localized":"","reload":"","hint":"Save generated captions to .txt files alongside the images.
Each image gets a matching caption file with the same base name."}, + {"id":"","label":"Append Caption Files","localized":"","reload":"","hint":"Append to existing caption files instead of overwriting them.
Useful for adding additional descriptions or tags to images that already have captions."}, + {"id":"","label":"Recursive","localized":"","reload":"","hint":"Process images in subfolders recursively.
When enabled, searches all nested subdirectories for images to process."} +], "generate": [ {"id":"","label":"Sampling method","localized":"","reload":"","hint":"Which algorithm to use to produce the image"}, {"id":"","label":"Steps","localized":"","reload":"","hint":"How many times to improve the generated image iteratively; higher values take longer; very low values can produce bad results"}, @@ -247,8 +278,8 @@ {"id":"","label":"Enhance now","localized":"","reload":"","hint":"Run prompt enhancement using the selected LLM model"}, {"id":"","label":"Apply to prompt","localized":"","reload":"","hint":"Automatically copy enhanced result to the prompt input box"}, {"id":"","label":"Auto enhance","localized":"","reload":"","hint":"Automatically enhance prompt before every image generation"}, - {"id":"","label":"Use vision","localized":"","reload":"","hint":"Include input image when enhancing prompt.

Only available for vision-capable models, marked with \uf06e icon."}, - {"id":"","label":"LLM model","localized":"","reload":"","hint":"Select the language model to use for prompt enhancement.

Models supporting vision are marked with \uf06e icon.
Models supporting thinking mode are marked with \uf0eb icon."}, + {"id":"","label":"Use vision","localized":"","reload":"","hint":"Include input image when enhancing prompt.

Only available for vision-capable models, marked with  icon."}, + {"id":"","label":"LLM model","localized":"","reload":"","hint":"Select the language model to use for prompt enhancement.

Models supporting vision are marked with  icon.
Models supporting thinking mode are marked with  icon."}, {"id":"","label":"Model repo","localized":"","reload":"","hint":"HuggingFace repository ID for the model"}, {"id":"","label":"Model gguf","localized":"","reload":"","hint":"Optional GGUF quantized model repository on HuggingFace"}, {"id":"","label":"Model type","localized":"","reload":"","hint":"Optional GGUF model quantization type"}, @@ -486,7 +517,6 @@ {"id":"","label":"answer","localized":"","reload":"","hint":"answer"}, {"id":"","label":"aot_ts_nvfuser","localized":"","reload":"","hint":"aot_ts_nvfuser"}, {"id":"","label":"appearance","localized":"","reload":"","hint":"appearance"}, - {"id":"","label":"append caption files","localized":"","reload":"","hint":"append caption files"}, {"id":"","label":"append image info json file","localized":"","reload":"","hint":"append image info json file"}, {"id":"","label":"append interrogated prompt at each iteration","localized":"","reload":"","hint":"append interrogated prompt at each iteration"}, {"id":"","label":"apply color correction","localized":"","reload":"","hint":"apply color correction"}, @@ -549,7 +579,6 @@ {"id":"","label":"cache text encoder results","localized":"","reload":"","hint":"cache text encoder results"}, {"id":"","label":"canny","localized":"","reload":"","hint":"canny"}, {"id":"","label":"caption","localized":"","reload":"","hint":"caption"}, - {"id":"","label":"caption model","localized":"","reload":"","hint":"caption model"}, {"id":"","label":"center","localized":"","reload":"","hint":"center"}, {"id":"","label":"change log","localized":"","reload":"","hint":"change log"}, {"id":"","label":"change model","localized":"","reload":"","hint":"change model"}, @@ -566,7 +595,6 @@ {"id":"","label":"civitai token","localized":"","reload":"","hint":"civitai token"}, {"id":"","label":"ckpt","localized":"","reload":"","hint":"ckpt"}, {"id":"","label":"cleanup temporary folder on startup","localized":"","reload":"","hint":"cleanup temporary folder on startup"}, - {"id":"","label":"clip model","localized":"","reload":"","hint":"clip model"}, {"id":"","label":"clip: chunk size","localized":"","reload":"","hint":"clip: chunk size"}, {"id":"","label":"clip: default captioner","localized":"","reload":"","hint":"clip: default captioner"}, {"id":"","label":"clip: default mode","localized":"","reload":"","hint":"clip: default mode"}, @@ -868,8 +896,6 @@ {"id":"","label":"kdpm2","localized":"","reload":"","hint":"kdpm2"}, {"id":"","label":"kdpm2 a","localized":"","reload":"","hint":"kdpm2 a"}, {"id":"","label":"keep incomplete images","localized":"","reload":"","hint":"keep incomplete images"}, - {"id":"","label":"Keep Thinking Trace","localized":"","reload":"","hint":"Include the model's reasoning process in the final output.
Useful for understanding how the model arrived at its answer.
Only works with models that support thinking mode."}, - {"id":"","label":"Keep Prefill","localized":"","reload":"","hint":"Include the prefill text at the beginning of the final output.
If disabled, the prefill text used to guide the model is removed from the result."}, {"id":"","label":"large","localized":"","reload":"","hint":"large"}, {"id":"","label":"latent history size","localized":"","reload":"","hint":"latent history size"}, {"id":"","label":"latent mode","localized":"","reload":"","hint":"latent mode"}, @@ -945,7 +971,6 @@ {"id":"","label":"mine","localized":"","reload":"","hint":"mine"}, {"id":"","label":"mlsd","localized":"","reload":"","hint":"mlsd"}, {"id":"","label":"mm","localized":"","reload":"","hint":"mm"}, - {"id":"","label":"mode","localized":"","reload":"","hint":"mode"}, {"id":"","label":"mode after","localized":"","reload":"","hint":"mode after"}, {"id":"","label":"mode before","localized":"","reload":"","hint":"mode before"}, {"id":"","label":"mode mask","localized":"","reload":"","hint":"mode mask"}, @@ -990,7 +1015,6 @@ {"id":"","label":"none","localized":"","reload":"","hint":"none"}, {"id":"","label":"note","localized":"","reload":"","hint":"note"}, {"id":"","label":"nothing","localized":"","reload":"","hint":"nothing"}, - {"id":"","label":"num beams","localized":"","reload":"","hint":"Maintains multiple candidate paths simultaneously and selects the overall best sequence.
Like exploring several drafts at once to find the best one. More thorough but much slower and less creative than random sampling.
Generally not recommended, most modern VLMs perform better with sampling methods.
Set to 1 to disable."}, {"id":"","label":"number","localized":"","reload":"","hint":"number"}, {"id":"","label":"numbered filenames","localized":"","reload":"","hint":"numbered filenames"}, {"id":"","label":"offload","localized":"","reload":"","hint":"offload"}, @@ -1072,7 +1096,6 @@ {"id":"","label":"random seeds","localized":"","reload":"","hint":"random seeds"}, {"id":"","label":"range","localized":"","reload":"","hint":"range"}, {"id":"","label":"rebase","localized":"","reload":"","hint":"rebase"}, - {"id":"","label":"recursive","localized":"","reload":"","hint":"recursive"}, {"id":"","label":"reduce-overhead","localized":"","reload":"","hint":"reduce-overhead"}, {"id":"","label":"redux prompt strength","localized":"","reload":"","hint":"redux prompt strength"}, {"id":"","label":"reference adain weight","localized":"","reload":"","hint":"reference adain weight"}, @@ -1123,7 +1146,6 @@ {"id":"","label":"saturation","localized":"","reload":"","hint":"saturation"}, {"id":"","label":"save all generated image grids","localized":"","reload":"","hint":"save all generated image grids"}, {"id":"","label":"save all generated images","localized":"","reload":"","hint":"save all generated images"}, - {"id":"","label":"save caption files","localized":"","reload":"","hint":"save caption files"}, {"id":"","label":"save diffusers","localized":"","reload":"","hint":"save diffusers"}, {"id":"","label":"save hdr image","localized":"","reload":"","hint":"save hdr image"}, {"id":"","label":"save image before color correction","localized":"","reload":"","hint":"save image before color correction"}, @@ -1228,8 +1250,6 @@ {"id":"","label":"tcd","localized":"","reload":"","hint":"tcd"}, {"id":"","label":"tdd","localized":"","reload":"","hint":"tdd"}, {"id":"","label":"te","localized":"","reload":"","hint":"te"}, - {"id":"","label":"temperature","localized":"","reload":"","hint":"Controls randomness in token selection by reshaping the probability distribution.
Like adjusting a dial between cautious predictability (low values ~0.4) and creative exploration (higher values ~1). Higher temperatures increase willingness to choose less obvious options, but makes outputs more unpredictable.

Set to 0 to disable, resulting in silent switch to greedy decoding, disabling sampling."}, - {"id":"","label":"Thinking Mode","localized":"","reload":"","hint":"Enables thinking/reasoning, allowing the model to take more time to generate responses.
This can lead to more thoughtful and detailed answers, but will increase response time.
This setting affects both hybrid and thinking-only models, and in some may result in lower overall quality than expected. For thinking-only models like Qwen3-VL this setting might have to be combined with prefill to guarantee preventing thinking.

Models supporting this feature are marked with an \uf0eb icon."}, {"id":"","label":"Repetition penalty","localized":"","reload":"","hint":"Discourages reusing tokens that already appear in the prompt or output by penalizing their probabilities.
Like adding friction to revisiting previous choices. Helps break repetitive loops but may reduce coherence at aggressive values.

Set to 1 to disable."}, {"id":"","label":"text guidance scale","localized":"","reload":"","hint":"text guidance scale"}, {"id":"","label":"template","localized":"","reload":"","hint":"template"}, @@ -1273,8 +1293,6 @@ {"id":"","label":"todo","localized":"","reload":"","hint":"todo"}, {"id":"","label":"tome","localized":"","reload":"","hint":"tome"}, {"id":"","label":"tool","localized":"","reload":"","hint":"tool"}, - {"id":"","label":"top-k","localized":"","reload":"","hint":"Limits token selection to the K most likely candidates at each step.
Lower values (e.g., 40) make outputs more focused and predictable, while higher values allow more diverse choices.

Set to 0 to disable."}, - {"id":"","label":"top-p","localized":"","reload":"","hint":"Selects tokens from the smallest set whose cumulative probability exceeds P (e.g., 0.9).
Dynamically adapts the number of candidates based on model confidence; fewer options when certain, more when uncertain.

Set to 1 to disable."}, {"id":"","label":"torch","localized":"","reload":"","hint":"torch"}, {"id":"","label":"transformer","localized":"","reload":"","hint":"transformer"}, {"id":"","label":"trigger word","localized":"","reload":"","hint":"trigger word"}, @@ -1318,7 +1336,6 @@ {"id":"","label":"use random seeds","localized":"","reload":"","hint":"use random seeds"}, {"id":"","label":"use reference values when available","localized":"","reload":"","hint":"use reference values when available"}, {"id":"","label":"use same seed","localized":"","reload":"","hint":"use same seed"}, - {"id":"","label":"use samplers","localized":"","reload":"","hint":"Enable to use sampling (randomly selecting tokens based on sampling methods like Top-k or Top-p) or disable to use greedy decoding (selecting the most probable token at each step).
Enabling makes outputs more diverse and more creative but less deterministic."}, {"id":"","label":"use separate base dict","localized":"","reload":"","hint":"use separate base dict"}, {"id":"","label":"use simplified solvers in final steps","localized":"","reload":"","hint":"use simplified solvers in final steps"}, {"id":"","label":"use text inputs","localized":"","reload":"","hint":"use text inputs"}, @@ -1338,16 +1355,9 @@ {"id":"","label":"video file","localized":"","reload":"","hint":"video file"}, {"id":"","label":"video type","localized":"","reload":"","hint":"video type"}, {"id":"","label":"vlm","localized":"","reload":"","hint":"vlm"}, - {"id":"","label":"vlm model","localized":"","reload":"","hint":"Select which model to use for Visual Language tasks.

Models which support thinking mode are marked with an \uf0eb icon."}, {"id":"","label":"vlm: default model","localized":"","reload":"","hint":"vlm: default model"}, {"id":"","label":"vlm: default prompt","localized":"","reload":"","hint":"vlm: default prompt"}, {"id":"","label":"vlm: max length","localized":"","reload":"","hint":"vlm: max length"}, - {"id":"","label":"VLM Num Beams","localized":"","reload":"","hint":"Maintains multiple candidate paths simultaneously and selects the overall best sequence.
Like exploring several drafts at once to find the best one. More thorough but much slower and less creative than random sampling.
Generally not recommended, most modern VLMs perform better with sampling methods.
Set to 1 to disable."}, - {"id":"","label":"vlm: top-k","localized":"","reload":"","hint":"Limits token selection to the K most likely candidates at each step.
Lower values (e.g., 40) make outputs more focused and predictable, while higher values allow more diverse choices.
Set to 0 to disable."}, - {"id":"","label":"vlm: top-p","localized":"","reload":"","hint":"Selects tokens from the smallest set whose cumulative probability exceeds P (e.g., 0.9).
Dynamically adapts the number of candidates based on model confidence; fewer options when certain, more when uncertain.
Set to 1 to disable."}, - {"id":"","label":"vlm: use sample method","localized":"","reload":"","hint":"Enable to use sampling (randomly selecting tokens based on sampling methods like Top-k or Top-p) or disable to use greedy decoding (selecting the most probable token at each step).
Enabling makes outputs more diverse and creative but less deterministic."}, - {"id":"","label":"VLM Max tokens","localized":"","reload":"","hint":"Maximum number of tokens the model can generate in its response.
The model is not aware of this limit during generation and it won't make the model try to generate more detailed or more concise responses, it simply sets the hard limit for the length, and will forcefully cut off the response when the limit is reached."}, - {"id":"","label":"VLM Temperature","localized":"","reload":"","hint":"Controls randomness in token selection. Lower values (e.g., 0.1) make outputs more focused and deterministic, always choosing high-probability tokens.
Higher values (e.g., 0.9) increase creativity and diversity by allowing less probable tokens.

Set to 0 for fully deterministic output (always picks the most likely token)."}, {"id":"","label":"warmth","localized":"","reload":"","hint":"warmth"}, {"id":"","label":"webp lossless compression","localized":"","reload":"","hint":"webp lossless compression"}, {"id":"","label":"weight","localized":"","reload":"","hint":"weight"}, diff --git a/modules/ui_caption.py b/modules/ui_caption.py index 5ab4d74b7..b2eef1d3b 100644 --- a/modules/ui_caption.py +++ b/modules/ui_caption.py @@ -167,7 +167,7 @@ def create_ui(): with gr.Row(): vlm_load_btn = gr.Button(value='Load', elem_id='vlm_load', variant='secondary') vlm_unload_btn = gr.Button(value='Unload', elem_id='vlm_unload', variant='secondary') - with gr.Accordion(label='VLM: Advanced Options', open=False, visible=True): + with gr.Accordion(label='Caption: Advanced Options', open=False, visible=True): with gr.Row(): vlm_max_tokens = gr.Slider(label='VLM Max Tokens', value=shared.opts.interrogate_vlm_max_length, minimum=16, maximum=4096, step=1, elem_id='vlm_max_tokens') vlm_num_beams = gr.Slider(label='VLM Num Beams', value=shared.opts.interrogate_vlm_num_beams, minimum=1, maximum=16, step=1, elem_id='vlm_num_beams') @@ -192,7 +192,7 @@ def create_ui(): vlm_keep_prefill.change(fn=update_vlm_params, inputs=[vlm_max_tokens, vlm_num_beams, vlm_temperature, vlm_do_sample, vlm_top_k, vlm_top_p, vlm_keep_prefill, vlm_keep_thinking, vlm_thinking_mode], outputs=[]) vlm_keep_thinking.change(fn=update_vlm_params, inputs=[vlm_max_tokens, vlm_num_beams, vlm_temperature, vlm_do_sample, vlm_top_k, vlm_top_p, vlm_keep_prefill, vlm_keep_thinking, vlm_thinking_mode], outputs=[]) vlm_thinking_mode.change(fn=update_vlm_params, inputs=[vlm_max_tokens, vlm_num_beams, vlm_temperature, vlm_do_sample, vlm_top_k, vlm_top_p, vlm_keep_prefill, vlm_keep_thinking, vlm_thinking_mode], outputs=[]) - with gr.Accordion(label='VLM: Batch Caption', open=False, visible=True): + with gr.Accordion(label='Caption: Batch', open=False, visible=True): with gr.Row(): vlm_batch_files = gr.File(label="Files", show_label=True, file_count='multiple', file_types=['image'], interactive=True, height=100, elem_id='vlm_batch_files') with gr.Row(): @@ -213,7 +213,7 @@ def create_ui(): ui_common.create_refresh_button(clip_model, openclip.refresh_clip_models, lambda: {"choices": openclip.refresh_clip_models()}, 'clip_models_refresh') blip_model = gr.Dropdown(list(openclip.caption_models), value=shared.opts.interrogate_blip_model, label='Caption Model', elem_id='btN_clip_blip_model') clip_mode = gr.Dropdown(openclip.caption_types, label='Mode', value='fast', elem_id='clip_clip_mode') - with gr.Accordion(label='CLiP: Advanced Options', open=False, visible=True): + with gr.Accordion(label='Caption: Advanced Options', open=False, visible=True): with gr.Row(): clip_min_length = gr.Slider(label='clip: min length', value=shared.opts.interrogate_clip_min_length, minimum=8, maximum=75, step=1, elem_id='clip_caption_min_length') clip_max_length = gr.Slider(label='clip: max length', value=shared.opts.interrogate_clip_max_length, minimum=16, maximum=1024, step=1, elem_id='clip_caption_max_length') @@ -231,7 +231,7 @@ def create_ui(): clip_max_flavors.change(fn=update_clip_params, inputs=[clip_min_length, clip_max_length, clip_chunk_size, clip_min_flavors, clip_max_flavors, clip_flavor_count, clip_num_beams], outputs=[]) clip_flavor_count.change(fn=update_clip_params, inputs=[clip_min_length, clip_max_length, clip_chunk_size, clip_min_flavors, clip_max_flavors, clip_flavor_count, clip_num_beams], outputs=[]) clip_num_beams.change(fn=update_clip_params, inputs=[clip_min_length, clip_max_length, clip_chunk_size, clip_min_flavors, clip_max_flavors, clip_flavor_count, clip_num_beams], outputs=[]) - with gr.Accordion(label='CLiP: Batch Interrogate', open=False, visible=True): + with gr.Accordion(label='Caption: Batch', open=False, visible=True): with gr.Row(): clip_batch_files = gr.File(label="Files", show_label=True, file_count='multiple', file_types=['image'], interactive=True, height=100, elem_id='clip_batch_files') with gr.Row(): @@ -255,7 +255,7 @@ def create_ui(): with gr.Row(): wd_load_btn = gr.Button(value='Load', elem_id='wd_load', variant='secondary') wd_unload_btn = gr.Button(value='Unload', elem_id='wd_unload', variant='secondary') - with gr.Accordion(label='Tagger: Advanced Options', open=True, visible=True): + with gr.Accordion(label='Caption: Advanced Options', open=True, visible=True): with gr.Row(): wd_general_threshold = gr.Slider(label='General threshold', value=shared.opts.tagger_threshold, minimum=0.0, maximum=1.0, step=0.01, elem_id='wd_general_threshold') wd_character_threshold = gr.Slider(label='Character threshold', value=shared.opts.waifudiffusion_character_threshold, minimum=0.0, maximum=1.0, step=0.01, elem_id='wd_character_threshold') @@ -271,7 +271,7 @@ def create_ui(): with gr.Row(): wd_show_scores = gr.Checkbox(label='Show confidence scores', value=shared.opts.tagger_show_scores, elem_id='wd_show_scores') gr.HTML('') - with gr.Accordion(label='Tagger: Batch', open=False, visible=True): + with gr.Accordion(label='Caption: Batch', open=False, visible=True): with gr.Row(): wd_batch_files = gr.File(label="Files", show_label=True, file_count='multiple', file_types=['image'], interactive=True, height=100, elem_id='wd_batch_files') with gr.Row():