From 5183ebec589ad650a182a1069d2981b3133372a2 Mon Sep 17 00:00:00 2001 From: CalamitousFelicitousness Date: Mon, 26 Jan 2026 01:14:53 +0000 Subject: [PATCH] refactor: rename interrogate module to caption Move all caption-related modules from modules/interrogate/ to modules/caption/ for better naming consistency: - Rename deepbooru, deepseek, joycaption, joytag, moondream3, openclip, tagger, vqa, vqa_detection, waifudiffusion modules - Add new caption.py dispatcher module - Remove old interrogate.py (functionality moved to caption.py) --- modules/caption/caption.py | 48 ++++ modules/{interrogate => caption}/deepbooru.py | 6 +- .../deepbooru_model.py | 0 modules/{interrogate => caption}/deepseek.py | 8 +- .../{interrogate => caption}/joycaption.py | 6 +- modules/{interrogate => caption}/joytag.py | 4 +- .../{interrogate => caption}/moondream3.py | 68 +++--- modules/{interrogate => caption}/openclip.py | 81 ++++--- modules/{interrogate => caption}/tagger.py | 16 +- modules/{interrogate => caption}/vqa.py | 210 +++++++++--------- .../{interrogate => caption}/vqa_detection.py | 0 .../waifudiffusion.py | 6 +- modules/interrogate/interrogate.py | 48 ---- 13 files changed, 257 insertions(+), 244 deletions(-) create mode 100644 modules/caption/caption.py rename modules/{interrogate => caption}/deepbooru.py (98%) rename modules/{interrogate => caption}/deepbooru_model.py (100%) rename modules/{interrogate => caption}/deepseek.py (91%) rename modules/{interrogate => caption}/joycaption.py (96%) rename modules/{interrogate => caption}/joytag.py (99%) rename modules/{interrogate => caption}/moondream3.py (81%) rename modules/{interrogate => caption}/openclip.py (81%) rename modules/{interrogate => caption}/tagger.py (82%) rename modules/{interrogate => caption}/vqa.py (87%) rename modules/{interrogate => caption}/vqa_detection.py (100%) rename modules/{interrogate => caption}/waifudiffusion.py (99%) delete mode 100644 modules/interrogate/interrogate.py diff --git a/modules/caption/caption.py b/modules/caption/caption.py new file mode 100644 index 000000000..300af6479 --- /dev/null +++ b/modules/caption/caption.py @@ -0,0 +1,48 @@ +import time +from PIL import Image +from modules import shared + + +def caption(image): + if isinstance(image, list): + image = image[0] if len(image) > 0 else None + if isinstance(image, dict) and 'name' in image: + image = Image.open(image['name']) + if image is None: + shared.log.error('Caption: no image provided') + return '' + t0 = time.time() + if shared.opts.caption_default_type == 'OpenCLiP': + shared.log.info(f'Caption: type={shared.opts.caption_default_type} clip="{shared.opts.caption_openclip_model}" blip="{shared.opts.caption_openclip_blip_model}" mode="{shared.opts.caption_openclip_mode}"') + from modules.caption import openclip + openclip.load_captioner(clip_model=shared.opts.caption_openclip_model, blip_model=shared.opts.caption_openclip_blip_model) + openclip.update_caption_params() + prompt = openclip.caption(image, mode=shared.opts.caption_openclip_mode) + shared.log.debug(f'Caption: time={time.time()-t0:.2f} answer="{prompt}"') + return prompt + elif shared.opts.caption_default_type == 'Tagger': + shared.log.info(f'Caption: type={shared.opts.caption_default_type} model="{shared.opts.waifudiffusion_model}"') + from modules.caption import tagger + prompt = tagger.tag( + image=image, + model_name=shared.opts.waifudiffusion_model, + general_threshold=shared.opts.tagger_threshold, + character_threshold=shared.opts.waifudiffusion_character_threshold, + include_rating=shared.opts.tagger_include_rating, + exclude_tags=shared.opts.tagger_exclude_tags, + max_tags=shared.opts.tagger_max_tags, + sort_alpha=shared.opts.tagger_sort_alpha, + use_spaces=shared.opts.tagger_use_spaces, + escape_brackets=shared.opts.tagger_escape_brackets, + ) + shared.log.debug(f'Caption: time={time.time()-t0:.2f} answer="{prompt}"') + return prompt + elif shared.opts.caption_default_type == 'VLM': + shared.log.info(f'Caption: type={shared.opts.caption_default_type} vlm="{shared.opts.caption_vlm_model}" prompt="{shared.opts.caption_vlm_prompt}"') + from modules.caption import vqa + prompt = vqa.caption(image=image, model_name=shared.opts.caption_vlm_model, question=shared.opts.caption_vlm_prompt, prompt=None, system_prompt=shared.opts.caption_vlm_system) + shared.log.debug(f'Caption: time={time.time()-t0:.2f} answer="{prompt}"') + return prompt + else: + shared.log.error(f'Caption: type="{shared.opts.caption_default_type}" unknown') + return '' diff --git a/modules/interrogate/deepbooru.py b/modules/caption/deepbooru.py similarity index 98% rename from modules/interrogate/deepbooru.py rename to modules/caption/deepbooru.py index 1e18cc2ce..c43d683f1 100644 --- a/modules/interrogate/deepbooru.py +++ b/modules/caption/deepbooru.py @@ -19,7 +19,7 @@ class DeepDanbooru: if self.model is not None: return model_path = os.path.join(shared.opts.clip_models_path, "DeepDanbooru") - shared.log.debug(f'Interrogate load: module=DeepDanbooru folder="{model_path}"') + shared.log.debug(f'Caption load: module=DeepDanbooru folder="{model_path}"') files = modelloader.load_models( model_path=model_path, model_url='https://github.com/AUTOMATIC1111/TorchDeepDanbooru/releases/download/v1/model-resnet_custom_v3.pt', @@ -27,7 +27,7 @@ class DeepDanbooru: download_name='model-resnet_custom_v3.pt', ) - from modules.interrogate.deepbooru_model import DeepDanbooruModel + from modules.caption.deepbooru_model import DeepDanbooruModel self.model = DeepDanbooruModel() self.model.load_state_dict(torch.load(files[0], map_location="cpu")) self.model.eval() @@ -38,7 +38,7 @@ class DeepDanbooru: self.model.to(devices.device) def stop(self): - if shared.opts.interrogate_offload: + if shared.opts.caption_offload: self.model.to(devices.cpu) devices.torch_gc() diff --git a/modules/interrogate/deepbooru_model.py b/modules/caption/deepbooru_model.py similarity index 100% rename from modules/interrogate/deepbooru_model.py rename to modules/caption/deepbooru_model.py diff --git a/modules/interrogate/deepseek.py b/modules/caption/deepseek.py similarity index 91% rename from modules/interrogate/deepseek.py rename to modules/caption/deepseek.py index 2c967f85f..05f7a7a64 100644 --- a/modules/interrogate/deepseek.py +++ b/modules/caption/deepseek.py @@ -32,11 +32,11 @@ def load(repo: str): """Load DeepSeek VL2 model (experimental).""" global vl_gpt, vl_chat_processor, loaded_repo # pylint: disable=global-statement if not shared.cmd_opts.experimental: - shared.log.error(f'Interrogate: type=vlm model="DeepSeek VL2" repo="{repo}" is experimental-only') + shared.log.error(f'Caption: type=vlm model="DeepSeek VL2" repo="{repo}" is experimental-only') return False folder = os.path.join(paths.script_path, 'repositories', 'deepseek-vl2') if not os.path.exists(folder): - shared.log.error(f'Interrogate: type=vlm model="DeepSeek VL2" repo="{repo}" deepseek-vl2 repo not found') + shared.log.error(f'Caption: type=vlm model="DeepSeek VL2" repo="{repo}" deepseek-vl2 repo not found') return False if vl_gpt is None or loaded_repo != repo: sys.modules['attrdict'] = fake_attrdict @@ -53,7 +53,7 @@ def load(repo: str): vl_gpt.to(dtype=devices.dtype) vl_gpt.eval() loaded_repo = repo - shared.log.info(f'Interrogate: type=vlm model="DeepSeek VL2" repo="{repo}"') + shared.log.info(f'Caption: type=vlm model="DeepSeek VL2" repo="{repo}"') sd_models.move_model(vl_gpt, devices.device) return True @@ -105,7 +105,7 @@ def predict(question, image, repo): pad_token_id=vl_chat_processor.tokenizer.eos_token_id, bos_token_id=vl_chat_processor.tokenizer.bos_token_id, eos_token_id=vl_chat_processor.tokenizer.eos_token_id, - max_new_tokens=shared.opts.interrogate_vlm_max_length, + max_new_tokens=shared.opts.caption_vlm_max_length, do_sample=False, use_cache=True ) diff --git a/modules/interrogate/joycaption.py b/modules/caption/joycaption.py similarity index 96% rename from modules/interrogate/joycaption.py rename to modules/caption/joycaption.py index c8d445d9e..9f3827d47 100644 --- a/modules/interrogate/joycaption.py +++ b/modules/caption/joycaption.py @@ -64,7 +64,7 @@ def load(repo: str = None): if llava_model is None or opts.repo != repo: opts.repo = repo llava_model = None - shared.log.info(f'Interrogate: type=vlm model="JoyCaption" {str(opts)}') + shared.log.info(f'Caption: type=vlm model="JoyCaption" {str(opts)}') processor = AutoProcessor.from_pretrained(repo, max_pixels=1024*1024, cache_dir=shared.opts.hfcache_dir) quant_args = model_quant.create_config(module='LLM') llava_model = LlavaForConditionalGeneration.from_pretrained( @@ -92,7 +92,7 @@ def unload(): @torch.no_grad() def predict(question: str, image, vqa_model: str = None) -> str: - opts.max_new_tokens = shared.opts.interrogate_vlm_max_length + opts.max_new_tokens = shared.opts.caption_vlm_max_length load(vqa_model) if len(question) < 2: @@ -121,7 +121,7 @@ def predict(question: str, image, vqa_model: str = None) -> str: )[0] generate_ids = generate_ids[inputs['input_ids'].shape[1]:] # Trim off the prompt caption = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) # Decode the caption - if shared.opts.interrogate_offload: + if shared.opts.caption_offload: sd_models.move_model(llava_model, devices.cpu, force=True) caption = caption.replace('\n\n', '\n').strip() return caption diff --git a/modules/interrogate/joytag.py b/modules/caption/joytag.py similarity index 99% rename from modules/interrogate/joytag.py rename to modules/caption/joytag.py index f78e49ae5..a47ec2041 100644 --- a/modules/interrogate/joytag.py +++ b/modules/caption/joytag.py @@ -1044,7 +1044,7 @@ def load(): model.eval() with open(os.path.join(folder, 'top_tags.txt'), 'r', encoding='utf8') as f: tags = [line.strip() for line in f.readlines() if line.strip()] - shared.log.info(f'Interrogate: type=vlm model="JoyTag" repo="{MODEL_REPO}" tags={len(tags)}') + shared.log.info(f'Caption: type=vlm model="JoyTag" repo="{MODEL_REPO}" tags={len(tags)}') sd_models.move_model(model, devices.device) @@ -1068,7 +1068,7 @@ def predict(image: Image.Image): preds = model({'image': image_tensor}) tag_preds = preds['tags'].sigmoid().cpu() scores = {tags[i]: tag_preds[0][i] for i in range(len(tags))} - if shared.opts.interrogate_score: + if shared.opts.tagger_show_scores: predicted_tags = [f'{tag}:{score:.2f}' for tag, score in scores.items() if score > THRESHOLD] else: predicted_tags = [tag for tag, score in scores.items() if score > THRESHOLD] diff --git a/modules/interrogate/moondream3.py b/modules/caption/moondream3.py similarity index 81% rename from modules/interrogate/moondream3.py rename to modules/caption/moondream3.py index f760b3233..1a86cdf85 100644 --- a/modules/interrogate/moondream3.py +++ b/modules/caption/moondream3.py @@ -7,11 +7,11 @@ import re import transformers from PIL import Image from modules import shared, devices, sd_models -from modules.interrogate import vqa_detection +from modules.caption import vqa_detection # Debug logging - function-based to avoid circular import -debug_enabled = os.environ.get('SD_INTERROGATE_DEBUG', None) is not None +debug_enabled = os.environ.get('SD_CAPTION_DEBUG', None) is not None def debug(*args, **kwargs): if debug_enabled: @@ -30,12 +30,12 @@ def get_settings(): Moondream 3 accepts: temperature, top_p, max_tokens """ settings = {} - if shared.opts.interrogate_vlm_max_length > 0: - settings['max_tokens'] = shared.opts.interrogate_vlm_max_length - if shared.opts.interrogate_vlm_temperature > 0: - settings['temperature'] = shared.opts.interrogate_vlm_temperature - if shared.opts.interrogate_vlm_top_p > 0: - settings['top_p'] = shared.opts.interrogate_vlm_top_p + if shared.opts.caption_vlm_max_length > 0: + settings['max_tokens'] = shared.opts.caption_vlm_max_length + if shared.opts.caption_vlm_temperature > 0: + settings['temperature'] = shared.opts.caption_vlm_temperature + if shared.opts.caption_vlm_top_p > 0: + settings['top_p'] = shared.opts.caption_vlm_top_p return settings if settings else None @@ -44,7 +44,7 @@ def load_model(repo: str): global moondream3_model, loaded # pylint: disable=global-statement if moondream3_model is None or loaded != repo: - shared.log.debug(f'Interrogate load: vlm="{repo}"') + shared.log.debug(f'Caption load: vlm="{repo}"') moondream3_model = None moondream3_model = transformers.AutoModelForCausalLM.from_pretrained( @@ -84,7 +84,7 @@ def encode_image(image: Image.Image, cache_key: str = None): Encoded image tensor """ if cache_key and cache_key in image_cache: - debug(f'VQA interrogate: handler=moondream3 using cached encoding for cache_key="{cache_key}"') + debug(f'VQA caption: handler=moondream3 using cached encoding for cache_key="{cache_key}"') return image_cache[cache_key] model = load_model(loaded) @@ -94,7 +94,7 @@ def encode_image(image: Image.Image, cache_key: str = None): if cache_key: image_cache[cache_key] = encoded - debug(f'VQA interrogate: handler=moondream3 cached encoding cache_key="{cache_key}" cache_size={len(image_cache)}') + debug(f'VQA caption: handler=moondream3 cached encoding cache_key="{cache_key}" cache_size={len(image_cache)}') return encoded @@ -129,7 +129,7 @@ def query(image: Image.Image, question: str, repo: str, stream: bool = False, if max_tokens is not None: settings['max_tokens'] = max_tokens - debug(f'VQA interrogate: handler=moondream3 method=query question="{question}" stream={stream} settings={settings}') + debug(f'VQA caption: handler=moondream3 method=query question="{question}" stream={stream} settings={settings}') # Use cached encoding if requested if use_cache: @@ -150,12 +150,12 @@ def query(image: Image.Image, question: str, repo: str, stream: bool = False, # Log response structure (for non-streaming) if not stream: if isinstance(response, dict): - debug(f'VQA interrogate: handler=moondream3 response_type=dict keys={list(response.keys())}') + debug(f'VQA caption: handler=moondream3 response_type=dict keys={list(response.keys())}') if 'reasoning' in response: reasoning_text = response['reasoning'].get('text', '')[:100] + '...' if len(response['reasoning'].get('text', '')) > 100 else response['reasoning'].get('text', '') - debug(f'VQA interrogate: handler=moondream3 reasoning="{reasoning_text}"') + debug(f'VQA caption: handler=moondream3 reasoning="{reasoning_text}"') if 'answer' in response: - debug(f'VQA interrogate: handler=moondream3 answer="{response["answer"]}"') + debug(f'VQA caption: handler=moondream3 answer="{response["answer"]}"') return response @@ -188,7 +188,7 @@ def caption(image: Image.Image, repo: str, length: str = 'normal', stream: bool if max_tokens is not None: settings['max_tokens'] = max_tokens - debug(f'VQA interrogate: handler=moondream3 method=caption length={length} stream={stream} settings={settings}') + debug(f'VQA caption: handler=moondream3 method=caption length={length} stream={stream} settings={settings}') with devices.inference_context(): response = model.caption( @@ -200,7 +200,7 @@ def caption(image: Image.Image, repo: str, length: str = 'normal', stream: bool # Log response structure (for non-streaming) if not stream and isinstance(response, dict): - debug(f'VQA interrogate: handler=moondream3 response_type=dict keys={list(response.keys())}') + debug(f'VQA caption: handler=moondream3 response_type=dict keys={list(response.keys())}') return response @@ -220,21 +220,21 @@ def point(image: Image.Image, object_name: str, repo: str): """ model = load_model(repo) - debug(f'VQA interrogate: handler=moondream3 method=point object_name="{object_name}"') + debug(f'VQA caption: handler=moondream3 method=point object_name="{object_name}"') with devices.inference_context(): result = model.point(image, object_name) - debug(f'VQA interrogate: handler=moondream3 point_raw_result="{result}" type={type(result)}') + debug(f'VQA caption: handler=moondream3 point_raw_result="{result}" type={type(result)}') if isinstance(result, dict): - debug(f'VQA interrogate: handler=moondream3 point_raw_result_keys={list(result.keys())}') + debug(f'VQA caption: handler=moondream3 point_raw_result_keys={list(result.keys())}') points = vqa_detection.parse_points(result) if points: - debug(f'VQA interrogate: handler=moondream3 point_result={len(points)} points found') + debug(f'VQA caption: handler=moondream3 point_result={len(points)} points found') return points - debug('VQA interrogate: handler=moondream3 point_result=not found') + debug('VQA caption: handler=moondream3 point_result=not found') return None @@ -257,17 +257,17 @@ def detect(image: Image.Image, object_name: str, repo: str, max_objects: int = 1 """ model = load_model(repo) - debug(f'VQA interrogate: handler=moondream3 method=detect object_name="{object_name}" max_objects={max_objects}') + debug(f'VQA caption: handler=moondream3 method=detect object_name="{object_name}" max_objects={max_objects}') with devices.inference_context(): result = model.detect(image, object_name) - debug(f'VQA interrogate: handler=moondream3 detect_raw_result="{result}" type={type(result)}') + debug(f'VQA caption: handler=moondream3 detect_raw_result="{result}" type={type(result)}') if isinstance(result, dict): - debug(f'VQA interrogate: handler=moondream3 detect_raw_result_keys={list(result.keys())}') + debug(f'VQA caption: handler=moondream3 detect_raw_result_keys={list(result.keys())}') detections = vqa_detection.parse_detections(result, object_name, max_objects) - debug(f'VQA interrogate: handler=moondream3 detect_result={len(detections)} objects found') + debug(f'VQA caption: handler=moondream3 detect_result={len(detections)} objects found') return detections @@ -291,7 +291,7 @@ def predict(question: str, image: Image.Image, repo: str, model_name: str = None Response string (detection data stored on VQA singleton instance.last_detection_data) (or generator if stream=True for query/caption modes) """ - debug(f'VQA interrogate: handler=moondream3 model_name="{model_name}" repo="{repo}" question="{question}" image_size={image.size if image else None} mode={mode} stream={stream}') + debug(f'VQA caption: handler=moondream3 model_name="{model_name}" repo="{repo}" question="{question}" image_size={image.size if image else None} mode={mode} stream={stream}') # Clean question question = question.replace('<', '').replace('>', '').replace('_', ' ') if question else '' @@ -331,7 +331,7 @@ def predict(question: str, image: Image.Image, repo: str, model_name: str = None else: mode = 'query' - debug(f'VQA interrogate: handler=moondream3 mode_selected={mode}') + debug(f'VQA caption: handler=moondream3 mode_selected={mode}') # Dispatch to appropriate method try: @@ -348,10 +348,10 @@ def predict(question: str, image: Image.Image, repo: str, model_name: str = None object_name = re.sub(rf'\b{phrase}\b', '', object_name, flags=re.IGNORECASE) object_name = re.sub(r'[?.!,]', '', object_name).strip() object_name = re.sub(r'^\s*the\s+', '', object_name, flags=re.IGNORECASE) - debug(f'VQA interrogate: handler=moondream3 point_extracted_object="{object_name}"') + debug(f'VQA caption: handler=moondream3 point_extracted_object="{object_name}"') result = point(image, object_name, repo) if result: - from modules.interrogate import vqa + from modules.caption import vqa vqa.get_instance().last_detection_data = {'points': result} return vqa_detection.format_points_text(result) return "Object not found" @@ -364,11 +364,11 @@ def predict(question: str, image: Image.Image, repo: str, model_name: str = None object_name = re.sub(r'^\s*the\s+', '', object_name, flags=re.IGNORECASE) if ' and ' in object_name.lower(): object_name = re.split(r'\s+and\s+', object_name, flags=re.IGNORECASE)[0].strip() - debug(f'VQA interrogate: handler=moondream3 detect_extracted_object="{object_name}"') + debug(f'VQA caption: handler=moondream3 detect_extracted_object="{object_name}"') results = detect(image, object_name, repo, max_objects=kwargs.get('max_objects', 10)) if results: - from modules.interrogate import vqa + from modules.caption import vqa vqa.get_instance().last_detection_data = {'detections': results} return vqa_detection.format_detections_text(results) return "No objects detected" @@ -377,7 +377,7 @@ def predict(question: str, image: Image.Image, repo: str, model_name: str = None question = "Describe this image." response = query(image, question, repo, stream=stream, use_cache=use_cache, reasoning=thinking_mode) - debug(f'VQA interrogate: handler=moondream3 response_before_clean="{response}"') + debug(f'VQA caption: handler=moondream3 response_before_clean="{response}"') return response except Exception as e: @@ -390,7 +390,7 @@ def clear_cache(): """Clear image encoding cache.""" cache_size = len(image_cache) image_cache.clear() - debug(f'VQA interrogate: handler=moondream3 cleared image cache cache_size_was={cache_size}') + debug(f'VQA caption: handler=moondream3 cleared image cache cache_size_was={cache_size}') shared.log.debug(f'Moondream3: Cleared image cache ({cache_size} entries)') diff --git a/modules/interrogate/openclip.py b/modules/caption/openclip.py similarity index 81% rename from modules/interrogate/openclip.py rename to modules/caption/openclip.py index 7c1fd607f..660bf7d5d 100644 --- a/modules/interrogate/openclip.py +++ b/modules/caption/openclip.py @@ -8,7 +8,7 @@ from PIL import Image from modules import devices, shared, errors, sd_models -debug_enabled = os.environ.get('SD_INTERROGATE_DEBUG', None) is not None +debug_enabled = os.environ.get('SD_CAPTION_DEBUG', None) is not None debug_log = shared.log.trace if debug_enabled else lambda *args, **kwargs: None # Per-request overrides for API calls @@ -19,7 +19,7 @@ def get_clip_setting(name): """Get CLIP setting with per-request override support. Args: - name: Setting name without 'interrogate_clip_' prefix (e.g., 'min_flavors', 'max_length') + name: Setting name without 'caption_openclip_' prefix (e.g., 'min_flavors', 'max_length') Returns: Override value if set, otherwise the value from shared.opts @@ -28,7 +28,7 @@ def get_clip_setting(name): value = _clip_overrides.get(name) if value is not None: return value - return getattr(shared.opts, f'interrogate_clip_{name}') + return getattr(shared.opts, f'caption_openclip_{name}') def _apply_blip2_fix(model, processor): @@ -87,13 +87,14 @@ class BatchWriter: self.file.close() -def update_interrogate_params(): +def update_caption_params(): if ci is not None: ci.caption_max_length = get_clip_setting('max_length') ci.chunk_size = get_clip_setting('chunk_size') ci.flavor_intermediate_count = get_clip_setting('flavor_count') - ci.clip_offload = shared.opts.interrogate_offload - ci.caption_offload = shared.opts.interrogate_offload + ci.clip_offload = shared.opts.caption_offload + ci.caption_offload = shared.opts.caption_offload + def get_clip_models(): @@ -104,12 +105,12 @@ def refresh_clip_models(): global clip_models # pylint: disable=global-statement import open_clip models = sorted(open_clip.list_pretrained()) - shared.log.debug(f'Interrogate: pkg=openclip version={open_clip.__version__} models={len(models)}') + shared.log.debug(f'Caption: pkg=openclip version={open_clip.__version__} models={len(models)}') clip_models = ['/'.join(x) for x in models] return clip_models -def load_interrogator(clip_model, blip_model): +def load_captioner(clip_model, blip_model): from installer import install install('clip_interrogator==0.6.0') import clip_interrogator @@ -120,20 +121,21 @@ def load_interrogator(clip_model, blip_model): device = devices.get_optimal_device() cache_path = shared.opts.clip_models_path shared.log.info(f'CLIP load: clip="{clip_model}" blip="{blip_model}" device={device}') - debug_log(f'CLIP load: cache_path="{cache_path}" max_length={shared.opts.interrogate_clip_max_length} chunk_size={shared.opts.interrogate_clip_chunk_size} flavor_count={shared.opts.interrogate_clip_flavor_count} offload={shared.opts.interrogate_offload}') - interrogator_config = clip_interrogator.Config( + debug_log(f'CLIP load: cache_path="{cache_path}" max_length={shared.opts.caption_openclip_max_length} chunk_size={shared.opts.caption_openclip_chunk_size} flavor_count={shared.opts.caption_openclip_flavor_count} offload={shared.opts.caption_offload}') + captioner_config = clip_interrogator.Config( device=device, cache_path=cache_path, clip_model_name=clip_model, caption_model_name=blip_model, quiet=True, - caption_max_length=shared.opts.interrogate_clip_max_length, - chunk_size=shared.opts.interrogate_clip_chunk_size, - flavor_intermediate_count=shared.opts.interrogate_clip_flavor_count, - clip_offload=shared.opts.interrogate_offload, - caption_offload=shared.opts.interrogate_offload, + caption_max_length=shared.opts.caption_openclip_max_length, + chunk_size=shared.opts.caption_openclip_chunk_size, + flavor_intermediate_count=shared.opts.caption_openclip_flavor_count, + clip_offload=shared.opts.caption_offload, + caption_offload=shared.opts.caption_offload, ) - ci = clip_interrogator.Interrogator(interrogator_config) + ci = clip_interrogator.Interrogator(captioner_config) + if blip_model.startswith('blip2-'): _apply_blip2_fix(ci.caption_model, ci.caption_processor) shared.log.debug(f'CLIP load: time={time.time()-t0:.2f}') @@ -145,12 +147,14 @@ def load_interrogator(clip_model, blip_model): ci.config.clip_model_name = clip_model ci.config.clip_model = None ci.load_clip_model() + ci.clip_offloaded = True # Reset flag so _prepare_clip() will move model to device if blip_model != ci.config.caption_model_name: shared.log.info(f'CLIP load: blip="{blip_model}" reloading') debug_log(f'CLIP load: previous blip="{ci.config.caption_model_name}"') ci.config.caption_model_name = blip_model ci.config.caption_model = None ci.load_caption_model() + ci.caption_offloaded = True # Reset flag so _prepare_caption() will move model to device if blip_model.startswith('blip2-'): _apply_blip2_fix(ci.caption_model, ci.caption_processor) shared.log.debug(f'CLIP load: time={time.time()-t0:.2f}') @@ -159,7 +163,7 @@ def load_interrogator(clip_model, blip_model): def unload_clip_model(): - if ci is not None and shared.opts.interrogate_offload: + if ci is not None and shared.opts.caption_offload: shared.log.debug('CLIP unload: offloading models to CPU') sd_models.move_model(ci.caption_model, devices.cpu) sd_models.move_model(ci.clip_model, devices.cpu) @@ -169,7 +173,7 @@ def unload_clip_model(): debug_log('CLIP unload: complete') -def interrogate(image, mode, caption=None): +def caption(image, mode, base_caption=None): if isinstance(image, list): image = image[0] if len(image) > 0 else None if isinstance(image, dict) and 'name' in image: @@ -180,15 +184,17 @@ def interrogate(image, mode, caption=None): t0 = time.time() min_flavors = get_clip_setting('min_flavors') max_flavors = get_clip_setting('max_flavors') - debug_log(f'CLIP: mode="{mode}" image_size={image.size} caption={caption is not None} min_flavors={min_flavors} max_flavors={max_flavors}') + debug_log(f'CLIP: mode="{mode}" image_size={image.size} caption={base_caption is not None} min_flavors={min_flavors} max_flavors={max_flavors}') + # NOTE: Method names like .interrogate(), .interrogate_classic(), etc. come from the external + # clip-interrogator library (https://github.com/pharmapsychotic/clip-interrogator) and cannot be renamed. if mode == 'best': - prompt = ci.interrogate(image, caption=caption, min_flavors=min_flavors, max_flavors=max_flavors) + prompt = ci.interrogate(image, caption=base_caption, min_flavors=min_flavors, max_flavors=max_flavors) elif mode == 'caption': - prompt = ci.generate_caption(image) if caption is None else caption + prompt = ci.generate_caption(image) if base_caption is None else base_caption elif mode == 'classic': - prompt = ci.interrogate_classic(image, caption=caption, max_flavors=max_flavors) + prompt = ci.interrogate_classic(image, caption=base_caption, max_flavors=max_flavors) elif mode == 'fast': - prompt = ci.interrogate_fast(image, caption=caption, max_flavors=max_flavors) + prompt = ci.interrogate_fast(image, caption=base_caption, max_flavors=max_flavors) elif mode == 'negative': prompt = ci.interrogate_negative(image, max_flavors=max_flavors) else: @@ -197,9 +203,10 @@ def interrogate(image, mode, caption=None): return prompt -def interrogate_image(image, clip_model, blip_model, mode, overrides=None): + +def caption_image(image, clip_model, blip_model, mode, overrides=None): global _clip_overrides # pylint: disable=global-statement - jobid = shared.state.begin('Interrogate CLiP') + jobid = shared.state.begin('Caption CLiP') t0 = time.time() shared.log.info(f'CLIP: mode="{mode}" clip="{clip_model}" blip="{blip_model}" image_size={image.size if image else None}') if overrides: @@ -211,17 +218,19 @@ def interrogate_image(image, clip_model, blip_model, mode, overrides=None): from modules.sd_models import apply_balanced_offload # prevent circular import apply_balanced_offload(shared.sd_model) debug_log('CLIP: applied balanced offload to sd_model') - load_interrogator(clip_model, blip_model) - # Apply overrides to loaded interrogator - update_interrogate_params() + load_captioner(clip_model, blip_model) + # Apply overrides to loaded captioner + update_caption_params() image = image.convert('RGB') - prompt = interrogate(image, mode) + prompt = caption(image, mode) + if shared.opts.caption_offload: + unload_clip_model() devices.torch_gc() shared.log.debug(f'CLIP: complete time={time.time()-t0:.2f}') except Exception as e: prompt = f"Exception {type(e)}" shared.log.error(f'CLIP: {e}') - errors.display(e, 'Interrogate') + errors.display(e, 'Caption') finally: # Clear per-request overrides _clip_overrides = None @@ -229,7 +238,8 @@ def interrogate_image(image, clip_model, blip_model, mode, overrides=None): return prompt -def interrogate_batch(batch_files, batch_folder, batch_str, clip_model, blip_model, mode, write, append, recursive): + +def caption_batch(batch_files, batch_folder, batch_str, clip_model, blip_model, mode, write, append, recursive): files = [] if batch_files is not None: files += [f.name for f in batch_files] @@ -244,10 +254,10 @@ def interrogate_batch(batch_files, batch_folder, batch_str, clip_model, blip_mod t0 = time.time() shared.log.info(f'CLIP batch: mode="{mode}" images={len(files)} clip="{clip_model}" blip="{blip_model}" write={write} append={append}') debug_log(f'CLIP batch: recursive={recursive} files={files[:5]}{"..." if len(files) > 5 else ""}') - jobid = shared.state.begin('Interrogate batch') + jobid = shared.state.begin('Caption batch') prompts = [] - load_interrogator(clip_model, blip_model) + load_captioner(clip_model, blip_model) if write: file_mode = 'w' if not append else 'a' writer = BatchWriter(os.path.dirname(files[0]), mode=file_mode) @@ -263,7 +273,7 @@ def interrogate_batch(batch_files, batch_folder, batch_str, clip_model, blip_mod shared.log.info('CLIP batch: interrupted') break image = Image.open(file).convert('RGB') - prompt = interrogate(image, mode) + prompt = caption(image, mode) prompts.append(prompt) if write: writer.add(file, prompt) @@ -278,10 +288,11 @@ def interrogate_batch(batch_files, batch_folder, batch_str, clip_model, blip_mod return '\n\n'.join(prompts) + def analyze_image(image, clip_model, blip_model): t0 = time.time() shared.log.info(f'CLIP analyze: clip="{clip_model}" blip="{blip_model}" image_size={image.size if image else None}') - load_interrogator(clip_model, blip_model) + load_captioner(clip_model, blip_model) image = image.convert('RGB') image_features = ci.image_to_features(image) debug_log(f'CLIP analyze: features shape={image_features.shape if hasattr(image_features, "shape") else "unknown"}') diff --git a/modules/interrogate/tagger.py b/modules/caption/tagger.py similarity index 82% rename from modules/interrogate/tagger.py rename to modules/caption/tagger.py index 51516adaa..2b18b3937 100644 --- a/modules/interrogate/tagger.py +++ b/modules/caption/tagger.py @@ -8,7 +8,7 @@ DEEPBOORU_MODEL = "DeepBooru" def get_models() -> list: """Return combined list: DeepBooru + WaifuDiffusion models.""" - from modules.interrogate import waifudiffusion + from modules.caption import waifudiffusion return [DEEPBOORU_MODEL] + waifudiffusion.get_models() @@ -25,16 +25,16 @@ def is_deepbooru(model_name: str) -> bool: def load_model(model_name: str) -> bool: """Load appropriate backend.""" if is_deepbooru(model_name): - from modules.interrogate import deepbooru + from modules.caption import deepbooru return deepbooru.load_model() else: - from modules.interrogate import waifudiffusion + from modules.caption import waifudiffusion return waifudiffusion.load_model(model_name) def unload_model(): """Unload both backends to ensure memory is freed.""" - from modules.interrogate import deepbooru, waifudiffusion + from modules.caption import deepbooru, waifudiffusion deepbooru.unload_model() waifudiffusion.unload_model() @@ -54,10 +54,10 @@ def tag(image, model_name: str = None, **kwargs) -> str: model_name = shared.opts.waifudiffusion_model if is_deepbooru(model_name): - from modules.interrogate import deepbooru + from modules.caption import deepbooru return deepbooru.tag(image, **kwargs) else: - from modules.interrogate import waifudiffusion + from modules.caption import waifudiffusion return waifudiffusion.tag(image, model_name=model_name, **kwargs) @@ -72,8 +72,8 @@ def batch(model_name: str, **kwargs) -> str: Combined tag results """ if is_deepbooru(model_name): - from modules.interrogate import deepbooru + from modules.caption import deepbooru return deepbooru.batch(model_name=model_name, **kwargs) else: - from modules.interrogate import waifudiffusion + from modules.caption import waifudiffusion return waifudiffusion.batch(model_name=model_name, **kwargs) diff --git a/modules/interrogate/vqa.py b/modules/caption/vqa.py similarity index 87% rename from modules/interrogate/vqa.py rename to modules/caption/vqa.py index c9659b42f..5cabf65e4 100644 --- a/modules/interrogate/vqa.py +++ b/modules/caption/vqa.py @@ -9,11 +9,11 @@ import transformers import transformers.dynamic_module_utils from PIL import Image from modules import shared, devices, errors, model_quant, sd_models, sd_models_compile, ui_symbols -from modules.interrogate import vqa_detection +from modules.caption import vqa_detection # Debug logging - function-based to avoid circular import -debug_enabled = os.environ.get('SD_INTERROGATE_DEBUG', None) is not None +debug_enabled = os.environ.get('SD_CAPTION_DEBUG', None) is not None def debug(*args, **kwargs): if debug_enabled: @@ -265,7 +265,7 @@ def keep_think_block_open(text_prompt: str) -> str: while end_close < len(text_prompt) and text_prompt[end_close] in ('\r', '\n'): end_close += 1 trimmed_prompt = text_prompt[:close_index] + text_prompt[end_close:] - debug('VQA interrogate: keep_think_block_open applied to prompt segment near assistant reply') + debug('VQA caption: keep_think_block_open applied to prompt segment near assistant reply') return trimmed_prompt @@ -348,7 +348,7 @@ def get_keep_thinking(): overrides = _get_overrides() if overrides.get('keep_thinking') is not None: return overrides['keep_thinking'] - return shared.opts.interrogate_vlm_keep_thinking + return shared.opts.caption_vlm_keep_thinking def get_keep_prefill(): @@ -356,7 +356,7 @@ def get_keep_prefill(): overrides = _get_overrides() if overrides.get('keep_prefill') is not None: return overrides['keep_prefill'] - return shared.opts.interrogate_vlm_keep_prefill + return shared.opts.caption_vlm_keep_prefill def get_kwargs(): @@ -370,12 +370,12 @@ def get_kwargs(): overrides = _get_overrides() # Get base values from settings, apply overrides if provided - max_tokens = overrides.get('max_tokens') if overrides.get('max_tokens') is not None else shared.opts.interrogate_vlm_max_length - do_sample = overrides.get('do_sample') if overrides.get('do_sample') is not None else shared.opts.interrogate_vlm_do_sample - num_beams = overrides.get('num_beams') if overrides.get('num_beams') is not None else shared.opts.interrogate_vlm_num_beams - temperature = overrides.get('temperature') if overrides.get('temperature') is not None else shared.opts.interrogate_vlm_temperature - top_k = overrides.get('top_k') if overrides.get('top_k') is not None else shared.opts.interrogate_vlm_top_k - top_p = overrides.get('top_p') if overrides.get('top_p') is not None else shared.opts.interrogate_vlm_top_p + max_tokens = overrides.get('max_tokens') if overrides.get('max_tokens') is not None else shared.opts.caption_vlm_max_length + do_sample = overrides.get('do_sample') if overrides.get('do_sample') is not None else shared.opts.caption_vlm_do_sample + num_beams = overrides.get('num_beams') if overrides.get('num_beams') is not None else shared.opts.caption_vlm_num_beams + temperature = overrides.get('temperature') if overrides.get('temperature') is not None else shared.opts.caption_vlm_temperature + top_k = overrides.get('top_k') if overrides.get('top_k') is not None else shared.opts.caption_vlm_top_k + top_p = overrides.get('top_p') if overrides.get('top_p') is not None else shared.opts.caption_vlm_top_p kwargs = { 'max_new_tokens': max_tokens, @@ -419,7 +419,7 @@ class VQA: def load(self, model_name: str = None): """Load VLM model into memory for the specified model name.""" - model_name = model_name or shared.opts.interrogate_vlm_model + model_name = model_name or shared.opts.caption_vlm_model if not model_name: shared.log.warning('VQA load: no model specified') return @@ -430,7 +430,7 @@ class VQA: shared.log.debug(f'VQA load: pre-loading model="{model_name}" repo="{repo}"') - # Dispatch to appropriate loader (same logic as interrogate) + # Dispatch to appropriate loader (same logic as caption) repo_lower = repo.lower() if 'qwen' in repo_lower or 'torii' in repo_lower or 'mimo' in repo_lower: self._load_qwen(repo) @@ -459,22 +459,22 @@ class VQA: elif 'fastvlm' in repo_lower: self._load_fastvlm(repo) elif 'moondream3' in repo_lower: - from modules.interrogate import moondream3 + from modules.caption import moondream3 moondream3.load_model(repo) shared.log.info(f'VQA load: model="{model_name}" loaded (external handler)') return elif 'joytag' in repo_lower: - from modules.interrogate import joytag + from modules.caption import joytag joytag.load() shared.log.info(f'VQA load: model="{model_name}" loaded (external handler)') return elif 'joycaption' in repo_lower: - from modules.interrogate import joycaption + from modules.caption import joycaption joycaption.load(repo) shared.log.info(f'VQA load: model="{model_name}" loaded (external handler)') return elif 'deepseek' in repo_lower: - from modules.interrogate import deepseek + from modules.caption import deepseek deepseek.load(repo) shared.log.info(f'VQA load: model="{model_name}" loaded (external handler)') return @@ -488,7 +488,7 @@ class VQA: def _load_fastvlm(self, repo: str): """Load FastVLM model and tokenizer.""" if self.model is None or self.loaded != repo: - shared.log.debug(f'Interrogate load: vlm="{repo}"') + shared.log.debug(f'Caption load: vlm="{repo}"') quant_args = model_quant.create_config(module='LLM') self.model = None self.processor = transformers.AutoTokenizer.from_pretrained(repo, trust_remote_code=True, cache_dir=shared.opts.hfcache_dir) @@ -503,7 +503,7 @@ class VQA: devices.torch_gc() def _fastvlm(self, question: str, image: Image.Image, repo: str, model_name: str = None): - debug(f'VQA interrogate: handler=fastvlm model_name="{model_name}" repo="{repo}" question="{question}" image_size={image.size if image else None}') + debug(f'VQA caption: handler=fastvlm model_name="{model_name}" repo="{repo}" question="{question}" image_size={image.size if image else None}') self._load_fastvlm(repo) sd_models.move_model(self.model, devices.device) if len(question) < 2: @@ -534,7 +534,7 @@ class VQA: def _load_qwen(self, repo: str): """Load Qwen VL model and processor.""" if self.model is None or self.loaded != repo: - shared.log.debug(f'Interrogate load: vlm="{repo}"') + shared.log.debug(f'Caption load: vlm="{repo}"') self.model = None if 'Qwen3-VL' in repo or 'Qwen3VL' in repo: cls_name = transformers.Qwen3VLForConditionalGeneration @@ -562,10 +562,10 @@ class VQA: sd_models.move_model(self.model, devices.device) # Get model class name for logging cls_name = self.model.__class__.__name__ - debug(f'VQA interrogate: handler=qwen model_name="{model_name}" model_class="{cls_name}" repo="{repo}" question="{question}" system_prompt="{system_prompt}" image_size={image.size if image else None}') + debug(f'VQA caption: handler=qwen model_name="{model_name}" model_class="{cls_name}" repo="{repo}" question="{question}" system_prompt="{system_prompt}" image_size={image.size if image else None}') question = question.replace('<', '').replace('>', '').replace('_', ' ') - system_prompt = system_prompt or shared.opts.interrogate_vlm_system + system_prompt = system_prompt or shared.opts.caption_vlm_system conversation = [ { "role": "system", @@ -593,9 +593,9 @@ class VQA: use_prefill = len(prefill_text) > 0 if debug_enabled: - debug(f'VQA interrogate: handler=qwen conversation_roles={[msg["role"] for msg in conversation]}') - debug(f'VQA interrogate: handler=qwen full_conversation={truncate_b64_in_conversation(conversation)}') - debug(f'VQA interrogate: handler=qwen is_thinking={is_thinking} thinking_mode={thinking_mode} prefill="{prefill_text}"') + debug(f'VQA caption: handler=qwen conversation_roles={[msg["role"] for msg in conversation]}') + debug(f'VQA caption: handler=qwen full_conversation={truncate_b64_in_conversation(conversation)}') + debug(f'VQA caption: handler=qwen is_thinking={is_thinking} thinking_mode={thinking_mode} prefill="{prefill_text}"') # Generate base prompt using template # Qwen-Thinking template automatically adds "<|im_start|>assistant\n\n" when add_generation_prompt=True @@ -605,7 +605,7 @@ class VQA: add_generation_prompt=True, ) except (TypeError, ValueError) as e: - debug(f'VQA interrogate: handler=qwen chat_template fallback add_generation_prompt=True: {e}') + debug(f'VQA caption: handler=qwen chat_template fallback add_generation_prompt=True: {e}') text_prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True) # Manually handle thinking tags and prefill @@ -627,23 +627,23 @@ class VQA: text_prompt += prefill_text if debug_enabled: - debug(f'VQA interrogate: handler=qwen text_prompt="{text_prompt}"') + debug(f'VQA caption: handler=qwen text_prompt="{text_prompt}"') inputs = self.processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt") inputs = inputs.to(devices.device, devices.dtype) gen_kwargs = get_kwargs() - debug(f'VQA interrogate: handler=qwen generation_kwargs={gen_kwargs} input_ids_shape={inputs.input_ids.shape}') + debug(f'VQA caption: handler=qwen generation_kwargs={gen_kwargs} input_ids_shape={inputs.input_ids.shape}') output_ids = self.model.generate( **inputs, **gen_kwargs, ) - debug(f'VQA interrogate: handler=qwen output_ids_shape={output_ids.shape}') + debug(f'VQA caption: handler=qwen output_ids_shape={output_ids.shape}') generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids) ] response = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) if debug_enabled: - debug(f'VQA interrogate: handler=qwen response_before_clean="{response}"') + debug(f'VQA caption: handler=qwen response_before_clean="{response}"') # Clean up thinking tags # Note: is in the prompt, not the response - only appears in generated output if len(response) > 0: @@ -672,7 +672,7 @@ class VQA: def _load_gemma(self, repo: str): """Load Gemma 3 model and processor.""" if self.model is None or self.loaded != repo: - shared.log.debug(f'Interrogate load: vlm="{repo}"') + shared.log.debug(f'Caption load: vlm="{repo}"') self.model = None if '3n' in repo: cls = transformers.Gemma3nForConditionalGeneration # pylint: disable=no-member @@ -696,10 +696,10 @@ class VQA: sd_models.move_model(self.model, devices.device) # Get model class name for logging cls_name = self.model.__class__.__name__ - debug(f'VQA interrogate: handler=gemma model_name="{model_name}" model_class="{cls_name}" repo="{repo}" question="{question}" system_prompt="{system_prompt}" image_size={image.size if image else None}') + debug(f'VQA caption: handler=gemma model_name="{model_name}" model_class="{cls_name}" repo="{repo}" question="{question}" system_prompt="{system_prompt}" image_size={image.size if image else None}') question = question.replace('<', '').replace('>', '').replace('_', ' ') - system_prompt = system_prompt or shared.opts.interrogate_vlm_system + system_prompt = system_prompt or shared.opts.caption_vlm_system system_content = [] if system_prompt is not None and len(system_prompt) > 4: @@ -726,14 +726,14 @@ class VQA: "role": "assistant", "content": [{"type": "text", "text": prefill_text}], }) - debug(f'VQA interrogate: handler=gemma prefill="{prefill_text}"') + debug(f'VQA caption: handler=gemma prefill="{prefill_text}"') else: - debug('VQA interrogate: handler=gemma prefill disabled (empty), relying on add_generation_prompt') + debug('VQA caption: handler=gemma prefill disabled (empty), relying on add_generation_prompt') if debug_enabled: - debug(f'VQA interrogate: handler=gemma conversation_roles={[msg["role"] for msg in conversation]}') - debug(f'VQA interrogate: handler=gemma full_conversation={truncate_b64_in_conversation(conversation)}') + debug(f'VQA caption: handler=gemma conversation_roles={[msg["role"] for msg in conversation]}') + debug(f'VQA caption: handler=gemma full_conversation={truncate_b64_in_conversation(conversation)}') debug_prefill_mode = 'add_generation_prompt=False continue_final_message=True' if use_prefill else 'add_generation_prompt=True' - debug(f'VQA interrogate: handler=gemma template_mode={debug_prefill_mode}') + debug(f'VQA caption: handler=gemma template_mode={debug_prefill_mode}') try: if use_prefill: text_prompt = self.processor.apply_chat_template( @@ -749,7 +749,7 @@ class VQA: tokenize=False, ) except (TypeError, ValueError) as e: - debug(f'VQA interrogate: handler=gemma chat_template fallback add_generation_prompt=True: {e}') + debug(f'VQA caption: handler=gemma chat_template fallback add_generation_prompt=True: {e}') text_prompt = self.processor.apply_chat_template( conversation, add_generation_prompt=True, @@ -758,7 +758,7 @@ class VQA: if use_prefill and use_thinking: text_prompt = keep_think_block_open(text_prompt) if debug_enabled: - debug(f'VQA interrogate: handler=gemma text_prompt="{text_prompt}"') + debug(f'VQA caption: handler=gemma text_prompt="{text_prompt}"') inputs = self.processor( text=[text_prompt], images=[image], @@ -767,17 +767,17 @@ class VQA: ).to(device=devices.device, dtype=devices.dtype) input_len = inputs["input_ids"].shape[-1] gen_kwargs = get_kwargs() - debug(f'VQA interrogate: handler=gemma generation_kwargs={gen_kwargs} input_len={input_len}') + debug(f'VQA caption: handler=gemma generation_kwargs={gen_kwargs} input_len={input_len}') with devices.inference_context(): generation = self.model.generate( **inputs, **gen_kwargs, ) - debug(f'VQA interrogate: handler=gemma output_ids_shape={generation.shape}') + debug(f'VQA caption: handler=gemma output_ids_shape={generation.shape}') generation = generation[0][input_len:] response = self.processor.decode(generation, skip_special_tokens=True) if debug_enabled: - debug(f'VQA interrogate: handler=gemma response_before_clean="{response}"') + debug(f'VQA caption: handler=gemma response_before_clean="{response}"') # Clean up thinking tags (if any remain) if get_keep_thinking(): @@ -798,7 +798,7 @@ class VQA: def _load_paligemma(self, repo: str): """Load PaliGemma model and processor.""" if self.model is None or self.loaded != repo: - shared.log.debug(f'Interrogate load: vlm="{repo}"') + shared.log.debug(f'Caption load: vlm="{repo}"') self.processor = transformers.PaliGemmaProcessor.from_pretrained(repo, cache_dir=shared.opts.hfcache_dir) self.model = None self.model = transformers.PaliGemmaForConditionalGeneration.from_pretrained( @@ -827,7 +827,7 @@ class VQA: def _load_ovis(self, repo: str): """Load Ovis model (requires flash-attn).""" if self.model is None or self.loaded != repo: - shared.log.debug(f'Interrogate load: vlm="{repo}"') + shared.log.debug(f'Caption load: vlm="{repo}"') self.model = None self.model = transformers.AutoModelForCausalLM.from_pretrained( repo, @@ -843,7 +843,7 @@ class VQA: try: import flash_attn # pylint: disable=unused-import except Exception: - shared.log.error(f'Interrogate: vlm="{repo}" flash-attn is not available') + shared.log.error(f'Caption: vlm="{repo}" flash-attn is not available') return '' self._load_ovis(repo) sd_models.move_model(self.model, devices.device) @@ -875,7 +875,7 @@ class VQA: def _load_smol(self, repo: str): """Load SmolVLM model and processor.""" if self.model is None or self.loaded != repo: - shared.log.debug(f'Interrogate load: vlm="{repo}"') + shared.log.debug(f'Caption load: vlm="{repo}"') self.model = None quant_args = model_quant.create_config(module='LLM') self.model = transformers.AutoModelForVision2Seq.from_pretrained( @@ -895,10 +895,10 @@ class VQA: sd_models.move_model(self.model, devices.device) # Get model class name for logging cls_name = self.model.__class__.__name__ - debug(f'VQA interrogate: handler=smol model_name="{model_name}" model_class="{cls_name}" repo="{repo}" question="{question}" system_prompt="{system_prompt}" image_size={image.size if image else None}') + debug(f'VQA caption: handler=smol model_name="{model_name}" model_class="{cls_name}" repo="{repo}" question="{question}" system_prompt="{system_prompt}" image_size={image.size if image else None}') question = question.replace('<', '').replace('>', '').replace('_', ' ') - system_prompt = system_prompt or shared.opts.interrogate_vlm_system + system_prompt = system_prompt or shared.opts.caption_vlm_system conversation = [ { "role": "system", @@ -924,14 +924,14 @@ class VQA: "role": "assistant", "content": [{"type": "text", "text": prefill_text}], }) - debug(f'VQA interrogate: handler=smol prefill="{prefill_text}"') + debug(f'VQA caption: handler=smol prefill="{prefill_text}"') else: - debug('VQA interrogate: handler=smol prefill disabled (empty), relying on add_generation_prompt') + debug('VQA caption: handler=smol prefill disabled (empty), relying on add_generation_prompt') if debug_enabled: - debug(f'VQA interrogate: handler=smol conversation_roles={[msg["role"] for msg in conversation]}') - debug(f'VQA interrogate: handler=smol full_conversation={truncate_b64_in_conversation(conversation)}') + debug(f'VQA caption: handler=smol conversation_roles={[msg["role"] for msg in conversation]}') + debug(f'VQA caption: handler=smol full_conversation={truncate_b64_in_conversation(conversation)}') debug_prefill_mode = 'add_generation_prompt=False continue_final_message=True' if use_prefill else 'add_generation_prompt=True' - debug(f'VQA interrogate: handler=smol template_mode={debug_prefill_mode}') + debug(f'VQA caption: handler=smol template_mode={debug_prefill_mode}') try: if use_prefill: text_prompt = self.processor.apply_chat_template( @@ -942,24 +942,24 @@ class VQA: else: text_prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True) except (TypeError, ValueError) as e: - debug(f'VQA interrogate: handler=smol chat_template fallback add_generation_prompt=True: {e}') + debug(f'VQA caption: handler=smol chat_template fallback add_generation_prompt=True: {e}') text_prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True) if use_prefill and use_thinking: text_prompt = keep_think_block_open(text_prompt) if debug_enabled: - debug(f'VQA interrogate: handler=smol text_prompt="{text_prompt}"') + debug(f'VQA caption: handler=smol text_prompt="{text_prompt}"') inputs = self.processor(text=text_prompt, images=[image], padding=True, return_tensors="pt") inputs = inputs.to(devices.device, devices.dtype) gen_kwargs = get_kwargs() - debug(f'VQA interrogate: handler=smol generation_kwargs={gen_kwargs}') + debug(f'VQA caption: handler=smol generation_kwargs={gen_kwargs}') output_ids = self.model.generate( **inputs, **gen_kwargs, ) - debug(f'VQA interrogate: handler=smol output_ids_shape={output_ids.shape}') + debug(f'VQA caption: handler=smol output_ids_shape={output_ids.shape}') response = self.processor.batch_decode(output_ids, skip_special_tokens=True) if debug_enabled: - debug(f'VQA interrogate: handler=smol response_before_clean="{response}"') + debug(f'VQA caption: handler=smol response_before_clean="{response}"') # Clean up thinking tags if len(response) > 0: @@ -981,7 +981,7 @@ class VQA: def _load_git(self, repo: str): """Load Microsoft GIT model and processor.""" if self.model is None or self.loaded != repo: - shared.log.debug(f'Interrogate load: vlm="{repo}"') + shared.log.debug(f'Caption load: vlm="{repo}"') self.model = None self.model = transformers.GitForCausalLM.from_pretrained( repo, @@ -1011,7 +1011,7 @@ class VQA: def _load_blip(self, repo: str): """Load Salesforce BLIP model and processor.""" if self.model is None or self.loaded != repo: - shared.log.debug(f'Interrogate load: vlm="{repo}"') + shared.log.debug(f'Caption load: vlm="{repo}"') self.model = None self.model = transformers.BlipForQuestionAnswering.from_pretrained( repo, @@ -1035,7 +1035,7 @@ class VQA: def _load_vilt(self, repo: str): """Load ViLT model and processor.""" if self.model is None or self.loaded != repo: - shared.log.debug(f'Interrogate load: vlm="{repo}"') + shared.log.debug(f'Caption load: vlm="{repo}"') self.model = None self.model = transformers.ViltForQuestionAnswering.from_pretrained( repo, @@ -1061,7 +1061,7 @@ class VQA: def _load_pix(self, repo: str): """Load Pix2Struct model and processor.""" if self.model is None or self.loaded != repo: - shared.log.debug(f'Interrogate load: vlm="{repo}"') + shared.log.debug(f'Caption load: vlm="{repo}"') self.model = None self.model = transformers.Pix2StructForConditionalGeneration.from_pretrained( repo, @@ -1087,7 +1087,7 @@ class VQA: def _load_moondream(self, repo: str): """Load Moondream 2 model and tokenizer.""" if self.model is None or self.loaded != repo: - shared.log.debug(f'Interrogate load: vlm="{repo}"') + shared.log.debug(f'Caption load: vlm="{repo}"') self.model = None self.model = transformers.AutoModelForCausalLM.from_pretrained( repo, @@ -1102,7 +1102,7 @@ class VQA: devices.torch_gc() def _moondream(self, question: str, image: Image.Image, repo: str, model_name: str = None, thinking_mode: bool = False): - debug(f'VQA interrogate: handler=moondream model_name="{model_name}" repo="{repo}" question="{question}" thinking_mode={thinking_mode}') + debug(f'VQA caption: handler=moondream model_name="{model_name}" repo="{repo}" question="{question}" thinking_mode={thinking_mode}') self._load_moondream(repo) sd_models.move_model(self.model, devices.device) question = question.replace('<', '').replace('>', '').replace('_', ' ') @@ -1117,9 +1117,9 @@ class VQA: target = question[9:].strip() if question.lower().startswith('point at ') else '' if not target: return "Please specify an object to locate" - debug(f'VQA interrogate: handler=moondream method=point target="{target}"') + debug(f'VQA caption: handler=moondream method=point target="{target}"') result = self.model.point(image, target) - debug(f'VQA interrogate: handler=moondream point_raw_result={result}') + debug(f'VQA caption: handler=moondream point_raw_result={result}') points = vqa_detection.parse_points(result) if points: self.last_detection_data = {'points': points} @@ -1129,35 +1129,35 @@ class VQA: target = question[7:].strip() if question.lower().startswith('detect ') else '' if not target: return "Please specify an object to detect" - debug(f'VQA interrogate: handler=moondream method=detect target="{target}"') + debug(f'VQA caption: handler=moondream method=detect target="{target}"') result = self.model.detect(image, target) - debug(f'VQA interrogate: handler=moondream detect_raw_result={result}') + debug(f'VQA caption: handler=moondream detect_raw_result={result}') detections = vqa_detection.parse_detections(result, target) if detections: self.last_detection_data = {'detections': detections} return vqa_detection.format_detections_text(detections, include_confidence=False) return "No objects detected" elif question == 'DETECT_GAZE' or question.lower() == 'detect gaze': - debug('VQA interrogate: handler=moondream method=detect_gaze') + debug('VQA caption: handler=moondream method=detect_gaze') faces = self.model.detect(image, "face") - debug(f'VQA interrogate: handler=moondream detect_gaze faces={faces}') + debug(f'VQA caption: handler=moondream detect_gaze faces={faces}') if faces.get('objects'): eye_x, eye_y = vqa_detection.calculate_eye_position(faces['objects'][0]) result = self.model.detect_gaze(image, eye=(eye_x, eye_y)) - debug(f'VQA interrogate: handler=moondream detect_gaze result={result}') + debug(f'VQA caption: handler=moondream detect_gaze result={result}') if result.get('gaze'): gaze = result['gaze'] self.last_detection_data = {'points': [(gaze['x'], gaze['y'])]} return f"Gaze direction: ({gaze['x']:.3f}, {gaze['y']:.3f})" return "No face/gaze detected" else: - debug(f'VQA interrogate: handler=moondream method=query question="{question}" reasoning={thinking_mode}') + debug(f'VQA caption: handler=moondream method=query question="{question}" reasoning={thinking_mode}') result = self.model.query(image, question, reasoning=thinking_mode) response = result['answer'] - debug(f'VQA interrogate: handler=moondream query_result keys={list(result.keys()) if isinstance(result, dict) else "not dict"}') + debug(f'VQA caption: handler=moondream query_result keys={list(result.keys()) if isinstance(result, dict) else "not dict"}') if thinking_mode and 'reasoning' in result: reasoning_text = result['reasoning'].get('text', '') if isinstance(result['reasoning'], dict) else str(result['reasoning']) - debug(f'VQA interrogate: handler=moondream reasoning_text="{reasoning_text[:100]}..."') + debug(f'VQA caption: handler=moondream reasoning_text="{reasoning_text[:100]}..."') if get_keep_thinking(): response = f"Reasoning:\n{reasoning_text}\n\nAnswer:\n{response}" # When keep_thinking is False, just use the answer (reasoning is discarded) @@ -1183,7 +1183,7 @@ class VQA: effective_revision = revision_from_repo if self.model is None or self.loaded != cache_key: - shared.log.debug(f'Interrogate load: vlm="{repo_name}" revision="{effective_revision}" path="{shared.opts.hfcache_dir}"') + shared.log.debug(f'Caption load: vlm="{repo_name}" revision="{effective_revision}" path="{shared.opts.hfcache_dir}"') transformers.dynamic_module_utils.get_imports = get_imports self.model = None quant_args = model_quant.create_config(module='LLM') @@ -1213,7 +1213,7 @@ class VQA: pixel_values = inputs['pixel_values'].to(devices.device, devices.dtype) # Florence-2 requires beam search, not sampling - sampling causes probability tensor errors overrides = _get_overrides() - max_tokens = overrides.get('max_tokens') if overrides.get('max_tokens') is not None else shared.opts.interrogate_vlm_max_length + max_tokens = overrides.get('max_tokens') if overrides.get('max_tokens') is not None else shared.opts.caption_vlm_max_length with devices.inference_context(): generated_ids = self.model.generate( input_ids=input_ids, @@ -1263,9 +1263,9 @@ class VQA: response = return_dict["prediction"] # the text format answer return response - def interrogate(self, question: str = '', system_prompt: str = None, prompt: str = None, image: Image.Image = None, model_name: str = None, prefill: str = None, thinking_mode: bool = None, quiet: bool = False, generation_kwargs: dict = None) -> str: + def caption(self, question: str = '', system_prompt: str = None, prompt: str = None, image: Image.Image = None, model_name: str = None, prefill: str = None, thinking_mode: bool = None, quiet: bool = False, generation_kwargs: dict = None) -> str: """ - Main entry point for VQA interrogation. Returns string answer. + Main entry point for VQA captioning. Returns string answer. Detection data stored in self.last_detection_data for annotated image creation. Args: @@ -1283,11 +1283,11 @@ class VQA: self.last_annotated_image = None self.last_detection_data = None self._generation_overrides = generation_kwargs # Set per-request overrides - jobid = shared.state.begin('Interrogate LLM') + jobid = shared.state.begin('Caption LLM') t0 = time.time() - model_name = model_name or shared.opts.interrogate_vlm_model + model_name = model_name or shared.opts.caption_vlm_model prefill = vlm_prefill if prefill is None else prefill # Use provided prefill when specified - thinking_mode = shared.opts.interrogate_vlm_thinking_mode if thinking_mode is None else thinking_mode # Resolve from settings if not specified + thinking_mode = shared.opts.caption_vlm_thinking_mode if thinking_mode is None else thinking_mode # Resolve from settings if not specified if isinstance(image, list): image = image[0] if len(image) > 0 else None if isinstance(image, dict) and 'name' in image: @@ -1298,7 +1298,7 @@ class VQA: if image.mode != 'RGB': image = image.convert('RGB') if image is None: - shared.log.error(f'VQA interrogate: model="{model_name}" error="No input image provided"') + shared.log.error(f'VQA caption: model="{model_name}" error="No input image provided"') shared.state.end(jobid) return 'Error: No input image provided. Please upload or select an image.' @@ -1306,7 +1306,7 @@ class VQA: if question == "Use Prompt": # Use content from Prompt field directly - requires user input if not prompt or len(prompt.strip()) < 2: - shared.log.error(f'VQA interrogate: model="{model_name}" error="Please enter a prompt"') + shared.log.error(f'VQA caption: model="{model_name}" error="Please enter a prompt"') shared.state.end(jobid) return 'Error: Please enter a question or instruction in the Prompt field.' question = prompt @@ -1316,7 +1316,7 @@ class VQA: if raw_mapping in ("POINT_MODE", "DETECT_MODE"): # These modes require user input in the prompt field if not prompt or len(prompt.strip()) < 2: - shared.log.error(f'VQA interrogate: model="{model_name}" error="Please specify what to find in the prompt field"') + shared.log.error(f'VQA caption: model="{model_name}" error="Please specify what to find in the prompt field"') shared.state.end(jobid) return 'Error: Please specify what to find in the prompt field (e.g., "the red car" or "faces").' # Convert friendly name to internal token (handles Point/Detect prefix) @@ -1328,12 +1328,12 @@ class VQA: try: if model_name is None: - shared.log.error(f'Interrogate: type=vlm model="{model_name}" no model selected') + shared.log.error(f'Caption: type=vlm model="{model_name}" no model selected') shared.state.end(jobid) return '' vqa_model = vlm_models.get(model_name, None) if vqa_model is None: - shared.log.error(f'Interrogate: type=vlm model="{model_name}" unknown') + shared.log.error(f'Caption: type=vlm model="{model_name}" unknown') shared.state.end(jobid) return '' @@ -1352,7 +1352,7 @@ class VQA: answer = self._pix(question, image, vqa_model, model_name) elif 'moondream3' in vqa_model.lower(): handler = 'moondream3' - from modules.interrogate import moondream3 + from modules.caption import moondream3 answer = moondream3.predict(question, image, vqa_model, model_name, thinking_mode=thinking_mode) elif 'moondream2' in vqa_model.lower(): handler = 'moondream' @@ -1368,15 +1368,15 @@ class VQA: answer = self._smol(question, image, vqa_model, system_prompt, model_name, prefill, thinking_mode) elif 'joytag' in vqa_model.lower(): handler = 'joytag' - from modules.interrogate import joytag + from modules.caption import joytag answer = joytag.predict(image) elif 'joycaption' in vqa_model.lower(): handler = 'joycaption' - from modules.interrogate import joycaption + from modules.caption import joycaption answer = joycaption.predict(question, image, vqa_model) elif 'deepseek' in vqa_model.lower(): handler = 'deepseek' - from modules.interrogate import deepseek + from modules.caption import deepseek answer = deepseek.predict(question, image, vqa_model) elif 'paligemma' in vqa_model.lower(): handler = 'paligemma' @@ -1399,7 +1399,7 @@ class VQA: errors.display(e, 'VQA') answer = 'error' - if shared.opts.interrogate_offload and self.model is not None: + if shared.opts.caption_offload and self.model is not None: sd_models.move_model(self.model, devices.cpu, force=True) devices.torch_gc(force=True, reason='vqa') @@ -1412,16 +1412,17 @@ class VQA: points = self.last_detection_data.get('points', None) if detections or points: self.last_annotated_image = vqa_detection.draw_bounding_boxes(image, detections or [], points) - debug(f'VQA interrogate: handler={handler} created annotated image detections={len(detections) if detections else 0} points={len(points) if points else 0}') + debug(f'VQA caption: handler={handler} created annotated image detections={len(detections) if detections else 0} points={len(points) if points else 0}') - debug(f'VQA interrogate: handler={handler} response_after_clean="{answer}" has_annotation={self.last_annotated_image is not None}') + debug(f'VQA caption: handler={handler} response_after_clean="{answer}" has_annotation={self.last_annotated_image is not None}') t1 = time.time() if not quiet: - shared.log.debug(f'Interrogate: type=vlm model="{model_name}" repo="{vqa_model}" args={get_kwargs()} time={t1-t0:.2f}') + shared.log.debug(f'Caption: type=vlm model="{model_name}" repo="{vqa_model}" args={get_kwargs()} time={t1-t0:.2f}') self._generation_overrides = None # Clear per-request overrides shared.state.end(jobid) return answer + def batch(self, model_name, system_prompt, batch_files, batch_folder, batch_str, question, prompt, write, append, recursive, prefill=None, thinking_mode=False): class BatchWriter: def __init__(self, folder, mode='w'): @@ -1450,15 +1451,15 @@ class VQA: from modules.files_cache import list_files files += list(list_files(batch_str, ext_filter=['.png', '.jpg', '.jpeg', '.webp', '.jxl'], recursive=recursive)) if len(files) == 0: - shared.log.warning('Interrogate batch: type=vlm no images') + shared.log.warning('Caption batch: type=vlm no images') return '' - jobid = shared.state.begin('Interrogate batch') + jobid = shared.state.begin('Caption batch') prompts = [] if write: mode = 'w' if not append else 'a' writer = BatchWriter(os.path.dirname(files[0]), mode=mode) - orig_offload = shared.opts.interrogate_offload - shared.opts.interrogate_offload = False + orig_offload = shared.opts.caption_offload + shared.opts.caption_offload = False import rich.progress as rp pbar = rp.Progress(rp.TextColumn('[cyan]Caption:'), rp.BarColumn(), rp.MofNCompleteColumn(), rp.TaskProgressColumn(), rp.TimeRemainingColumn(), rp.TimeElapsedColumn(), rp.TextColumn('[cyan]{task.description}'), console=shared.console) with pbar: @@ -1469,7 +1470,7 @@ class VQA: if shared.state.interrupted: break img = Image.open(file) - caption = self.interrogate(question, system_prompt, prompt, img, model_name, prefill, thinking_mode, quiet=True) + caption = self.caption(question, system_prompt, prompt, img, model_name, prefill, thinking_mode, quiet=True) # Save annotated image if available if self.last_annotated_image and write: annotated_path = os.path.splitext(file)[0] + "_annotated.png" @@ -1478,10 +1479,10 @@ class VQA: if write: writer.add(file, caption) except Exception as e: - shared.log.error(f'Interrogate batch: {e}') + shared.log.error(f'Caption batch: {e}') if write: writer.close() - shared.opts.interrogate_offload = orig_offload + shared.opts.caption_offload = orig_offload shared.state.end(jobid) return '\n\n'.join(prompts) @@ -1499,8 +1500,9 @@ def get_instance() -> VQA: # Backwards-compatible module-level functions -def interrogate(*args, **kwargs): - return get_instance().interrogate(*args, **kwargs) +def caption(*args, **kwargs): + return get_instance().caption(*args, **kwargs) + def unload_model(): diff --git a/modules/interrogate/vqa_detection.py b/modules/caption/vqa_detection.py similarity index 100% rename from modules/interrogate/vqa_detection.py rename to modules/caption/vqa_detection.py diff --git a/modules/interrogate/waifudiffusion.py b/modules/caption/waifudiffusion.py similarity index 99% rename from modules/interrogate/waifudiffusion.py rename to modules/caption/waifudiffusion.py index 71951a47f..5d252f0cd 100644 --- a/modules/interrogate/waifudiffusion.py +++ b/modules/caption/waifudiffusion.py @@ -10,8 +10,8 @@ from PIL import Image from modules import shared, devices, errors -# Debug logging - enable with SD_INTERROGATE_DEBUG environment variable -debug_enabled = os.environ.get('SD_INTERROGATE_DEBUG', None) is not None +# Debug logging - enable with SD_CAPTION_DEBUG environment variable +debug_enabled = os.environ.get('SD_CAPTION_DEBUG', None) is not None debug_log = shared.log.trace if debug_enabled else lambda *args, **kwargs: None re_special = re.compile(r'([\\()])') @@ -405,7 +405,7 @@ def tag(image: Image.Image, model_name: str = None, **kwargs) -> str: result = tagger.predict(image, **kwargs) shared.log.debug(f'WaifuDiffusion: complete time={time.time()-t0:.2f} tags={len(result.split(", ")) if result else 0}') # Offload model if setting enabled - if shared.opts.interrogate_offload: + if shared.opts.caption_offload: tagger.unload() except Exception as e: result = f"Exception {type(e)}" diff --git a/modules/interrogate/interrogate.py b/modules/interrogate/interrogate.py deleted file mode 100644 index 4e06fb36f..000000000 --- a/modules/interrogate/interrogate.py +++ /dev/null @@ -1,48 +0,0 @@ -import time -from PIL import Image -from modules import shared - - -def interrogate(image): - if isinstance(image, list): - image = image[0] if len(image) > 0 else None - if isinstance(image, dict) and 'name' in image: - image = Image.open(image['name']) - if image is None: - shared.log.error('Interrogate: no image provided') - return '' - t0 = time.time() - if shared.opts.interrogate_default_type == 'OpenCLiP': - shared.log.info(f'Interrogate: type={shared.opts.interrogate_default_type} clip="{shared.opts.interrogate_clip_model}" blip="{shared.opts.interrogate_blip_model}" mode="{shared.opts.interrogate_clip_mode}"') - from modules.interrogate import openclip - openclip.load_interrogator(clip_model=shared.opts.interrogate_clip_model, blip_model=shared.opts.interrogate_blip_model) - openclip.update_interrogate_params() - prompt = openclip.interrogate(image, mode=shared.opts.interrogate_clip_mode) - shared.log.debug(f'Interrogate: time={time.time()-t0:.2f} answer="{prompt}"') - return prompt - elif shared.opts.interrogate_default_type == 'Tagger': - shared.log.info(f'Interrogate: type={shared.opts.interrogate_default_type} model="{shared.opts.waifudiffusion_model}"') - from modules.interrogate import tagger - prompt = tagger.tag( - image=image, - model_name=shared.opts.waifudiffusion_model, - general_threshold=shared.opts.tagger_threshold, - character_threshold=shared.opts.waifudiffusion_character_threshold, - include_rating=shared.opts.tagger_include_rating, - exclude_tags=shared.opts.tagger_exclude_tags, - max_tags=shared.opts.tagger_max_tags, - sort_alpha=shared.opts.tagger_sort_alpha, - use_spaces=shared.opts.tagger_use_spaces, - escape_brackets=shared.opts.tagger_escape_brackets, - ) - shared.log.debug(f'Interrogate: time={time.time()-t0:.2f} answer="{prompt}"') - return prompt - elif shared.opts.interrogate_default_type == 'VLM': - shared.log.info(f'Interrogate: type={shared.opts.interrogate_default_type} vlm="{shared.opts.interrogate_vlm_model}" prompt="{shared.opts.interrogate_vlm_prompt}"') - from modules.interrogate import vqa - prompt = vqa.interrogate(image=image, model_name=shared.opts.interrogate_vlm_model, question=shared.opts.interrogate_vlm_prompt, prompt=None, system_prompt=shared.opts.interrogate_vlm_system) - shared.log.debug(f'Interrogate: time={time.time()-t0:.2f} answer="{prompt}"') - return prompt - else: - shared.log.error(f'Interrogate: type="{shared.opts.interrogate_default_type}" unknown') - return ''