#!/usr/bin/env python3 """Fetch and convert booru tag databases to SD.Next dict format. Usage: python cli/fetch_dicts.py danbooru [--output PATH] [--min-count N] python cli/fetch_dicts.py e621 [--output PATH] [--min-count N] python cli/fetch_dicts.py rule34 --key USER_ID:API_KEY [--output PATH] [--min-count N] python cli/fetch_dicts.py sankaku [--output PATH] [--min-count N] python cli/fetch_dicts.py all --key USER_ID:API_KEY [--output-dir DIR] [--min-count N] Output format: JSON with { name, version, categories, tags: [[name, category_id, post_count], ...] } Tags sorted by post_count descending. """ import argparse import json import os import sys import time from datetime import date import requests DANBOORU_CATEGORIES = { "0": {"name": "general", "color": "#0075f8"}, "1": {"name": "artist", "color": "#a800aa"}, "3": {"name": "copyright", "color": "#dd00dd"}, "4": {"name": "character", "color": "#00ab2c"}, "5": {"name": "meta", "color": "#ee8800"}, } E621_CATEGORIES = { "0": {"name": "general", "color": "#0075f8"}, "1": {"name": "artist", "color": "#a800aa"}, "3": {"name": "copyright", "color": "#dd00dd"}, "4": {"name": "character", "color": "#00ab2c"}, "5": {"name": "species", "color": "#ed5d1f"}, "6": {"name": "invalid", "color": "#ff3d3d"}, "7": {"name": "meta", "color": "#ee8800"}, "8": {"name": "lore", "color": "#228b22"}, } # Rule34 uses the same category IDs as Danbooru (Gelbooru-compatible) RULE34_CATEGORIES = { "0": {"name": "general", "color": "#0075f8"}, "1": {"name": "artist", "color": "#a800aa"}, "3": {"name": "copyright", "color": "#dd00dd"}, "4": {"name": "character", "color": "#00ab2c"}, "5": {"name": "meta", "color": "#ee8800"}, } SANKAKU_CATEGORIES = { "0": {"name": "general", "color": "#0075f8"}, "1": {"name": "artist", "color": "#a800aa"}, "2": {"name": "studio", "color": "#a800aa"}, "3": {"name": "copyright", "color": "#dd00dd"}, "4": {"name": "character", "color": "#00ab2c"}, "5": {"name": "species", "color": "#ed5d1f"}, "8": {"name": "medium", "color": "#ee8800"}, "9": {"name": "meta", "color": "#ee8800"}, } USER_AGENT = "SDNext-DictFetcher/1.0 (tag autocomplete)" def fetch_danbooru(min_count: int = 10, **_kwargs) -> list: """Fetch tags from Danbooru API, paginated.""" tags = [] page = 1 session = requests.Session() session.headers["User-Agent"] = USER_AGENT while True: url = f"https://danbooru.donmai.us/tags.json?limit=1000&page={page}&search[order]=count" try: resp = session.get(url, timeout=30) resp.raise_for_status() except requests.RequestException as e: print(f" Error on page {page}: {e}", file=sys.stderr) break data = resp.json() if not data: break for tag in data: count = tag.get("post_count", 0) if count < min_count: continue tags.append([tag["name"], tag["category"], count]) below_threshold = all(t.get("post_count", 0) < min_count for t in data) print(f" Page {page}: {len(data)} tags (total: {len(tags)})", file=sys.stderr) if below_threshold: break page += 1 time.sleep(0.5) tags.sort(key=lambda t: t[2], reverse=True) return tags def fetch_e621(min_count: int = 10, **_kwargs) -> list: """Fetch tags from e621 API, paginated.""" tags = [] page = 1 session = requests.Session() session.headers["User-Agent"] = USER_AGENT while True: url = f"https://e621.net/tags.json?limit=320&page={page}&search[order]=count" try: resp = session.get(url, timeout=30) resp.raise_for_status() except requests.RequestException as e: print(f" Error on page {page}: {e}", file=sys.stderr) break data = resp.json() if not data: break for tag in data: count = tag.get("post_count", 0) if count < min_count: continue tags.append([tag["name"], tag["category"], count]) below_threshold = all(t.get("post_count", 0) < min_count for t in data) print(f" Page {page}: {len(data)} tags (total: {len(tags)})", file=sys.stderr) if below_threshold: break page += 1 time.sleep(1.0) # e621 rate limit is stricter tags.sort(key=lambda t: t[2], reverse=True) return tags def fetch_gelbooru(base_url: str, min_count: int = 10, api_key: str | None = None, rate_limit: float = 0.5) -> list: """Fetch tags from a Gelbooru-compatible API (rule34, gelbooru, etc.). Unlike Danbooru/e621, the Gelbooru tag endpoint doesn't support ordering by count, so we paginate through all tags and filter client-side. The tag endpoint returns XML (json=1 is not supported for tags). """ import xml.etree.ElementTree as ET tags = [] page = 0 page_size = 1000 session = requests.Session() session.headers["User-Agent"] = USER_AGENT params: dict[str, str] = { "page": "dapi", "s": "tag", "q": "index", "limit": str(page_size), } if api_key: if ":" not in api_key: print(" Error: --key must be USER_ID:API_KEY format", file=sys.stderr) return [] uid, key = api_key.split(":", 1) params["user_id"] = uid params["api_key"] = key while True: params["pid"] = str(page) try: resp = session.get(base_url, params=params, timeout=30) resp.raise_for_status() except requests.RequestException as e: print(f" Error on page {page}: {e}", file=sys.stderr) break text = resp.text.strip() if not text or text.startswith('"') or not text.startswith('= min_count: tags.append([el.get("name", ""), int(el.get("type", "0")), count]) print(f" Page {page}: {len(elements)} tags (total: {len(tags)})", file=sys.stderr) if len(elements) < page_size: break page += 1 time.sleep(rate_limit) tags.sort(key=lambda t: t[2], reverse=True) return tags def fetch_rule34(min_count: int = 10, api_key: str | None = None, **_kwargs) -> list: """Fetch tags from rule34.xxx.""" if not api_key: print(" Warning: no --key provided, rule34 may rate-limit aggressively", file=sys.stderr) return fetch_gelbooru("https://api.rule34.xxx/index.php", min_count=min_count, api_key=api_key) def fetch_sankaku(min_count: int = 10, **_kwargs) -> list: """Fetch tags from Sankaku Complex (chan.sankakucomplex.com). Uses the public JSON API at sankakuapi.com which supports order=count, so we can stop early when counts drop below min_count. Tag names come as English with spaces — converted to lowercase underscores. """ tags = [] page = 1 page_size = 200 # API max session = requests.Session() session.headers["User-Agent"] = USER_AGENT while True: try: resp = session.get( "https://sankakuapi.com/tags", params={"limit": page_size, "page": page, "order": "count"}, timeout=30, ) resp.raise_for_status() except requests.RequestException as e: print(f" Error on page {page}: {e}", file=sys.stderr) break data = resp.json() if not data: break below_threshold = True for tag in data: count = tag.get("post_count", 0) if count >= min_count: below_threshold = False name = tag.get("name_en") or tag.get("name_ja", "") if not name: continue name = name.strip().lower() tags.append([name, tag.get("type", 0), count]) else: below_threshold = True print(f" Page {page}: {len(data)} tags (total: {len(tags)})", file=sys.stderr) if below_threshold: break page += 1 time.sleep(0.5) tags.sort(key=lambda t: t[2], reverse=True) return tags SOURCES = { "danbooru": { "fetch": fetch_danbooru, "categories": DANBOORU_CATEGORIES, }, "e621": { "fetch": fetch_e621, "categories": E621_CATEGORIES, }, "rule34": { "fetch": fetch_rule34, "categories": RULE34_CATEGORIES, }, "sankaku": { "fetch": fetch_sankaku, "categories": SANKAKU_CATEGORIES, }, } def write_dict(name: str, tags: list, categories: dict, output_path: str, separator: str = "_"): """Write dict JSON file atomically. separator controls the word separator in tag names: "_" (default) → "high_resolution" (booru convention, anime/illustration models) " " → "high resolution" (natural language, SDXL/Flux-style models) """ if separator == "_": tags = [[t[0].replace(" ", "_"), t[1], t[2]] for t in tags] else: tags = [[t[0].replace("_", " "), t[1], t[2]] for t in tags] data = { "name": name, "version": date.today().isoformat(), "categories": categories, "tags": tags, } os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True) tmp_path = output_path + ".tmp" with open(tmp_path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, separators=(",", ":")) os.replace(tmp_path, output_path) size_mb = os.path.getsize(output_path) / (1024 * 1024) print(f" Written: {output_path} ({len(tags)} tags, {size_mb:.1f} MB)", file=sys.stderr) def fetch_source(name: str, output: str, min_count: int, api_key: str | None = None, separator: str = "_"): """Fetch and write a single source.""" if name not in SOURCES: print(f"Unknown source: {name}. Available: {', '.join(SOURCES.keys())}", file=sys.stderr) sys.exit(1) source = SOURCES[name] print(f"Fetching {name} (min_count={min_count})...", file=sys.stderr) tags = source["fetch"](min_count=min_count, api_key=api_key) if not tags: print(f" No tags fetched for {name}", file=sys.stderr) return write_dict(name, tags, source["categories"], output, separator=separator) def main(): parser = argparse.ArgumentParser(description="Fetch booru tag databases for SD.Next dict autocomplete") parser.add_argument("source", choices=list(SOURCES.keys()) + ["all"], help="Tag source to fetch") parser.add_argument("--output", "-o", help="Output file path (for single source)") parser.add_argument("--output-dir", "-d", help="Output directory (for 'all')") parser.add_argument("--min-count", "-m", type=int, default=10, help="Minimum post count to include (default: 10)") parser.add_argument("--key", "-k", help="API key as USER_ID:API_KEY (required for rule34)") parser.add_argument("--spaces", action="store_true", help="Use spaces instead of underscores in tag names (for natural language models like SDXL/Flux)") args = parser.parse_args() separator = " " if args.spaces else "_" if args.source == "all": output_dir = args.output_dir or "." for name in SOURCES: output = os.path.join(output_dir, f"{name}.json") fetch_source(name, output, args.min_count, api_key=args.key, separator=separator) else: output = args.output or f"{args.source}.json" fetch_source(args.source, output, args.min_count, api_key=args.key, separator=separator) print("Done.", file=sys.stderr) if __name__ == "__main__": main()