first

2022-11-05 12:42:23 +03:00 · 2022-11-05 12:42:23 +03:00 · e74fc69eb2
commit e74fc69eb2
2 changed files with 94 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
+An extension for stable-diffusion-webui that adds a tab that lets you preview how CLIP model would tokenize your text.
--- a/scripts/tokenizer.py
+++ b/scripts/tokenizer.py
@ -0,0 +1,93 @@
+import html
+
+from ldm.modules.encoders.modules import FrozenCLIPEmbedder
+from modules import script_callbacks, shared
+
+import gradio as gr
+
+
+css = """
+.tokenizer-token{
+    cursor: pointer;
+}
+.tokenizer-token-0 {background: rgba(255, 0, 0, 0.05);}
+.tokenizer-token-0:hover {background: rgba(255, 0, 0, 0.15);}
+.tokenizer-token-1 {background: rgba(0, 255, 0, 0.05);}
+.tokenizer-token-1:hover {background: rgba(0, 255, 0, 0.15);}
+.tokenizer-token-2 {background: rgba(0, 0, 255, 0.05);}
+.tokenizer-token-2:hover {background: rgba(0, 0, 255, 0.15);}
+.tokenizer-token-3 {background: rgba(255, 156, 0, 0.05);}
+.tokenizer-token-3:hover {background: rgba(255, 156, 0, 0.15);}
+"""
+
+
+def tokenize(text):
+    clip: FrozenCLIPEmbedder = shared.sd_model.cond_stage_model.wrapped
+    tokens = clip.tokenizer(text, truncation=False, add_special_tokens=False)["input_ids"]
+
+    vocab = {v: k for k, v in clip.tokenizer.get_vocab().items()}
+
+    code = ''
+    ids = []
+
+    current_ids = []
+    class_index = 0
+
+    def dump():
+        nonlocal code, ids, current_ids, class_index
+
+        words = [vocab.get(x, "") for x in current_ids]
+
+        try:
+            word = bytearray([clip.tokenizer.byte_decoder[x] for x in ''.join(words)]).decode("utf-8")
+        except UnicodeDecodeError:
+            return
+
+        word = word.replace("</w>", " ")
+
+        code += f"""<span class='tokenizer-token tokenizer-token-{class_index%4}' title='{html.escape(", ".join([str(x) for x in current_ids]))}'>{html.escape(word)}</span>"""
+        ids += current_ids
+        class_index += 1
+
+        current_ids = []
+
+    for token in tokens:
+        token = int(token)
+        current_ids.append(token)
+
+        dump()
+
+    dump()
+
+    return code, ids
+
+
+def add_tab():
+    with gr.Blocks(analytics_enabled=False, css=css) as ui:
+        gr.HTML(f"""
+<style>{css}</style>
+<p>
+Before your text is sent to the neural network, it gets turned into numbers in a process called tokenization. These tokens are how the neural network reads and interprets text. Thanks to our great friends at Shousetsu愛 for inspiration for this feature.
+</p>
+""")
+        prompt = gr.Textbox(label="Prompt", elem_id="tokenizer_prompt", show_label=False, lines=8, placeholder="Prompt for tokenization")
+
+        go = gr.Button(value="Tokenize", variant="primary")
+
+        with gr.Tabs():
+            with gr.Tab("Text"):
+                tokenized_text = gr.HTML(elem_id="tokenized_text")
+
+            with gr.Tab("Tokens"):
+                tokens = gr.Text(elem_id="tokenized_tokens", show_label=False)
+
+        go.click(
+            fn=tokenize,
+            inputs=[prompt],
+            outputs=[tokenized_text, tokens],
+        )
+
+    return [(ui, "Tokenizer", "tokenizer")]
+
+
+script_callbacks.on_ui_tabs(add_tab)
				`@ -0,0 +1 @@`
				`An extension for stable-diffusion-webui that adds a tab that lets you preview how CLIP model would tokenize your text.`