option to use id input

better handling for bad data
2022-11-05 23:40:39 +03:00 · 2022-11-05 23:40:39 +03:00 · c5e95a7233
parent e489e608e7
commit c5e95a7233
1 changed files with 55 additions and 12 deletions
--- a/scripts/tokenizer.py
+++ b/scripts/tokenizer.py
@ -21,9 +21,13 @@ css = """
 """


-def tokenize(text):
+def tokenize(text, input_is_ids=False):
    clip: FrozenCLIPEmbedder = shared.sd_model.cond_stage_model.wrapped
-    tokens = clip.tokenizer(text, truncation=False, add_special_tokens=False)["input_ids"]
+
+    if input_is_ids:
+        tokens = [int(x.strip()) for x in text.split(",")]
+    else:
+        tokens = clip.tokenizer(text, truncation=False, add_special_tokens=False)["input_ids"]

    vocab = {v: k for k, v in clip.tokenizer.get_vocab().items()}

@ -33,21 +37,41 @@ def tokenize(text):
    current_ids = []
    class_index = 0

-    def dump():
-        nonlocal code, ids, current_ids, class_index
+    def dump(last=False):
+        nonlocal code, ids, current_ids

        words = [vocab.get(x, "") for x in current_ids]

+        def wordscode(ids, word):
+            nonlocal class_index
+            res = f"""<span class='tokenizer-token tokenizer-token-{class_index%4}' title='{html.escape(", ".join([str(x) for x in ids]))}'>{html.escape(word)}</span>"""
+            class_index += 1
+            return res
+
        try:
            word = bytearray([clip.tokenizer.byte_decoder[x] for x in ''.join(words)]).decode("utf-8")
        except UnicodeDecodeError:
-            return
+            if last:
+                word = "❌" * len(current_ids)
+            elif len(current_ids) > 4:
+                id = current_ids[0]
+                ids += [id]
+                local_ids = current_ids[1:]
+                code += wordscode([id], "❌")
+
+                current_ids = []
+                for id in local_ids:
+                    current_ids.append(id)
+                    dump()
+
+                return
+            else:
+                return

        word = word.replace("</w>", " ")

-        code += f"""<span class='tokenizer-token tokenizer-token-{class_index%4}' title='{html.escape(", ".join([str(x) for x in current_ids]))}'>{html.escape(word)}</span>"""
+        code += wordscode(current_ids, word)
        ids += current_ids
-        class_index += 1

        current_ids = []

@ -57,9 +81,16 @@ def tokenize(text):

        dump()

-    dump()
+    dump(last=True)

-    return code, ids
+    ids_html = f"""
+<p>
+Token count: {len(ids)}<br>
+{", ".join([str(x) for x in ids])}
+</p>
+"""
+
+    return code, ids_html


 def add_tab():
@ -70,16 +101,22 @@ def add_tab():
 Before your text is sent to the neural network, it gets turned into numbers in a process called tokenization. These tokens are how the neural network reads and interprets text. Thanks to our great friends at Shousetsu愛 for inspiration for this feature.
 </p>
 """)
-        prompt = gr.Textbox(label="Prompt", elem_id="tokenizer_prompt", show_label=False, lines=8, placeholder="Prompt for tokenization")

-        go = gr.Button(value="Tokenize", variant="primary")
+        with gr.Tabs() as tabs:
+            with gr.Tab("Text input", id="input_text"):
+                prompt = gr.Textbox(label="Prompt", elem_id="tokenizer_prompt", show_label=False, lines=8, placeholder="Prompt for tokenization")
+                go = gr.Button(value="Tokenize", variant="primary")
+
+            with gr.Tab("ID input", id="input_ids"):
+                prompt_ids = gr.Textbox(label="Prompt", elem_id="tokenizer_prompt", show_label=False, lines=8, placeholder="Ids for tokenization (example: 9061, 631, 736)")
+                go_ids = gr.Button(value="Tokenize", variant="primary")

        with gr.Tabs():
            with gr.Tab("Text"):
                tokenized_text = gr.HTML(elem_id="tokenized_text")

            with gr.Tab("Tokens"):
-                tokens = gr.Text(elem_id="tokenized_tokens", show_label=False)
+                tokens = gr.HTML(elem_id="tokenized_tokens")

        go.click(
            fn=tokenize,
@ -87,6 +124,12 @@ Before your text is sent to the neural network, it gets turned into numbers in a
            outputs=[tokenized_text, tokens],
        )

+        go_ids.click(
+            fn=lambda x: tokenize(x, input_is_ids=True),
+            inputs=[prompt_ids],
+            outputs=[tokenized_text, tokens],
+        )
+
    return [(ui, "Tokenizer", "tokenizer")]