diff --git a/scripts/tokenizer.py b/scripts/tokenizer.py index df69c16..02963b3 100644 --- a/scripts/tokenizer.py +++ b/scripts/tokenizer.py @@ -21,9 +21,13 @@ css = """ """ -def tokenize(text): +def tokenize(text, input_is_ids=False): clip: FrozenCLIPEmbedder = shared.sd_model.cond_stage_model.wrapped - tokens = clip.tokenizer(text, truncation=False, add_special_tokens=False)["input_ids"] + + if input_is_ids: + tokens = [int(x.strip()) for x in text.split(",")] + else: + tokens = clip.tokenizer(text, truncation=False, add_special_tokens=False)["input_ids"] vocab = {v: k for k, v in clip.tokenizer.get_vocab().items()} @@ -33,21 +37,41 @@ def tokenize(text): current_ids = [] class_index = 0 - def dump(): - nonlocal code, ids, current_ids, class_index + def dump(last=False): + nonlocal code, ids, current_ids words = [vocab.get(x, "") for x in current_ids] + def wordscode(ids, word): + nonlocal class_index + res = f"""{html.escape(word)}""" + class_index += 1 + return res + try: word = bytearray([clip.tokenizer.byte_decoder[x] for x in ''.join(words)]).decode("utf-8") except UnicodeDecodeError: - return + if last: + word = "❌" * len(current_ids) + elif len(current_ids) > 4: + id = current_ids[0] + ids += [id] + local_ids = current_ids[1:] + code += wordscode([id], "❌") + + current_ids = [] + for id in local_ids: + current_ids.append(id) + dump() + + return + else: + return word = word.replace("", " ") - code += f"""{html.escape(word)}""" + code += wordscode(current_ids, word) ids += current_ids - class_index += 1 current_ids = [] @@ -57,9 +81,16 @@ def tokenize(text): dump() - dump() + dump(last=True) - return code, ids + ids_html = f""" +

+Token count: {len(ids)}
+{", ".join([str(x) for x in ids])} +

+""" + + return code, ids_html def add_tab(): @@ -70,16 +101,22 @@ def add_tab(): Before your text is sent to the neural network, it gets turned into numbers in a process called tokenization. These tokens are how the neural network reads and interprets text. Thanks to our great friends at Shousetsu愛 for inspiration for this feature.

""") - prompt = gr.Textbox(label="Prompt", elem_id="tokenizer_prompt", show_label=False, lines=8, placeholder="Prompt for tokenization") - go = gr.Button(value="Tokenize", variant="primary") + with gr.Tabs() as tabs: + with gr.Tab("Text input", id="input_text"): + prompt = gr.Textbox(label="Prompt", elem_id="tokenizer_prompt", show_label=False, lines=8, placeholder="Prompt for tokenization") + go = gr.Button(value="Tokenize", variant="primary") + + with gr.Tab("ID input", id="input_ids"): + prompt_ids = gr.Textbox(label="Prompt", elem_id="tokenizer_prompt", show_label=False, lines=8, placeholder="Ids for tokenization (example: 9061, 631, 736)") + go_ids = gr.Button(value="Tokenize", variant="primary") with gr.Tabs(): with gr.Tab("Text"): tokenized_text = gr.HTML(elem_id="tokenized_text") with gr.Tab("Tokens"): - tokens = gr.Text(elem_id="tokenized_tokens", show_label=False) + tokens = gr.HTML(elem_id="tokenized_tokens") go.click( fn=tokenize, @@ -87,6 +124,12 @@ Before your text is sent to the neural network, it gets turned into numbers in a outputs=[tokenized_text, tokens], ) + go_ids.click( + fn=lambda x: tokenize(x, input_is_ids=True), + inputs=[prompt_ids], + outputs=[tokenized_text, tokens], + ) + return [(ui, "Tokenizer", "tokenizer")]