parent
e489e608e7
commit
c5e95a7233
|
|
@ -21,9 +21,13 @@ css = """
|
|||
"""
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
def tokenize(text, input_is_ids=False):
|
||||
clip: FrozenCLIPEmbedder = shared.sd_model.cond_stage_model.wrapped
|
||||
tokens = clip.tokenizer(text, truncation=False, add_special_tokens=False)["input_ids"]
|
||||
|
||||
if input_is_ids:
|
||||
tokens = [int(x.strip()) for x in text.split(",")]
|
||||
else:
|
||||
tokens = clip.tokenizer(text, truncation=False, add_special_tokens=False)["input_ids"]
|
||||
|
||||
vocab = {v: k for k, v in clip.tokenizer.get_vocab().items()}
|
||||
|
||||
|
|
@ -33,21 +37,41 @@ def tokenize(text):
|
|||
current_ids = []
|
||||
class_index = 0
|
||||
|
||||
def dump():
|
||||
nonlocal code, ids, current_ids, class_index
|
||||
def dump(last=False):
|
||||
nonlocal code, ids, current_ids
|
||||
|
||||
words = [vocab.get(x, "") for x in current_ids]
|
||||
|
||||
def wordscode(ids, word):
|
||||
nonlocal class_index
|
||||
res = f"""<span class='tokenizer-token tokenizer-token-{class_index%4}' title='{html.escape(", ".join([str(x) for x in ids]))}'>{html.escape(word)}</span>"""
|
||||
class_index += 1
|
||||
return res
|
||||
|
||||
try:
|
||||
word = bytearray([clip.tokenizer.byte_decoder[x] for x in ''.join(words)]).decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
return
|
||||
if last:
|
||||
word = "❌" * len(current_ids)
|
||||
elif len(current_ids) > 4:
|
||||
id = current_ids[0]
|
||||
ids += [id]
|
||||
local_ids = current_ids[1:]
|
||||
code += wordscode([id], "❌")
|
||||
|
||||
current_ids = []
|
||||
for id in local_ids:
|
||||
current_ids.append(id)
|
||||
dump()
|
||||
|
||||
return
|
||||
else:
|
||||
return
|
||||
|
||||
word = word.replace("</w>", " ")
|
||||
|
||||
code += f"""<span class='tokenizer-token tokenizer-token-{class_index%4}' title='{html.escape(", ".join([str(x) for x in current_ids]))}'>{html.escape(word)}</span>"""
|
||||
code += wordscode(current_ids, word)
|
||||
ids += current_ids
|
||||
class_index += 1
|
||||
|
||||
current_ids = []
|
||||
|
||||
|
|
@ -57,9 +81,16 @@ def tokenize(text):
|
|||
|
||||
dump()
|
||||
|
||||
dump()
|
||||
dump(last=True)
|
||||
|
||||
return code, ids
|
||||
ids_html = f"""
|
||||
<p>
|
||||
Token count: {len(ids)}<br>
|
||||
{", ".join([str(x) for x in ids])}
|
||||
</p>
|
||||
"""
|
||||
|
||||
return code, ids_html
|
||||
|
||||
|
||||
def add_tab():
|
||||
|
|
@ -70,16 +101,22 @@ def add_tab():
|
|||
Before your text is sent to the neural network, it gets turned into numbers in a process called tokenization. These tokens are how the neural network reads and interprets text. Thanks to our great friends at Shousetsu愛 for inspiration for this feature.
|
||||
</p>
|
||||
""")
|
||||
prompt = gr.Textbox(label="Prompt", elem_id="tokenizer_prompt", show_label=False, lines=8, placeholder="Prompt for tokenization")
|
||||
|
||||
go = gr.Button(value="Tokenize", variant="primary")
|
||||
with gr.Tabs() as tabs:
|
||||
with gr.Tab("Text input", id="input_text"):
|
||||
prompt = gr.Textbox(label="Prompt", elem_id="tokenizer_prompt", show_label=False, lines=8, placeholder="Prompt for tokenization")
|
||||
go = gr.Button(value="Tokenize", variant="primary")
|
||||
|
||||
with gr.Tab("ID input", id="input_ids"):
|
||||
prompt_ids = gr.Textbox(label="Prompt", elem_id="tokenizer_prompt", show_label=False, lines=8, placeholder="Ids for tokenization (example: 9061, 631, 736)")
|
||||
go_ids = gr.Button(value="Tokenize", variant="primary")
|
||||
|
||||
with gr.Tabs():
|
||||
with gr.Tab("Text"):
|
||||
tokenized_text = gr.HTML(elem_id="tokenized_text")
|
||||
|
||||
with gr.Tab("Tokens"):
|
||||
tokens = gr.Text(elem_id="tokenized_tokens", show_label=False)
|
||||
tokens = gr.HTML(elem_id="tokenized_tokens")
|
||||
|
||||
go.click(
|
||||
fn=tokenize,
|
||||
|
|
@ -87,6 +124,12 @@ Before your text is sent to the neural network, it gets turned into numbers in a
|
|||
outputs=[tokenized_text, tokens],
|
||||
)
|
||||
|
||||
go_ids.click(
|
||||
fn=lambda x: tokenize(x, input_is_ids=True),
|
||||
inputs=[prompt_ids],
|
||||
outputs=[tokenized_text, tokens],
|
||||
)
|
||||
|
||||
return [(ui, "Tokenizer", "tokenizer")]
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue