Code cleanup, fixes

pull/12/head
d8ahazard 2022-12-15 13:48:36 -06:00
parent e4c638c7b8
commit 5fa74185f5
7 changed files with 51 additions and 44 deletions

View File

@ -2,40 +2,53 @@
## What is this??
As the name would imply, this is an extension for the [Stable-Diffusion WebUI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) by @Automatic1111
As the name would imply, this is an extension for
the [Stable-Diffusion WebUI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) by @Automatic1111
## What does this do?
It does a few things, actually.
For starters, it utilizes a combination of BLIP/CLIP and YOLOv5 to provide "smart cropping" for images. The primary subject of each image is identified, the center of that subject is determined, and then the application tries it's best to crop the image so as to keep as much of the subject as possible within the dimensions specified.
For starters, it utilizes a combination of BLIP/CLIP and YOLOv5 to provide "smart cropping" for images. The primary
subject of each image is identified, the center of that subject is determined, and then the application tries it's best
to crop the image so as to keep as much of the subject as possible within the dimensions specified.
Second, it allows storing the determined image caption directly to the image filename, versus having to create a txt file along side every image. You can still create a txt file, use existing captions, or not do any captioning at all.
Second, it allows storing the determined image caption directly to the image filename, versus having to create a txt
file along side every image. You can still create a txt file, use existing captions, or not do any captioning at all.
Third, I've provided face restoration and upscaling options for input images. You can select from GFPGAN and Codeformer for face restoration, and any of the provided upscalers from the "extras' tab to refine/smooth/add detail to your final output images.
Last, but not least, it offers a rudimentary way to swap the "class" of a captioned image with the specific keyword in the image. So, if you're trying to train a subject called "xyz" and "xyz" is a dog, you can easily swap "dog" (and "a dog") wth "xyz" in your captions. Neato!
Third, I've provided face restoration and upscaling options for input images. You can select from GFPGAN and Codeformer
for face restoration, and any of the provided upscalers from the "extras' tab to refine/smooth/add detail to your final
output images.
Last, but not least, it offers a rudimentary way to swap the "class" of a captioned image with the specific keyword in
the image. So, if you're trying to train a subject called "xyz" and "xyz" is a dog, you can easily swap "dog" (and "a
dog") wth "xyz" in your captions. Neato!
## Smart Cropping
As I said above, smart cropping utilizes a combination of YOLOV5 object recognition and BLIP/CLIP (and DeepDanBooru) captioning to automatically determine the most prominent subject in a photo, and automatically crop the subject as completely as possible. You can also specify a specific subject (dog/cat/woman/house) for the software to find, and skip the YOLOV5 detection entirely.
As I said above, smart cropping utilizes a combination of YOLOV5 object recognition and BLIP/CLIP (and DeepDanBooru)
captioning to automatically determine the most prominent subject in a photo, and automatically crop the subject as
completely as possible. You can also specify a specific subject (dog/cat/woman/house) for the software to find, and skip
the YOLOV5 detection entirely.
<img src="https://user-images.githubusercontent.com/1633844/198178259-e1ade3d6-386e-41b8-9c93-0eca19c82d3d.png" width="550" height="741" />
If a subject is not found, the image will be downscaled and cropped from the center.
## Smart Captioning
This uses all the same features as set in user preferences, with the additional options to save to txt or append to the image file name.
This uses all the same features as set in user preferences, with the additional options to save to txt or append to the
image file name.
Additionally, you can swap the generic "class" of the image with a specific subject keyword. This feature may not be perfect in all cases, but it should still go a long way in speeding up the captioning process.
You can also specify a maximum caption length, which will split the caption by spaces and append words until the maximum length is reached.
Additionally, you can swap the generic "class" of the image with a specific subject keyword. This feature may not be
perfect in all cases, but it should still go a long way in speeding up the captioning process.
You can also specify a maximum caption length, which will split the caption by spaces and append words until the maximum
length is reached.
## Post Processing
It's basically a simplified version of the "extras" tab. The idea is that you can do facial restoration and/or use a model like swinIR or LDSR to smooth or add details to an image. If an image is "actually" upscaled beyond the target crop size, it will be downscaled again back to the original size.
It's basically a simplified version of the "extras" tab. The idea is that you can do facial restoration and/or use a
model like swinIR or LDSR to smooth or add details to an image. If an image is "actually" upscaled beyond the target
crop size, it will be downscaled again back to the original size.

View File

@ -113,4 +113,4 @@ class CropClip:
# If the method is TM_SQDIFF or TM_SQDIFF_NORMED, take minimum
top_left = min_loc
bottom_right = (top_left[0] + out.width, top_left[1] + out.height)
return [top_left[0], bottom_right[0], top_left[1], bottom_right[1]]
return [top_left[0], bottom_right[0], top_left[1], bottom_right[1]]

View File

@ -1,20 +1,20 @@
import hashlib
import inspect
import math
import numpy as np
import open_clip
import os
import pickle
import time
import torch
from dataclasses import dataclass
from models.blip import blip_decoder, BLIP_Decoder
from typing import List
import numpy as np
import open_clip
import torch
from PIL import Image
from models.blip import blip_decoder, BLIP_Decoder
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from tqdm import tqdm
from typing import List
from extensions.sd_smartprocess.interrogator import Interrogator
@ -50,9 +50,9 @@ class ClipInterrogator(Interrogator):
def __init__(self,
use_v2,
append_artist,
append_medium,
append_movement,
append_flavor,
append_medium,
append_movement,
append_flavor,
append_trending):
if use_v2:
model_name = "ViT-H-14/laion2b_s32b_b79k"
@ -360,4 +360,4 @@ def _truncate_to_fit(text: str, tokenize) -> str:
if _prompt_at_max_len(new_text + part, tokenize):
break
new_text += ', ' + part
return new_text
return new_text

View File

@ -2,8 +2,6 @@
import os
import re
import sys
import traceback
from collections import namedtuple
from pathlib import Path
from typing import Tuple, Dict
@ -11,16 +9,13 @@ from typing import Tuple, Dict
import numpy as np
import pandas as pd
import torch
import open_clip
from PIL import Image
from huggingface_hub import hf_hub_download
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
import modules.deepbooru
import modules.shared as shared
from extensions.sd_smartprocess import dbimutils
from modules import devices, paths, lowvram, modelloader
from modules import devices
from modules import images
from modules.deepbooru import re_special as tag_escape_pattern

View File

@ -1,9 +1,9 @@
function start_smart_process(){
function start_smart_process() {
requestProgress('sp');
gradioApp().querySelector('#sp_error').innerHTML='';
gradioApp().querySelector('#sp_error').innerHTML = '';
return args_to_array(arguments);
}
onUiUpdate(function(){
onUiUpdate(function () {
check_progressbar('sp', 'sp_progressbar', 'sp_progress_span', '', 'sp_interrupt', 'sp_preview', 'sp_gallery')
})

View File

@ -2,9 +2,9 @@ import gradio as gr
from extensions.sd_smartprocess import smartprocess
from modules import script_callbacks, shared
from modules.call_queue import wrap_gradio_gpu_call
from modules.shared import cmd_opts
from modules.ui import setup_progressbar
from webui import wrap_gradio_gpu_call
def on_ui_tabs():
@ -36,7 +36,8 @@ def on_ui_tabs():
sp_clip_append_artist = gr.Checkbox(label="Append Artist tags from CLIP")
sp_clip_append_trending = gr.Checkbox(label="Append Trending tags from CLIP")
sp_caption_wd14 = gr.Checkbox(label="Add WD14 Tags to Caption")
sp_wd14_min_score = gr.Slider(label="Minimum Score for WD14 Tags", value=0.75, minimum=0.01, maximum=1,
sp_wd14_min_score = gr.Slider(label="Minimum Score for WD14 Tags", value=0.75, minimum=0.01,
maximum=1,
step=0.01)
sp_caption_deepbooru = gr.Checkbox(label='Add DeepDanbooru Tags to Caption',
visible=True if cmd_opts.deepdanbooru else False)
@ -50,13 +51,13 @@ def on_ui_tabs():
with gr.Tab("Post-Processing"):
sp_restore_faces = gr.Checkbox(label='Restore Faces', value=False)
sp_face_model = gr.Dropdown(label="Face Restore Model",choices=["GFPGAN", "Codeformer"], value="GFPGAN")
sp_face_model = gr.Dropdown(label="Face Restore Model", choices=["GFPGAN", "Codeformer"],
value="GFPGAN")
sp_upscale = gr.Checkbox(label='Upscale and Resize', value=False)
sp_upscale_ratio = gr.Slider(label="Upscale Ratio", value=2, step=1, minimum=2, maximum=4)
sp_scaler = gr.Radio(label='Upscaler', elem_id="sp_scaler",
choices=[x.name for x in shared.sd_upscalers],
value=shared.sd_upscalers[0].name, type="index")
choices=[x.name for x in shared.sd_upscalers],
value=shared.sd_upscalers[0].name, type="index")
# Preview/progress
with gr.Column(variant="panel"):

View File

@ -78,7 +78,6 @@ def preprocess(rename,
upscale_ratio,
scaler
):
try:
shared.state.textinfo = "Initializing smart processing..."
safe.RestrictedUnpickler = reallysafe.RestrictedUnpickler
@ -122,7 +121,6 @@ def preprocess(rename,
pass
crop_clip = CropClip()
src = os.path.abspath(src)
dst = os.path.abspath(dst)
@ -152,7 +150,7 @@ def preprocess(rename,
if caption_clip:
tags = clip_interrogator.interrogate(img, max_flavors=clip_max_flavors)
for tag in tags:
#print(f"CLIPTag: {tag}")
# print(f"CLIPTag: {tag}")
out_tags.append(tag)
if wd_interrogator is not None:
@ -160,7 +158,7 @@ def preprocess(rename,
for tag in sorted(tags, key=tags.get, reverse=True):
if tags[tag] >= wd14_min_score:
#print(f"WDTag {tag} score is {tags[tag]}")
# print(f"WDTag {tag} score is {tags[tag]}")
out_tags.append(tag)
else:
break
@ -169,7 +167,7 @@ def preprocess(rename,
tags = db_interrogator.interrogate(image)
for tag in sorted(tags, key=tags.get, reverse=True):
if tags[tag] >= booru_min_score:
#print(f"DBTag {tag} score is {tags[tag]}")
# print(f"DBTag {tag} score is {tags[tag]}")
out_tags.append(tag)
# Remove duplicates