Add files via upload

2023-03-17 16:36:09 +03:00 · 2023-03-17 16:36:09 +03:00 · cf4e5e1ba3
parent 3207cb20a0
commit cf4e5e1ba3
3 changed files with 168 additions and 0 deletions
--- a/init.png
+++ b/init.png
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,27 @@
+# Reference based SD+CN Animation Script
+This script allows to automate video stylization task using StableDiffusion and ControlNet. There is also reference image trick implemented to make animation more stable. Here is an example of a video made with this script:
+
+[![IMAGE_ALT](https://img.youtube.com/vi/YW1JBJ57YBQ/0.jpg)](https://youtu.be/YW1JBJ57YBQ)
+
+Before running the script you need to create reference image that has the same resolution as the target video. You may generate it using Stable Diffusion by stylizing any frame of the video or via any other means. It is better to process different scenes separately to achieve better quality. For the video above the following reference image was used:
+![Reference image!](/init.png "Reference image")
+
+## Dependencies
+To install all necessary dependencies run this command
+```
+pip install opencv-python, numpy, tqdm
+```
+
+## Running the script
+This script works on top of [Automatic1111/web-ui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) interface via API. To run this script you have to set it up first. You also should have [sd-webui-controlnet](https://github.com/Mikubill/sd-webui-controlnet) extension installed. You need to have t2iadapter_color_sd14v1 and control_hed-fp16 models installed. If you have web-ui with ControlNet working correctly do the following:
+1. Go to the web-ui settings -> ControlNet tab -> Set "Allow other script to control this extension" checkbox to active and set "Multi ControlNet: Max models amount (requires restart)" to more then 2 -> press "Apply settings"
+2. Run web-ui with '--api' flag. It also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.   
+```bash webui.sh --xformers --api```
+3. Go to the script.py file and change main parameters (INPUT_VIDEO, OUTPUT_VIDEO, REF_IMAGE, PROMPT, N_PROMPT, SEED, W, H) to the ones you need for your project. The script is pretty simple so may change other parameters as well, although I would recommend to leave them as is for the first time.
+4. Run the script with ```python3 script.py```
+
+## Potential improvements
+There are several ways overall quality of animation may be improved:
+* You may use separate reference frame for each camera position to get more consistent style of the characters and less flickering.
+* Face animations may be substantially improved if one would use face-detection to grab them from the frame, align them and then process them separately on a higher resolution. And then you could place processed faces back to its place. It may improve lips movements and face coherency in general.
+* You may try to use previous processed frame as a reference frame for next image. It could improve short temporal consistency but may lead to a style drift in a long video.
--- a/script.py
+++ b/script.py
@ -0,0 +1,141 @@
+import requests
+import cv2
+import base64
+import numpy as np
+from tqdm import tqdm
+import os
+
+INPUT_VIDEO = "video_input.mp4"
+OUTPUT_VIDEO = "result.mp4"
+REF_IMAGE = "init.png"
+
+PROMPT = "pixarstyle 3D cartoon version of Pulp Fiction apartment Scene, natural skin texture, 4k textures, hdr, intricate, highly detailed, sharp focus, cinematic look, hyperdetailed. White man, black man, strong man, 90's room, gun, burger and cola."
+N_PROMPT = "cropped head, black and white, slanted eyes, deformed, bad anatomy, disfigured, poorly drawn face, mutation, mutated, extra limb, ugly, disgusting, poorly drawn hands, missing limb, floating limbs, disconnected limbs, malformed hands, blurry, ((((mutated hands and fingers)))), watermark, watermarked, oversaturated, censored, distorted hands, amputation, missing hands, obese, doubled face, double hands"
+SEED = 2901260158
+w,h = 1216, 512 # Width and height of the processed image. Note that actual image processed would be a W x 2H resolution. You should have enough VRAM to process it.
+
+
+START_FROM_IND = 2235 # index of a frame to start a processing from. Might be helpful with long animations where you need to restart the script multiple times
+SAVE_FRAMES = True # saves individual frames into 'out' folder if set True. Again might be helpful with long animations
+
+def to_b64(img):
+    _, buffer = cv2.imencode('.png', img)
+    b64img = base64.b64encode(buffer).decode("utf-8")
+    return b64img
+
+mask_img = np.zeros((h * 2,w,3), dtype=np.uint8)
+mask_img[:h] = 255
+b64_mask_img = to_b64(mask_img)
+
+# load context image to make generations more stable
+cont_img = cv2.imread(REF_IMAGE)
+cont_img = cv2.resize(cont_img, (w,h))
+
+class controlnetRequest():
+    def __init__(self, b64_full_img):
+        self.url = "http://localhost:7860/controlnet/img2img"
+        self.body = {
+            "init_images": [b64_full_img],
+            "mask": b64_mask_img,
+            "mask_blur": 0,
+            "inpainting_fill": 1,
+            "prompt": PROMPT,
+            "negative_prompt": N_PROMPT,
+            "seed": SEED,
+            "subseed": -1,
+            "subseed_strength": 0,
+            "batch_size": 1,
+            "n_iter": 1,
+            "steps": 20,
+            "cfg_scale": 4.5,
+            "denoising_strength":0.6,
+            "width": w,
+            "height": h * 2,
+            "restore_faces": False,
+            "eta": 0,
+            "sampler_index": "DPM++ 2S a",
+            "controlnet_units": [
+                {
+                    "module": "hed",
+                    "model": "control_hed-fp16 [13fee50b]",
+                    "weight": 0.6,
+                    "resize_mode": "Just Resize",
+                    "lowvram": False,
+                    "processor_res": 64,
+                    "threshold_a": 64,
+                    "threshold_b": 64,
+                    "guidance": 0.6,
+                    "guessmode": False
+                },
+                {
+                    "module": "none",
+                    "model": "t2iadapter_color_sd14v1 [8522029d]",
+                    "weight": 0.6,
+                    "resize_mode": "Just Resize",
+                    "lowvram": False,
+                    "processor_res": 64,
+                    "threshold_a": 64,
+                    "threshold_b": 64,
+                    "guidance": 0.6,
+                    "guessmode": False
+                }
+            ]
+        }
+
+    def sendRequest(self):
+        r = requests.post(self.url, json=self.body)
+        return r.json()
+
+# Open the input video file
+input_video = cv2.VideoCapture(INPUT_VIDEO)
+
+# Get useful info from the souce video
+fps = int(input_video.get(cv2.CAP_PROP_FPS))
+total_frames = int(input_video.get(cv2.CAP_PROP_FRAME_COUNT))
+
+# Create an output video file with the same fps, width, and height as the input video
+output_video = cv2.VideoWriter(OUTPUT_VIDEO, cv2.VideoWriter_fourcc(*'MP4V'), fps, (w, h))
+
+for ind in tqdm(range(total_frames)):
+    # Read the next frame from the input video
+    if not input_video.isOpened(): break
+    ret, frame = input_video.read()
+    if not ret: break
+
+    if ind+1 < START_FROM_IND: continue
+
+    # Resize the frame to proper resolution 
+    frame = cv2.resize(frame, (w,h))
+
+    full_img = cv2.vconcat([frame, cont_img])
+    full_img = cv2.resize(full_img, (w,h * 2))
+    b64_full_img = to_b64(full_img)
+
+    # Sending request to the web-ui
+    data_js = controlnetRequest(b64_full_img).sendRequest()
+    
+    # Convert the byte array to a NumPy array
+    image_bytes = base64.b64decode(data_js["images"][0])
+    np_array = np.frombuffer(image_bytes, dtype=np.uint8)
+
+    # Convert the NumPy array to a cv2 image
+    cv2_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
+
+    # Write the frame to the output video
+    cv2_image = cv2_image[:h]
+    output_video.write(cv2_image)
+
+    # show the last written frame - useful to catch any issue with the process
+    cv2.imshow('Out img', cv2_image)
+    
+    if SAVE_FRAMES: 
+        if not os.path.isdir('out'): os.makedirs('out')
+        cv2.imwrite(f'out/{ind+1:05d}.png', cv2_image)
+    if cv2.waitKey(1) & 0xFF == ord('q'): break # press Q to close the script while processing
+
+# Release the input and output video files
+input_video.release()
+output_video.release()
+
+# Close all windows
+cv2.destroyAllWindows()