v0.4

2023-04-13 00:00:28 +03:00 · 2023-04-13 00:00:28 +03:00 · 02936b0127
parent d8f4774706
commit 02936b0127
6 changed files with 414 additions and 314 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
+__pycache__/
 out/
 result.mp4
--- a/compute_flow.py
+++ b/compute_flow.py
@ -0,0 +1,71 @@
+import cv2
+import base64
+import numpy as np
+from tqdm import tqdm
+import os
+
+from flow_utils import RAFT_estimate_flow
+import h5py
+
+import argparse
+
+def main(args):
+  W, H = args.width, args.height
+  # Open the input video file
+  input_video = cv2.VideoCapture(args.input_video)
+
+  # Get useful info from the source video
+  fps = int(input_video.get(cv2.CAP_PROP_FPS))
+  total_frames = int(input_video.get(cv2.CAP_PROP_FRAME_COUNT))
+
+  prev_frame = None
+
+  # create an empty HDF5 file
+  with h5py.File(args.output_file, 'w') as f: pass
+
+  # open the file for writing a flow maps into it
+  with h5py.File(args.output_file, 'a') as f:
+    flow_maps = f.create_dataset('flow_maps', shape=(0, 2, H, W, 2), maxshape=(None, 2, H, W, 2), dtype=np.float16) 
+
+    for ind in tqdm(range(total_frames)):
+      # Read the next frame from the input video
+      if not input_video.isOpened(): break
+      ret, cur_frame = input_video.read()
+      if not ret: break
+
+      cur_frame = cv2.resize(cur_frame, (W, H))
+
+      if prev_frame is not None:
+        next_flow, prev_flow, occlusion_mask = RAFT_estimate_flow(prev_frame, cur_frame)
+
+        # write data into a file
+        flow_maps.resize(ind, axis=0)
+        flow_maps[ind-1, 0] = next_flow
+        flow_maps[ind-1, 1] = prev_flow
+
+        occlusion_mask = np.clip(occlusion_mask * 0.2 * 255, 0, 255).astype(np.uint8)
+
+        if args.visualize:
+          # show the last written frame - useful to catch any issue with the process
+          img_show = cv2.hconcat([cur_frame, occlusion_mask])
+          cv2.imshow('Out img', img_show)
+          if cv2.waitKey(1) & 0xFF == ord('q'): exit() # press Q to close the script while processing
+
+      prev_frame = cur_frame.copy()
+
+  # Release the input and output video files
+  input_video.release()
+
+  # Close all windows
+  if args.visualize: cv2.destroyAllWindows()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--input_video', help="Path to input video file", required=True)
+    parser.add_argument('-o', '--output_file', help="Path to output flow file. Stored in *.h5 format", required=True)
+    parser.add_argument('-W', '--width', help='Width of the generated flow maps', default=1024, type=int)
+    parser.add_argument('-H', '--height', help='Height of the generated flow maps', default=576, type=int)
+    parser.add_argument('-v', '--visualize', action='store_true', help='Show proceed images and occlusion maps')
+    args = parser.parse_args()
+
+    main(args)
--- a/flow_utils.py
+++ b/flow_utils.py
@ -0,0 +1,87 @@
+import numpy as np
+import cv2
+
+#RAFT dependencies
+import sys
+sys.path.append('RAFT/core')
+
+from collections import namedtuple
+import torch
+import argparse
+from raft import RAFT
+from utils.utils import InputPadder
+
+RAFT_model = None
+def RAFT_estimate_flow(frame1, frame2, device = 'cuda'):
+  global RAFT_model
+  if RAFT_model is None:
+    args = argparse.Namespace(**{
+      'model': 'RAFT/models/raft-things.pth',
+      'mixed_precision': True,
+      'small': False,
+      'alternate_corr': False,
+      'path': ""
+    })
+
+    RAFT_model = torch.nn.DataParallel(RAFT(args))
+    RAFT_model.load_state_dict(torch.load(args.model))
+
+    RAFT_model = RAFT_model.module
+    RAFT_model.to(device)
+    RAFT_model.eval()
+
+  with torch.no_grad():
+    frame1_torch = torch.from_numpy(frame1).permute(2, 0, 1).float()[None].to(device)
+    frame2_torch = torch.from_numpy(frame2).permute(2, 0, 1).float()[None].to(device)
+
+    padder = InputPadder(frame1_torch.shape)
+    image1, image2 = padder.pad(frame1_torch, frame2_torch)
+
+    # estimate optical flow
+    _, next_flow = RAFT_model(image1, image2, iters=20, test_mode=True)
+    _, prev_flow = RAFT_model(image2, image1, iters=20, test_mode=True)
+
+    next_flow = next_flow[0].permute(1,2,0).cpu().numpy()
+    prev_flow = prev_flow[0].permute(1,2,0).cpu().numpy()
+
+    fb_flow = next_flow + prev_flow
+    fb_norm = np.linalg.norm(fb_flow, axis=2)
+
+    occlusion_mask = fb_norm[..., None].repeat(3, axis = -1)
+
+  return next_flow, prev_flow, occlusion_mask
+
+def compute_diff_map(next_flow, prev_flow, prev_frame, cur_frame, prev_frame_styled):
+  h, w = cur_frame.shape[:2]
+
+  next_flow = cv2.resize(next_flow, (w, h))
+  prev_flow = cv2.resize(prev_flow, (w, h))
+
+  flow_map = -next_flow.copy()
+  flow_map[:,:,0] += np.arange(w)
+  flow_map[:,:,1] += np.arange(h)[:,np.newaxis]
+
+  warped_frame = cv2.remap(prev_frame, flow_map, None, cv2.INTER_NEAREST)
+  warped_frame_styled = cv2.remap(prev_frame_styled, flow_map, None, cv2.INTER_NEAREST)
+
+  # compute occlusion mask
+  fb_flow = next_flow + prev_flow
+  fb_norm = np.linalg.norm(fb_flow, axis=2)
+
+  occlusion_mask = fb_norm[..., None] 
+
+  diff_mask_org = np.abs(warped_frame.astype(np.float32) - cur_frame.astype(np.float32)) / 255
+  diff_mask_org = diff_mask_org.max(axis = -1, keepdims=True)
+
+  diff_mask_stl = np.abs(warped_frame_styled.astype(np.float32) - cur_frame.astype(np.float32)) / 255
+  diff_mask_stl = diff_mask_stl.max(axis = -1, keepdims=True)
+
+  alpha_mask = np.maximum(occlusion_mask * 0.3, diff_mask_org * 4, diff_mask_stl * 2)
+  alpha_mask = alpha_mask.repeat(3, axis = -1)
+
+  #alpha_mask_blured = cv2.dilate(alpha_mask, np.ones((5, 5), np.float32))
+  alpha_mask = cv2.GaussianBlur(alpha_mask, (51,51), 5, cv2.BORDER_DEFAULT)
+
+  alpha_mask = np.clip(alpha_mask, 0, 1)
+
+  return alpha_mask, warped_frame_styled
--- a/readme.md
+++ b/readme.md
@ -1,42 +1,49 @@
-# SD-CN-Animation Script
-This script allows you to automate video stylization task using StableDiffusion and ControlNet. It uses a simple optical flow estimation algorithm to keep the animation stable and create an inpating mask that is used to generate the next frame. Here is an example of a video made with this script:
+# SD-CN-Animation
+This script allows you to automate video stylization task using StableDiffusion and ControlNet. It uses a simple optical flow estimation algorithm to keep the animation stable and create an inpainting mask that is used to generate the next frame. Here is an example of a video made with this script:

 [![IMAGE_ALT](https://img.youtube.com/vi/j-0niEMm6DU/0.jpg)](https://youtu.be/j-0niEMm6DU)

 This script can also be using to swap the person in the video like in this example: https://youtube.com/shorts/be93_dIeZWU

 ## Dependencies
-To install all necessary dependencies run this command
+To install all the necessary dependencies, run this command:
 ```
-pip install opencv-python opencv-contrib-python numpy tqdm
+pip install opencv-python opencv-contrib-python numpy tqdm h5py
 ```
-
-You have to set up the RAFT repository as it described here: https://github.com/princeton-vl/RAFT .
-Basically it just comes down to running "./download_models.sh" in RAFT folder to download the models. 
-
-To run the algorithm alongside Stable Diffusion with ControlNet in 640x640 resolution would require about 8GB of VRAM, as [RAFT](https://github.com/princeton-vl/RAFT) (current optical flow estimation method) takes about 3,7GB of memory.
+You have to set up the RAFT repository as it described here: https://github.com/princeton-vl/RAFT . Basically it just comes down to running "./download_models.sh" in RAFT folder to download the models.

 ## Running the script
-This script works on top of [Automatic1111/web-ui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) interface via API. To run this script you have to set it up first. You also should have [sd-webui-controlnet](https://github.com/Mikubill/sd-webui-controlnet) extension installed. You need to have control_hed-fp16 model installed. If you have web-ui with ControlNet working correctly do the following:
-1. Go to the web-ui settings -> ControlNet tab -> Set "Allow other script to control this extension" checkbox to active and set "Multi ControlNet: Max models amount (requires restart)" to more then 2 -> press "Apply settings"
-2. Run web-ui with '--api' flag. It also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.   
-```bash webui.sh --xformers --api```
-3. Go to the script.py file and change main parameters (INPUT_VIDEO, OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. The script is pretty simple so you may change other parameters as well, although I would recommend to leave them as is for the first time.
-4. Run the script with ```python3 script.py```
+This script works on top of [Automatic1111/web-ui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) interface via API. To run this script you have to set it up first. You should also have[sd-webui-controlnet](https://github.com/Mikubill/sd-webui-controlnet) extension installed. You need to have control_hed-fp16 model installed. If you have web-ui with ControlNet working correctly, you have to also allow API to work with controlNet. To do so, go to the web-ui settings -> ControlNet tab -> Set "Allow other script to control this extension" checkbox to active and set "Multi ControlNet: Max models amount (requires restart)" to more then 2 -> press "Apply settings".

-<!---
-## Last version changes 0.4
-* Fixed issue with extreme blur accumulating at the static parts of the video
-->
+### Step 1.
+To process the video, first of all you would need to precompute optical flow data before running web-ui with this command:
+```
+python3 compute_flow.py -i {path to your video} -o {path to output *.h5} -v -W {width of the flow map} -H {height of the flow map}
+```
+The main reason to do this step separately is to save precious GPU memory that will be useful to generate better quality images. Choose W and H parameters as high as your GPU can handle with respect to proportion of original video resolution. Do not worry if it higher or less then the processing resolution, flow maps will be scaled accordingly at the processing stage. This will generate quite a large file that may take up to a several gigabytes on the drive even for minute long video. If you want to process a long video consider splitting it into several parts beforehand. 

-## Last version changes 0.3
-* Flow estimation algorithm is updated to [RAFT](https://github.com/princeton-vl/RAFT) method.
-* Difference map now computed as per-pixel maximum of warped first and second frame of the original video and occlusion map that is computed from forward and backward flow estimation.
-* Added keyframe detection that illuminates ghosting artifacts between the scenes.
+### Step 2.
+Run web-ui with '--api' flag. It is also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.   
+```
+bash webui.sh --xformers --api
+```
+
+### Step 3.
+Go to the **vid2vid.py** file and change main parameters (INPUT_VIDEO, FLOW_MAPS, OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. FLOW_MAPS parameter should contain a path to the flow file that you generated at the first step. The script is pretty simple so you may change other parameters as well, although I would recommend to leave them as is for the first time. Finally run the script with the command:
+```
+python3 vid2vid.py
+```
+
+## Last version changes: v0.4
+* Fixed issue with extreme blur accumulating at the static parts of the video.
+* The order of processing was changed to achieve the best quality at different domains.
+* Optical flow computation isolated into a separate script for better GPU memory management. Check out the instruction for a new processing pipeline.

 ## Potential improvements
 There are several ways overall quality of animation may be improved:
 * You may use a separate processing for each camera position to get a more consistent style of the characters and less ghosting.
 * Because the quality of the video depends on how good optical flow was estimated it might be beneficial to use high frame rate video as a source, so it would be easier to guess the flow properly.
-* The quality of flow estimation might be greatly improved with better flow estimation model like this one: https://github.com/autonomousvision/unimatch .
-* It is possible to lower VRAM requirements if precompute flow maps beforehand. 
+* The quality of flow estimation might be greatly improved with proper flow estimation model like this one: https://github.com/autonomousvision/unimatch .
+
+## Licence
+This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact me directly at borsky.alexey@gmail.com
--- a/script.py
+++ b/script.py
@ -1,289 +0,0 @@
-import requests
-import cv2
-import base64
-import numpy as np
-from tqdm import tqdm
-import os
-
-#RAFT dependencies
-import sys
-sys.path.append('RAFT/core')
-
-from collections import namedtuple
-import torch
-import argparse
-from raft import RAFT
-from utils.utils import InputPadder
-
-
-INPUT_VIDEO = "/media/alex/ded3efe6-5825-429d-ac89-7ded676a2b6d/media/Fallout_noir_2/benny.mp4"
-OUTPUT_VIDEO = "result.mp4"
-
-PROMPT = "RAW photo, Matthew Perry wearing suit and pants, cinematic light, dramatic light, (high detailed skin:1.2), 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
-N_PROMPT = "blur, blurred, unfocus, obscure, dim, fade, obscure, muddy, black and white image, old, naked, black person, green face, green skin, black and white, slanted eyes, red eyes, blood eyes, deformed, bad anatomy, disfigured, poorly drawn face, mutation, mutated, extra limb, ugly, disgusting, poorly drawn hands, missing limb, floating limbs, disconnected limbs, malformed hands, blurry, ((((mutated hands and fingers)))), watermark, watermarked, oversaturated, censored, distorted hands, amputation, missing hands, obese, doubled face, double hands"
-SEED = -1
-w,h = 512, 704 # Width and height of the processed image. Note that actual image processed would be a W x 2H resolution. You should have enough VRAM to process it.
-
-START_FROM_IND = 0 # index of a frame to start a processing from. Might be helpful with long animations where you need to restart the script multiple times
-SAVE_FRAMES = True # saves individual frames into 'out' folder if set True. Again might be helpful with long animations
-
-BLUR_SIZE = (11, 11)
-BLUR_SIGMA = 12
-
-def to_b64(img):
-  _, buffer = cv2.imencode('.png', img)
-  b64img = base64.b64encode(buffer).decode("utf-8")
-  return b64img
-
-class controlnetRequest():
-  def __init__(self, b64_cur_img, b64_hed_img, ds = 0.35, w=w, h=h, mask = None):
-    self.url = "http://localhost:7860/sdapi/v1/img2img"
-    self.body = {
-      "init_images": [b64_cur_img],
-      "mask": mask,
-      "mask_blur": 0,
-      "inpainting_fill": 1,
-      "inpainting_mask_invert": 0,
-      "prompt": PROMPT,
-      "negative_prompt": N_PROMPT,
-      "seed": SEED,
-      "subseed": -1,
-      "subseed_strength": 0,
-      "batch_size": 1,
-      "n_iter": 1,
-      "steps": 15,
-      "cfg_scale": 7,
-      "denoising_strength": ds,
-      "width": w,
-      "height": h,
-      "restore_faces": False,
-      "eta": 0,
-      "sampler_index": "DPM++ 2S a",
-      "control_net_enabled": True,
-      "alwayson_scripts": {
-        "ControlNet":{
-          "args": [
-            {
-              "input_image": b64_hed_img,
-              "module": "hed",
-              "model": "control_hed-fp16 [13fee50b]",
-              "weight": 1,
-              "resize_mode": "Just Resize",
-              "lowvram": False,
-              "processor_res": 512,
-              "guidance": 1,
-              "guessmode": False
-            }
-          ]
-        }
-      },
-    }
-
-  def sendRequest(self):
-      r = requests.post(self.url, json=self.body)
-      return r.json()
-    
-DEVICE = 'cuda'
-RAFT_model = None
-def RAFT_estimate_flow_diff(frame1, frame2, frame1_styled):
-  global RAFT_model
-  if RAFT_model is None:
-    args = argparse.Namespace(**{
-      'model': 'RAFT/models/raft-things.pth',
-      'mixed_precision': True,
-      'small': False,
-      'alternate_corr': False,
-      'path': ""
-    })
-
-    RAFT_model = torch.nn.DataParallel(RAFT(args))
-    RAFT_model.load_state_dict(torch.load(args.model))
-
-    RAFT_model = RAFT_model.module
-    RAFT_model.to(DEVICE)
-    RAFT_model.eval()
-
-  with torch.no_grad():
-    frame1_torch = torch.from_numpy(frame1).permute(2, 0, 1).float()[None].to(DEVICE)
-    frame2_torch = torch.from_numpy(frame2).permute(2, 0, 1).float()[None].to(DEVICE)
-
-    padder = InputPadder(frame1_torch.shape)
-    image1, image2 = padder.pad(frame1_torch, frame2_torch)
-
-    # estimate and apply optical flow
-    _, next_flow = RAFT_model(image1, image2, iters=20, test_mode=True)
-    _, prev_flow = RAFT_model(image2, image1, iters=20, test_mode=True)
-
-    next_flow = next_flow[0].permute(1,2,0).cpu().numpy()
-    prev_flow = prev_flow[0].permute(1,2,0).cpu().numpy()
-
-    flow_map = -next_flow.copy()
-    h, w = flow_map.shape[:2]
-    flow_map[:,:,0] += np.arange(w)
-    flow_map[:,:,1] += np.arange(h)[:,np.newaxis]
-
-    warped_frame = cv2.remap(frame1, flow_map, None, cv2.INTER_NEAREST)
-    warped_frame_styled = cv2.remap(frame1_styled, flow_map, None, cv2.INTER_NEAREST)
-
-    # compute occlusion mask
-    fb_flow = next_flow + prev_flow
-    fb_norm = np.linalg.norm(fb_flow, axis=2)
-
-    occlusion_mask = fb_norm[..., None] 
-
-    diff_mask = np.abs(warped_frame.astype(np.float32) - frame2.astype(np.float32)) / 255
-    diff_mask = diff_mask.max(axis = -1, keepdims=True)
-
-    diff = np.maximum(occlusion_mask * 0.2, diff_mask * 4)
-    #diff = (diff > 0.1).astype(float)
-    #diff = diff * 1.5
-
-    #diff_blured = cv2.GaussianBlur(diff, BLUR_SIZE, BLUR_SIGMA, cv2.BORDER_DEFAULT)
-    diff = diff.repeat(3, axis = -1)
-    #diff_frame = np.clip((diff) * 255, 1, 255).astype(np.uint8)
-    alpha_mask = np.clip(diff, 0, 1)
-
-
-  return warped_frame, alpha_mask, warped_frame_styled
-
-''' # old flow estimation algorithm, might be useful later 
-def estimate_flow_diff(frame1, frame2, frame1_styled):
-  prvs = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
-  next = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
-
-  # estimate and apply optical flow
-  flow = cv2.calcOpticalFlowFarneback(prvs, next, None, 0.5, 3, 15, 3, 5, 1.2, 0)
-  h, w = flow.shape[:2]
-  flow_data = -flow
-  flow_data[:,:,0] += np.arange(w)
-  flow_data[:,:,1] += np.arange(h)[:,np.newaxis]
-  #map_x, map_y = cv2.convertMaps(flow_data, 0, -1, True)
-  warped_frame = cv2.remap(frame1, flow_data, None, cv2.INTER_LINEAR)
-  warped_frame_styled = cv2.remap(frame1_styled, flow_data, None, cv2.INTER_LINEAR)
-
-  # compute occlusion mask
-  flow_back = cv2.calcOpticalFlowFarneback(next, prvs, None, 0.5, 3, 15, 3, 5, 1.2, 0)
-  fb_flow = flow + flow_back
-  fb_norm = np.linalg.norm(fb_flow, axis=2)
-  occlusion_mask = fb_norm[..., None]
-
-  diff_mask = np.abs(warped_frame.astype(np.float32) - frame2.astype(np.float32)) / 255
-  diff_mask = diff_mask.max(axis = -1, keepdims=True)
-
-  diff = np.maximum(occlusion_mask, diff_mask).repeat(3, axis = -1)
-  #diff = diff * 1.5
-
-  #diff = cv2.GaussianBlur(diff, BLUR_SIZE, BLUR_SIGMA, cv2.BORDER_DEFAULT)
-  diff_frame = np.clip(diff * 255, 0, 255).astype(np.uint8)
-
-  return warped_frame, diff_frame, warped_frame_styled
-'''
-
-cv2.namedWindow('Out img')
-
-# Open the input video file
-input_video = cv2.VideoCapture(INPUT_VIDEO)
-
-# Get useful info from the souce video
-fps = int(input_video.get(cv2.CAP_PROP_FPS))
-total_frames = int(input_video.get(cv2.CAP_PROP_FRAME_COUNT))
-
-# Create an output video file with the same fps, width, and height as the input video
-output_video = cv2.VideoWriter(OUTPUT_VIDEO, cv2.VideoWriter_fourcc(*'MP4V'), fps, (w, h))
-
-prev_frame = None
-
-for ind in tqdm(range(total_frames)):
-  # Read the next frame from the input video
-  if not input_video.isOpened(): break
-  ret, cur_frame = input_video.read()
-  if not ret: break
-
-  if ind+1 < START_FROM_IND: continue
-
-  is_keyframe = True
-  if prev_frame is not None:
-    # Compute absolute difference between current and previous frame
-    frames_diff = cv2.absdiff(cur_frame, prev_frame)
-    # Compute mean of absolute difference
-    mean_diff = cv2.mean(frames_diff)[0]
-    # Check if mean difference is above threshold
-    is_keyframe = mean_diff > 30
-
-  # Generate course version of a current frame with previous stylized frame as a reference image
-  if is_keyframe:
-    # Resize the frame to proper resolution 
-    frame = cv2.resize(cur_frame, (w, h))
-
-    # Sending request to the web-ui
-    data_js = controlnetRequest(to_b64(frame), to_b64(frame), 0.65, w, h, mask = None).sendRequest()
-    
-    # Convert the byte array to a NumPy array
-    image_bytes = base64.b64decode(data_js["images"][0])
-    np_array = np.frombuffer(image_bytes, dtype=np.uint8)
-
-    # Convert the NumPy array to a cv2 image
-    out_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
-    alpha_img = out_image.copy()
-    diff_mask_blured = out_image.copy()
-  else:
-    # Resize the frame to proper resolution 
-    frame = cv2.resize(cur_frame, (w, h))
-    prev_frame = cv2.resize(prev_frame, (w, h))
-
-    _, alpha_mask, warped_styled = RAFT_estimate_flow_diff(prev_frame, frame, prev_frame_styled)
-
-    pr_image = frame * alpha_mask + warped_styled * (1 - alpha_mask)
-    #diff_mask_blured = cv2.GaussianBlur(alpha * 255, BLUR_SIZE, BLUR_SIGMA, cv2.BORDER_DEFAULT)
-    #diff_mask_blured = np.clip(diff_mask_blured + diff_mask, 5, 255).astype(np.uint8)]
-
-    alpha_img = np.clip(alpha_mask * 255, 0, 255).astype(np.uint8)
-    
-    # Sending request to the web-ui
-    data_js = controlnetRequest(to_b64(pr_image), to_b64(frame), 0.65, w, h, mask = to_b64(alpha_img)).sendRequest()
-    
-    # Convert the byte array to a NumPy array
-    image_bytes = base64.b64decode(data_js["images"][0])
-    np_array = np.frombuffer(image_bytes, dtype=np.uint8)
-
-    # Convert the NumPy array to a cv2 image
-    out_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
-
-
-
-    # Sending request to the web-ui
-    data_js = controlnetRequest(to_b64(out_image), to_b64(frame), 0.1, w, h, mask = None).sendRequest()
-    
-    # Convert the byte array to a NumPy array
-    image_bytes = base64.b64decode(data_js["images"][0])
-    np_array = np.frombuffer(image_bytes, dtype=np.uint8)
-
-    # Convert the NumPy array to a cv2 image
-    out_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
-
-  # Write the frame to the output video
-  frame_out = out_image[:h]
-  output_video.write(frame_out)
-
-  # show the last written frame - useful to catch any issue with the process
-  img_show = cv2.hconcat([out_image, alpha_img])
-  cv2.imshow('Out img', img_show)
-  if cv2.waitKey(1) & 0xFF == ord('q'): exit() # press Q to close the script while processing
-
-
-  # Write the frame to the output video
-  output_video.write(frame_out)
-  prev_frame = cur_frame.copy()
-  prev_frame_styled = frame_out.copy()
-
-  
-  if SAVE_FRAMES: 
-      if not os.path.isdir('out'): os.makedirs('out')
-      cv2.imwrite(f'out/{ind+1:05d}.png', frame_out)
-
-# Release the input and output video files
-input_video.release()
-output_video.release()
-
-# Close all windows
-cv2.destroyAllWindows()
--- a/vid2vid.py
+++ b/vid2vid.py
@ -0,0 +1,223 @@
+import requests
+import cv2
+import base64
+import numpy as np
+from tqdm import tqdm
+import os
+
+import h5py
+from flow_utils import compute_diff_map
+
+INPUT_VIDEO = "input.mp4"
+FLOW_MAPS = "flow.h5"
+OUTPUT_VIDEO = "result.mp4"
+
+PROMPT = "marble statue"
+N_PROMPT = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime:1.4), text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
+w,h = 1152, 640 # Width and height of the processed image. Note that actual image processed would be a W x H resolution.
+
+START_FROM_IND = 575 # index of a frame to start a processing from. Might be helpful with long animations where you need to restart the script multiple times
+SAVE_FRAMES = True # saves individual frames into 'out' folder if set True. Again might be helpful with long animations
+
+PROCESSING_STRENGTH = 0.85
+BLUR_FIX_STRENGTH = 0.15
+
+APPLY_HED = False
+APPLY_CANNY = True
+APPLY_DEPTH = False
+GUESSMODE = False
+
+CFG_SCALE = 5.5
+
+VISUALIZE = True
+
+def to_b64(img):
+  img_cliped = np.clip(img, 0, 255).astype(np.uint8)
+  _, buffer = cv2.imencode('.png', img_cliped)
+  b64img = base64.b64encode(buffer).decode("utf-8")
+  return b64img
+
+class controlnetRequest():
+  def __init__(self, b64_cur_img, b64_hed_img, ds = 0.35, w=w, h=h, mask = None, seed=-1):
+    self.url = "http://localhost:7860/sdapi/v1/img2img"
+    self.body = {
+      "init_images": [b64_cur_img],
+      "mask": mask,
+      "mask_blur": 0,
+      "inpainting_fill": 1,
+      "inpainting_mask_invert": 0,
+      "prompt": PROMPT,
+      "negative_prompt": N_PROMPT,
+      "seed": seed,
+      "subseed": -1,
+      "subseed_strength": 0,
+      "batch_size": 1,
+      "n_iter": 1,
+      "steps": 15,
+      "cfg_scale": CFG_SCALE,
+      "denoising_strength": ds,
+      "width": w,
+      "height": h,
+      "restore_faces": False,
+      "eta": 0,
+      "sampler_index": "DPM++ 2S a",
+      "control_net_enabled": True,
+      "alwayson_scripts": {
+        "ControlNet":{"args": []}
+      },
+    }
+
+    if APPLY_HED:
+      self.body["alwayson_scripts"]["ControlNet"]["args"].append({
+        "input_image": b64_hed_img,
+        "module": "hed",
+        "model": "control_hed-fp16 [13fee50b]",
+        "weight": 0.85,
+        "resize_mode": "Just Resize",
+        "lowvram": False,
+        "processor_res": 512,
+        "guidance_start": 0,
+        "guidance_end": 0.85,
+        "guessmode": GUESSMODE
+      })
+
+    if APPLY_CANNY:
+      self.body["alwayson_scripts"]["ControlNet"]["args"].append({
+        "input_image": b64_hed_img,
+        "module": "canny",
+        "model": "control_canny-fp16 [e3fe7712]",
+        "weight": 0.85,
+        "resize_mode": "Just Resize",
+        "lowvram": False,
+        "threshold_a": 35,
+        "threshold_b": 35,
+        "processor_res": 512,
+        "guidance_start": 0,
+        "guidance_end": 0.85,
+        "guessmode": GUESSMODE
+      })
+
+    if APPLY_DEPTH:
+      self.body["alwayson_scripts"]["ControlNet"]["args"].append({
+        "input_image": b64_hed_img,
+        "module": "depth",
+        "model": "control_depth-fp16 [400750f6]",
+        "weight": 0.85,
+        "resize_mode": "Just Resize",
+        "lowvram": False,
+        "processor_res": 512,
+        "guidance_start": 0,
+        "guidance_end": 0.85,
+        "guessmode": GUESSMODE
+      })
+
+
+  def sendRequest(self):
+      # Request to web-ui
+      data_js = requests.post(self.url, json=self.body).json()
+
+      # Convert the byte array to a NumPy array
+      image_bytes = base64.b64decode(data_js["images"][0])
+      np_array = np.frombuffer(image_bytes, dtype=np.uint8)
+
+      # Convert the NumPy array to a cv2 image
+      out_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
+      return out_image
+  
+
+  
+if VISUALIZE: cv2.namedWindow('Out img')
+
+# Open the input video file
+input_video = cv2.VideoCapture(INPUT_VIDEO)
+
+# Get useful info from the source video
+fps = int(input_video.get(cv2.CAP_PROP_FPS))
+total_frames = int(input_video.get(cv2.CAP_PROP_FRAME_COUNT))
+
+# Create an output video file with the same fps, width, and height as the input video
+output_video = cv2.VideoWriter(OUTPUT_VIDEO, cv2.VideoWriter_fourcc(*'MP4V'), fps, (w, h))
+
+prev_frame = None
+prev_frame_styled = None
+
+# reading flow maps in a stream manner
+with h5py.File(FLOW_MAPS, 'r') as f:
+  flow_maps = f['flow_maps']
+
+  for ind in tqdm(range(total_frames)):
+    # Read the next frame from the input video
+    if not input_video.isOpened(): break
+    ret, cur_frame = input_video.read()
+    if not ret: break
+
+    if ind+1 < START_FROM_IND: continue
+
+    is_keyframe = True
+    if prev_frame is not None:
+      # Compute absolute difference between current and previous frame
+      frames_diff = cv2.absdiff(cur_frame, prev_frame)
+      # Compute mean of absolute difference
+      mean_diff = cv2.mean(frames_diff)[0]
+      # Check if mean difference is above threshold
+      is_keyframe = mean_diff > 30
+
+    # Generate course version of a current frame with previous stylized frame as a reference image
+    if is_keyframe:
+      # Resize the frame to proper resolution 
+      frame = cv2.resize(cur_frame, (w, h))
+
+      # Processing current frame with current frame as a mask without any inpainting
+      out_image = controlnetRequest(to_b64(frame), to_b64(frame), PROCESSING_STRENGTH, w, h, mask = None).sendRequest()
+
+      alpha_img = out_image.copy()
+      out_image_ = out_image.copy()
+    else:
+      # Resize the frame to proper resolution 
+      frame = cv2.resize(cur_frame, (w, h))
+      prev_frame = cv2.resize(prev_frame, (w, h))
+
+      # Processing current frame with current frame as a mask without any inpainting
+      out_image = controlnetRequest(to_b64(frame), to_b64(frame), PROCESSING_STRENGTH, w, h, mask = None).sendRequest()
+
+      next_flow, prev_flow = flow_maps[ind-1].astype(np.float32)
+      alpha_mask, warped_styled = compute_diff_map(next_flow, prev_flow, prev_frame, frame, prev_frame_styled)
+
+      # This clipping at lower side required to fix small trailing issues that for some reason left outside of the bright part of the mask, 
+      # and at the higher part it making parts changed strongly to do it with less flickering. 
+      alpha_mask = np.clip(alpha_mask + 0.05, 0.05, 0.95)
+      alpha_img = np.clip(alpha_mask * 255, 0, 255).astype(np.uint8)
+
+      out_image = out_image.astype(float) * alpha_mask + warped_styled.astype(float) * (1 - alpha_mask)
+
+      out_image_ = (out_image * 0.65 + warped_styled * 0.35) 
+      
+      
+    # Bluring issue fix via additional processing
+    out_image_fixed = controlnetRequest(to_b64(out_image_), to_b64(frame), BLUR_FIX_STRENGTH, w, h, mask = None, seed=8888).sendRequest()
+
+    # Write the frame to the output video
+    frame_out = np.clip(out_image_fixed, 0, 255).astype(np.uint8)
+    output_video.write(frame_out)
+
+    if VISUALIZE:
+      # show the last written frame - useful to catch any issue with the process
+      img_show = cv2.hconcat([frame_out, alpha_img])
+      cv2.imshow('Out img', img_show)
+      cv2.setWindowTitle("Out img", str(ind+1))
+      if cv2.waitKey(1) & 0xFF == ord('q'): exit() # press Q to close the script while processing
+
+    if SAVE_FRAMES: 
+        if not os.path.isdir('out'): os.makedirs('out')
+        cv2.imwrite(f'out/{ind+1:05d}.png', frame_out)
+
+    prev_frame = cur_frame.copy()
+    prev_frame_styled = out_image.copy()
+
+    
+# Release the input and output video files
+input_video.release()
+output_video.release()
+
+# Close all windows
+if VISUALIZE: cv2.destroyAllWindows()