v0.6

2023-05-03 02:46:56 +03:00 · 2023-05-03 02:46:56 +03:00 · bb4353a264
parent c34cfe2976
commit bb4353a264
12 changed files with 940 additions and 232 deletions
--- a/FloweR/model.py
+++ b/FloweR/model.py
@ -1,143 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.functional as F
-
-# Define the model
-class FloweR(nn.Module):
-  def __init__(self, input_size = (384, 384), window_size = 4):
-    super(FloweR, self).__init__()
-
-    self.input_size = input_size
-    self.window_size = window_size
-
-    #INPUT: 384 x 384 x 10 * 3 
-
-    ### DOWNSCALE ###
-    self.conv_block_1 = nn.Sequential(
-      nn.Conv2d(3 * self.window_size, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 384 x 384 x 128
-
-    self.conv_block_2 = nn.Sequential(
-      nn.AvgPool2d(2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 192 x 192 x 128
-
-    self.conv_block_3 = nn.Sequential(
-      nn.AvgPool2d(2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 96 x 96 x 128
-
-    self.conv_block_4 = nn.Sequential(
-      nn.AvgPool2d(2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 48 x 48 x 128
-
-    self.conv_block_5 = nn.Sequential(
-      nn.AvgPool2d(2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 24 x 24 x 128
-
-    self.conv_block_6 = nn.Sequential(
-      nn.AvgPool2d(2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 12 x 12 x 128
-
-    self.conv_block_7 = nn.Sequential(
-      nn.AvgPool2d(2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 6 x 6 x 128
-
-    self.conv_block_8 = nn.Sequential(
-      nn.AvgPool2d(2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 3 x 3 x 128
-
-    ### UPSCALE ###
-    self.conv_block_9 = nn.Sequential(
-      nn.Upsample(scale_factor=2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 6 x 6 x 128
-
-    self.conv_block_10 = nn.Sequential(
-      nn.Upsample(scale_factor=2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 12 x 12 x 128
-
-    self.conv_block_11 = nn.Sequential(
-      nn.Upsample(scale_factor=2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 24 x 24 x 128
-
-    self.conv_block_12 = nn.Sequential(
-      nn.Upsample(scale_factor=2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 48 x 48 x 128
-
-    self.conv_block_13 = nn.Sequential(
-      nn.Upsample(scale_factor=2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 96 x 96 x 128
-
-    self.conv_block_14 = nn.Sequential(
-      nn.Upsample(scale_factor=2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 192 x 192 x 128
-    
-    self.conv_block_15 = nn.Sequential(
-      nn.Upsample(scale_factor=2),
-      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
-      nn.ReLU(),
-    ) # 384 x 384 x 128
-
-    self.conv_block_16 = nn.Conv2d(128, 3, kernel_size=3, stride=1, padding='same')
-
-  def forward(self, x):
-    if x.size(1) != self.window_size: 
-      raise Exception(f'Shape of the input is not compatable. There should be exactly {self.window_size} frames in an input video.')
-
-    # batch, frames, height, width, colors
-    in_x = x.permute((0, 1, 4, 2, 3))
-    # batch, frames, colors, height, width
-
-    in_x = in_x.reshape(-1, self.window_size * 3, self.input_size[0], self.input_size[1])
-
-    ### DOWNSCALE ###
-    block_1_out = self.conv_block_1(in_x)        # 384 x 384 x 128
-    block_2_out = self.conv_block_2(block_1_out) # 192 x 192 x 128
-    block_3_out = self.conv_block_3(block_2_out) # 96 x 96 x 128
-    block_4_out = self.conv_block_4(block_3_out) # 48 x 48 x 128
-    block_5_out = self.conv_block_5(block_4_out) # 24 x 24 x 128
-    block_6_out = self.conv_block_6(block_5_out) # 12 x 12 x 128
-    block_7_out = self.conv_block_7(block_6_out) # 6 x 6 x 128
-    block_8_out = self.conv_block_8(block_7_out) # 3 x 3 x 128
-
-    ### UPSCALE ###
-    block_9_out = block_7_out + self.conv_block_9(block_8_out)    # 6 x 6 x 128
-    block_10_out = block_6_out + self.conv_block_10(block_9_out)  # 12 x 12 x 128
-    block_11_out = block_5_out + self.conv_block_11(block_10_out) # 24 x 24 x 128
-    block_12_out = block_4_out + self.conv_block_12(block_11_out) # 48 x 48 x 128
-    block_13_out = block_3_out + self.conv_block_13(block_12_out) # 96 x 96 x 128
-    block_14_out = block_2_out + self.conv_block_14(block_13_out) # 192 x 192 x 128
-    block_15_out = block_1_out + self.conv_block_15(block_14_out) # 384 x 384 x 128
-
-    block_16_out = self.conv_block_16(block_15_out) # 384 x 384 x (2 + 1)
-    out = block_16_out.reshape(-1, 3, self.input_size[0], self.input_size[1])
-
-    # batch, colors, height, width
-    out = out.permute((0, 2, 3, 1))
-    # batch, height, width, colors
-    return out
--- a/install.py
+++ b/install.py
@ -0,0 +1,15 @@
+import launch
+
+import os
+
+req_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "requirements.txt")
+
+with open(req_file) as file:
+
+    for lib in file:
+
+        lib = lib.strip()
+
+        if not launch.is_installed(lib):
+
+            launch.run_pip(f"install {lib}", f"text2video requirement: {lib}")
--- a/old_scripts/compute_flow.py
+++ b/old_scripts/compute_flow.py
--- a/old_scripts/flow_utils.py
+++ b/old_scripts/flow_utils.py
--- a/old_scripts/readme.md
+++ b/old_scripts/readme.md
@ -0,0 +1,133 @@
+# SD-CN-Animation
+This project allows you to automate video stylization task using StableDiffusion and ControlNet. It also allows you to generate completely new videos from text at any resolution and length in contrast to other current text2video methods using any Stable Diffusion model as a backbone, including custom ones. It uses '[RAFT](https://github.com/princeton-vl/RAFT)' optical flow estimation algorithm to keep the animation stable and create an inpainting mask that is used to generate the next frame. In text to video mode it relies on 'FloweR' method (work in progress) that predicts optical flow from the previous frames.
+
+
+### Video to Video Examples:
+<!--
+[![IMAGE_ALT](https://img.youtube.com/vi/j-0niEMm6DU/0.jpg)](https://youtu.be/j-0niEMm6DU)
+This script can also be using to swap the person in the video like in this example: https://youtube.com/shorts/be93_dIeZWU
+-->
+</table>
+<table class="center">
+<tr>
+ <td><img src="examples/girl_org.gif" raw=true></td>
+ <td><img src="examples/girl_to_jc.gif" raw=true></td>
+ <td><img src="examples/girl_to_wc.gif" raw=true></td>
+</tr>
+<tr>
+ <td width=33% align="center">Original video</td>
+ <td width=33% align="center">"Jessica Chastain"</td>
+ <td width=33% align="center">"Watercolor painting"</td>
+</tr>
+</table>
+
+Examples presented are generated at 1024x576 resolution using the 'realisticVisionV13_v13' model as a base. They were cropt, downsized and compressed for better loading speed. You can see them in their original quality in the 'examples' folder. 
+
+### Text to Video Examples:
+</table>
+<table class="center">
+<tr>
+ <td><img src="examples/flower_1.gif" raw=true></td>
+ <td><img src="examples/bonfire_1.gif" raw=true></td>
+ <td><img src="examples/diamond_4.gif" raw=true></td>
+</tr>
+<tr>
+ <td width=33% align="center">"close up of a flower"</td>
+ <td width=33% align="center">"bonfire near the camp in the mountains at night"</td>
+ <td width=33% align="center">"close up of a diamond laying on the table"</td>
+</tr>
+<tr>
+ <td><img src="examples/macaroni_1.gif" raw=true></td>
+ <td><img src="examples/gold_1.gif" raw=true></td>
+ <td><img src="examples/tree_2.gif" raw=true></td>
+</tr>
+<tr>
+ <td width=33% align="center">"close up of macaroni on the plate"</td>
+ <td width=33% align="center">"close up of golden sphere"</td>
+ <td width=33% align="center">"a tree standing in the winter forest"</td>
+</tr>
+</table>
+
+All examples you can see here are originally generated at 512x512 resolution using the 'sd-v1-5-inpainting' model as a base. They were downsized and compressed for better loading speed. You can see them in their original quality in the 'examples' folder. Actual prompts used were stated in the following format: "RAW photo, {subject}, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3", only the 'subject' part is described in the table above.
+
+
+
+## Dependencies
+To install all the necessary dependencies, run this command:
+```
+pip install opencv-python opencv-contrib-python numpy tqdm h5py scikit-image
+```
+You have to set up the RAFT repository as it described here: https://github.com/princeton-vl/RAFT . Basically it just comes down to running "./download_models.sh" in RAFT folder to download the models.
+
+
+## Running the scripts
+This script works on top of [Automatic1111/web-ui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) interface via API. To run this script you have to set it up first. You should also have[sd-webui-controlnet](https://github.com/Mikubill/sd-webui-controlnet) extension installed. You need to have the control_hed-fp16 model installed. If you have web-ui with ControlNet working correctly, you have to also allow the API to work with controlNet. To do so, go to the web-ui settings -> ControlNet tab -> Set "Allow other script to control this extension" checkbox to active and set "Multi ControlNet: Max models amount (requires restart)" to more then 2 -> press "Apply settings".
+
+
+### Video To Video
+#### Step 1.
+To process the video, first of all you would need to precompute optical flow data before running web-ui with this command:
+```
+python3 compute_flow.py -i "path to your video" -o "path to output file with *.h5 format" -v -W width_of_the_flow_map -H height_of_the_flow_map
+```
+The main reason to do this step separately is to save precious GPU memory that will be useful to generate better quality images. Choose W and H parameters as high as your GPU can handle with respect to the proportion of original video resolution. Do not worry if it is higher or less then the processing resolution, flow maps will be scaled accordingly at the processing stage. This will generate quite a large file that may take up to a several gigabytes on the drive even for minute long video. If you want to process a long video consider splitting it into several parts beforehand.
+
+
+#### Step 2.
+Run web-ui with '--api' flag. It is also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.  
+```
+bash webui.sh --xformers --api
+```
+
+
+#### Step 3.
+Go to the **vid2vid.py** file and change main parameters (INPUT_VIDEO, FLOW_MAPS, OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. FLOW_MAPS parameter should contain a path to the flow file that you generated at the first step. The script is pretty simple so you may change other parameters as well, although I would recommend to leave them as is for the first time. Finally run the script with the command:
+```
+python3 vid2vid.py
+```
+
+
+### Text To Video
+This method is still in development and works on top of ‘Stable Diffusion’ and 'FloweR' - optical flow reconstruction method that is also in a yearly development stage. Do not expect much from it as it is more of a proof of a concept rather than a complete solution. 
+
+#### Step 1.
+Download 'FloweR_0.1.pth' model from here: [Google drive link](https://drive.google.com/file/d/1WhzoVIw6Kdg4EjfK9LaTLqFm5dF-IJ7F/view?usp=share_link) and place it in the 'FloweR' folder.
+
+#### Step 2.
+Same as with vid2vid case, run web-ui with '--api' flag. It is also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.  
+```
+bash webui.sh --xformers --api
+```
+
+#### Step 3.
+Go to the **txt2vid.py** file and change main parameters (OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. Again, the script is simple so you may change other parameters if you want to. Finally run the script with the command:
+```
+python3 txt2vid.py
+```
+
+## Last version changes: v0.5
+* Fixed an issue with the wrong direction of an optical flow applied to an image.
+* Added text to video mode within txt2vid.py script. Make sure to update new dependencies for this script to work!
+* Added a threshold for an optical flow before processing the frame to remove white noise that might appear, as it was suggested by [@alexfredo](https://github.com/alexfredo).
+* Background removal at flow computation stage implemented by [@CaptnSeraph](https://github.com/CaptnSeraph), it should reduce ghosting effect in most of the videos processed with vid2vid script.
+
+<!--
+## Last version changes: v0.6
+* Added separate flag '-rb' for background removal process at the flow computation stage in the compute_flow.py script.
+* Added flow normalization before rescaling it, so the magnitude of the flow computed correctly at the different resolution.
+* Less ghosting and color change in vid2vid mode
+-->
+
+<!--
+## Potential improvements
+There are several ways overall quality of animation may be improved:
+* You may use a separate processing for each camera position to get a more consistent style of the characters and less ghosting.
+* Because the quality of the video depends on how good optical flow was estimated it might be beneficial to use high frame rate video as a source, so it would be easier to guess the flow properly.
+* The quality of flow estimation might be greatly improved with a proper flow estimation model like this one: https://github.com/autonomousvision/unimatch .
+-->
+## Licence
+This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact me directly at borsky.alexey@gmail.com
+
+
+
+
--- a/old_scripts/txt2vid.py
+++ b/old_scripts/txt2vid.py
@ -18,11 +18,11 @@ import skimage
 import datetime


-OUTPUT_VIDEO = f'result_{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}.mp4'
+OUTPUT_VIDEO = f'videos/result_{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}.mp4'

-PROMPT = "RAW photo, bonfire near the camp in the mountains at night, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
+PROMPT = "people looking at flying robots. Future. People looking to the sky. Stars in the background. Dramatic light, Cinematic light. Soft lighting, high quality, film grain."
 N_PROMPT = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime:1.4), text, letters, logo, brand, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
-w,h = 512, 512 # Width and height of the processed image. Note that actual image processed would be a W x H resolution.
+w,h = 768, 512 # Width and height of the processed image. Note that actual image processed would be a W x H resolution.

 SAVE_FRAMES = True # saves individual frames into 'out' folder if set True. Again might be helpful with long animations

@ -128,7 +128,7 @@ prev_frame_styled = None

 # Instantiate the model
 model = FloweR(input_size = (h, w))
-model.load_state_dict(torch.load('FloweR/FloweR_0.1.pth'))
+model.load_state_dict(torch.load('FloweR/FloweR_0.1.1.pth'))
 # Move the model to the device
 model = model.to(DEVICE)

@ -142,7 +142,7 @@ clip_frames = np.zeros((4, h, w, 3), dtype=np.uint8)

 color_shift = np.zeros((0, 3))
 color_scale = np.zeros((0, 3))
-for ind in tqdm(range(40)): 
+for ind in tqdm(range(450)): 
  clip_frames = np.roll(clip_frames, -1, axis=0)
  clip_frames[-1] = prev_frame
  
--- a/old_scripts/vid2vid.py
+++ b/old_scripts/vid2vid.py
@ -11,18 +11,18 @@ from flow_utils import compute_diff_map
 import skimage
 import datetime

-INPUT_VIDEO = "input.mp4"
-FLOW_MAPS = "flow.h5"
+INPUT_VIDEO = "/media/alex/ded3efe6-5825-429d-ac89-7ded676a2b6d/media/Peter_Gabriel/pexels-monstera-5302599-4096x2160-30fps.mp4"
+FLOW_MAPS = "/media/alex/ded3efe6-5825-429d-ac89-7ded676a2b6d/media/Peter_Gabriel/pexels-monstera-5302599-4096x2160-30fps.h5"
 OUTPUT_VIDEO = f'videos/result_{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}.mp4'

-PROMPT = "Watercolor painting"
+PROMPT = "Underwater shot Peter Gabriel with closed eyes in  Peter Gabriel's music video. 80's music video. VHS style. Dramatic light, Cinematic light. RAW photo, 8k uhd, dslr, soft lighting, high quality, film grain."
 N_PROMPT = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime:1.4), text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
-w,h = 1024, 576 # Width and height of the processed image. Note that actual image processed would be a W x H resolution.
+w,h = 1088, 576 # Width and height of the processed image. Note that actual image processed would be a W x H resolution.

 START_FROM_IND = 0 # index of a frame to start a processing from. Might be helpful with long animations where you need to restart the script multiple times
 SAVE_FRAMES = True # saves individual frames into 'out' folder if set True. Again might be helpful with long animations

-PROCESSING_STRENGTH = 0.85
+PROCESSING_STRENGTH = 0.95
 BLUR_FIX_STRENGTH = 0.15

 APPLY_HED = True
@ -75,12 +75,12 @@ class controlnetRequest():
        "input_image": b64_hed_img,
        "module": "hed",
        "model": "control_hed-fp16 [13fee50b]",
-        "weight": 0.85,
+        "weight": 0.65,
        "resize_mode": "Just Resize",
        "lowvram": False,
        "processor_res": 512,
        "guidance_start": 0,
-        "guidance_end": 0.85,
+        "guidance_end": 0.65,
        "guessmode": GUESSMODE
      })

--- a/readme.md
+++ b/readme.md
@ -3,10 +3,6 @@ This project allows you to automate video stylization task using StableDiffusion


 ### Video to Video Examples:
-<!--
-[![IMAGE_ALT](https://img.youtube.com/vi/j-0niEMm6DU/0.jpg)](https://youtu.be/j-0niEMm6DU)
-This script can also be using to swap the person in the video like in this example: https://youtube.com/shorts/be93_dIeZWU
-->
 </table>
 <table class="center">
 <tr>
@ -51,83 +47,16 @@ Examples presented are generated at 1024x576 resolution using the 'realisticVisi
 All examples you can see here are originally generated at 512x512 resolution using the 'sd-v1-5-inpainting' model as a base. They were downsized and compressed for better loading speed. You can see them in their original quality in the 'examples' folder. Actual prompts used were stated in the following format: "RAW photo, {subject}, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3", only the 'subject' part is described in the table above.


+## Installing the extension
+~TODO~

-## Dependencies
-To install all the necessary dependencies, run this command:
-```
-pip install opencv-python opencv-contrib-python numpy tqdm h5py scikit-image
-```
-You have to set up the RAFT repository as it described here: https://github.com/princeton-vl/RAFT . Basically it just comes down to running "./download_models.sh" in RAFT folder to download the models.
+Download RAFT 'raft-things.pth' from here: [RAFT link] and place it into 'stable-diffusion-webui/models/RAFT/' folder.
+All generated video will be saved into 'outputs/sd-cn-animation' folder.

-
-## Running the scripts
-This script works on top of [Automatic1111/web-ui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) interface via API. To run this script you have to set it up first. You should also have[sd-webui-controlnet](https://github.com/Mikubill/sd-webui-controlnet) extension installed. You need to have the control_hed-fp16 model installed. If you have web-ui with ControlNet working correctly, you have to also allow the API to work with controlNet. To do so, go to the web-ui settings -> ControlNet tab -> Set "Allow other script to control this extension" checkbox to active and set "Multi ControlNet: Max models amount (requires restart)" to more then 2 -> press "Apply settings".
-
-
-### Video To Video
-#### Step 1.
-To process the video, first of all you would need to precompute optical flow data before running web-ui with this command:
-```
-python3 compute_flow.py -i "path to your video" -o "path to output file with *.h5 format" -v -W width_of_the_flow_map -H height_of_the_flow_map
-```
-The main reason to do this step separately is to save precious GPU memory that will be useful to generate better quality images. Choose W and H parameters as high as your GPU can handle with respect to the proportion of original video resolution. Do not worry if it is higher or less then the processing resolution, flow maps will be scaled accordingly at the processing stage. This will generate quite a large file that may take up to a several gigabytes on the drive even for minute long video. If you want to process a long video consider splitting it into several parts beforehand.
-
-
-#### Step 2.
-Run web-ui with '--api' flag. It is also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.  
-```
-bash webui.sh --xformers --api
-```
-
-
-#### Step 3.
-Go to the **vid2vid.py** file and change main parameters (INPUT_VIDEO, FLOW_MAPS, OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. FLOW_MAPS parameter should contain a path to the flow file that you generated at the first step. The script is pretty simple so you may change other parameters as well, although I would recommend to leave them as is for the first time. Finally run the script with the command:
-```
-python3 vid2vid.py
-```
-
-
-### Text To Video
-This method is still in development and works on top of ‘Stable Diffusion’ and 'FloweR' - optical flow reconstruction method that is also in a yearly development stage. Do not expect much from it as it is more of a proof of a concept rather than a complete solution. 
-
-#### Step 1.
-Download 'FloweR_0.1.pth' model from here: [Google drive link](https://drive.google.com/file/d/1WhzoVIw6Kdg4EjfK9LaTLqFm5dF-IJ7F/view?usp=share_link) and place it in the 'FloweR' folder.
-
-#### Step 2.
-Same as with vid2vid case, run web-ui with '--api' flag. It is also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.  
-```
-bash webui.sh --xformers --api
-```
-
-#### Step 3.
-Go to the **txt2vid.py** file and change main parameters (OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. Again, the script is simple so you may change other parameters if you want to. Finally run the script with the command:
-```
-python3 txt2vid.py
-```
-
-## Last version changes: v0.5
-* Fixed an issue with the wrong direction of an optical flow applied to an image.
-* Added text to video mode within txt2vid.py script. Make sure to update new dependencies for this script to work!
-* Added a threshold for an optical flow before processing the frame to remove white noise that might appear, as it was suggested by [@alexfredo](https://github.com/alexfredo).
-* Background removal at flow computation stage implemented by [@CaptnSeraph](https://github.com/CaptnSeraph), it should reduce ghosting effect in most of the videos processed with vid2vid script.
-
-<!--
 ## Last version changes: v0.6
+* Complete rewrite of the project to make it possible to install as a Automatic1111/Web-ui extension.
 * Added separate flag '-rb' for background removal process at the flow computation stage in the compute_flow.py script.
 * Added flow normalization before rescaling it, so the magnitude of the flow computed correctly at the different resolution.
 * Less ghosting and color change in vid2vid mode
-->
-
-<!--
-## Potential improvements
-There are several ways overall quality of animation may be improved:
-* You may use a separate processing for each camera position to get a more consistent style of the characters and less ghosting.
-* Because the quality of the video depends on how good optical flow was estimated it might be beneficial to use high frame rate video as a source, so it would be easier to guess the flow properly.
-* The quality of flow estimation might be greatly improved with a proper flow estimation model like this one: https://github.com/autonomousvision/unimatch .
-
-## Licence
-This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact me directly at borsky.alexey@gmail.com
-->
-
-
+* Added "warped styled frame fix" at vid2vid mode that removes image duplicated from the parts of the image that cannot be relocated from the optical flow.

--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
+scikit-image==0.19.2
--- a/scripts/base_ui.py
+++ b/scripts/base_ui.py
@ -0,0 +1,221 @@
+import sys, os
+basedirs = [os.getcwd()]
+
+for basedir in basedirs:
+    paths_to_ensure = [
+        basedir,
+        basedir + '/extensions/sd-cn-animation/scripts'
+        ]
+
+    for scripts_path_fix in paths_to_ensure:
+        if not scripts_path_fix in sys.path:
+            sys.path.extend([scripts_path_fix])
+
+import gradio as gr
+from types import SimpleNamespace
+
+from modules import script_callbacks, shared
+from modules.shared import cmd_opts, opts
+from webui import wrap_gradio_gpu_call
+
+from modules.ui_components import ToolButton, FormRow, FormGroup
+from modules.ui import create_override_settings_dropdown
+import modules.scripts as scripts
+
+from modules.sd_samplers import samplers_for_img2img
+from modules.ui import setup_progressbar, create_sampler_and_steps_selection, ordered_ui_categories, create_output_panel
+
+
+from vid2vid import *
+
+def V2VArgs():
+    seed = -1
+    width = 1024
+    height = 576
+    cfg_scale = 5.5
+    steps = 15
+    prompt = ""
+    n_prompt = "text, letters, logo, brand, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
+    processing_strength = 0.85
+    fix_frame_strength = 0.15
+    return locals()
+
+def T2VArgs():
+    seed = -1
+    width = 768
+    height = 512
+    cfg_scale = 5.5
+    steps = 15
+    prompt = ""
+    n_prompt = "text, letters, logo, brand, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
+    processing_strength = 0.85
+    fix_frame_strength = 0.35
+    return locals()
+
+def setup_common_values(mode, d):
+    with gr.Row():
+        width = gr.Slider(label='Width', minimum=64, maximum=2048, step=64, value=d.width, interactive=True)
+        height = gr.Slider(label='Height', minimum=64, maximum=2048, step=64, value=d.height, interactive=True)
+    with gr.Row(elem_id=f'{mode}_prompt_toprow'):
+        prompt = gr.Textbox(label='Prompt', lines=3, interactive=True, elem_id=f"{mode}_prompt", placeholder="Enter your prompt here...")
+    with gr.Row(elem_id=f'{mode}_n_prompt_toprow'):
+        n_prompt = gr.Textbox(label='Negative prompt', lines=3, interactive=True, elem_id=f"{mode}_n_prompt", value=d.n_prompt)
+    with gr.Row():
+        #steps = gr.Slider(label='Steps', minimum=1, maximum=100, step=1, value=d.steps, interactive=True)
+        cfg_scale = gr.Slider(label='CFG scale', minimum=1, maximum=100, step=1, value=d.cfg_scale, interactive=True)
+    with gr.Row():
+        seed = gr.Number(label='Seed (this parameter controls how the first frame looks like and the color distribution of the consecutive frames as they are dependent on the first one)', value = d.seed, Interactive = True, precision=0)
+    with gr.Row():
+        processing_strength = gr.Slider(label="Processing strength", value=d.processing_strength, minimum=0, maximum=1, step=0.05, interactive=True)
+        fix_frame_strength = gr.Slider(label="Fix frame strength", value=d.fix_frame_strength, minimum=0, maximum=1, step=0.05, interactive=True)
+
+    return width, height, prompt, n_prompt, cfg_scale, seed, processing_strength, fix_frame_strength
+
+def inputs_ui():
+    v2v_args = SimpleNamespace(**V2VArgs())
+    t2v_args = SimpleNamespace(**T2VArgs())
+    with gr.Tab('vid2vid') as tab_vid2vid:
+        with gr.Row():
+            gr.HTML('Put your video here')
+        with gr.Row():
+            vid2vid_file = gr.File(label="Input video", interactive=True, file_count="single", file_types=["video"], elem_id="vid_to_vid_chosen_file")
+            #init_img = gr.Image(label="Image for img2img", elem_id="img2img_image", show_label=False, source="upload", interactive=True, type="pil", image_mode="RGBA")
+        #with gr.Row():
+        #    gr.HTML('Alternative: enter the relative (to the webui) path to the file')
+        #with gr.Row():
+        #    vid2vid_frames_path = gr.Textbox(label="Input video path", interactive=True, elem_id="vid_to_vid_chosen_path", placeholder='Enter your video path here, or upload in the box above ^')
+
+        width, height, prompt, n_prompt, cfg_scale, seed, processing_strength, fix_frame_strength = setup_common_values('vid2vid', v2v_args)
+        #with gr.Row():
+        #    strength = gr.Slider(label="denoising strength", value=d.strength, minimum=0, maximum=1, step=0.05, interactive=True)
+        #    vid2vid_startFrame=gr.Number(label='vid2vid start frame',value=d.vid2vid_startFrame)
+        
+    with gr.Tab('txt2vid') as tab_txt2vid:
+        gr.Markdown('Work in progress...') 
+    #    width, height, prompt, n_prompt, steps, cfg_scale, seed, processing_strength, fix_frame_strength = setup_common_values('txt2vid', t2v_args)
+
+    #with gr.Tab('settings') as tab_setts:        
+    #    gr.Markdown('Work in progress...')         
+         
+    return locals()
+
+def on_ui_tabs():
+    modules.scripts.scripts_current = modules.scripts.scripts_img2img
+    modules.scripts.scripts_img2img.initialize_scripts(is_img2img=True)
+
+    with gr.Blocks(analytics_enabled=False) as sdcnanim_interface:
+        components = {}
+        
+        #dv = SimpleNamespace(**T2VOutputArgs())
+        with gr.Row(elem_id='v2v-core').style(equal_height=False, variant='compact'):
+            with gr.Column(scale=1, variant='panel'):
+                with gr.Row(variant='compact'):
+                    run_button = gr.Button('Generate', elem_id=f"sdcn_anim_generate", variant='primary')
+
+                with gr.Tabs():
+                    components = inputs_ui()
+                    print('components', components['processing_strength'])
+                    #do_vid2vid = gr.State(value=0)
+
+                    for category in ordered_ui_categories():
+                        if category == "sampler":
+                            steps, sampler_index = create_sampler_and_steps_selection(samplers_for_img2img, "vid2vid")
+
+                        elif category == "override_settings":
+                            with FormRow(elem_id="vid2vid_override_settings_row") as row:
+                                override_settings = create_override_settings_dropdown("vid2vid", row)
+
+                        elif category == "scripts":
+                            with FormGroup(elem_id=f"script_container"):
+                                custom_inputs = scripts.scripts_img2img.setup_ui()
+    
+            with gr.Column(scale=1, variant='compact'):
+                with gr.Column(variant="panel"):
+                    sp_progress = gr.HTML(elem_id="sp_progress", value="")
+                    sp_progress.update()
+                    #sp_outcome = gr.HTML(elem_id="sp_error", value="")
+                    #sp_progressbar = gr.HTML(elem_id="sp_progressbar")
+                    #setup_progressbar(sp_progressbar, sp_preview, 'sp', textinfo=sp_progress)
+                    
+                    with gr.Row(variant='compact'):
+                        img_preview_curr_frame = gr.Image(label='Current frame', elem_id=f"img_preview_curr_frame", type='pil').style(height=240)
+                        img_preview_curr_occl = gr.Image(label='Current occlusion', elem_id=f"img_preview_curr_occl", type='pil').style(height=240)
+                    with gr.Row(variant='compact'):
+                        img_preview_prev_warp = gr.Image(label='Previous frame warped', elem_id=f"img_preview_curr_frame", type='pil').style(height=240)
+                        img_preview_processed = gr.Image(label='Processed', elem_id=f"img_preview_processed", type='pil').style(height=240)
+                    #with gr.Row(variant='compact'):
+                    
+                    html_log = gr.HTML(elem_id=f'html_log_vid2vid')
+                
+                with gr.Row(variant='compact'):
+                    dummy_component = gr.Label(visible=False)
+            
+
+            # Define parameters for the action methods. Not all of them are included yet
+            method_inputs = [
+                dummy_component,                    # send None for task_id
+                dummy_component,                    # mode
+                components['prompt'],               # prompt
+                components['n_prompt'],             # negative_prompt
+                dummy_component,                    # prompt_styles
+                components['vid2vid_file'],         # input_video
+                dummy_component,                    # sketch
+                dummy_component,                    # init_img_with_mask
+                dummy_component,                    # inpaint_color_sketch
+                dummy_component,                    # inpaint_color_sketch_orig
+                dummy_component,                    # init_img_inpaint
+                dummy_component,                    # init_mask_inpaint
+                steps,                              # steps
+                sampler_index,                      # sampler_index
+                dummy_component,                    # mask_blur
+                dummy_component,                    # mask_alpha
+                dummy_component,                    # inpainting_fill
+                dummy_component,                    # restore_faces
+                dummy_component,                    # tiling
+                dummy_component,                    # n_iter
+                dummy_component,                    # batch_size
+                components['cfg_scale'],            # cfg_scale
+                dummy_component,                    # image_cfg_scale
+                components['processing_strength'],  # denoising_strength
+                components['seed'],                 # seed
+                dummy_component,                    # subseed
+                dummy_component,                    # subseed_strength
+                dummy_component,                    # seed_resize_from_h
+                dummy_component,                    # seed_resize_from_w
+                dummy_component,                    # seed_enable_extras
+                components['height'],               # height
+                components['width'],                # width
+                dummy_component,                    # resize_mode
+                dummy_component,                    # inpaint_full_res
+                dummy_component,                    # inpaint_full_res_padding
+                dummy_component,                    # inpainting_mask_invert
+                dummy_component,                    # img2img_batch_input_dir
+                dummy_component,                    # img2img_batch_output_dir
+                dummy_component,                    # img2img_batch_inpaint_mask_dir
+                override_settings,                  # override_settings_texts
+            ] + custom_inputs
+
+            method_outputs = [
+                sp_progress,
+                img_preview_curr_frame,
+                img_preview_curr_occl,
+                img_preview_prev_warp,
+                img_preview_processed,
+                html_log,
+            ]
+
+            run_button.click(
+                fn=start_process, #wrap_gradio_gpu_call(start_process, extra_outputs=[None, '', '']), 
+                inputs=method_inputs,
+                outputs=method_outputs,
+                show_progress=True,
+            )
+
+        modules.scripts.scripts_current = None
+
+        # define queue - required for generators
+        sdcnanim_interface.queue(concurrency_count=1)
+    return [(sdcnanim_interface, "SD-CN-Animation", "sd_cn_animation_interface")]
+
+
+script_callbacks.on_ui_tabs(on_ui_tabs)
--- a/scripts/flow_utils.py
+++ b/scripts/flow_utils.py
@ -0,0 +1,154 @@
+import sys, os
+basedirs = [os.getcwd()]
+
+for basedir in basedirs:
+    paths_to_ensure = [
+        basedir,
+        basedir + '/extensions/sd-cn-animation/scripts',
+        basedir + '/extensions/sd-cn-animation/RAFT',
+        ]
+
+    for scripts_path_fix in paths_to_ensure:
+        if not scripts_path_fix in sys.path:
+            sys.path.extend([scripts_path_fix])
+
+import numpy as np
+import cv2
+
+from collections import namedtuple
+import torch
+import argparse
+from RAFT.raft import RAFT
+from RAFT.utils.utils import InputPadder
+
+import modules.paths as ph
+import gc
+
+RAFT_model = None
+fgbg = cv2.createBackgroundSubtractorMOG2(history=500, varThreshold=16, detectShadows=True)
+
+def background_subtractor(frame, fgbg):
+  fgmask = fgbg.apply(frame)
+  return cv2.bitwise_and(frame, frame, mask=fgmask)
+
+def RAFT_clear_memory():
+  global RAFT_model
+  del RAFT_model
+  gc.collect()
+  torch.cuda.empty_cache()
+  RAFT_model = None
+
+def RAFT_estimate_flow(frame1, frame2, device='cuda', subtract_background=True):
+  global RAFT_model
+  if RAFT_model is None:
+    args = argparse.Namespace(**{
+        'model': ph.models_path + '/RAFT/raft-things.pth',
+        'mixed_precision': True,
+        'small': False,
+        'alternate_corr': False,
+        'path': ""
+    })
+
+    RAFT_model = torch.nn.DataParallel(RAFT(args))
+    RAFT_model.load_state_dict(torch.load(args.model))
+
+    RAFT_model = RAFT_model.module
+    RAFT_model.to(device)
+    RAFT_model.eval()
+
+  if subtract_background:
+    frame1 = background_subtractor(frame1, fgbg)
+    frame2 = background_subtractor(frame2, fgbg)
+
+  with torch.no_grad():
+    frame1_torch = torch.from_numpy(frame1).permute(2, 0, 1).float()[None].to(device)
+    frame2_torch = torch.from_numpy(frame2).permute(2, 0, 1).float()[None].to(device)
+
+    padder = InputPadder(frame1_torch.shape)
+    image1, image2 = padder.pad(frame1_torch, frame2_torch)
+
+    # estimate optical flow
+    _, next_flow = RAFT_model(image1, image2, iters=20, test_mode=True)
+    _, prev_flow = RAFT_model(image2, image1, iters=20, test_mode=True)
+
+    next_flow = next_flow[0].permute(1, 2, 0).cpu().numpy()
+    prev_flow = prev_flow[0].permute(1, 2, 0).cpu().numpy()
+
+    fb_flow = next_flow + prev_flow
+    fb_norm = np.linalg.norm(fb_flow, axis=2)
+
+    occlusion_mask = fb_norm[..., None].repeat(3, axis=-1)
+
+  return next_flow, prev_flow, occlusion_mask, frame1, frame2
+
+# ... rest of the file ...
+
+
+def compute_diff_map(next_flow, prev_flow, prev_frame, cur_frame, prev_frame_styled):
+  h, w = cur_frame.shape[:2]
+  fl_w, fl_h = next_flow.shape[:2]
+
+  # normalize flow
+  next_flow = next_flow / np.array([fl_h,fl_w]) 
+  prev_flow = prev_flow / np.array([fl_h,fl_w])
+
+  # remove low value noise (@alexfredo suggestion)
+  next_flow[np.abs(next_flow) < 0.05] = 0
+  prev_flow[np.abs(prev_flow) < 0.05] = 0
+
+  # resize flow
+  next_flow = cv2.resize(next_flow, (w, h)) 
+  next_flow = (next_flow * np.array([h,w])).astype(np.float32)
+  prev_flow = cv2.resize(prev_flow, (w, h))
+  prev_flow = (prev_flow  * np.array([h,w])).astype(np.float32)
+
+  # Generate sampling grids
+  grid_y, grid_x = torch.meshgrid(torch.arange(0, h), torch.arange(0, w))
+  flow_grid = torch.stack((grid_x, grid_y), dim=0).float()
+  flow_grid += torch.from_numpy(prev_flow).permute(2, 0, 1)
+  flow_grid = flow_grid.unsqueeze(0)
+  flow_grid[:, 0, :, :] = 2 * flow_grid[:, 0, :, :] / (w - 1) - 1
+  flow_grid[:, 1, :, :] = 2 * flow_grid[:, 1, :, :] / (h - 1) - 1
+  flow_grid = flow_grid.permute(0, 2, 3, 1)
+
+  
+  prev_frame_torch = torch.from_numpy(prev_frame).float().unsqueeze(0).permute(0, 3, 1, 2) #N, C, H, W
+  prev_frame_styled_torch = torch.from_numpy(prev_frame_styled).float().unsqueeze(0).permute(0, 3, 1, 2) #N, C, H, W
+
+  warped_frame = torch.nn.functional.grid_sample(prev_frame_torch, flow_grid, padding_mode="reflection").permute(0, 2, 3, 1)[0].numpy()
+  warped_frame_styled = torch.nn.functional.grid_sample(prev_frame_styled_torch, flow_grid, padding_mode="reflection").permute(0, 2, 3, 1)[0].numpy()
+
+  #warped_frame = cv2.remap(prev_frame, flow_map, None, cv2.INTER_NEAREST, borderMode = cv2.BORDER_REFLECT)
+  #warped_frame_styled = cv2.remap(prev_frame_styled, flow_map, None, cv2.INTER_NEAREST, borderMode = cv2.BORDER_REFLECT)
+
+  # compute occlusion mask
+  fb_flow = next_flow + prev_flow
+  fb_norm = np.linalg.norm(fb_flow, axis=2)
+
+  occlusion_mask = fb_norm[..., None] 
+
+  diff_mask_org = np.abs(warped_frame.astype(np.float32) - cur_frame.astype(np.float32)) / 255
+  diff_mask_org = diff_mask_org.max(axis = -1, keepdims=True)
+
+  diff_mask_stl = np.abs(warped_frame_styled.astype(np.float32) - cur_frame.astype(np.float32)) / 255
+  diff_mask_stl = diff_mask_stl.max(axis = -1, keepdims=True)
+
+  alpha_mask = np.maximum(occlusion_mask * 0.3, diff_mask_org * 4, diff_mask_stl * 2)
+  alpha_mask = alpha_mask.repeat(3, axis = -1)
+
+  #alpha_mask_blured = cv2.dilate(alpha_mask, np.ones((5, 5), np.float32))
+  alpha_mask = cv2.GaussianBlur(alpha_mask, (51,51), 5, cv2.BORDER_REFLECT)
+
+  alpha_mask = np.clip(alpha_mask, 0, 1)
+
+  return alpha_mask, warped_frame_styled
+
+def frames_norm(occl): return occl / 127.5 - 1
+
+def flow_norm(flow): return flow / 255
+
+def occl_norm(occl): return occl / 127.5 - 1
+
+def flow_renorm(flow): return flow * 255
+
+def occl_renorm(occl): return (occl + 1) * 127.5
--- a/scripts/vid2vid.py
+++ b/scripts/vid2vid.py
@ -0,0 +1,398 @@
+import sys, os
+basedirs = [os.getcwd()]
+
+for basedir in basedirs:
+    paths_to_ensure = [
+        basedir,
+        basedir + '/extensions/sd-cn-animation/scripts',
+        ]
+
+    for scripts_path_fix in paths_to_ensure:
+        if not scripts_path_fix in sys.path:
+            sys.path.extend([scripts_path_fix])
+
+import math
+import os
+import sys
+import traceback
+
+import numpy as np
+from PIL import Image, ImageOps, ImageFilter, ImageEnhance, ImageChops
+
+from modules import devices, sd_samplers, img2img
+from modules import shared, sd_hijack, lowvram
+from modules.generation_parameters_copypaste import create_override_settings_dict
+from modules.processing import Processed, StableDiffusionProcessingImg2Img, process_images
+from modules.shared import opts, devices
+import modules.shared as shared
+import modules.processing as processing
+from modules.ui import plaintext_to_html
+import modules.images as images
+import modules.scripts
+
+import gc
+import cv2
+import gradio as gr
+
+import time
+import skimage
+import datetime
+
+from flow_utils import RAFT_estimate_flow, RAFT_clear_memory, compute_diff_map
+from types import SimpleNamespace   
+
+class sdcn_anim_tmp:
+    prepear_counter = 0
+    process_counter = 0
+    input_video = None
+    output_video = None
+    curr_frame = None
+    prev_frame = None
+    prev_frame_styled = None
+    fps = None
+    total_frames = None
+    prepared_frames = None
+    prepared_next_flows = None
+    prepared_prev_flows = None
+    frames_prepared = False
+
+def read_frame_from_video():
+    # Reading video file
+    if sdcn_anim_tmp.input_video.isOpened():
+        ret, cur_frame = sdcn_anim_tmp.input_video.read()
+        if cur_frame is not None: 
+            cur_frame = cv2.cvtColor(cur_frame, cv2.COLOR_BGR2RGB) 
+    else:
+        cur_frame = None
+        sdcn_anim_tmp.input_video.release()
+    
+    return cur_frame
+
+def get_cur_stat():
+    stat =  f'Frames prepared: {sdcn_anim_tmp.prepear_counter + 1} / {sdcn_anim_tmp.total_frames}; '
+    stat += f'Frames processed: {sdcn_anim_tmp.process_counter + 1} / {sdcn_anim_tmp.total_frames}; '
+    return stat
+
+def clear_memory_from_sd():
+    if shared.sd_model is not None:
+        sd_hijack.model_hijack.undo_hijack(shared.sd_model)
+        try:
+            lowvram.send_everything_to_cpu()
+        except Exception as e:
+            ...
+        del shared.sd_model
+        shared.sd_model = None
+    gc.collect()
+    devices.torch_gc()
+
+def get_device():
+    device=devices.get_optimal_device()
+    #print('device',device)
+    return device
+
+def args_to_dict(*args): # converts list of argumets into dictionary for better handling of it
+    args_list = ['id_task', 'mode', 'prompt', 'negative_prompt', 'prompt_styles', 'init_video', 'sketch', 'init_img_with_mask', 'inpaint_color_sketch', 'inpaint_color_sketch_orig', 'init_img_inpaint', 'init_mask_inpaint', 'steps', 'sampler_index', 'mask_blur', 'mask_alpha', 'inpainting_fill', 'restore_faces', 'tiling', 'n_iter', 'batch_size', 'cfg_scale', 'image_cfg_scale', 'denoising_strength', 'seed', 'subseed', 'subseed_strength', 'seed_resize_from_h', 'seed_resize_from_w', 'seed_enable_extras', 'height', 'width', 'resize_mode', 'inpaint_full_res', 'inpaint_full_res_padding', 'inpainting_mask_invert', 'img2img_batch_input_dir', 'img2img_batch_output_dir', 'img2img_batch_inpaint_mask_dir', 'override_settings_texts']
+
+    # set default values for params that were not specified
+    args_dict = {
+        'mode': 0,
+        'prompt': '',
+        'negative_prompt': '',
+        'prompt_styles': [],
+        'init_video': None, # Always required
+
+        'steps': 15,
+        'sampler_index': 0, # 'Euler a'    
+        'mask_blur': 0,
+
+        'inpainting_fill': 1, # original
+        'restore_faces': False,
+        'tiling': False,
+        'n_iter': 1,
+        'batch_size': 1,
+        'cfg_scale': 5.5,
+        'image_cfg_scale': 1.5,
+        'denoising_strength': 0.75,
+        'seed': -1,
+        'subseed': -1,
+        'subseed_strength': 0,
+        'seed_resize_from_h': 512,
+        'seed_resize_from_w': 512,
+        'seed_enable_extras': False,
+        'height': 512,
+        'width': 512,
+        'resize_mode': 1,
+        'inpaint_full_res': True,
+        'inpaint_full_res_padding': 0,
+    }
+
+    args = list(args)
+
+    for i in range(len(args_list)):
+        if (args[i] is None) and (args_list[i] in args_dict):
+            args[i] = args_dict[args_list[i]] 
+        else:
+            args_dict[args_list[i]] = args[i]
+
+    args_dict['script_inputs'] = args[len(args_list):]
+    return args_dict, args
+
+def start_process(*args):
+    args_dict, args_list = args_to_dict(*args) 
+
+    sdcn_anim_tmp.process_counter = 0
+    sdcn_anim_tmp.prepear_counter = 0
+
+    # Open the input video file
+    sdcn_anim_tmp.input_video = cv2.VideoCapture(args_dict['init_video'].name)
+    
+    # Get useful info from the source video
+    sdcn_anim_tmp.fps = int(sdcn_anim_tmp.input_video.get(cv2.CAP_PROP_FPS))
+    sdcn_anim_tmp.total_frames = int(sdcn_anim_tmp.input_video.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    # Create an output video file with the same fps, width, and height as the input video
+    output_video_name = f'outputs/sd-cn-animation/vid2vid/{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}.mp4'
+    os.makedirs(os.path.dirname(output_video_name), exist_ok=True)
+    sdcn_anim_tmp.output_video = cv2.VideoWriter(output_video_name, cv2.VideoWriter_fourcc(*'mp4v'), sdcn_anim_tmp.fps, (args_dict['width'], args_dict['height']))
+
+    curr_frame = read_frame_from_video()
+    curr_frame = cv2.resize(curr_frame, (args_dict['width'], args_dict['height']))
+    sdcn_anim_tmp.prepared_frames = np.zeros((11, args_dict['height'], args_dict['width'], 3), dtype=np.uint8)
+    sdcn_anim_tmp.prepared_next_flows = np.zeros((10, args_dict['height'], args_dict['width'], 2))
+    sdcn_anim_tmp.prepared_prev_flows = np.zeros((10, args_dict['height'], args_dict['width'], 2))
+    sdcn_anim_tmp.prepared_frames[0] = curr_frame
+
+    #args_dict['init_img'] = cur_frame
+    args_list[5] = Image.fromarray(curr_frame)
+    processed_frames, _, _, _ = modules.img2img.img2img(*args_list) #img2img(args_dict)
+    processed_frame = np.array(processed_frames[0])
+    processed_frame = skimage.exposure.match_histograms(processed_frame, curr_frame, multichannel=False, channel_axis=-1)
+    #print('Processed frame ', 0)
+    
+
+    sdcn_anim_tmp.curr_frame = curr_frame
+    sdcn_anim_tmp.prev_frame = curr_frame.copy()
+    sdcn_anim_tmp.prev_frame_styled = processed_frame.copy()
+    yield get_cur_stat(), sdcn_anim_tmp.curr_frame, None, None, processed_frame, ''
+
+    # TODO: SOLVE PROBLEM with wrong prev frame on the start on new processing iterations
+
+    for step in range((sdcn_anim_tmp.total_frames-1) * 2):
+        args_dict, args_list = args_to_dict(*args) 
+
+        occlusion_mask = None
+        prev_frame = None
+        curr_frame = sdcn_anim_tmp.curr_frame
+        warped_styled_frame = gr.Image.update()
+        processed_frame = gr.Image.update()
+
+        prepare_steps = 10
+        if sdcn_anim_tmp.process_counter % prepare_steps == 0 and not sdcn_anim_tmp.frames_prepared: # prepare next 10 frames for processing
+            #clear_memory_from_sd()
+            device = get_device()
+
+            curr_frame = read_frame_from_video()
+            if curr_frame is not None: 
+                curr_frame = cv2.resize(curr_frame, (args_dict['width'], args_dict['height']))
+                prev_frame = sdcn_anim_tmp.prev_frame.copy()
+
+                next_flow, prev_flow, occlusion_mask, frame1_bg_removed, frame2_bg_removed = RAFT_estimate_flow(prev_frame, curr_frame, subtract_background=False, device=device)
+                occlusion_mask = np.clip(occlusion_mask * 0.1 * 255, 0, 255).astype(np.uint8)
+
+                cn = sdcn_anim_tmp.prepear_counter % 10
+                if sdcn_anim_tmp.prepear_counter % 10 == 0:
+                    sdcn_anim_tmp.prepared_frames[cn] = sdcn_anim_tmp.prev_frame
+                sdcn_anim_tmp.prepared_frames[cn + 1] = curr_frame.copy()
+                sdcn_anim_tmp.prepared_next_flows[cn] = next_flow.copy()
+                sdcn_anim_tmp.prepared_prev_flows[cn] = prev_flow.copy()
+                #print('Prepared frame ', cn+1)
+
+                sdcn_anim_tmp.prev_frame = curr_frame.copy()
+
+            sdcn_anim_tmp.prepear_counter += 1
+            if sdcn_anim_tmp.prepear_counter % prepare_steps == 0 or \
+               sdcn_anim_tmp.prepear_counter >= sdcn_anim_tmp.total_frames - 1 or \
+               curr_frame is None:
+                # Remove RAFT from memory
+                RAFT_clear_memory()
+                sdcn_anim_tmp.frames_prepared = True
+        else:
+            # process frame
+            sdcn_anim_tmp.frames_prepared = False
+
+            cn = sdcn_anim_tmp.process_counter % 10 
+            curr_frame = sdcn_anim_tmp.prepared_frames[cn+1]
+            prev_frame = sdcn_anim_tmp.prepared_frames[cn]
+            next_flow = sdcn_anim_tmp.prepared_next_flows[cn]
+            prev_flow = sdcn_anim_tmp.prepared_prev_flows[cn]
+
+            # process current frame
+            args_list[5] = Image.fromarray(curr_frame)
+            args_list[24] = -1
+            processed_frames, _, _, _ = modules.img2img.img2img(*args_list)
+            processed_frame = np.array(processed_frames[0])
+
+
+            alpha_mask, warped_styled_frame = compute_diff_map(next_flow, prev_flow, prev_frame, curr_frame, sdcn_anim_tmp.prev_frame_styled)
+            alpha_mask = np.clip(alpha_mask + 0.05, 0.05, 0.95)
+
+            fl_w, fl_h = prev_flow.shape[:2]
+            prev_flow_n = prev_flow / np.array([fl_h,fl_w])
+            flow_mask = np.clip(1 - np.linalg.norm(prev_flow_n, axis=-1)[...,None], 0, 1)
+
+            # fix warped styled frame from duplicated that occures on the places where flow is zero, but only because there is no place to get the color from
+            warped_styled_frame = curr_frame.astype(float) * alpha_mask * flow_mask + warped_styled_frame.astype(float) * (1 - alpha_mask * flow_mask)
+            
+            # This clipping at lower side required to fix small trailing issues that for some reason left outside of the bright part of the mask, 
+            # and at the higher part it making parts changed strongly to do it with less flickering. 
+            
+            occlusion_mask = np.clip(alpha_mask * 255, 0, 255).astype(np.uint8)
+
+            # normalizing the colors
+            processed_frame = skimage.exposure.match_histograms(processed_frame, curr_frame, multichannel=False, channel_axis=-1)
+            processed_frame = processed_frame.astype(float) * alpha_mask + warped_styled_frame.astype(float) * (1 - alpha_mask)
+            
+            processed_frame = processed_frame * 0.9 + curr_frame * 0.1
+            processed_frame = np.clip(processed_frame, 0, 255).astype(np.uint8)
+            sdcn_anim_tmp.prev_frame_styled = processed_frame.copy()
+
+            args_list[5] = Image.fromarray(processed_frame)
+            args_list[23] = 0.15
+            args_list[24] = 8888
+            processed_frames, _, _, _ = modules.img2img.img2img(*args_list)
+            processed_frame = np.array(processed_frames[0])
+
+            processed_frame = np.clip(processed_frame, 0, 255).astype(np.uint8)
+            warped_styled_frame = np.clip(warped_styled_frame, 0, 255).astype(np.uint8)
+            
+
+            # Write the frame to the output video
+            frame_out = np.clip(processed_frame, 0, 255).astype(np.uint8)
+            frame_out = cv2.cvtColor(frame_out, cv2.COLOR_RGB2BGR) 
+            sdcn_anim_tmp.output_video.write(frame_out)
+
+            sdcn_anim_tmp.process_counter += 1
+            if sdcn_anim_tmp.process_counter >= sdcn_anim_tmp.total_frames - 1:
+                sdcn_anim_tmp.input_video.release()
+                sdcn_anim_tmp.output_video.release()
+                sdcn_anim_tmp.prev_frame = None
+
+        #print(f'\nEND OF STEP {step}, {sdcn_anim_tmp.prepear_counter}, {sdcn_anim_tmp.process_counter}')
+        yield get_cur_stat(), curr_frame, occlusion_mask, warped_styled_frame, processed_frame, ''
+
+    #sdcn_anim_tmp.input_video.release()
+    #sdcn_anim_tmp.output_video.release()
+
+    return get_cur_stat(), curr_frame, occlusion_mask, warped_styled_frame, processed_frame, ''
+
+'''
+# id_task: str, mode: int, prompt: str, negative_prompt: str, prompt_styles: list, init_img, sketch, init_img_with_mask, inpaint_color_sketch, inpaint_color_sketch_orig, init_img_inpaint, init_mask_inpaint, steps: int, sampler_index: int, mask_blur: int, mask_alpha: float, inpainting_fill: int, restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, image_cfg_scale: float, denoising_strength: float, seed: int, subseed: int, subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool, height: int, width: int, resize_mode: int, inpaint_full_res: bool, inpaint_full_res_padding: int, inpainting_mask_invert: int, img2img_batch_input_dir: str, img2img_batch_output_dir: str, img2img_batch_inpaint_mask_dir: str, override_settings_texts, *args
+def img2img(args_dict):  
+    args = SimpleNamespace(**args_dict)
+    print('override_settings:', args.override_settings_texts)
+    override_settings = create_override_settings_dict(args.override_settings_texts)
+
+    is_batch = args.mode == 5
+
+    if args.mode == 0:  # img2img
+        image = args.init_img.convert("RGB")
+        mask = None
+    elif args.mode == 1:  # img2img sketch
+        image = args.sketch.convert("RGB")
+        mask = None
+    elif args.mode == 2:  # inpaint
+        image, mask = args.init_img_with_mask["image"], args.init_img_with_mask["mask"]
+        alpha_mask = ImageOps.invert(image.split()[-1]).convert('L').point(lambda x: 255 if x > 0 else 0, mode='1')
+        mask = ImageChops.lighter(alpha_mask, mask.convert('L')).convert('L')
+        image = image.convert("RGB")
+    elif args.mode == 3:  # inpaint sketch
+        image = args.inpaint_color_sketch
+        orig = args.inpaint_color_sketch_orig or args.inpaint_color_sketch
+        pred = np.any(np.array(image) != np.array(orig), axis=-1)
+        mask = Image.fromarray(pred.astype(np.uint8) * 255, "L")
+        mask = ImageEnhance.Brightness(mask).enhance(1 - args.mask_alpha / 100)
+        blur = ImageFilter.GaussianBlur(args.mask_blur)
+        image = Image.composite(image.filter(blur), orig, mask.filter(blur))
+        image = image.convert("RGB")
+    elif args.mode == 4:  # inpaint upload mask
+        image = args.init_img_inpaint
+        mask = args.init_mask_inpaint
+    else:
+        image = None
+        mask = None
+
+    # Use the EXIF orientation of photos taken by smartphones.
+    if image is not None:
+        image = ImageOps.exif_transpose(image)
+
+    assert 0. <= args.denoising_strength <= 1., 'can only work with strength in [0.0, 1.0]'
+
+    p = StableDiffusionProcessingImg2Img(
+        sd_model=shared.sd_model,
+        outpath_samples=opts.outdir_samples or opts.outdir_img2img_samples,
+        outpath_grids=opts.outdir_grids or opts.outdir_img2img_grids,
+        prompt=args.prompt,
+        negative_prompt=args.negative_prompt,
+        styles=args.prompt_styles,
+        seed=args.seed,
+        subseed=args.subseed,
+        subseed_strength=args.subseed_strength,
+        seed_resize_from_h=args.seed_resize_from_h,
+        seed_resize_from_w=args.seed_resize_from_w,
+        seed_enable_extras=args.seed_enable_extras,
+        sampler_name=sd_samplers.samplers_for_img2img[args.sampler_index].name,
+        batch_size=args.batch_size,
+        n_iter=args.n_iter,
+        steps=args.steps,
+        cfg_scale=args.cfg_scale,
+        width=args.width,
+        height=args.height,
+        restore_faces=args.restore_faces,
+        tiling=args.tiling,
+        init_images=[image],
+        mask=mask,
+        mask_blur=args.mask_blur,
+        inpainting_fill=args.inpainting_fill,
+        resize_mode=args.resize_mode,
+        denoising_strength=args.denoising_strength,
+        image_cfg_scale=args.image_cfg_scale,
+        inpaint_full_res=args.inpaint_full_res,
+        inpaint_full_res_padding=args.inpaint_full_res_padding,
+        inpainting_mask_invert=args.inpainting_mask_invert,
+        override_settings=override_settings,
+    )
+
+    p.scripts = modules.scripts.scripts_txt2img
+    p.script_args = args.script_inputs
+
+    print('script_inputs 1:', args.script_inputs[1].__dict__)
+    #print('script_inputs 2:', args.script_inputs[1])
+
+    if shared.cmd_opts.enable_console_prompts:
+        print(f"\nimg2img: {args.prompt}", file=shared.progress_print_out)
+
+    if mask:
+        p.extra_generation_params["Mask blur"] = args.mask_blur
+
+    if is_batch:
+        ...
+    #    assert not shared.cmd_opts.hide_ui_dir_config, "Launched with --hide-ui-dir-config, batch img2img disabled"
+    #    process_batch(p, img2img_batch_input_dir, img2img_batch_output_dir, img2img_batch_inpaint_mask_dir, args.script_inputs)
+    #    processed = Processed(p, [], p.seed, "")
+    else:
+        processed = modules.scripts.scripts_img2img.run(p, *args.script_inputs)
+        if processed is None:
+            processed = process_images(p)
+    
+    p.close()
+
+    shared.total_tqdm.clear()
+
+    generation_info_js = processed.js()
+    if opts.samples_log_stdout:
+        print(generation_info_js)
+
+    if opts.do_not_show_images:
+        processed.images = []
+
+    return processed.images[0] #, generation_info_js, plaintext_to_html(processed.info), plaintext_to_html(processed.comments)'''