pull/87/head
Alexey Borsky 2023-05-03 02:46:56 +03:00
parent c34cfe2976
commit bb4353a264
12 changed files with 940 additions and 232 deletions

View File

@ -1,143 +0,0 @@
import torch
import torch.nn as nn
import torch.functional as F
# Define the model
class FloweR(nn.Module):
def __init__(self, input_size = (384, 384), window_size = 4):
super(FloweR, self).__init__()
self.input_size = input_size
self.window_size = window_size
#INPUT: 384 x 384 x 10 * 3
### DOWNSCALE ###
self.conv_block_1 = nn.Sequential(
nn.Conv2d(3 * self.window_size, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 384 x 384 x 128
self.conv_block_2 = nn.Sequential(
nn.AvgPool2d(2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 192 x 192 x 128
self.conv_block_3 = nn.Sequential(
nn.AvgPool2d(2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 96 x 96 x 128
self.conv_block_4 = nn.Sequential(
nn.AvgPool2d(2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 48 x 48 x 128
self.conv_block_5 = nn.Sequential(
nn.AvgPool2d(2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 24 x 24 x 128
self.conv_block_6 = nn.Sequential(
nn.AvgPool2d(2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 12 x 12 x 128
self.conv_block_7 = nn.Sequential(
nn.AvgPool2d(2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 6 x 6 x 128
self.conv_block_8 = nn.Sequential(
nn.AvgPool2d(2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 3 x 3 x 128
### UPSCALE ###
self.conv_block_9 = nn.Sequential(
nn.Upsample(scale_factor=2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 6 x 6 x 128
self.conv_block_10 = nn.Sequential(
nn.Upsample(scale_factor=2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 12 x 12 x 128
self.conv_block_11 = nn.Sequential(
nn.Upsample(scale_factor=2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 24 x 24 x 128
self.conv_block_12 = nn.Sequential(
nn.Upsample(scale_factor=2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 48 x 48 x 128
self.conv_block_13 = nn.Sequential(
nn.Upsample(scale_factor=2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 96 x 96 x 128
self.conv_block_14 = nn.Sequential(
nn.Upsample(scale_factor=2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 192 x 192 x 128
self.conv_block_15 = nn.Sequential(
nn.Upsample(scale_factor=2),
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
nn.ReLU(),
) # 384 x 384 x 128
self.conv_block_16 = nn.Conv2d(128, 3, kernel_size=3, stride=1, padding='same')
def forward(self, x):
if x.size(1) != self.window_size:
raise Exception(f'Shape of the input is not compatable. There should be exactly {self.window_size} frames in an input video.')
# batch, frames, height, width, colors
in_x = x.permute((0, 1, 4, 2, 3))
# batch, frames, colors, height, width
in_x = in_x.reshape(-1, self.window_size * 3, self.input_size[0], self.input_size[1])
### DOWNSCALE ###
block_1_out = self.conv_block_1(in_x) # 384 x 384 x 128
block_2_out = self.conv_block_2(block_1_out) # 192 x 192 x 128
block_3_out = self.conv_block_3(block_2_out) # 96 x 96 x 128
block_4_out = self.conv_block_4(block_3_out) # 48 x 48 x 128
block_5_out = self.conv_block_5(block_4_out) # 24 x 24 x 128
block_6_out = self.conv_block_6(block_5_out) # 12 x 12 x 128
block_7_out = self.conv_block_7(block_6_out) # 6 x 6 x 128
block_8_out = self.conv_block_8(block_7_out) # 3 x 3 x 128
### UPSCALE ###
block_9_out = block_7_out + self.conv_block_9(block_8_out) # 6 x 6 x 128
block_10_out = block_6_out + self.conv_block_10(block_9_out) # 12 x 12 x 128
block_11_out = block_5_out + self.conv_block_11(block_10_out) # 24 x 24 x 128
block_12_out = block_4_out + self.conv_block_12(block_11_out) # 48 x 48 x 128
block_13_out = block_3_out + self.conv_block_13(block_12_out) # 96 x 96 x 128
block_14_out = block_2_out + self.conv_block_14(block_13_out) # 192 x 192 x 128
block_15_out = block_1_out + self.conv_block_15(block_14_out) # 384 x 384 x 128
block_16_out = self.conv_block_16(block_15_out) # 384 x 384 x (2 + 1)
out = block_16_out.reshape(-1, 3, self.input_size[0], self.input_size[1])
# batch, colors, height, width
out = out.permute((0, 2, 3, 1))
# batch, height, width, colors
return out

15
install.py Normal file
View File

@ -0,0 +1,15 @@
import launch
import os
req_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "requirements.txt")
with open(req_file) as file:
for lib in file:
lib = lib.strip()
if not launch.is_installed(lib):
launch.run_pip(f"install {lib}", f"text2video requirement: {lib}")

133
old_scripts/readme.md Normal file
View File

@ -0,0 +1,133 @@
# SD-CN-Animation
This project allows you to automate video stylization task using StableDiffusion and ControlNet. It also allows you to generate completely new videos from text at any resolution and length in contrast to other current text2video methods using any Stable Diffusion model as a backbone, including custom ones. It uses '[RAFT](https://github.com/princeton-vl/RAFT)' optical flow estimation algorithm to keep the animation stable and create an inpainting mask that is used to generate the next frame. In text to video mode it relies on 'FloweR' method (work in progress) that predicts optical flow from the previous frames.
### Video to Video Examples:
<!--
[![IMAGE_ALT](https://img.youtube.com/vi/j-0niEMm6DU/0.jpg)](https://youtu.be/j-0niEMm6DU)
This script can also be using to swap the person in the video like in this example: https://youtube.com/shorts/be93_dIeZWU
-->
</table>
<table class="center">
<tr>
<td><img src="examples/girl_org.gif" raw=true></td>
<td><img src="examples/girl_to_jc.gif" raw=true></td>
<td><img src="examples/girl_to_wc.gif" raw=true></td>
</tr>
<tr>
<td width=33% align="center">Original video</td>
<td width=33% align="center">"Jessica Chastain"</td>
<td width=33% align="center">"Watercolor painting"</td>
</tr>
</table>
Examples presented are generated at 1024x576 resolution using the 'realisticVisionV13_v13' model as a base. They were cropt, downsized and compressed for better loading speed. You can see them in their original quality in the 'examples' folder.
### Text to Video Examples:
</table>
<table class="center">
<tr>
<td><img src="examples/flower_1.gif" raw=true></td>
<td><img src="examples/bonfire_1.gif" raw=true></td>
<td><img src="examples/diamond_4.gif" raw=true></td>
</tr>
<tr>
<td width=33% align="center">"close up of a flower"</td>
<td width=33% align="center">"bonfire near the camp in the mountains at night"</td>
<td width=33% align="center">"close up of a diamond laying on the table"</td>
</tr>
<tr>
<td><img src="examples/macaroni_1.gif" raw=true></td>
<td><img src="examples/gold_1.gif" raw=true></td>
<td><img src="examples/tree_2.gif" raw=true></td>
</tr>
<tr>
<td width=33% align="center">"close up of macaroni on the plate"</td>
<td width=33% align="center">"close up of golden sphere"</td>
<td width=33% align="center">"a tree standing in the winter forest"</td>
</tr>
</table>
All examples you can see here are originally generated at 512x512 resolution using the 'sd-v1-5-inpainting' model as a base. They were downsized and compressed for better loading speed. You can see them in their original quality in the 'examples' folder. Actual prompts used were stated in the following format: "RAW photo, {subject}, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3", only the 'subject' part is described in the table above.
## Dependencies
To install all the necessary dependencies, run this command:
```
pip install opencv-python opencv-contrib-python numpy tqdm h5py scikit-image
```
You have to set up the RAFT repository as it described here: https://github.com/princeton-vl/RAFT . Basically it just comes down to running "./download_models.sh" in RAFT folder to download the models.
## Running the scripts
This script works on top of [Automatic1111/web-ui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) interface via API. To run this script you have to set it up first. You should also have[sd-webui-controlnet](https://github.com/Mikubill/sd-webui-controlnet) extension installed. You need to have the control_hed-fp16 model installed. If you have web-ui with ControlNet working correctly, you have to also allow the API to work with controlNet. To do so, go to the web-ui settings -> ControlNet tab -> Set "Allow other script to control this extension" checkbox to active and set "Multi ControlNet: Max models amount (requires restart)" to more then 2 -> press "Apply settings".
### Video To Video
#### Step 1.
To process the video, first of all you would need to precompute optical flow data before running web-ui with this command:
```
python3 compute_flow.py -i "path to your video" -o "path to output file with *.h5 format" -v -W width_of_the_flow_map -H height_of_the_flow_map
```
The main reason to do this step separately is to save precious GPU memory that will be useful to generate better quality images. Choose W and H parameters as high as your GPU can handle with respect to the proportion of original video resolution. Do not worry if it is higher or less then the processing resolution, flow maps will be scaled accordingly at the processing stage. This will generate quite a large file that may take up to a several gigabytes on the drive even for minute long video. If you want to process a long video consider splitting it into several parts beforehand.
#### Step 2.
Run web-ui with '--api' flag. It is also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.
```
bash webui.sh --xformers --api
```
#### Step 3.
Go to the **vid2vid.py** file and change main parameters (INPUT_VIDEO, FLOW_MAPS, OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. FLOW_MAPS parameter should contain a path to the flow file that you generated at the first step. The script is pretty simple so you may change other parameters as well, although I would recommend to leave them as is for the first time. Finally run the script with the command:
```
python3 vid2vid.py
```
### Text To Video
This method is still in development and works on top of Stable Diffusion and 'FloweR' - optical flow reconstruction method that is also in a yearly development stage. Do not expect much from it as it is more of a proof of a concept rather than a complete solution.
#### Step 1.
Download 'FloweR_0.1.pth' model from here: [Google drive link](https://drive.google.com/file/d/1WhzoVIw6Kdg4EjfK9LaTLqFm5dF-IJ7F/view?usp=share_link) and place it in the 'FloweR' folder.
#### Step 2.
Same as with vid2vid case, run web-ui with '--api' flag. It is also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.
```
bash webui.sh --xformers --api
```
#### Step 3.
Go to the **txt2vid.py** file and change main parameters (OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. Again, the script is simple so you may change other parameters if you want to. Finally run the script with the command:
```
python3 txt2vid.py
```
## Last version changes: v0.5
* Fixed an issue with the wrong direction of an optical flow applied to an image.
* Added text to video mode within txt2vid.py script. Make sure to update new dependencies for this script to work!
* Added a threshold for an optical flow before processing the frame to remove white noise that might appear, as it was suggested by [@alexfredo](https://github.com/alexfredo).
* Background removal at flow computation stage implemented by [@CaptnSeraph](https://github.com/CaptnSeraph), it should reduce ghosting effect in most of the videos processed with vid2vid script.
<!--
## Last version changes: v0.6
* Added separate flag '-rb' for background removal process at the flow computation stage in the compute_flow.py script.
* Added flow normalization before rescaling it, so the magnitude of the flow computed correctly at the different resolution.
* Less ghosting and color change in vid2vid mode
-->
<!--
## Potential improvements
There are several ways overall quality of animation may be improved:
* You may use a separate processing for each camera position to get a more consistent style of the characters and less ghosting.
* Because the quality of the video depends on how good optical flow was estimated it might be beneficial to use high frame rate video as a source, so it would be easier to guess the flow properly.
* The quality of flow estimation might be greatly improved with a proper flow estimation model like this one: https://github.com/autonomousvision/unimatch .
-->
## Licence
This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact me directly at borsky.alexey@gmail.com

View File

@ -18,11 +18,11 @@ import skimage
import datetime
OUTPUT_VIDEO = f'result_{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}.mp4'
OUTPUT_VIDEO = f'videos/result_{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}.mp4'
PROMPT = "RAW photo, bonfire near the camp in the mountains at night, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
PROMPT = "people looking at flying robots. Future. People looking to the sky. Stars in the background. Dramatic light, Cinematic light. Soft lighting, high quality, film grain."
N_PROMPT = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime:1.4), text, letters, logo, brand, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
w,h = 512, 512 # Width and height of the processed image. Note that actual image processed would be a W x H resolution.
w,h = 768, 512 # Width and height of the processed image. Note that actual image processed would be a W x H resolution.
SAVE_FRAMES = True # saves individual frames into 'out' folder if set True. Again might be helpful with long animations
@ -128,7 +128,7 @@ prev_frame_styled = None
# Instantiate the model
model = FloweR(input_size = (h, w))
model.load_state_dict(torch.load('FloweR/FloweR_0.1.pth'))
model.load_state_dict(torch.load('FloweR/FloweR_0.1.1.pth'))
# Move the model to the device
model = model.to(DEVICE)
@ -142,7 +142,7 @@ clip_frames = np.zeros((4, h, w, 3), dtype=np.uint8)
color_shift = np.zeros((0, 3))
color_scale = np.zeros((0, 3))
for ind in tqdm(range(40)):
for ind in tqdm(range(450)):
clip_frames = np.roll(clip_frames, -1, axis=0)
clip_frames[-1] = prev_frame

View File

@ -11,18 +11,18 @@ from flow_utils import compute_diff_map
import skimage
import datetime
INPUT_VIDEO = "input.mp4"
FLOW_MAPS = "flow.h5"
INPUT_VIDEO = "/media/alex/ded3efe6-5825-429d-ac89-7ded676a2b6d/media/Peter_Gabriel/pexels-monstera-5302599-4096x2160-30fps.mp4"
FLOW_MAPS = "/media/alex/ded3efe6-5825-429d-ac89-7ded676a2b6d/media/Peter_Gabriel/pexels-monstera-5302599-4096x2160-30fps.h5"
OUTPUT_VIDEO = f'videos/result_{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}.mp4'
PROMPT = "Watercolor painting"
PROMPT = "Underwater shot Peter Gabriel with closed eyes in Peter Gabriel's music video. 80's music video. VHS style. Dramatic light, Cinematic light. RAW photo, 8k uhd, dslr, soft lighting, high quality, film grain."
N_PROMPT = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime:1.4), text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
w,h = 1024, 576 # Width and height of the processed image. Note that actual image processed would be a W x H resolution.
w,h = 1088, 576 # Width and height of the processed image. Note that actual image processed would be a W x H resolution.
START_FROM_IND = 0 # index of a frame to start a processing from. Might be helpful with long animations where you need to restart the script multiple times
SAVE_FRAMES = True # saves individual frames into 'out' folder if set True. Again might be helpful with long animations
PROCESSING_STRENGTH = 0.85
PROCESSING_STRENGTH = 0.95
BLUR_FIX_STRENGTH = 0.15
APPLY_HED = True
@ -75,12 +75,12 @@ class controlnetRequest():
"input_image": b64_hed_img,
"module": "hed",
"model": "control_hed-fp16 [13fee50b]",
"weight": 0.85,
"weight": 0.65,
"resize_mode": "Just Resize",
"lowvram": False,
"processor_res": 512,
"guidance_start": 0,
"guidance_end": 0.85,
"guidance_end": 0.65,
"guessmode": GUESSMODE
})

View File

@ -3,10 +3,6 @@ This project allows you to automate video stylization task using StableDiffusion
### Video to Video Examples:
<!--
[![IMAGE_ALT](https://img.youtube.com/vi/j-0niEMm6DU/0.jpg)](https://youtu.be/j-0niEMm6DU)
This script can also be using to swap the person in the video like in this example: https://youtube.com/shorts/be93_dIeZWU
-->
</table>
<table class="center">
<tr>
@ -51,83 +47,16 @@ Examples presented are generated at 1024x576 resolution using the 'realisticVisi
All examples you can see here are originally generated at 512x512 resolution using the 'sd-v1-5-inpainting' model as a base. They were downsized and compressed for better loading speed. You can see them in their original quality in the 'examples' folder. Actual prompts used were stated in the following format: "RAW photo, {subject}, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3", only the 'subject' part is described in the table above.
## Installing the extension
~TODO~
## Dependencies
To install all the necessary dependencies, run this command:
```
pip install opencv-python opencv-contrib-python numpy tqdm h5py scikit-image
```
You have to set up the RAFT repository as it described here: https://github.com/princeton-vl/RAFT . Basically it just comes down to running "./download_models.sh" in RAFT folder to download the models.
Download RAFT 'raft-things.pth' from here: [RAFT link] and place it into 'stable-diffusion-webui/models/RAFT/' folder.
All generated video will be saved into 'outputs/sd-cn-animation' folder.
## Running the scripts
This script works on top of [Automatic1111/web-ui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) interface via API. To run this script you have to set it up first. You should also have[sd-webui-controlnet](https://github.com/Mikubill/sd-webui-controlnet) extension installed. You need to have the control_hed-fp16 model installed. If you have web-ui with ControlNet working correctly, you have to also allow the API to work with controlNet. To do so, go to the web-ui settings -> ControlNet tab -> Set "Allow other script to control this extension" checkbox to active and set "Multi ControlNet: Max models amount (requires restart)" to more then 2 -> press "Apply settings".
### Video To Video
#### Step 1.
To process the video, first of all you would need to precompute optical flow data before running web-ui with this command:
```
python3 compute_flow.py -i "path to your video" -o "path to output file with *.h5 format" -v -W width_of_the_flow_map -H height_of_the_flow_map
```
The main reason to do this step separately is to save precious GPU memory that will be useful to generate better quality images. Choose W and H parameters as high as your GPU can handle with respect to the proportion of original video resolution. Do not worry if it is higher or less then the processing resolution, flow maps will be scaled accordingly at the processing stage. This will generate quite a large file that may take up to a several gigabytes on the drive even for minute long video. If you want to process a long video consider splitting it into several parts beforehand.
#### Step 2.
Run web-ui with '--api' flag. It is also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.
```
bash webui.sh --xformers --api
```
#### Step 3.
Go to the **vid2vid.py** file and change main parameters (INPUT_VIDEO, FLOW_MAPS, OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. FLOW_MAPS parameter should contain a path to the flow file that you generated at the first step. The script is pretty simple so you may change other parameters as well, although I would recommend to leave them as is for the first time. Finally run the script with the command:
```
python3 vid2vid.py
```
### Text To Video
This method is still in development and works on top of Stable Diffusion and 'FloweR' - optical flow reconstruction method that is also in a yearly development stage. Do not expect much from it as it is more of a proof of a concept rather than a complete solution.
#### Step 1.
Download 'FloweR_0.1.pth' model from here: [Google drive link](https://drive.google.com/file/d/1WhzoVIw6Kdg4EjfK9LaTLqFm5dF-IJ7F/view?usp=share_link) and place it in the 'FloweR' folder.
#### Step 2.
Same as with vid2vid case, run web-ui with '--api' flag. It is also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.
```
bash webui.sh --xformers --api
```
#### Step 3.
Go to the **txt2vid.py** file and change main parameters (OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. Again, the script is simple so you may change other parameters if you want to. Finally run the script with the command:
```
python3 txt2vid.py
```
## Last version changes: v0.5
* Fixed an issue with the wrong direction of an optical flow applied to an image.
* Added text to video mode within txt2vid.py script. Make sure to update new dependencies for this script to work!
* Added a threshold for an optical flow before processing the frame to remove white noise that might appear, as it was suggested by [@alexfredo](https://github.com/alexfredo).
* Background removal at flow computation stage implemented by [@CaptnSeraph](https://github.com/CaptnSeraph), it should reduce ghosting effect in most of the videos processed with vid2vid script.
<!--
## Last version changes: v0.6
* Complete rewrite of the project to make it possible to install as a Automatic1111/Web-ui extension.
* Added separate flag '-rb' for background removal process at the flow computation stage in the compute_flow.py script.
* Added flow normalization before rescaling it, so the magnitude of the flow computed correctly at the different resolution.
* Less ghosting and color change in vid2vid mode
-->
<!--
## Potential improvements
There are several ways overall quality of animation may be improved:
* You may use a separate processing for each camera position to get a more consistent style of the characters and less ghosting.
* Because the quality of the video depends on how good optical flow was estimated it might be beneficial to use high frame rate video as a source, so it would be easier to guess the flow properly.
* The quality of flow estimation might be greatly improved with a proper flow estimation model like this one: https://github.com/autonomousvision/unimatch .
## Licence
This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact me directly at borsky.alexey@gmail.com
-->
* Added "warped styled frame fix" at vid2vid mode that removes image duplicated from the parts of the image that cannot be relocated from the optical flow.

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
scikit-image==0.19.2

221
scripts/base_ui.py Normal file
View File

@ -0,0 +1,221 @@
import sys, os
basedirs = [os.getcwd()]
for basedir in basedirs:
paths_to_ensure = [
basedir,
basedir + '/extensions/sd-cn-animation/scripts'
]
for scripts_path_fix in paths_to_ensure:
if not scripts_path_fix in sys.path:
sys.path.extend([scripts_path_fix])
import gradio as gr
from types import SimpleNamespace
from modules import script_callbacks, shared
from modules.shared import cmd_opts, opts
from webui import wrap_gradio_gpu_call
from modules.ui_components import ToolButton, FormRow, FormGroup
from modules.ui import create_override_settings_dropdown
import modules.scripts as scripts
from modules.sd_samplers import samplers_for_img2img
from modules.ui import setup_progressbar, create_sampler_and_steps_selection, ordered_ui_categories, create_output_panel
from vid2vid import *
def V2VArgs():
seed = -1
width = 1024
height = 576
cfg_scale = 5.5
steps = 15
prompt = ""
n_prompt = "text, letters, logo, brand, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
processing_strength = 0.85
fix_frame_strength = 0.15
return locals()
def T2VArgs():
seed = -1
width = 768
height = 512
cfg_scale = 5.5
steps = 15
prompt = ""
n_prompt = "text, letters, logo, brand, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
processing_strength = 0.85
fix_frame_strength = 0.35
return locals()
def setup_common_values(mode, d):
with gr.Row():
width = gr.Slider(label='Width', minimum=64, maximum=2048, step=64, value=d.width, interactive=True)
height = gr.Slider(label='Height', minimum=64, maximum=2048, step=64, value=d.height, interactive=True)
with gr.Row(elem_id=f'{mode}_prompt_toprow'):
prompt = gr.Textbox(label='Prompt', lines=3, interactive=True, elem_id=f"{mode}_prompt", placeholder="Enter your prompt here...")
with gr.Row(elem_id=f'{mode}_n_prompt_toprow'):
n_prompt = gr.Textbox(label='Negative prompt', lines=3, interactive=True, elem_id=f"{mode}_n_prompt", value=d.n_prompt)
with gr.Row():
#steps = gr.Slider(label='Steps', minimum=1, maximum=100, step=1, value=d.steps, interactive=True)
cfg_scale = gr.Slider(label='CFG scale', minimum=1, maximum=100, step=1, value=d.cfg_scale, interactive=True)
with gr.Row():
seed = gr.Number(label='Seed (this parameter controls how the first frame looks like and the color distribution of the consecutive frames as they are dependent on the first one)', value = d.seed, Interactive = True, precision=0)
with gr.Row():
processing_strength = gr.Slider(label="Processing strength", value=d.processing_strength, minimum=0, maximum=1, step=0.05, interactive=True)
fix_frame_strength = gr.Slider(label="Fix frame strength", value=d.fix_frame_strength, minimum=0, maximum=1, step=0.05, interactive=True)
return width, height, prompt, n_prompt, cfg_scale, seed, processing_strength, fix_frame_strength
def inputs_ui():
v2v_args = SimpleNamespace(**V2VArgs())
t2v_args = SimpleNamespace(**T2VArgs())
with gr.Tab('vid2vid') as tab_vid2vid:
with gr.Row():
gr.HTML('Put your video here')
with gr.Row():
vid2vid_file = gr.File(label="Input video", interactive=True, file_count="single", file_types=["video"], elem_id="vid_to_vid_chosen_file")
#init_img = gr.Image(label="Image for img2img", elem_id="img2img_image", show_label=False, source="upload", interactive=True, type="pil", image_mode="RGBA")
#with gr.Row():
# gr.HTML('Alternative: enter the relative (to the webui) path to the file')
#with gr.Row():
# vid2vid_frames_path = gr.Textbox(label="Input video path", interactive=True, elem_id="vid_to_vid_chosen_path", placeholder='Enter your video path here, or upload in the box above ^')
width, height, prompt, n_prompt, cfg_scale, seed, processing_strength, fix_frame_strength = setup_common_values('vid2vid', v2v_args)
#with gr.Row():
# strength = gr.Slider(label="denoising strength", value=d.strength, minimum=0, maximum=1, step=0.05, interactive=True)
# vid2vid_startFrame=gr.Number(label='vid2vid start frame',value=d.vid2vid_startFrame)
with gr.Tab('txt2vid') as tab_txt2vid:
gr.Markdown('Work in progress...')
# width, height, prompt, n_prompt, steps, cfg_scale, seed, processing_strength, fix_frame_strength = setup_common_values('txt2vid', t2v_args)
#with gr.Tab('settings') as tab_setts:
# gr.Markdown('Work in progress...')
return locals()
def on_ui_tabs():
modules.scripts.scripts_current = modules.scripts.scripts_img2img
modules.scripts.scripts_img2img.initialize_scripts(is_img2img=True)
with gr.Blocks(analytics_enabled=False) as sdcnanim_interface:
components = {}
#dv = SimpleNamespace(**T2VOutputArgs())
with gr.Row(elem_id='v2v-core').style(equal_height=False, variant='compact'):
with gr.Column(scale=1, variant='panel'):
with gr.Row(variant='compact'):
run_button = gr.Button('Generate', elem_id=f"sdcn_anim_generate", variant='primary')
with gr.Tabs():
components = inputs_ui()
print('components', components['processing_strength'])
#do_vid2vid = gr.State(value=0)
for category in ordered_ui_categories():
if category == "sampler":
steps, sampler_index = create_sampler_and_steps_selection(samplers_for_img2img, "vid2vid")
elif category == "override_settings":
with FormRow(elem_id="vid2vid_override_settings_row") as row:
override_settings = create_override_settings_dropdown("vid2vid", row)
elif category == "scripts":
with FormGroup(elem_id=f"script_container"):
custom_inputs = scripts.scripts_img2img.setup_ui()
with gr.Column(scale=1, variant='compact'):
with gr.Column(variant="panel"):
sp_progress = gr.HTML(elem_id="sp_progress", value="")
sp_progress.update()
#sp_outcome = gr.HTML(elem_id="sp_error", value="")
#sp_progressbar = gr.HTML(elem_id="sp_progressbar")
#setup_progressbar(sp_progressbar, sp_preview, 'sp', textinfo=sp_progress)
with gr.Row(variant='compact'):
img_preview_curr_frame = gr.Image(label='Current frame', elem_id=f"img_preview_curr_frame", type='pil').style(height=240)
img_preview_curr_occl = gr.Image(label='Current occlusion', elem_id=f"img_preview_curr_occl", type='pil').style(height=240)
with gr.Row(variant='compact'):
img_preview_prev_warp = gr.Image(label='Previous frame warped', elem_id=f"img_preview_curr_frame", type='pil').style(height=240)
img_preview_processed = gr.Image(label='Processed', elem_id=f"img_preview_processed", type='pil').style(height=240)
#with gr.Row(variant='compact'):
html_log = gr.HTML(elem_id=f'html_log_vid2vid')
with gr.Row(variant='compact'):
dummy_component = gr.Label(visible=False)
# Define parameters for the action methods. Not all of them are included yet
method_inputs = [
dummy_component, # send None for task_id
dummy_component, # mode
components['prompt'], # prompt
components['n_prompt'], # negative_prompt
dummy_component, # prompt_styles
components['vid2vid_file'], # input_video
dummy_component, # sketch
dummy_component, # init_img_with_mask
dummy_component, # inpaint_color_sketch
dummy_component, # inpaint_color_sketch_orig
dummy_component, # init_img_inpaint
dummy_component, # init_mask_inpaint
steps, # steps
sampler_index, # sampler_index
dummy_component, # mask_blur
dummy_component, # mask_alpha
dummy_component, # inpainting_fill
dummy_component, # restore_faces
dummy_component, # tiling
dummy_component, # n_iter
dummy_component, # batch_size
components['cfg_scale'], # cfg_scale
dummy_component, # image_cfg_scale
components['processing_strength'], # denoising_strength
components['seed'], # seed
dummy_component, # subseed
dummy_component, # subseed_strength
dummy_component, # seed_resize_from_h
dummy_component, # seed_resize_from_w
dummy_component, # seed_enable_extras
components['height'], # height
components['width'], # width
dummy_component, # resize_mode
dummy_component, # inpaint_full_res
dummy_component, # inpaint_full_res_padding
dummy_component, # inpainting_mask_invert
dummy_component, # img2img_batch_input_dir
dummy_component, # img2img_batch_output_dir
dummy_component, # img2img_batch_inpaint_mask_dir
override_settings, # override_settings_texts
] + custom_inputs
method_outputs = [
sp_progress,
img_preview_curr_frame,
img_preview_curr_occl,
img_preview_prev_warp,
img_preview_processed,
html_log,
]
run_button.click(
fn=start_process, #wrap_gradio_gpu_call(start_process, extra_outputs=[None, '', '']),
inputs=method_inputs,
outputs=method_outputs,
show_progress=True,
)
modules.scripts.scripts_current = None
# define queue - required for generators
sdcnanim_interface.queue(concurrency_count=1)
return [(sdcnanim_interface, "SD-CN-Animation", "sd_cn_animation_interface")]
script_callbacks.on_ui_tabs(on_ui_tabs)

154
scripts/flow_utils.py Normal file
View File

@ -0,0 +1,154 @@
import sys, os
basedirs = [os.getcwd()]
for basedir in basedirs:
paths_to_ensure = [
basedir,
basedir + '/extensions/sd-cn-animation/scripts',
basedir + '/extensions/sd-cn-animation/RAFT',
]
for scripts_path_fix in paths_to_ensure:
if not scripts_path_fix in sys.path:
sys.path.extend([scripts_path_fix])
import numpy as np
import cv2
from collections import namedtuple
import torch
import argparse
from RAFT.raft import RAFT
from RAFT.utils.utils import InputPadder
import modules.paths as ph
import gc
RAFT_model = None
fgbg = cv2.createBackgroundSubtractorMOG2(history=500, varThreshold=16, detectShadows=True)
def background_subtractor(frame, fgbg):
fgmask = fgbg.apply(frame)
return cv2.bitwise_and(frame, frame, mask=fgmask)
def RAFT_clear_memory():
global RAFT_model
del RAFT_model
gc.collect()
torch.cuda.empty_cache()
RAFT_model = None
def RAFT_estimate_flow(frame1, frame2, device='cuda', subtract_background=True):
global RAFT_model
if RAFT_model is None:
args = argparse.Namespace(**{
'model': ph.models_path + '/RAFT/raft-things.pth',
'mixed_precision': True,
'small': False,
'alternate_corr': False,
'path': ""
})
RAFT_model = torch.nn.DataParallel(RAFT(args))
RAFT_model.load_state_dict(torch.load(args.model))
RAFT_model = RAFT_model.module
RAFT_model.to(device)
RAFT_model.eval()
if subtract_background:
frame1 = background_subtractor(frame1, fgbg)
frame2 = background_subtractor(frame2, fgbg)
with torch.no_grad():
frame1_torch = torch.from_numpy(frame1).permute(2, 0, 1).float()[None].to(device)
frame2_torch = torch.from_numpy(frame2).permute(2, 0, 1).float()[None].to(device)
padder = InputPadder(frame1_torch.shape)
image1, image2 = padder.pad(frame1_torch, frame2_torch)
# estimate optical flow
_, next_flow = RAFT_model(image1, image2, iters=20, test_mode=True)
_, prev_flow = RAFT_model(image2, image1, iters=20, test_mode=True)
next_flow = next_flow[0].permute(1, 2, 0).cpu().numpy()
prev_flow = prev_flow[0].permute(1, 2, 0).cpu().numpy()
fb_flow = next_flow + prev_flow
fb_norm = np.linalg.norm(fb_flow, axis=2)
occlusion_mask = fb_norm[..., None].repeat(3, axis=-1)
return next_flow, prev_flow, occlusion_mask, frame1, frame2
# ... rest of the file ...
def compute_diff_map(next_flow, prev_flow, prev_frame, cur_frame, prev_frame_styled):
h, w = cur_frame.shape[:2]
fl_w, fl_h = next_flow.shape[:2]
# normalize flow
next_flow = next_flow / np.array([fl_h,fl_w])
prev_flow = prev_flow / np.array([fl_h,fl_w])
# remove low value noise (@alexfredo suggestion)
next_flow[np.abs(next_flow) < 0.05] = 0
prev_flow[np.abs(prev_flow) < 0.05] = 0
# resize flow
next_flow = cv2.resize(next_flow, (w, h))
next_flow = (next_flow * np.array([h,w])).astype(np.float32)
prev_flow = cv2.resize(prev_flow, (w, h))
prev_flow = (prev_flow * np.array([h,w])).astype(np.float32)
# Generate sampling grids
grid_y, grid_x = torch.meshgrid(torch.arange(0, h), torch.arange(0, w))
flow_grid = torch.stack((grid_x, grid_y), dim=0).float()
flow_grid += torch.from_numpy(prev_flow).permute(2, 0, 1)
flow_grid = flow_grid.unsqueeze(0)
flow_grid[:, 0, :, :] = 2 * flow_grid[:, 0, :, :] / (w - 1) - 1
flow_grid[:, 1, :, :] = 2 * flow_grid[:, 1, :, :] / (h - 1) - 1
flow_grid = flow_grid.permute(0, 2, 3, 1)
prev_frame_torch = torch.from_numpy(prev_frame).float().unsqueeze(0).permute(0, 3, 1, 2) #N, C, H, W
prev_frame_styled_torch = torch.from_numpy(prev_frame_styled).float().unsqueeze(0).permute(0, 3, 1, 2) #N, C, H, W
warped_frame = torch.nn.functional.grid_sample(prev_frame_torch, flow_grid, padding_mode="reflection").permute(0, 2, 3, 1)[0].numpy()
warped_frame_styled = torch.nn.functional.grid_sample(prev_frame_styled_torch, flow_grid, padding_mode="reflection").permute(0, 2, 3, 1)[0].numpy()
#warped_frame = cv2.remap(prev_frame, flow_map, None, cv2.INTER_NEAREST, borderMode = cv2.BORDER_REFLECT)
#warped_frame_styled = cv2.remap(prev_frame_styled, flow_map, None, cv2.INTER_NEAREST, borderMode = cv2.BORDER_REFLECT)
# compute occlusion mask
fb_flow = next_flow + prev_flow
fb_norm = np.linalg.norm(fb_flow, axis=2)
occlusion_mask = fb_norm[..., None]
diff_mask_org = np.abs(warped_frame.astype(np.float32) - cur_frame.astype(np.float32)) / 255
diff_mask_org = diff_mask_org.max(axis = -1, keepdims=True)
diff_mask_stl = np.abs(warped_frame_styled.astype(np.float32) - cur_frame.astype(np.float32)) / 255
diff_mask_stl = diff_mask_stl.max(axis = -1, keepdims=True)
alpha_mask = np.maximum(occlusion_mask * 0.3, diff_mask_org * 4, diff_mask_stl * 2)
alpha_mask = alpha_mask.repeat(3, axis = -1)
#alpha_mask_blured = cv2.dilate(alpha_mask, np.ones((5, 5), np.float32))
alpha_mask = cv2.GaussianBlur(alpha_mask, (51,51), 5, cv2.BORDER_REFLECT)
alpha_mask = np.clip(alpha_mask, 0, 1)
return alpha_mask, warped_frame_styled
def frames_norm(occl): return occl / 127.5 - 1
def flow_norm(flow): return flow / 255
def occl_norm(occl): return occl / 127.5 - 1
def flow_renorm(flow): return flow * 255
def occl_renorm(occl): return (occl + 1) * 127.5

398
scripts/vid2vid.py Normal file
View File

@ -0,0 +1,398 @@
import sys, os
basedirs = [os.getcwd()]
for basedir in basedirs:
paths_to_ensure = [
basedir,
basedir + '/extensions/sd-cn-animation/scripts',
]
for scripts_path_fix in paths_to_ensure:
if not scripts_path_fix in sys.path:
sys.path.extend([scripts_path_fix])
import math
import os
import sys
import traceback
import numpy as np
from PIL import Image, ImageOps, ImageFilter, ImageEnhance, ImageChops
from modules import devices, sd_samplers, img2img
from modules import shared, sd_hijack, lowvram
from modules.generation_parameters_copypaste import create_override_settings_dict
from modules.processing import Processed, StableDiffusionProcessingImg2Img, process_images
from modules.shared import opts, devices
import modules.shared as shared
import modules.processing as processing
from modules.ui import plaintext_to_html
import modules.images as images
import modules.scripts
import gc
import cv2
import gradio as gr
import time
import skimage
import datetime
from flow_utils import RAFT_estimate_flow, RAFT_clear_memory, compute_diff_map
from types import SimpleNamespace
class sdcn_anim_tmp:
prepear_counter = 0
process_counter = 0
input_video = None
output_video = None
curr_frame = None
prev_frame = None
prev_frame_styled = None
fps = None
total_frames = None
prepared_frames = None
prepared_next_flows = None
prepared_prev_flows = None
frames_prepared = False
def read_frame_from_video():
# Reading video file
if sdcn_anim_tmp.input_video.isOpened():
ret, cur_frame = sdcn_anim_tmp.input_video.read()
if cur_frame is not None:
cur_frame = cv2.cvtColor(cur_frame, cv2.COLOR_BGR2RGB)
else:
cur_frame = None
sdcn_anim_tmp.input_video.release()
return cur_frame
def get_cur_stat():
stat = f'Frames prepared: {sdcn_anim_tmp.prepear_counter + 1} / {sdcn_anim_tmp.total_frames}; '
stat += f'Frames processed: {sdcn_anim_tmp.process_counter + 1} / {sdcn_anim_tmp.total_frames}; '
return stat
def clear_memory_from_sd():
if shared.sd_model is not None:
sd_hijack.model_hijack.undo_hijack(shared.sd_model)
try:
lowvram.send_everything_to_cpu()
except Exception as e:
...
del shared.sd_model
shared.sd_model = None
gc.collect()
devices.torch_gc()
def get_device():
device=devices.get_optimal_device()
#print('device',device)
return device
def args_to_dict(*args): # converts list of argumets into dictionary for better handling of it
args_list = ['id_task', 'mode', 'prompt', 'negative_prompt', 'prompt_styles', 'init_video', 'sketch', 'init_img_with_mask', 'inpaint_color_sketch', 'inpaint_color_sketch_orig', 'init_img_inpaint', 'init_mask_inpaint', 'steps', 'sampler_index', 'mask_blur', 'mask_alpha', 'inpainting_fill', 'restore_faces', 'tiling', 'n_iter', 'batch_size', 'cfg_scale', 'image_cfg_scale', 'denoising_strength', 'seed', 'subseed', 'subseed_strength', 'seed_resize_from_h', 'seed_resize_from_w', 'seed_enable_extras', 'height', 'width', 'resize_mode', 'inpaint_full_res', 'inpaint_full_res_padding', 'inpainting_mask_invert', 'img2img_batch_input_dir', 'img2img_batch_output_dir', 'img2img_batch_inpaint_mask_dir', 'override_settings_texts']
# set default values for params that were not specified
args_dict = {
'mode': 0,
'prompt': '',
'negative_prompt': '',
'prompt_styles': [],
'init_video': None, # Always required
'steps': 15,
'sampler_index': 0, # 'Euler a'
'mask_blur': 0,
'inpainting_fill': 1, # original
'restore_faces': False,
'tiling': False,
'n_iter': 1,
'batch_size': 1,
'cfg_scale': 5.5,
'image_cfg_scale': 1.5,
'denoising_strength': 0.75,
'seed': -1,
'subseed': -1,
'subseed_strength': 0,
'seed_resize_from_h': 512,
'seed_resize_from_w': 512,
'seed_enable_extras': False,
'height': 512,
'width': 512,
'resize_mode': 1,
'inpaint_full_res': True,
'inpaint_full_res_padding': 0,
}
args = list(args)
for i in range(len(args_list)):
if (args[i] is None) and (args_list[i] in args_dict):
args[i] = args_dict[args_list[i]]
else:
args_dict[args_list[i]] = args[i]
args_dict['script_inputs'] = args[len(args_list):]
return args_dict, args
def start_process(*args):
args_dict, args_list = args_to_dict(*args)
sdcn_anim_tmp.process_counter = 0
sdcn_anim_tmp.prepear_counter = 0
# Open the input video file
sdcn_anim_tmp.input_video = cv2.VideoCapture(args_dict['init_video'].name)
# Get useful info from the source video
sdcn_anim_tmp.fps = int(sdcn_anim_tmp.input_video.get(cv2.CAP_PROP_FPS))
sdcn_anim_tmp.total_frames = int(sdcn_anim_tmp.input_video.get(cv2.CAP_PROP_FRAME_COUNT))
# Create an output video file with the same fps, width, and height as the input video
output_video_name = f'outputs/sd-cn-animation/vid2vid/{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}.mp4'
os.makedirs(os.path.dirname(output_video_name), exist_ok=True)
sdcn_anim_tmp.output_video = cv2.VideoWriter(output_video_name, cv2.VideoWriter_fourcc(*'mp4v'), sdcn_anim_tmp.fps, (args_dict['width'], args_dict['height']))
curr_frame = read_frame_from_video()
curr_frame = cv2.resize(curr_frame, (args_dict['width'], args_dict['height']))
sdcn_anim_tmp.prepared_frames = np.zeros((11, args_dict['height'], args_dict['width'], 3), dtype=np.uint8)
sdcn_anim_tmp.prepared_next_flows = np.zeros((10, args_dict['height'], args_dict['width'], 2))
sdcn_anim_tmp.prepared_prev_flows = np.zeros((10, args_dict['height'], args_dict['width'], 2))
sdcn_anim_tmp.prepared_frames[0] = curr_frame
#args_dict['init_img'] = cur_frame
args_list[5] = Image.fromarray(curr_frame)
processed_frames, _, _, _ = modules.img2img.img2img(*args_list) #img2img(args_dict)
processed_frame = np.array(processed_frames[0])
processed_frame = skimage.exposure.match_histograms(processed_frame, curr_frame, multichannel=False, channel_axis=-1)
#print('Processed frame ', 0)
sdcn_anim_tmp.curr_frame = curr_frame
sdcn_anim_tmp.prev_frame = curr_frame.copy()
sdcn_anim_tmp.prev_frame_styled = processed_frame.copy()
yield get_cur_stat(), sdcn_anim_tmp.curr_frame, None, None, processed_frame, ''
# TODO: SOLVE PROBLEM with wrong prev frame on the start on new processing iterations
for step in range((sdcn_anim_tmp.total_frames-1) * 2):
args_dict, args_list = args_to_dict(*args)
occlusion_mask = None
prev_frame = None
curr_frame = sdcn_anim_tmp.curr_frame
warped_styled_frame = gr.Image.update()
processed_frame = gr.Image.update()
prepare_steps = 10
if sdcn_anim_tmp.process_counter % prepare_steps == 0 and not sdcn_anim_tmp.frames_prepared: # prepare next 10 frames for processing
#clear_memory_from_sd()
device = get_device()
curr_frame = read_frame_from_video()
if curr_frame is not None:
curr_frame = cv2.resize(curr_frame, (args_dict['width'], args_dict['height']))
prev_frame = sdcn_anim_tmp.prev_frame.copy()
next_flow, prev_flow, occlusion_mask, frame1_bg_removed, frame2_bg_removed = RAFT_estimate_flow(prev_frame, curr_frame, subtract_background=False, device=device)
occlusion_mask = np.clip(occlusion_mask * 0.1 * 255, 0, 255).astype(np.uint8)
cn = sdcn_anim_tmp.prepear_counter % 10
if sdcn_anim_tmp.prepear_counter % 10 == 0:
sdcn_anim_tmp.prepared_frames[cn] = sdcn_anim_tmp.prev_frame
sdcn_anim_tmp.prepared_frames[cn + 1] = curr_frame.copy()
sdcn_anim_tmp.prepared_next_flows[cn] = next_flow.copy()
sdcn_anim_tmp.prepared_prev_flows[cn] = prev_flow.copy()
#print('Prepared frame ', cn+1)
sdcn_anim_tmp.prev_frame = curr_frame.copy()
sdcn_anim_tmp.prepear_counter += 1
if sdcn_anim_tmp.prepear_counter % prepare_steps == 0 or \
sdcn_anim_tmp.prepear_counter >= sdcn_anim_tmp.total_frames - 1 or \
curr_frame is None:
# Remove RAFT from memory
RAFT_clear_memory()
sdcn_anim_tmp.frames_prepared = True
else:
# process frame
sdcn_anim_tmp.frames_prepared = False
cn = sdcn_anim_tmp.process_counter % 10
curr_frame = sdcn_anim_tmp.prepared_frames[cn+1]
prev_frame = sdcn_anim_tmp.prepared_frames[cn]
next_flow = sdcn_anim_tmp.prepared_next_flows[cn]
prev_flow = sdcn_anim_tmp.prepared_prev_flows[cn]
# process current frame
args_list[5] = Image.fromarray(curr_frame)
args_list[24] = -1
processed_frames, _, _, _ = modules.img2img.img2img(*args_list)
processed_frame = np.array(processed_frames[0])
alpha_mask, warped_styled_frame = compute_diff_map(next_flow, prev_flow, prev_frame, curr_frame, sdcn_anim_tmp.prev_frame_styled)
alpha_mask = np.clip(alpha_mask + 0.05, 0.05, 0.95)
fl_w, fl_h = prev_flow.shape[:2]
prev_flow_n = prev_flow / np.array([fl_h,fl_w])
flow_mask = np.clip(1 - np.linalg.norm(prev_flow_n, axis=-1)[...,None], 0, 1)
# fix warped styled frame from duplicated that occures on the places where flow is zero, but only because there is no place to get the color from
warped_styled_frame = curr_frame.astype(float) * alpha_mask * flow_mask + warped_styled_frame.astype(float) * (1 - alpha_mask * flow_mask)
# This clipping at lower side required to fix small trailing issues that for some reason left outside of the bright part of the mask,
# and at the higher part it making parts changed strongly to do it with less flickering.
occlusion_mask = np.clip(alpha_mask * 255, 0, 255).astype(np.uint8)
# normalizing the colors
processed_frame = skimage.exposure.match_histograms(processed_frame, curr_frame, multichannel=False, channel_axis=-1)
processed_frame = processed_frame.astype(float) * alpha_mask + warped_styled_frame.astype(float) * (1 - alpha_mask)
processed_frame = processed_frame * 0.9 + curr_frame * 0.1
processed_frame = np.clip(processed_frame, 0, 255).astype(np.uint8)
sdcn_anim_tmp.prev_frame_styled = processed_frame.copy()
args_list[5] = Image.fromarray(processed_frame)
args_list[23] = 0.15
args_list[24] = 8888
processed_frames, _, _, _ = modules.img2img.img2img(*args_list)
processed_frame = np.array(processed_frames[0])
processed_frame = np.clip(processed_frame, 0, 255).astype(np.uint8)
warped_styled_frame = np.clip(warped_styled_frame, 0, 255).astype(np.uint8)
# Write the frame to the output video
frame_out = np.clip(processed_frame, 0, 255).astype(np.uint8)
frame_out = cv2.cvtColor(frame_out, cv2.COLOR_RGB2BGR)
sdcn_anim_tmp.output_video.write(frame_out)
sdcn_anim_tmp.process_counter += 1
if sdcn_anim_tmp.process_counter >= sdcn_anim_tmp.total_frames - 1:
sdcn_anim_tmp.input_video.release()
sdcn_anim_tmp.output_video.release()
sdcn_anim_tmp.prev_frame = None
#print(f'\nEND OF STEP {step}, {sdcn_anim_tmp.prepear_counter}, {sdcn_anim_tmp.process_counter}')
yield get_cur_stat(), curr_frame, occlusion_mask, warped_styled_frame, processed_frame, ''
#sdcn_anim_tmp.input_video.release()
#sdcn_anim_tmp.output_video.release()
return get_cur_stat(), curr_frame, occlusion_mask, warped_styled_frame, processed_frame, ''
'''
# id_task: str, mode: int, prompt: str, negative_prompt: str, prompt_styles: list, init_img, sketch, init_img_with_mask, inpaint_color_sketch, inpaint_color_sketch_orig, init_img_inpaint, init_mask_inpaint, steps: int, sampler_index: int, mask_blur: int, mask_alpha: float, inpainting_fill: int, restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, image_cfg_scale: float, denoising_strength: float, seed: int, subseed: int, subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool, height: int, width: int, resize_mode: int, inpaint_full_res: bool, inpaint_full_res_padding: int, inpainting_mask_invert: int, img2img_batch_input_dir: str, img2img_batch_output_dir: str, img2img_batch_inpaint_mask_dir: str, override_settings_texts, *args
def img2img(args_dict):
args = SimpleNamespace(**args_dict)
print('override_settings:', args.override_settings_texts)
override_settings = create_override_settings_dict(args.override_settings_texts)
is_batch = args.mode == 5
if args.mode == 0: # img2img
image = args.init_img.convert("RGB")
mask = None
elif args.mode == 1: # img2img sketch
image = args.sketch.convert("RGB")
mask = None
elif args.mode == 2: # inpaint
image, mask = args.init_img_with_mask["image"], args.init_img_with_mask["mask"]
alpha_mask = ImageOps.invert(image.split()[-1]).convert('L').point(lambda x: 255 if x > 0 else 0, mode='1')
mask = ImageChops.lighter(alpha_mask, mask.convert('L')).convert('L')
image = image.convert("RGB")
elif args.mode == 3: # inpaint sketch
image = args.inpaint_color_sketch
orig = args.inpaint_color_sketch_orig or args.inpaint_color_sketch
pred = np.any(np.array(image) != np.array(orig), axis=-1)
mask = Image.fromarray(pred.astype(np.uint8) * 255, "L")
mask = ImageEnhance.Brightness(mask).enhance(1 - args.mask_alpha / 100)
blur = ImageFilter.GaussianBlur(args.mask_blur)
image = Image.composite(image.filter(blur), orig, mask.filter(blur))
image = image.convert("RGB")
elif args.mode == 4: # inpaint upload mask
image = args.init_img_inpaint
mask = args.init_mask_inpaint
else:
image = None
mask = None
# Use the EXIF orientation of photos taken by smartphones.
if image is not None:
image = ImageOps.exif_transpose(image)
assert 0. <= args.denoising_strength <= 1., 'can only work with strength in [0.0, 1.0]'
p = StableDiffusionProcessingImg2Img(
sd_model=shared.sd_model,
outpath_samples=opts.outdir_samples or opts.outdir_img2img_samples,
outpath_grids=opts.outdir_grids or opts.outdir_img2img_grids,
prompt=args.prompt,
negative_prompt=args.negative_prompt,
styles=args.prompt_styles,
seed=args.seed,
subseed=args.subseed,
subseed_strength=args.subseed_strength,
seed_resize_from_h=args.seed_resize_from_h,
seed_resize_from_w=args.seed_resize_from_w,
seed_enable_extras=args.seed_enable_extras,
sampler_name=sd_samplers.samplers_for_img2img[args.sampler_index].name,
batch_size=args.batch_size,
n_iter=args.n_iter,
steps=args.steps,
cfg_scale=args.cfg_scale,
width=args.width,
height=args.height,
restore_faces=args.restore_faces,
tiling=args.tiling,
init_images=[image],
mask=mask,
mask_blur=args.mask_blur,
inpainting_fill=args.inpainting_fill,
resize_mode=args.resize_mode,
denoising_strength=args.denoising_strength,
image_cfg_scale=args.image_cfg_scale,
inpaint_full_res=args.inpaint_full_res,
inpaint_full_res_padding=args.inpaint_full_res_padding,
inpainting_mask_invert=args.inpainting_mask_invert,
override_settings=override_settings,
)
p.scripts = modules.scripts.scripts_txt2img
p.script_args = args.script_inputs
print('script_inputs 1:', args.script_inputs[1].__dict__)
#print('script_inputs 2:', args.script_inputs[1])
if shared.cmd_opts.enable_console_prompts:
print(f"\nimg2img: {args.prompt}", file=shared.progress_print_out)
if mask:
p.extra_generation_params["Mask blur"] = args.mask_blur
if is_batch:
...
# assert not shared.cmd_opts.hide_ui_dir_config, "Launched with --hide-ui-dir-config, batch img2img disabled"
# process_batch(p, img2img_batch_input_dir, img2img_batch_output_dir, img2img_batch_inpaint_mask_dir, args.script_inputs)
# processed = Processed(p, [], p.seed, "")
else:
processed = modules.scripts.scripts_img2img.run(p, *args.script_inputs)
if processed is None:
processed = process_images(p)
p.close()
shared.total_tqdm.clear()
generation_info_js = processed.js()
if opts.samples_log_stdout:
print(generation_info_js)
if opts.do_not_show_images:
processed.images = []
return processed.images[0] #, generation_info_js, plaintext_to_html(processed.info), plaintext_to_html(processed.comments)'''