Text to video script added
parent
b65ad41836
commit
e5fd243585
|
|
@ -1,3 +1,6 @@
|
||||||
__pycache__/
|
__pycache__/
|
||||||
out/
|
out/
|
||||||
|
videos/
|
||||||
|
FP_Res/
|
||||||
result.mp4
|
result.mp4
|
||||||
|
*.pth
|
||||||
|
|
@ -0,0 +1,150 @@
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.functional as F
|
||||||
|
|
||||||
|
class View(nn.Module):
|
||||||
|
def __init__(self, *shape):
|
||||||
|
super(View, self).__init__()
|
||||||
|
self.shape = shape
|
||||||
|
def forward(self, input):
|
||||||
|
return input.view(*self.shape)
|
||||||
|
|
||||||
|
# Define the model
|
||||||
|
class FloweR(nn.Module):
|
||||||
|
def __init__(self, input_size = (384, 384), window_size = 4):
|
||||||
|
super(FloweR, self).__init__()
|
||||||
|
|
||||||
|
self.input_size = input_size
|
||||||
|
self.window_size = window_size
|
||||||
|
|
||||||
|
#INPUT: 384 x 384 x 10 * 3
|
||||||
|
|
||||||
|
### DOWNSCALE ###
|
||||||
|
self.conv_block_1 = nn.Sequential(
|
||||||
|
nn.Conv2d(3 * self.window_size, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 384 x 384 x 128
|
||||||
|
|
||||||
|
self.conv_block_2 = nn.Sequential(
|
||||||
|
nn.AvgPool2d(2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 192 x 192 x 128
|
||||||
|
|
||||||
|
self.conv_block_3 = nn.Sequential(
|
||||||
|
nn.AvgPool2d(2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 96 x 96 x 128
|
||||||
|
|
||||||
|
self.conv_block_4 = nn.Sequential(
|
||||||
|
nn.AvgPool2d(2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 48 x 48 x 128
|
||||||
|
|
||||||
|
self.conv_block_5 = nn.Sequential(
|
||||||
|
nn.AvgPool2d(2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 24 x 24 x 128
|
||||||
|
|
||||||
|
self.conv_block_6 = nn.Sequential(
|
||||||
|
nn.AvgPool2d(2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 12 x 12 x 128
|
||||||
|
|
||||||
|
self.conv_block_7 = nn.Sequential(
|
||||||
|
nn.AvgPool2d(2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 6 x 6 x 128
|
||||||
|
|
||||||
|
self.conv_block_8 = nn.Sequential(
|
||||||
|
nn.AvgPool2d(2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 3 x 3 x 128
|
||||||
|
|
||||||
|
### UPSCALE ###
|
||||||
|
self.conv_block_9 = nn.Sequential(
|
||||||
|
nn.Upsample(scale_factor=2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 6 x 6 x 128
|
||||||
|
|
||||||
|
self.conv_block_10 = nn.Sequential(
|
||||||
|
nn.Upsample(scale_factor=2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 12 x 12 x 128
|
||||||
|
|
||||||
|
self.conv_block_11 = nn.Sequential(
|
||||||
|
nn.Upsample(scale_factor=2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 24 x 24 x 128
|
||||||
|
|
||||||
|
self.conv_block_12 = nn.Sequential(
|
||||||
|
nn.Upsample(scale_factor=2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 48 x 48 x 128
|
||||||
|
|
||||||
|
self.conv_block_13 = nn.Sequential(
|
||||||
|
nn.Upsample(scale_factor=2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 96 x 96 x 128
|
||||||
|
|
||||||
|
self.conv_block_14 = nn.Sequential(
|
||||||
|
nn.Upsample(scale_factor=2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 192 x 192 x 128
|
||||||
|
|
||||||
|
self.conv_block_15 = nn.Sequential(
|
||||||
|
nn.Upsample(scale_factor=2),
|
||||||
|
nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
|
||||||
|
nn.ReLU(),
|
||||||
|
) # 384 x 384 x 128
|
||||||
|
|
||||||
|
self.conv_block_16 = nn.Conv2d(128, 3, kernel_size=3, stride=1, padding='same')
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if x.size(1) != self.window_size:
|
||||||
|
raise Exception(f'Shape of the input is not compatable. There should be exactly {self.window_size} frames in an input video.')
|
||||||
|
|
||||||
|
# batch, frames, height, width, colors
|
||||||
|
in_x = x.permute((0, 1, 4, 2, 3))
|
||||||
|
# batch, frames, colors, height, width
|
||||||
|
|
||||||
|
in_x = in_x.reshape(-1, self.window_size * 3, self.input_size[0], self.input_size[1])
|
||||||
|
|
||||||
|
### DOWNSCALE ###
|
||||||
|
block_1_out = self.conv_block_1(in_x) # 384 x 384 x 128
|
||||||
|
block_2_out = self.conv_block_2(block_1_out) # 192 x 192 x 128
|
||||||
|
block_3_out = self.conv_block_3(block_2_out) # 96 x 96 x 128
|
||||||
|
block_4_out = self.conv_block_4(block_3_out) # 48 x 48 x 128
|
||||||
|
block_5_out = self.conv_block_5(block_4_out) # 24 x 24 x 128
|
||||||
|
block_6_out = self.conv_block_6(block_5_out) # 12 x 12 x 128
|
||||||
|
block_7_out = self.conv_block_7(block_6_out) # 6 x 6 x 128
|
||||||
|
block_8_out = self.conv_block_8(block_7_out) # 3 x 3 x 128
|
||||||
|
|
||||||
|
### UPSCALE ###
|
||||||
|
block_9_out = block_7_out + self.conv_block_9(block_8_out) # 6 x 6 x 128
|
||||||
|
block_10_out = block_6_out + self.conv_block_10(block_9_out) # 12 x 12 x 128
|
||||||
|
block_11_out = block_5_out + self.conv_block_11(block_10_out) # 24 x 24 x 128
|
||||||
|
block_12_out = block_4_out + self.conv_block_12(block_11_out) # 48 x 48 x 128
|
||||||
|
block_13_out = block_3_out + self.conv_block_13(block_12_out) # 96 x 96 x 128
|
||||||
|
block_14_out = block_2_out + self.conv_block_14(block_13_out) # 192 x 192 x 128
|
||||||
|
block_15_out = block_1_out + self.conv_block_15(block_14_out) # 384 x 384 x 128
|
||||||
|
|
||||||
|
block_16_out = self.conv_block_16(block_15_out) # 384 x 384 x (2 + 1)
|
||||||
|
out = block_16_out.reshape(-1, 3, self.input_size[0], self.input_size[1])
|
||||||
|
|
||||||
|
# batch, colors, height, width
|
||||||
|
out = out.permute((0, 2, 3, 1))
|
||||||
|
# batch, height, width, colors
|
||||||
|
return out
|
||||||
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 902 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 451 KiB |
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 1.6 MiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 1.3 MiB |
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 1.1 MiB |
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 1.5 MiB |
Binary file not shown.
|
|
@ -57,12 +57,21 @@ def compute_diff_map(next_flow, prev_flow, prev_frame, cur_frame, prev_frame_sty
|
||||||
next_flow = cv2.resize(next_flow, (w, h))
|
next_flow = cv2.resize(next_flow, (w, h))
|
||||||
prev_flow = cv2.resize(prev_flow, (w, h))
|
prev_flow = cv2.resize(prev_flow, (w, h))
|
||||||
|
|
||||||
flow_map = -next_flow.copy()
|
# This is not correct. The flow map should be applied to the next frame to get previous frame
|
||||||
|
# flow_map = -next_flow.copy()
|
||||||
|
|
||||||
|
# remove white noise (@alexfredo suggestion)
|
||||||
|
next_flow[np.abs(next_flow) < 3] = 0
|
||||||
|
prev_flow[np.abs(prev_flow) < 3] = 0
|
||||||
|
|
||||||
|
# Here is the correct version
|
||||||
|
flow_map = prev_flow.copy()
|
||||||
|
|
||||||
flow_map[:,:,0] += np.arange(w)
|
flow_map[:,:,0] += np.arange(w)
|
||||||
flow_map[:,:,1] += np.arange(h)[:,np.newaxis]
|
flow_map[:,:,1] += np.arange(h)[:,np.newaxis]
|
||||||
|
|
||||||
warped_frame = cv2.remap(prev_frame, flow_map, None, cv2.INTER_NEAREST)
|
warped_frame = cv2.remap(prev_frame, flow_map, None, cv2.INTER_NEAREST, borderMode = cv2.BORDER_REFLECT)
|
||||||
warped_frame_styled = cv2.remap(prev_frame_styled, flow_map, None, cv2.INTER_NEAREST)
|
warped_frame_styled = cv2.remap(prev_frame_styled, flow_map, None, cv2.INTER_NEAREST, borderMode = cv2.BORDER_REFLECT)
|
||||||
|
|
||||||
# compute occlusion mask
|
# compute occlusion mask
|
||||||
fb_flow = next_flow + prev_flow
|
fb_flow = next_flow + prev_flow
|
||||||
|
|
@ -80,8 +89,19 @@ def compute_diff_map(next_flow, prev_flow, prev_frame, cur_frame, prev_frame_sty
|
||||||
alpha_mask = alpha_mask.repeat(3, axis = -1)
|
alpha_mask = alpha_mask.repeat(3, axis = -1)
|
||||||
|
|
||||||
#alpha_mask_blured = cv2.dilate(alpha_mask, np.ones((5, 5), np.float32))
|
#alpha_mask_blured = cv2.dilate(alpha_mask, np.ones((5, 5), np.float32))
|
||||||
alpha_mask = cv2.GaussianBlur(alpha_mask, (51,51), 5, cv2.BORDER_DEFAULT)
|
alpha_mask = cv2.GaussianBlur(alpha_mask, (51,51), 5, cv2.BORDER_REFLECT)
|
||||||
|
|
||||||
alpha_mask = np.clip(alpha_mask, 0, 1)
|
alpha_mask = np.clip(alpha_mask, 0, 1)
|
||||||
|
|
||||||
return alpha_mask, warped_frame_styled
|
return alpha_mask, warped_frame_styled
|
||||||
|
|
||||||
|
|
||||||
|
def frames_norm(occl): return occl / 127.5 - 1
|
||||||
|
|
||||||
|
def flow_norm(flow): return flow / 255
|
||||||
|
|
||||||
|
def occl_norm(occl): return occl / 127.5 - 1
|
||||||
|
|
||||||
|
def flow_renorm(flow): return flow * 255
|
||||||
|
|
||||||
|
def occl_renorm(occl): return (occl + 1) * 127.5
|
||||||
|
|
|
||||||
66
readme.md
66
readme.md
|
|
@ -8,42 +8,94 @@ This script can also be using to swap the person in the video like in this examp
|
||||||
## Dependencies
|
## Dependencies
|
||||||
To install all the necessary dependencies, run this command:
|
To install all the necessary dependencies, run this command:
|
||||||
```
|
```
|
||||||
pip install opencv-python opencv-contrib-python numpy tqdm h5py
|
pip install opencv-python opencv-contrib-python numpy tqdm h5py scikit-image
|
||||||
```
|
```
|
||||||
You have to set up the RAFT repository as it described here: https://github.com/princeton-vl/RAFT . Basically it just comes down to running "./download_models.sh" in RAFT folder to download the models.
|
You have to set up the RAFT repository as it described here: https://github.com/princeton-vl/RAFT . Basically it just comes down to running "./download_models.sh" in RAFT folder to download the models.
|
||||||
|
|
||||||
## Running the script
|
## Running the scripts
|
||||||
This script works on top of [Automatic1111/web-ui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) interface via API. To run this script you have to set it up first. You should also have[sd-webui-controlnet](https://github.com/Mikubill/sd-webui-controlnet) extension installed. You need to have control_hed-fp16 model installed. If you have web-ui with ControlNet working correctly, you have to also allow API to work with controlNet. To do so, go to the web-ui settings -> ControlNet tab -> Set "Allow other script to control this extension" checkbox to active and set "Multi ControlNet: Max models amount (requires restart)" to more then 2 -> press "Apply settings".
|
This script works on top of [Automatic1111/web-ui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) interface via API. To run this script you have to set it up first. You should also have[sd-webui-controlnet](https://github.com/Mikubill/sd-webui-controlnet) extension installed. You need to have control_hed-fp16 model installed. If you have web-ui with ControlNet working correctly, you have to also allow API to work with controlNet. To do so, go to the web-ui settings -> ControlNet tab -> Set "Allow other script to control this extension" checkbox to active and set "Multi ControlNet: Max models amount (requires restart)" to more then 2 -> press "Apply settings".
|
||||||
|
|
||||||
### Step 1.
|
### Video To Video
|
||||||
|
#### Step 1.
|
||||||
To process the video, first of all you would need to precompute optical flow data before running web-ui with this command:
|
To process the video, first of all you would need to precompute optical flow data before running web-ui with this command:
|
||||||
```
|
```
|
||||||
python3 compute_flow.py -i {path to your video} -o {path to output *.h5} -v -W {width of the flow map} -H {height of the flow map}
|
python3 compute_flow.py -i "path to your video" -o "path to output file with *.h5 format" -v -W width_of_the_flow_map -H height_of_the_flow_map
|
||||||
```
|
```
|
||||||
The main reason to do this step separately is to save precious GPU memory that will be useful to generate better quality images. Choose W and H parameters as high as your GPU can handle with respect to proportion of original video resolution. Do not worry if it higher or less then the processing resolution, flow maps will be scaled accordingly at the processing stage. This will generate quite a large file that may take up to a several gigabytes on the drive even for minute long video. If you want to process a long video consider splitting it into several parts beforehand.
|
The main reason to do this step separately is to save precious GPU memory that will be useful to generate better quality images. Choose W and H parameters as high as your GPU can handle with respect to proportion of original video resolution. Do not worry if it higher or less then the processing resolution, flow maps will be scaled accordingly at the processing stage. This will generate quite a large file that may take up to a several gigabytes on the drive even for minute long video. If you want to process a long video consider splitting it into several parts beforehand.
|
||||||
|
|
||||||
### Step 2.
|
#### Step 2.
|
||||||
Run web-ui with '--api' flag. It is also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.
|
Run web-ui with '--api' flag. It is also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.
|
||||||
```
|
```
|
||||||
bash webui.sh --xformers --api
|
bash webui.sh --xformers --api
|
||||||
```
|
```
|
||||||
|
|
||||||
### Step 3.
|
#### Step 3.
|
||||||
Go to the **vid2vid.py** file and change main parameters (INPUT_VIDEO, FLOW_MAPS, OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. FLOW_MAPS parameter should contain a path to the flow file that you generated at the first step. The script is pretty simple so you may change other parameters as well, although I would recommend to leave them as is for the first time. Finally run the script with the command:
|
Go to the **vid2vid.py** file and change main parameters (INPUT_VIDEO, FLOW_MAPS, OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. FLOW_MAPS parameter should contain a path to the flow file that you generated at the first step. The script is pretty simple so you may change other parameters as well, although I would recommend to leave them as is for the first time. Finally run the script with the command:
|
||||||
```
|
```
|
||||||
python3 vid2vid.py
|
python3 vid2vid.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Text To Video (prototype)
|
||||||
|
This method is still in development and works on top of ‘Stable Diffusion’ and 'FloweR' - optical flow reconstruction method that is also in a yearly development stage. Do not expect much from it as it is more of a proof of a concept rather than a complete solution. All examples you can see here are originally generated at 512x512 resolution using the 'sd-v1-5-inpainting' model as a base. They were downsized and compressed for better loading speed. You can see them in their original quality in the 'examples' folder. Actual prompts used were stated in the following format "RAW photo, {subject}, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3", only the 'subject' part is described in the table below.
|
||||||
|
|
||||||
|
#### Step 1.
|
||||||
|
Download 'FloweR_0.1.pth' model from here: [google disc link] and place it in the 'FloweR' folder.
|
||||||
|
|
||||||
|
#### Step 2.
|
||||||
|
Same as with vid2vid case, run web-ui with '--api' flag. It is also better to use '--xformers' flag, as you would need to have the highest resolution possible and using xformers memory optimization will greatly help.
|
||||||
|
```
|
||||||
|
bash webui.sh --xformers --api
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 3.
|
||||||
|
Go to the **txt2vid.py** file and change main parameters (OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. Again, the script is simple so you may change other parameters if you want to. Finally run the script with the command:
|
||||||
|
```
|
||||||
|
python3 txt2vid.py
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Examples:
|
||||||
|
</table>
|
||||||
|
<table class="center">
|
||||||
|
<tr>
|
||||||
|
<td><img src="examples/flower_1.gif" raw=true></td>
|
||||||
|
<td><img src="examples/bonfire_1.gif" raw=true></td>
|
||||||
|
<td><img src="examples/diamond_4.gif" raw=true></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td width=33% align="center">"close up of a flower"</td>
|
||||||
|
<td width=33% align="center">"bonfire near the camp in the mountains at night"</td>
|
||||||
|
<td width=33% align="center">"close up of a diamond laying on the table"</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><img src="examples/macaroni_1.gif" raw=true></td>
|
||||||
|
<td><img src="examples/gold_1.gif" raw=true></td>
|
||||||
|
<td><img src="examples/tree_2.gif" raw=true></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td width=33% align="center">"close up of macaroni on the plate"</td>
|
||||||
|
<td width=33% align="center">"close up of golden sphere"</td>
|
||||||
|
<td width=33% align="center">"a tree standing in the winter forest"</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<!--
|
||||||
## Last version changes: v0.4
|
## Last version changes: v0.4
|
||||||
* Fixed issue with extreme blur accumulating at the static parts of the video.
|
* Fixed issue with extreme blur accumulating at the static parts of the video.
|
||||||
* The order of processing was changed to achieve the best quality at different domains.
|
* The order of processing was changed to achieve the best quality at different domains.
|
||||||
* Optical flow computation isolated into a separate script for better GPU memory management. Check out the instruction for a new processing pipeline.
|
* Optical flow computation isolated into a separate script for better GPU memory management. Check out the instruction for a new processing pipeline.
|
||||||
|
-->
|
||||||
|
|
||||||
|
## Last version changes: v0.5
|
||||||
|
* Fixed an issue with wrong direction of an optical flow applied to an image.
|
||||||
|
* Added text to video mode within txt2vid.py script. Make sure to update new dependencies for this script to work!
|
||||||
|
* Added a threshold for an optical flow before processing the frame to remove white noise that might appear, as it was suggested by @alexfredo.
|
||||||
|
|
||||||
|
<!--
|
||||||
## Potential improvements
|
## Potential improvements
|
||||||
There are several ways overall quality of animation may be improved:
|
There are several ways overall quality of animation may be improved:
|
||||||
* You may use a separate processing for each camera position to get a more consistent style of the characters and less ghosting.
|
* You may use a separate processing for each camera position to get a more consistent style of the characters and less ghosting.
|
||||||
* Because the quality of the video depends on how good optical flow was estimated it might be beneficial to use high frame rate video as a source, so it would be easier to guess the flow properly.
|
* Because the quality of the video depends on how good optical flow was estimated it might be beneficial to use high frame rate video as a source, so it would be easier to guess the flow properly.
|
||||||
* The quality of flow estimation might be greatly improved with proper flow estimation model like this one: https://github.com/autonomousvision/unimatch .
|
* The quality of flow estimation might be greatly improved with proper flow estimation model like this one: https://github.com/autonomousvision/unimatch .
|
||||||
|
-->
|
||||||
## Licence
|
## Licence
|
||||||
This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact me directly at borsky.alexey@gmail.com
|
This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact me directly at borsky.alexey@gmail.com
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,207 @@
|
||||||
|
import requests
|
||||||
|
import cv2
|
||||||
|
import base64
|
||||||
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
|
import os
|
||||||
|
|
||||||
|
import sys
|
||||||
|
sys.path.append('FloweR/')
|
||||||
|
sys.path.append('RAFT/core')
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from model import FloweR
|
||||||
|
from utils import flow_viz
|
||||||
|
|
||||||
|
from flow_utils import *
|
||||||
|
import skimage
|
||||||
|
|
||||||
|
|
||||||
|
OUTPUT_VIDEO = "result.mp4"
|
||||||
|
|
||||||
|
PROMPT = "RAW photo, bonfire near the camp in the mountains at night, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
|
||||||
|
N_PROMPT = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime:1.4), text, letters, logo, brand, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
|
||||||
|
w,h = 512, 512 # Width and height of the processed image. Note that actual image processed would be a W x H resolution.
|
||||||
|
|
||||||
|
SAVE_FRAMES = True # saves individual frames into 'out' folder if set True. Again might be helpful with long animations
|
||||||
|
|
||||||
|
PROCESSING_STRENGTH = 0.85
|
||||||
|
FIX_STRENGTH = 0.35
|
||||||
|
|
||||||
|
CFG_SCALE = 5.5
|
||||||
|
|
||||||
|
APPLY_TEMPORALNET = False
|
||||||
|
APPLY_COLOR = False
|
||||||
|
|
||||||
|
VISUALIZE = True
|
||||||
|
DEVICE = 'cuda'
|
||||||
|
|
||||||
|
def to_b64(img):
|
||||||
|
img_cliped = np.clip(img, 0, 255).astype(np.uint8)
|
||||||
|
_, buffer = cv2.imencode('.png', img_cliped)
|
||||||
|
b64img = base64.b64encode(buffer).decode("utf-8")
|
||||||
|
return b64img
|
||||||
|
|
||||||
|
class controlnetRequest():
|
||||||
|
def __init__(self, b64_init_img = None, b64_prev_img = None, b64_color_img = None, ds = 0.35, w=w, h=h, mask = None, seed=-1, mode='img2img'):
|
||||||
|
self.url = f"http://localhost:7860/sdapi/v1/{mode}"
|
||||||
|
self.body = {
|
||||||
|
"init_images": [b64_init_img],
|
||||||
|
"mask": mask,
|
||||||
|
"mask_blur": 0,
|
||||||
|
"inpainting_fill": 1,
|
||||||
|
"inpainting_mask_invert": 0,
|
||||||
|
"prompt": PROMPT,
|
||||||
|
"negative_prompt": N_PROMPT,
|
||||||
|
"seed": seed,
|
||||||
|
"subseed": -1,
|
||||||
|
"subseed_strength": 0,
|
||||||
|
"batch_size": 1,
|
||||||
|
"n_iter": 1,
|
||||||
|
"steps": 15,
|
||||||
|
"cfg_scale": CFG_SCALE,
|
||||||
|
"denoising_strength": ds,
|
||||||
|
"width": w,
|
||||||
|
"height": h,
|
||||||
|
"restore_faces": False,
|
||||||
|
"eta": 0,
|
||||||
|
"sampler_index": "DPM++ 2S a",
|
||||||
|
"control_net_enabled": True,
|
||||||
|
"alwayson_scripts": {
|
||||||
|
"ControlNet":{"args": []}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if APPLY_TEMPORALNET:
|
||||||
|
self.body["alwayson_scripts"]["ControlNet"]["args"].append({
|
||||||
|
"input_image": b64_prev_img,
|
||||||
|
"module": "none",
|
||||||
|
"model": "diff_control_sd15_temporalnet_fp16 [adc6bd97]",
|
||||||
|
"weight": 0.65,
|
||||||
|
"resize_mode": "Just Resize",
|
||||||
|
"lowvram": False,
|
||||||
|
"processor_res": 512,
|
||||||
|
"guidance_start": 0,
|
||||||
|
"guidance_end": 0.65,
|
||||||
|
"guessmode": False
|
||||||
|
})
|
||||||
|
|
||||||
|
if APPLY_COLOR:
|
||||||
|
self.body["alwayson_scripts"]["ControlNet"]["args"].append({
|
||||||
|
"input_image": b64_prev_img,
|
||||||
|
"module": "color",
|
||||||
|
"model": "t2iadapter_color_sd14v1 [8522029d]",
|
||||||
|
"weight": 0.65,
|
||||||
|
"resize_mode": "Just Resize",
|
||||||
|
"lowvram": False,
|
||||||
|
"processor_res": 512,
|
||||||
|
"guidance_start": 0,
|
||||||
|
"guidance_end": 0.65,
|
||||||
|
"guessmode": False
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def sendRequest(self):
|
||||||
|
# Request to web-ui
|
||||||
|
data_js = requests.post(self.url, json=self.body).json()
|
||||||
|
|
||||||
|
# Convert the byte array to a NumPy array
|
||||||
|
image_bytes = base64.b64decode(data_js["images"][0])
|
||||||
|
np_array = np.frombuffer(image_bytes, dtype=np.uint8)
|
||||||
|
|
||||||
|
# Convert the NumPy array to a cv2 image
|
||||||
|
out_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
|
||||||
|
return out_image
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if VISUALIZE: cv2.namedWindow('Out img')
|
||||||
|
|
||||||
|
|
||||||
|
# Create an output video file with the same fps, width, and height as the input video
|
||||||
|
output_video = cv2.VideoWriter(OUTPUT_VIDEO, cv2.VideoWriter_fourcc(*'mp4v'), 8, (w, h))
|
||||||
|
|
||||||
|
prev_frame = None
|
||||||
|
prev_frame_styled = None
|
||||||
|
|
||||||
|
|
||||||
|
# Instantiate the model
|
||||||
|
model = FloweR(input_size = (h, w))
|
||||||
|
model.load_state_dict(torch.load('FloweR/FloweR_0.1.pth'))
|
||||||
|
# Move the model to the device
|
||||||
|
model = model.to(DEVICE)
|
||||||
|
|
||||||
|
|
||||||
|
init_frame = controlnetRequest(mode='txt2img', ds=PROCESSING_STRENGTH, w=w, h=h).sendRequest()
|
||||||
|
|
||||||
|
output_video.write(init_frame)
|
||||||
|
prev_frame = init_frame
|
||||||
|
|
||||||
|
clip_frames = np.zeros((4, h, w, 3), dtype=np.uint8)
|
||||||
|
|
||||||
|
color_shift = np.zeros((0, 3))
|
||||||
|
color_scale = np.zeros((0, 3))
|
||||||
|
for ind in tqdm(range(40)):
|
||||||
|
clip_frames = np.roll(clip_frames, -1, axis=0)
|
||||||
|
clip_frames[-1] = prev_frame
|
||||||
|
|
||||||
|
clip_frames_torch = frames_norm(torch.from_numpy(clip_frames).to(DEVICE, dtype=torch.float32))
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
pred_data = model(clip_frames_torch.unsqueeze(0))[0]
|
||||||
|
|
||||||
|
pred_flow = flow_renorm(pred_data[...,:2]).cpu().numpy()
|
||||||
|
pred_occl = occl_renorm(pred_data[...,2:3]).cpu().numpy().repeat(3, axis = -1)
|
||||||
|
|
||||||
|
pred_flow = np.clip(pred_flow, -150, 150)
|
||||||
|
pred_flow = cv2.GaussianBlur(pred_flow, (31,31), 2, cv2.BORDER_REFLECT_101)
|
||||||
|
|
||||||
|
|
||||||
|
pred_occl = cv2.GaussianBlur(pred_occl, (21,21), 2, cv2.BORDER_REFLECT_101)
|
||||||
|
pred_occl = (np.abs(pred_occl / 255) ** 1.5) * 255
|
||||||
|
pred_occl = np.clip(pred_occl * 25, 0, 255).astype(np.uint8)
|
||||||
|
|
||||||
|
flow_map = pred_flow.copy()
|
||||||
|
flow_map[:,:,0] += np.arange(w)
|
||||||
|
flow_map[:,:,1] += np.arange(h)[:,np.newaxis]
|
||||||
|
|
||||||
|
warped_frame = cv2.remap(prev_frame, flow_map, None, cv2.INTER_CUBIC, borderMode = cv2.BORDER_REFLECT_101)
|
||||||
|
|
||||||
|
out_image = warped_frame.copy()
|
||||||
|
|
||||||
|
out_image = controlnetRequest(
|
||||||
|
b64_init_img = to_b64(out_image),
|
||||||
|
b64_prev_img = to_b64(prev_frame),
|
||||||
|
b64_color_img = to_b64(warped_frame),
|
||||||
|
mask = to_b64(pred_occl),
|
||||||
|
ds=PROCESSING_STRENGTH, w=w, h=h).sendRequest()
|
||||||
|
|
||||||
|
out_image = controlnetRequest(
|
||||||
|
b64_init_img = to_b64(out_image),
|
||||||
|
b64_prev_img = to_b64(prev_frame),
|
||||||
|
b64_color_img = to_b64(warped_frame),
|
||||||
|
mask = None,
|
||||||
|
ds=FIX_STRENGTH, w=w, h=h).sendRequest()
|
||||||
|
|
||||||
|
# These step is necessary to reduce color drift of the image that some models may cause
|
||||||
|
out_image = skimage.exposure.match_histograms(out_image, warped_frame, multichannel=True, channel_axis=-1)
|
||||||
|
|
||||||
|
output_video.write(out_image)
|
||||||
|
if SAVE_FRAMES:
|
||||||
|
if not os.path.isdir('out'): os.makedirs('out')
|
||||||
|
cv2.imwrite(f'out/{ind+1:05d}.png', out_image)
|
||||||
|
|
||||||
|
pred_flow_img = flow_viz.flow_to_image(pred_flow)
|
||||||
|
frames_img = cv2.hconcat(list(clip_frames))
|
||||||
|
data_img = cv2.hconcat([pred_flow_img, pred_occl, warped_frame, out_image])
|
||||||
|
|
||||||
|
cv2.imshow('Out img', cv2.vconcat([frames_img, data_img]))
|
||||||
|
if cv2.waitKey(1) & 0xFF == ord('q'): exit() # press Q to close the script while processing
|
||||||
|
|
||||||
|
prev_frame = out_image.copy()
|
||||||
|
|
||||||
|
# Release the input and output video files
|
||||||
|
output_video.release()
|
||||||
|
|
||||||
|
# Close all windows
|
||||||
|
if VISUALIZE: cv2.destroyAllWindows()
|
||||||
Loading…
Reference in New Issue