sd-webui-text2video/scripts/modelscope/process_modelscope.py

260 lines
12 KiB
Python

# Function calls referenced from https://github.com/modelscope/modelscope/tree/master/modelscope/pipelines/multi_modal
# Copyright (C) 2023 by Artem Khrapov (kabachuha)
# Read LICENSE for usage terms.
from base64 import b64encode
from tqdm import tqdm
from PIL import Image
from modelscope.t2v_pipeline import TextToVideoSynthesis, tensor2vid
from t2v_helpers.key_frames import T2VAnimKeys # TODO: move to deforum_tools
from pathlib import Path
import numpy as np
import torch
import cv2
import gc
import modules.paths as ph
from types import SimpleNamespace
from t2v_helpers.general_utils import get_t2v_version, get_model_location
import time, math
from t2v_helpers.video_audio_utils import ffmpeg_stitch_video, get_quick_vid_info, vid2frames, duplicate_pngs_from_folder, clean_folder_name
from t2v_helpers.args import get_outdir, process_args
import t2v_helpers.args as t2v_helpers_args
from modules import shared, sd_hijack, lowvram
from modules.shared import opts, devices, state
from stable_lora.scripts.lora_webui import gr_inputs_list, StableLoraScriptInstance
import os
pipe = None
def setup_pipeline(model_name):
return TextToVideoSynthesis(get_model_location(model_name))
def process_modelscope(args_dict, extra_args=None):
args, video_args = process_args(args_dict)
global pipe
print(f"\033[4;33m text2video extension for auto1111 webui\033[0m")
print(f"Git commit: {get_t2v_version()}")
init_timestring = time.strftime('%Y%m%d%H%M%S')
outdir_current = os.path.join(get_outdir(), f"{init_timestring}")
max_vids_to_pack = opts.data.get("modelscope_deforum_show_n_videos") if opts.data is not None and opts.data.get("modelscope_deforum_show_n_videos") is not None else -1
cpu_vae = opts.data.get("modelscope_deforum_vae_settings") if opts.data is not None and opts.data.get("modelscope_deforum_vae_settings") is not None else 'GPU (half precision)'
if shared.sd_model is not None:
sd_hijack.model_hijack.undo_hijack(shared.sd_model)
try:
lowvram.send_everything_to_cpu()
except Exception as e:
pass
# the following command actually frees the GPU vram from the sd.model, no need to do del shared.sd_model 22-05-23
shared.sd_model = None
gc.collect()
devices.torch_gc()
print('Starting text2video')
print('Pipeline setup')
# optionally store pipe in global between runs
# also refresh the model if the user selected a newer one
# if args.model is none (e.g. an API call, the model stays as the previous one)
if pipe is None and args.model is None: # one more API call hack, falling back to <modelscope> if never used TODO: figure out how to permastore the model name the best way
args.model = "<modelscope>"
print(f"WARNING: received an API call with an empty model name, defaulting to {args.model} at {get_model_location(args.model)}")
if pipe is None or pipe is not None and args.model is not None and get_model_location(args.model) != pipe.model_dir:
pipe = setup_pipeline(args.model)
#TODO Wrap this in a list so that we can process this for future extensions.
stable_lora_processor = StableLoraScriptInstance
stable_lora_args = stable_lora_processor.process_extension_args(all_args=extra_args)
stable_lora_processor.process(pipe, *stable_lora_args)
pipe.keep_in_vram = opts.data.get("modelscope_deforum_keep_model_in_vram") if opts.data is not None and opts.data.get("modelscope_deforum_keep_model_in_vram") is not None else 'None'
device = devices.get_optimal_device()
print('device', device)
mask = None
if args.do_vid2vid:
if args.vid2vid_frames is None and args.vid2vid_frames_path == "":
raise FileNotFoundError("Please upload a video :()")
# Overrides
if args.vid2vid_frames is not None:
vid2vid_frames_path = args.vid2vid_frames.name
print("got a request to *vid2vid* an existing video.")
in_vid_fps, _, _ = get_quick_vid_info(vid2vid_frames_path)
folder_name = clean_folder_name(Path(vid2vid_frames_path).stem)
outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name)
i = 1
while os.path.exists(outdir_no_tmp):
outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name + '_' + str(i))
i += 1
outdir_v2v = os.path.join(outdir_no_tmp, 'tmp_input_frames')
os.makedirs(outdir_v2v, exist_ok=True)
vid2frames(video_path=vid2vid_frames_path, video_in_frame_path=outdir_v2v, overwrite=True, extract_from_frame=args.vid2vid_startFrame, extract_to_frame=args.vid2vid_startFrame + args.frames,
numeric_files_output=True, out_img_format='png')
temp_convert_raw_png_path = os.path.join(outdir_v2v, "tmp_vid2vid_folder")
duplicate_pngs_from_folder(outdir_v2v, temp_convert_raw_png_path, None, folder_name)
videogen = []
for f in os.listdir(temp_convert_raw_png_path):
# double check for old _depth_ files, not really needed probably but keeping it for now
if '_depth_' not in f:
videogen.append(f)
videogen.sort(key=lambda x: int(x.split('.')[0]))
images = []
for file in tqdm(videogen, desc="Loading frames"):
image = Image.open(os.path.join(temp_convert_raw_png_path, file))
image = image.resize((args.width, args.height), Image.ANTIALIAS)
array = np.array(image)
images += [array]
# print(images)
images = np.stack(images) # f h w c
batches = 1
n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1)) # n f h w c
bcfhw = n_images.transpose(0, 4, 1, 2, 3)
# convert to 0-1 float
bcfhw = bcfhw.astype(np.float32) / 255
bfchw = bcfhw.transpose(0, 2, 1, 3, 4) # b c f h w
print(f"Converted the frames to tensor {bfchw.shape}")
vd_out = torch.from_numpy(bcfhw).to("cuda")
# should be -1,1, not 0,1
vd_out = 2 * vd_out - 1
# latents should have shape num_sample, 4, max_frames, latent_h,latent_w
print("Computing latents")
latents = pipe.compute_latents(vd_out).to(device)
skip_steps = int(math.floor(args.steps * max(0, min(1 - args.strength, 1))))
else:
latents = None
args.strength = 1
skip_steps = 0
print('Working in txt2vid mode' if not args.do_vid2vid else 'Working in vid2vid mode')
# Start the batch count loop
pbar = tqdm(range(args.batch_count), leave=False)
if args.batch_count == 1:
pbar.disable = True
vids_to_pack = []
state.job_count = args.batch_count
for batch in pbar:
state.job_no = batch
if state.skipped:
state.skipped = False
if state.interrupted:
break
shared.state.job = f"Batch {batch + 1} out of {args.batch_count}"
# TODO: move to a separate function
if args.inpainting_frames > 0 and hasattr(args.inpainting_image, "name"):
keys = T2VAnimKeys(SimpleNamespace(**{'max_frames': args.frames, 'inpainting_weights': args.inpainting_weights}), args.seed, args.inpainting_frames)
images = []
print("Received an image for inpainting", args.inpainting_image.name)
for i in range(args.frames):
image = Image.open(args.inpainting_image.name).convert("RGB")
image = image.resize((args.width, args.height), Image.ANTIALIAS)
array = np.array(image)
images += [array]
images = np.stack(images) # f h w c
batches = 1
n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1)) # n f h w c
bcfhw = n_images.transpose(0, 4, 1, 2, 3)
# convert to 0-1 float
bcfhw = bcfhw.astype(np.float32) / 255
bfchw = bcfhw.transpose(0, 2, 1, 3, 4) # b c f h w
print(f"Converted the frames to tensor {bfchw.shape}")
vd_out = torch.from_numpy(bcfhw).to("cuda")
# should be -1,1, not 0,1
vd_out = 2 * vd_out - 1
# latents should have shape num_sample, 4, max_frames, latent_h,latent_w
# but right now they have shape num_sample=1,4, 1 (only used 1 img), latent_h, latent_w
print("Computing latents")
image_latents = pipe.compute_latents(vd_out).numpy()
# padding_width = [(0, 0), (0, 0), (0, frames-inpainting_frames), (0, 0), (0, 0)]
# padded_latents = np.pad(image_latents, pad_width=padding_width, mode='constant', constant_values=0)
latent_h = args.height // 8
latent_w = args.width // 8
latent_noise = np.random.normal(size=(1, 4, args.frames, latent_h, latent_w))
mask = np.ones(shape=(1, 4, args.frames, latent_h, latent_w))
mask_weights = [keys.inpainting_weights_series[frame_idx] for frame_idx in range(args.frames)]
for i in range(args.frames):
v = mask_weights[i]
mask[:, :, i, :, :] = v
masked_latents = image_latents * (1 - mask) + latent_noise * mask
latents = torch.tensor(masked_latents).to(device)
mask = torch.tensor(mask).to(device)
args.strength = 1
samples, _, infotext = pipe.infer(args.prompt, args.n_prompt, args.steps, args.frames, args.seed + batch if args.seed != -1 else -1, args.cfg_scale,
args.width, args.height, args.eta, cpu_vae, device, latents, strength=args.strength, skip_steps=skip_steps, mask=mask, is_vid2vid=args.do_vid2vid, sampler=args.sampler)
if batch > 0:
outdir_current = os.path.join(get_outdir(), f"{init_timestring}_{batch}")
print(f'text2video finished, saving frames to {outdir_current}')
# just deleted the folder so we need to make it again
os.makedirs(outdir_current, exist_ok=True)
for i in range(len(samples)):
cv2.imwrite(outdir_current + os.path.sep +
f"{i:06}.png", samples[i])
# save settings to a file
if opts.data.get("modelscope_save_info_to_file") if opts.data is not None and opts.data.get("modelscope_save_info_to_file") is not None else False:
args_file = os.path.join(outdir_current,'args.txt')
with open(args_file, 'w', encoding='utf-8') as f:
print(f'saving args to {args_file}')
f.write(infotext)
# TODO: add params to the GUI
if not video_args.skip_video_creation:
ffmpeg_stitch_video(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=outdir_current + os.path.sep + f"vid.mp4", imgs_path=os.path.join(outdir_current,
"%06d.png"),
stitch_from_frame=0, stitch_to_frame=-1, add_soundtrack=video_args.add_soundtrack,
audio_path=vid2vid_frames_path if video_args.add_soundtrack == 'Init Video' else video_args.soundtrack_path, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
print(f't2v complete, result saved at {outdir_current}')
mp4 = open(outdir_current + os.path.sep + f"vid.mp4", 'rb').read()
dataurl = "data:video/mp4;base64," + b64encode(mp4).decode()
if max_vids_to_pack == -1 or len(vids_to_pack) < max_vids_to_pack:
vids_to_pack.append(dataurl)
t2v_helpers_args.i1_store_t2v = f'<p style=\"font-weight:bold;margin-bottom:0em\">text2video extension for auto1111 — version 1.2b </p>'
for dataurl in vids_to_pack:
t2v_helpers_args.i1_store_t2v += f'<video controls loop><source src="{dataurl}" type="video/mp4"></video><br>'
pbar.close()
return vids_to_pack