293 lines
13 KiB
Python
293 lines
13 KiB
Python
import copy
|
|
import os
|
|
import torch
|
|
from pathlib import Path
|
|
from typing import NamedTuple
|
|
from modules import devices
|
|
|
|
from scripts.adapter import PlugableAdapter, Adapter, StyleAdapter, Adapter_light
|
|
from scripts.controlnet_lllite import PlugableControlLLLite
|
|
from scripts.cldm import PlugableControlModel
|
|
from scripts.controlnet_sparsectrl import PlugableSparseCtrlModel
|
|
from scripts.ipadapter.ipadapter_model import IPAdapterModel
|
|
from scripts.ipadapter.plugable_ipadapter import PlugableIPAdapter
|
|
from scripts.logging import logger
|
|
from scripts.controlnet_diffusers import convert_from_diffuser_state_dict
|
|
from scripts.controlnet_lora import controlnet_lora_hijack, force_load_state_dict
|
|
from scripts.enums import ControlModelType
|
|
|
|
|
|
controlnet_default_config = {'adm_in_channels': None,
|
|
'in_channels': 4,
|
|
'model_channels': 320,
|
|
'num_res_blocks': 2,
|
|
'attention_resolutions': [1, 2, 4],
|
|
'transformer_depth': [1, 1, 1, 0],
|
|
'channel_mult': [1, 2, 4, 4],
|
|
'transformer_depth_middle': 1,
|
|
'use_linear_in_transformer': False,
|
|
'context_dim': 768,
|
|
"num_heads": 8,
|
|
"global_average_pooling": False}
|
|
|
|
controlnet_sdxl_config = {'num_classes': 'sequential',
|
|
'adm_in_channels': 2816,
|
|
'in_channels': 4,
|
|
'model_channels': 320,
|
|
'num_res_blocks': 2,
|
|
'attention_resolutions': [2, 4],
|
|
'transformer_depth': [0, 2, 10],
|
|
'channel_mult': [1, 2, 4],
|
|
'transformer_depth_middle': 10,
|
|
'use_linear_in_transformer': True,
|
|
'context_dim': 2048,
|
|
"num_head_channels": 64,
|
|
"global_average_pooling": False}
|
|
|
|
controlnet_sdxl_mid_config = {'num_classes': 'sequential',
|
|
'adm_in_channels': 2816,
|
|
'in_channels': 4,
|
|
'model_channels': 320,
|
|
'num_res_blocks': 2,
|
|
'attention_resolutions': [4],
|
|
'transformer_depth': [0, 0, 1],
|
|
'channel_mult': [1, 2, 4],
|
|
'transformer_depth_middle': 1,
|
|
'use_linear_in_transformer': True,
|
|
'context_dim': 2048,
|
|
"num_head_channels": 64,
|
|
"global_average_pooling": False}
|
|
|
|
controlnet_sdxl_small_config = {'num_classes': 'sequential',
|
|
'adm_in_channels': 2816,
|
|
'in_channels': 4,
|
|
'model_channels': 320,
|
|
'num_res_blocks': 2,
|
|
'attention_resolutions': [],
|
|
'transformer_depth': [0, 0, 0],
|
|
'channel_mult': [1, 2, 4],
|
|
'transformer_depth_middle': 0,
|
|
'use_linear_in_transformer': True,
|
|
"num_head_channels": 64,
|
|
'context_dim': 1,
|
|
"global_average_pooling": False}
|
|
|
|
t2i_adapter_config = {
|
|
'channels': [320, 640, 1280, 1280],
|
|
'nums_rb': 2,
|
|
'ksize': 1,
|
|
'sk': True,
|
|
'cin': 192,
|
|
'use_conv': False
|
|
}
|
|
|
|
t2i_adapter_light_config = {
|
|
'channels': [320, 640, 1280, 1280],
|
|
'nums_rb': 4,
|
|
'cin': 192,
|
|
}
|
|
|
|
t2i_adapter_style_config = {
|
|
'width': 1024,
|
|
'context_dim': 768,
|
|
'num_head': 8,
|
|
'n_layes': 3,
|
|
'num_token': 8,
|
|
}
|
|
|
|
|
|
# Stolen from https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/utils.py
|
|
def state_dict_key_replace(state_dict, keys_to_replace):
|
|
for x in keys_to_replace:
|
|
if x in state_dict:
|
|
state_dict[keys_to_replace[x]] = state_dict.pop(x)
|
|
return state_dict
|
|
|
|
|
|
# # Stolen from https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/utils.py
|
|
def state_dict_prefix_replace(state_dict, replace_prefix):
|
|
for rp in replace_prefix:
|
|
replace = list(map(lambda a: (a, "{}{}".format(replace_prefix[rp], a[len(rp):])), filter(lambda a: a.startswith(rp), state_dict.keys())))
|
|
for x in replace:
|
|
state_dict[x[1]] = state_dict.pop(x[0])
|
|
return state_dict
|
|
|
|
|
|
class ControlModel(NamedTuple):
|
|
model: torch.nn.Module
|
|
type: ControlModelType
|
|
|
|
|
|
def build_model_by_guess(state_dict, unet, model_path: str) -> ControlModel:
|
|
if "lora_controlnet" in state_dict:
|
|
is_sdxl = "input_blocks.11.0.in_layers.0.weight" not in state_dict
|
|
logger.info(f"Using ControlNet lora ({'SDXL' if is_sdxl else 'SD15'})")
|
|
del state_dict['lora_controlnet']
|
|
config = copy.deepcopy(controlnet_sdxl_config if is_sdxl else controlnet_default_config)
|
|
config['global_average_pooling'] = False
|
|
config['hint_channels'] = int(state_dict['input_hint_block.0.weight'].shape[1])
|
|
config['use_fp16'] = devices.dtype_unet == torch.float16
|
|
with controlnet_lora_hijack():
|
|
network = PlugableControlModel(config, state_dict=None)
|
|
force_load_state_dict(network.control_model, state_dict)
|
|
network.is_control_lora = True
|
|
network.to(devices.dtype_unet)
|
|
return ControlModel(network, ControlModelType.ControlLoRA)
|
|
|
|
if "down_blocks.0.motion_modules.0.temporal_transformer.norm.weight" in state_dict: # sparsectrl
|
|
config = copy.deepcopy(controlnet_default_config)
|
|
if "input_hint_block.0.weight" in state_dict: # rgb
|
|
config['use_simplified_condition_embedding'] = True
|
|
config['conditioning_channels'] = 5
|
|
else: # scribble
|
|
config['use_simplified_condition_embedding'] = False
|
|
config['conditioning_channels'] = 4
|
|
|
|
config['use_fp16'] = devices.dtype_unet == torch.float16
|
|
|
|
network = PlugableSparseCtrlModel(config, state_dict)
|
|
network.to(devices.dtype_unet)
|
|
return ControlModel(network, ControlModelType.SparseCtrl)
|
|
|
|
if "controlnet_cond_embedding.conv_in.weight" in state_dict: # diffusers
|
|
state_dict = convert_from_diffuser_state_dict(state_dict)
|
|
|
|
if 'adapter.body.0.resnets.0.block1.weight' in state_dict: # diffusers
|
|
prefix_replace = {}
|
|
for i in range(4):
|
|
for j in range(2):
|
|
prefix_replace["adapter.body.{}.resnets.{}.".format(i, j)] = "body.{}.".format(i * 2 + j)
|
|
prefix_replace["adapter.body.{}.".format(i)] = "body.{}.".format(i * 2)
|
|
prefix_replace["adapter."] = ""
|
|
state_dict = state_dict_prefix_replace(state_dict, prefix_replace)
|
|
|
|
if any('image_proj.' in x for x in state_dict.keys()) and any('ip_adapter.' in x for x in state_dict.keys()): # safetensor ipadapters
|
|
st_model = {"image_proj": {}, "ip_adapter": {}}
|
|
for key in state_dict.keys():
|
|
if key.startswith("image_proj."):
|
|
st_model["image_proj"][key.replace("image_proj.", "")] = state_dict[key]
|
|
elif key.startswith("ip_adapter."):
|
|
st_model["ip_adapter"][key.replace("ip_adapter.", "")] = state_dict[key]
|
|
# sort keys
|
|
model = {"image_proj": st_model["image_proj"], "ip_adapter": {}}
|
|
sorted_keys = sorted(st_model["ip_adapter"].keys(), key=lambda x: int(x.split(".")[0]))
|
|
for key in sorted_keys:
|
|
model["ip_adapter"][key] = st_model["ip_adapter"][key]
|
|
state_dict = model
|
|
del st_model
|
|
|
|
model_has_shuffle_in_filename = 'shuffle' in Path(os.path.abspath(model_path)).stem.lower()
|
|
state_dict = {k.replace("control_model.", ""): v for k, v in state_dict.items()}
|
|
state_dict = {k.replace("adapter.", ""): v for k, v in state_dict.items()}
|
|
|
|
if 'input_hint_block.0.weight' in state_dict:
|
|
if 'label_emb.0.0.bias' not in state_dict:
|
|
config = copy.deepcopy(controlnet_default_config)
|
|
logger.info('controlnet_default_config')
|
|
config['global_average_pooling'] = model_has_shuffle_in_filename
|
|
config['hint_channels'] = int(state_dict['input_hint_block.0.weight'].shape[1])
|
|
config['context_dim'] = int(state_dict['input_blocks.5.1.transformer_blocks.0.attn2.to_k.weight'].shape[1])
|
|
for key in state_dict.keys():
|
|
p = state_dict[key]
|
|
if 'proj_in.weight' in key or 'proj_out.weight' in key:
|
|
if len(p.shape) == 2:
|
|
p = p[..., None, None]
|
|
state_dict[key] = p
|
|
else:
|
|
has_full_layers = 'input_blocks.8.1.transformer_blocks.9.norm3.weight' in state_dict
|
|
has_mid_layers = 'input_blocks.8.1.transformer_blocks.0.norm3.weight' in state_dict
|
|
if has_full_layers:
|
|
config = copy.deepcopy(controlnet_sdxl_config)
|
|
logger.info('controlnet_sdxl_config')
|
|
elif has_mid_layers:
|
|
config = copy.deepcopy(controlnet_sdxl_mid_config)
|
|
logger.info('controlnet_sdxl_mid_config')
|
|
else:
|
|
config = copy.deepcopy(controlnet_sdxl_small_config)
|
|
logger.info('controlnet_sdxl_small_config')
|
|
config['global_average_pooling'] = False
|
|
config['hint_channels'] = int(state_dict['input_hint_block.0.weight'].shape[1])
|
|
|
|
if 'difference' in state_dict and unet is not None:
|
|
unet_state_dict = unet.state_dict()
|
|
unet_state_dict_keys = unet_state_dict.keys()
|
|
final_state_dict = {}
|
|
for key in state_dict.keys():
|
|
p = state_dict[key]
|
|
if key in unet_state_dict_keys:
|
|
p_new = p + unet_state_dict[key].clone().cpu()
|
|
else:
|
|
p_new = p
|
|
final_state_dict[key] = p_new
|
|
state_dict = final_state_dict
|
|
|
|
if "control_add_embedding.linear_1.bias" in state_dict: # Controlnet Union
|
|
config["union_controlnet_num_control_type"] = state_dict["task_embedding"].shape[0]
|
|
final_state_dict = {}
|
|
for k in list(state_dict.keys()):
|
|
new_k = k.replace('.attn.in_proj_', '.attn.in_proj.')
|
|
final_state_dict[new_k] = state_dict.pop(k)
|
|
state_dict = final_state_dict
|
|
|
|
control_model_type = ControlModelType.ControlNetUnion
|
|
elif "instant_id" in model_path.lower():
|
|
control_model_type = ControlModelType.InstantID
|
|
else:
|
|
control_model_type = ControlModelType.ControlNet
|
|
|
|
config['use_fp16'] = devices.dtype_unet == torch.float16
|
|
|
|
network = PlugableControlModel(config, state_dict)
|
|
network.to(devices.dtype_unet)
|
|
|
|
return ControlModel(network, control_model_type)
|
|
|
|
if 'conv_in.weight' in state_dict:
|
|
logger.info('t2i_adapter_config')
|
|
cin = int(state_dict['conv_in.weight'].shape[1])
|
|
channel = int(state_dict['conv_in.weight'].shape[0])
|
|
ksize = int(state_dict['body.0.block2.weight'].shape[2])
|
|
down_opts = tuple(filter(lambda item: item.endswith("down_opt.op.weight"), state_dict))
|
|
use_conv = len(down_opts) > 0
|
|
is_sdxl = cin == 256 or cin == 768
|
|
adapter = Adapter(
|
|
cin=cin,
|
|
channels=[channel, channel*2, channel*4, channel*4],
|
|
nums_rb=2,
|
|
ksize=ksize,
|
|
sk=True,
|
|
use_conv=use_conv,
|
|
is_sdxl=is_sdxl
|
|
).cpu()
|
|
adapter.load_state_dict(state_dict, strict=False)
|
|
network = PlugableAdapter(adapter)
|
|
return ControlModel(network, ControlModelType.T2I_Adapter)
|
|
|
|
if 'style_embedding' in state_dict:
|
|
config = copy.deepcopy(t2i_adapter_style_config)
|
|
logger.info('t2i_adapter_style_config')
|
|
adapter = StyleAdapter(**config).cpu()
|
|
adapter.load_state_dict(state_dict, strict=False)
|
|
network = PlugableAdapter(adapter)
|
|
return ControlModel(network, ControlModelType.T2I_StyleAdapter)
|
|
|
|
if 'body.0.in_conv.weight' in state_dict:
|
|
config = copy.deepcopy(t2i_adapter_light_config)
|
|
logger.info('t2i_adapter_light_config')
|
|
config['cin'] = int(state_dict['body.0.in_conv.weight'].shape[1])
|
|
adapter = Adapter_light(**config).cpu()
|
|
adapter.load_state_dict(state_dict, strict=False)
|
|
network = PlugableAdapter(adapter)
|
|
return ControlModel(network, ControlModelType.T2I_Adapter)
|
|
|
|
if 'ip_adapter' in state_dict:
|
|
network = PlugableIPAdapter(IPAdapterModel.load(state_dict, model_path))
|
|
return ControlModel(network, ControlModelType.IPAdapter)
|
|
|
|
if any('lllite' in k for k in state_dict.keys()):
|
|
network = PlugableControlLLLite(state_dict)
|
|
network.to('cpu')
|
|
return ControlModel(network, ControlModelType.Controlllite)
|
|
|
|
raise Exception('[ControlNet Error] Cannot recognize the ControlModel!')
|