commit 571798cb1d3ec470f964f8a8e5ea515f3ea75b9a Author: hnmr293 Date: Mon Jan 2 20:40:08 2023 +0900 add implementation of U-net dumping diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a065928 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +images/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2a3c197 --- /dev/null +++ b/LICENSE @@ -0,0 +1 @@ +MIT Licence diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/javascript/size.js b/javascript/size.js new file mode 100644 index 0000000..087843d --- /dev/null +++ b/javascript/size.js @@ -0,0 +1,86 @@ +onUiUpdate(() => { + if (globalThis.DumpUnet) return; + const DumpUnet = {}; + globalThis.DumpUnet = DumpUnet; + + DumpUnet.applySizeCallback = function () { + if (globalThis.DumpUnet.applySizeCallbackCalled) return; + + const app = gradioApp(); + if (!app || app === document) return; + + const labels = Array.of(...app.querySelectorAll('#tab_txt2img label')); + const width_label = labels.find(x => x.textContent.trim() === "Width"); + const height_label = labels.find(x => x.textContent.trim() === "Height"); + const steps_label = labels.find(x => x.textContent.trim() === "Sampling Steps"); + if (!width_label || !height_label || !steps_label) return; + + const width_slider = app.querySelector(`#${width_label.htmlFor}`); + const height_slider = app.querySelector(`#${height_label.htmlFor}`); + const steps_slider = app.querySelector(`#${steps_label.htmlFor}`) + if (!width_slider || !height_slider || !steps_slider) return; + //if (+width_slider.dataset.dumpunetHooked && +height_slider.dataset.dumpunetHooked) return + // + //const value_hook = ele => { + // const proto = Object.getPrototypeOf(ele); + // const old_desc = Object.getOwnPropertyDescriptor(proto, 'value'); + // Object.defineProperty(ele, 'value', { + // get: function () { return old_desc.get.apply(this, arguments); }, + // set: function () { + // const old_value = this.value; + // old_desc.set.apply(this, arguments); + // const new_value = this.value; + // const ev = new CustomEvent('imagesizesliderchange', { detail: { old_value: old_value }, bubbles: true }); + // ele.dispatchEvent(ev); + // } + // }); + // ele.dataset.dumpunetHooked = 1; + //}; + // + //value_hook(width_slider); + //value_hook(height_slider); + + globalThis.DumpUnet.applySizeCallbackCalled = true; + + const update_info = () => { + const layer = app.querySelector('#dumpunet-layer select').value; + const info = JSON.parse(app.querySelector('#dumpunet-layer_setting').textContent)[layer]; + const + w = +width_slider.value, + h = +height_slider.value, + steps = +steps_slider.value, + iw = Math.max(1, Math.ceil(w / 64)), + ih = Math.max(1, Math.ceil(h / 64)), + ch = info[1][0], + nx = +app.querySelector('#dumpunet-gridx input').value, + ny = +app.querySelector('#dumpunet-gridy input').value, + n = Math.ceil(ch / (nx * ny)); + info[0][1] *= ih; + info[0][2] *= iw; + info[1][1] *= ih; + info[1][2] *= iw; + app.querySelector('#dumpunet-layerinfo').innerHTML = ` +[Layer Info]
+Name:   ${layer}
+Input:  (${info[0].join(',')})
+Outout: (${info[1].join(',')})
+[Output Images Info]
+N:      ${n} x ${steps}steps
+Width:  ${nx == 1 ? info[1][2] : ((info[1][2] + 1) * nx - 1)}px
+Height: ${ny == 1 ? info[1][1] : ((info[1][1] + 1) * ny - 1)}px
+`.trim(); + }; + + //app.addEventListener('imagesizesliderchange', e => { + // //console.log(e.detail.old_value, e.target.value); + // update_info(); + //}, false); + + app.addEventListener('input', update_info, false); + app.addEventListener('change', update_info, false); + + update_info(); + }; + + onUiUpdate(DumpUnet.applySizeCallback); +}); diff --git a/scripts/dumpunet.py b/scripts/dumpunet.py new file mode 100644 index 0000000..ecd6c38 --- /dev/null +++ b/scripts/dumpunet.py @@ -0,0 +1,304 @@ +import os +import time +import json +import re +import numpy as np +from torch import nn, Tensor +import gradio as gr +from PIL import Image + +import modules.scripts as scripts +from modules.processing import process_images, Processed, StableDiffusionProcessing +from modules import shared + +re_num = re.compile(r"^\s*\+?\s*\d+\s*$") +re_range = re.compile(r"^\s*(\+?\s*\d+)\s*-\s*(\+?\s*\d+)\s*(?:\(\s*\+?\s*(\d+)\s*\))?\s*$") + +class Script(scripts.Script): + + def title(self): + return "Dump U-net features" + + def show(self, is_img2img): + return not is_img2img + + def ui(self, is_img2img): + settings = { + # input shape output shape + "IN00": ( ( 4, 8, 8), ( 320, 8, 8) ), + "IN01": ( ( 320, 8, 8), ( 320, 8, 8) ), + "IN02": ( ( 320, 8, 8), ( 320, 8, 8) ), + "IN03": ( ( 320, 8, 8), ( 320, 4, 4) ), + "IN04": ( ( 320, 4, 4), ( 640, 4, 4) ), + "IN05": ( ( 640, 4, 4), ( 640, 4, 4) ), + "IN06": ( ( 640, 4, 4), ( 640, 2, 2) ), + "IN07": ( ( 640, 2, 2), (1280, 2, 2) ), + "IN08": ( (1280, 2, 2), (1280, 2, 2) ), + "IN09": ( (1280, 2, 2), (1280, 1, 1) ), + "IN10": ( (1280, 1, 1), (1280, 1, 1) ), + "IN11": ( (1280, 1, 1), (1280, 1, 1) ), + "M00": ( (1280, 1, 1), (1280, 1, 1) ), + "OUT00": ( (2560, 1, 1), (1280, 1, 1) ), + "OUT01": ( (2560, 1, 1), (1280, 1, 1) ), + "OUT02": ( (2560, 1, 1), (1280, 2, 2) ), + "OUT03": ( (2560, 2, 2), (1280, 2, 2) ), + "OUT04": ( (2560, 2, 2), (1280, 2, 2) ), + "OUT05": ( (1920, 2, 2), (1280, 4, 4) ), + "OUT06": ( (1920, 4, 4), ( 640, 4, 4) ), + "OUT07": ( (1280, 4, 4), ( 640, 4, 4) ), + "OUT08": ( ( 960, 4, 4), ( 640, 8, 8) ), + "OUT09": ( ( 960, 8, 8), ( 320, 8, 8) ), + "OUT10": ( ( 640, 8, 8), ( 320, 8, 8) ), + "OUT11": ( ( 640, 8, 8), ( 320, 8, 8) ), + } + + with gr.Blocks(elem_id="dumpunet"): + layer = gr.Dropdown([f"IN{i:02}" for i in range(12)] + ["M00"] + [f"OUT{i:02}" for i in range(12)], label="Layer", value="M00", elem_id="dumpunet-layer") + layer_setting_hidden = gr.HTML(json.dumps(settings), visible=False, elem_id="dumpunet-layer_setting") + + with gr.Row(): + grid_x = gr.Slider(1, 512, value=1, step=1, label="Grid X", elem_id="dumpunet-gridx") + grid_y = gr.Slider(1, 512, value=1, step=1, label="Grid Y", elem_id="dumpunet-gridy") + + steps = gr.Textbox(label="Image saving steps") + + color = gr.Checkbox(False, label="Use red/blue color map (red=POSITIVE, black=ZERO, blue=NEGATIVE)") + + with gr.Blocks(): + path_on = gr.Checkbox(False, label="Dump tensor to files") + path = gr.Textbox(label="Output path") + + layer_info = gr.HTML(elem_id="dumpunet-layerinfo") + + return [layer, grid_x, grid_y, steps, color, path_on, path] + + def run(self, + p: StableDiffusionProcessing, + layer: str, + grid_x: float, + grid_y: float, + step_input: str, + color: bool, + path_on: bool, + path: str): + + IN = [ f"IN{i:02}" for i in range(12) ] + OUT = [ f"OUT{i:02}" for i in range(12) ] + + assert p.n_iter == 1, "[DumpUnet] Batch count must be 1." + assert p.batch_size == 1, "[DumpUnet] Batch size must be 1." + assert layer is not None and layer != "", "[DumpUnet] must not be empty." + assert 1 <= grid_x, "[DumpUnet] must not be positive integer." + assert 1 <= grid_y, "[DumpUnet] must not be positive integer." + if path_on: + assert path is not None and path != "", "[DumpUnet] must not be empty." + + steps : list[int]|None = [] + step_input_tokens = (step_input or "").split(",") + for token in step_input_tokens: + if token == "": + continue + m1 = re_num.fullmatch(token) + m2 = re_range.fullmatch(token) + if m1: + steps1 = eval("[" + m1.group(0) + "]") + elif m2: + n1 = eval(m2.group(1)) + n2 = eval(m2.group(2)) + n3 = eval(m2.group(3)) if m2.group(3) else 1 + steps1 = list(range(n1, n2+1, n3)) + else: + raise ValueError("[DumpUnet] Invalid input for .") + steps.extend(steps1) + + steps = list(set(steps)) + if len(steps) == 0: + steps = None # all steps + else: + steps.sort() + + grid_x = int(grid_x) + grid_y = int(grid_y) + + unet = p.sd_model.model.diffusion_model # type: ignore + + #time_embed : nn.modules.container.Sequential + #input_blocks : nn.modules.container.ModuleList + #middle_block : ldm.modules.diffusionmodules.openaimodel.TimestepEmbedSequential + #output_blocks : nn.modules.container.ModuleList + #time_embed = unet.time_embed + #input_blocks = unet.input_blocks + #middle_block = unet.middle_block + #output_blocks = unet.output_blocks + #summary(unet, (4, 512, 512)) + + # mkdir -p path + if path_on: + if os.path.exists(path): + assert os.path.isdir(path), "[DumpUnet] already exists and is not a directory." + else: + os.makedirs(path, exist_ok=True) + + target : nn.modules.Module + if layer in IN: + idx = IN.index(layer) + target = unet.input_blocks[idx] + elif layer == "M00": + target = unet.middle_block + elif layer in OUT: + idx = OUT.index(layer) + target = unet.output_blocks[idx] + else: + assert False, "[DumpUnet] Invalid value." + + features = [] + current_step = [0] + def create_hook(features, name): + def forward_hook(module, inputs, outputs): + #print(f"{name}\t{inputs[0].size()}\t{outputs.size()}") + current_step[0] += 1 + if steps is None or current_step[0] in steps: + features.append({ + "steps": current_step[0], + "name": name, + "input_dims": [ x.size() for x in inputs if type(x) == Tensor ], + "output_dims": outputs.size(), + "outputs": outputs.detach().clone(), + }) + return forward_hook + + handles = [] + handles.append(target.register_forward_hook(create_hook(features, layer))) + + #for idx, mod in enumerate(input_blocks.children()): + # handles.append(mod.register_forward_hook(create_hook(features, f"IN{idx:02}"))) + # + #handles.append(middle_block.register_forward_hook(create_hook(features, "M00"))) + # + #for idx, mod in enumerate(output_blocks.children()): + # handles.append(mod.register_forward_hook(create_hook(features, f"OUT{idx:02}"))) + + t0 = int(time.time()) + try: + proc = process_images(p) + finally: + for handle in handles: + handle.remove() + + if proc: + assert len(proc.images) == 1, f"[DumpUnet] internal (#images={len(proc.images)}))" + images = [proc.images[-1]] + + for step, feature in enumerate(features, 1): + if shared.state.interrupted: + break + + tensors = feature["outputs"] + assert len(tensors.size()) == 4 + for idx in range(tensors.size()[0]): + # two same outputs??? + tensor = tensors[idx] + basename = f"{layer}-{step:03}-{{ch:04}}-{t0}" + canvases = process(tensor, grid_x, grid_y, tensor.size(), color, path, basename, path_on) + images.extend(canvases) + break + + else: + images = proc.images + + N = lambda x: [x] * len(images) + return Processed( + p, + images, + seed=proc.seed, + info=proc.info, + subseed=proc.subseed, + all_seeds=N(proc.seed), + all_subseeds=N(proc.subseed), + all_prompts=N(proc.prompt), + all_negative_prompts=N(proc.negative_prompt), + infotexts=[proc.infotexts[0]] + [f"{proc.infotexts[0]}\nFeature Steps: {n}" for n in (steps or range(1, current_step[0]+1))] + ) + +def process(tensor: Tensor, + grid_x: int, + grid_y: int, + dims: tuple[int,int,int], + color: bool, + save_dir: str, + basename: str, + save_bin: bool = False + ): + # Regardless of wheather --opt-channelslast is enabled or not, + # feature.size() seems to return (batch, ch, h, w). + # Is this intended result??? + + max_ch, ih, iw = dims + width = (grid_x * (iw + 1) - 1) + height = (grid_y * (ih + 1) - 1) + + def each_slice(it: range, n: int): + cur = [] + for x in it: + cur.append(x) + if n == len(cur): + yield cur + cur = [] + if 0 < len(cur): + yield cur + + canvases = [] + color_format = "RGB" if color else "L" + + for chs in each_slice(range(max_ch), grid_x * grid_y): + chs = list(chs) + + canvas = Image.new(color_format, (width, height), 0) + for iy in range(grid_y): + if len(chs) == 0: + break + + for ix in range(grid_x): + if shared.state.interrupted: + break + + if len(chs) == 0: + break + + ch = chs.pop(0) + array = tensor[ch].cpu().numpy().astype(np.float32) + filename = basename.format(x=ix, y=iy, ch=ch) + + # create image + x = (iw+1) * ix + y = (ih+1) * iy + image = tensor_to_image(array, color) + canvas.paste(Image.fromarray(image, color_format), (x, y)) + + # save binary + if save_bin: + assert save_dir is not None + binpath = os.path.join(save_dir, filename + ".bin") + with open(binpath, "wb") as io: + io.write(bytearray(array)) + + canvases.append(canvas) + return canvases + +def tensor_to_image(array: np.ndarray, color: bool): + # array := (-∞, ∞) + + if color: + def colorize(v: float): + # v = -1 .. 1 を + # v < 0 のとき青 (0, 0, 1) + # v > 0 のとき赤 (1 ,0, 0) + # にする + rgb = (v if v > 0.0 else 0.0, 0.0, -v if v < 0.0 else 0.0) + return rgb + colorize2 = np.vectorize(colorize, otypes=[np.float32, np.float32, np.float32]) + rgb = colorize2(np.clip(array, -1.0, 1.0)) + return np.clip((np.dstack(rgb) * 256), 0, 255).astype(np.uint8) + + else: + return np.clip(np.abs(array) * 256, 0, 255).astype(np.uint8) diff --git a/scripts/networks.txt b/scripts/networks.txt new file mode 100644 index 0000000..fd2dffd --- /dev/null +++ b/scripts/networks.txt @@ -0,0 +1,32 @@ +M00 := (1280, ceil(H/64), ceil(H/64)) + +------------------------------------------------------------------ +512x512 +------------------------------------------------------------------ +name modules input size (ch,h,w) output size (ch,h,w) +------------------------------------------------------------------ +IN00 Conv2D ( 4, 64, 64) ( 320, 64, 64) +IN01 Res+Trans ( 320, 64, 64) ( 320, 64, 64) +IN02 Res+Trans ( 320, 64, 64) ( 320, 64, 64) +IN03 Down ( 320, 64, 64) ( 320, 32, 32) +IN04 Res+Trans ( 320, 32, 32) ( 640, 32, 32) +IN05 Res+Trans ( 640, 32, 32) ( 640, 32, 32) +IN06 Down ( 640, 32, 32) ( 640, 16, 16) +IN07 Res+Trans ( 640, 16, 16) (1280, 16, 16) +IN08 Res+Trans (1280, 16, 16) (1280, 16, 16) +IN09 Down (1280, 16, 16) (1280, 8, 8) +IN10 Res (1280, 8, 8) (1280, 8, 8) +IN11 Res (1280, 8, 8) (1280, 8, 8) +M00 Res+Trans+Res (1280, 8, 8) (1280, 8, 8) +OUT00 ⊕IN11/Res (2560, 8, 8) (1280, 8, 8) +OUT01 ⊕IN10/Res (2560, 8, 8) (1280, 8, 8) +OUT02 ⊕IN09/Res+Up (2560, 8, 8) (1280, 16, 16) +OUT03 ⊕IN08/Res+Trans (2560, 16, 16) (1280, 16, 16) +OUT04 ⊕IN07/Res+Trans (2560, 16, 16) (1280, 16, 16) +OUT05 ⊕IN06/Res+Trans+Up (1920, 16, 16) (1280, 32, 32) +OUT06 ⊕IN05/Res+Trans (1920, 32, 32) ( 640, 32, 32) +OUT07 ⊕IN04/Res+Trans (1280, 32, 32) ( 640, 32, 32) +OUT08 ⊕IN03/Res+Trans+Up ( 960, 32, 32) ( 640, 64, 64) +OUT09 ⊕IN02/Res+Trans ( 960, 64, 64) ( 320, 64, 64) +OUT10 ⊕IN01/Res+Trans ( 640, 64, 64) ( 320, 64, 64) +OUT11 ⊕IN00/Res+Trans ( 640, 64, 64) ( 320, 64, 64) diff --git a/scripts/p.sd_model.model.txt b/scripts/p.sd_model.model.txt new file mode 100644 index 0000000..f14ef5c --- /dev/null +++ b/scripts/p.sd_model.model.txt @@ -0,0 +1,1151 @@ +DiffusionWrapper( + (diffusion_model): UNetModel( + (time_embed): Sequential( + (0): Linear(in_features=320, out_features=1280, bias=True) + (1): SiLU() + (2): Linear(in_features=1280, out_features=1280, bias=True) + ) + (input_blocks): ModuleList( + (0): TimestepEmbedSequential( + (0): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (1): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=320, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 320, eps=1e-06, affine=True) + (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=320, out_features=320, bias=False) + (to_v): Linear(in_features=320, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=320, out_features=2560, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=1280, out_features=320, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=768, out_features=320, bias=False) + (to_v): Linear(in_features=768, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (2): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=320, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 320, eps=1e-06, affine=True) + (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=320, out_features=320, bias=False) + (to_v): Linear(in_features=320, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=320, out_features=2560, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=1280, out_features=320, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=768, out_features=320, bias=False) + (to_v): Linear(in_features=768, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (3): TimestepEmbedSequential( + (0): Downsample( + (op): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + (4): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=640, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 640, eps=1e-06, affine=True) + (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=640, out_features=640, bias=False) + (to_v): Linear(in_features=640, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=640, out_features=5120, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=2560, out_features=640, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=768, out_features=640, bias=False) + (to_v): Linear(in_features=768, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (5): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=640, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 640, eps=1e-06, affine=True) + (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=640, out_features=640, bias=False) + (to_v): Linear(in_features=640, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=640, out_features=5120, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=2560, out_features=640, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=768, out_features=640, bias=False) + (to_v): Linear(in_features=768, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (6): TimestepEmbedSequential( + (0): Downsample( + (op): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + (7): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) + (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=1280, out_features=1280, bias=False) + (to_v): Linear(in_features=1280, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=1280, out_features=10240, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=5120, out_features=1280, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=768, out_features=1280, bias=False) + (to_v): Linear(in_features=768, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (8): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) + (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=1280, out_features=1280, bias=False) + (to_v): Linear(in_features=1280, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=1280, out_features=10240, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=5120, out_features=1280, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=768, out_features=1280, bias=False) + (to_v): Linear(in_features=768, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (9): TimestepEmbedSequential( + (0): Downsample( + (op): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + (10): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + ) + (11): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + ) + ) + (middle_block): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) + (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=1280, out_features=1280, bias=False) + (to_v): Linear(in_features=1280, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=1280, out_features=10240, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=5120, out_features=1280, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=768, out_features=1280, bias=False) + (to_v): Linear(in_features=768, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + ) + (output_blocks): ModuleList( + (0): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (1): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (2): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): Upsample( + (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (3): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) + (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=1280, out_features=1280, bias=False) + (to_v): Linear(in_features=1280, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=1280, out_features=10240, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=5120, out_features=1280, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=768, out_features=1280, bias=False) + (to_v): Linear(in_features=768, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (4): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) + (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=1280, out_features=1280, bias=False) + (to_v): Linear(in_features=1280, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=1280, out_features=10240, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=5120, out_features=1280, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=768, out_features=1280, bias=False) + (to_v): Linear(in_features=768, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (5): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1920, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) + (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=1280, out_features=1280, bias=False) + (to_v): Linear(in_features=1280, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=1280, out_features=10240, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=5120, out_features=1280, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=768, out_features=1280, bias=False) + (to_v): Linear(in_features=768, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): Upsample( + (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (6): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1920, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=640, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 640, eps=1e-06, affine=True) + (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=640, out_features=640, bias=False) + (to_v): Linear(in_features=640, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=640, out_features=5120, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=2560, out_features=640, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=768, out_features=640, bias=False) + (to_v): Linear(in_features=768, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (7): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=640, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 640, eps=1e-06, affine=True) + (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=640, out_features=640, bias=False) + (to_v): Linear(in_features=640, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=640, out_features=5120, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=2560, out_features=640, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=768, out_features=640, bias=False) + (to_v): Linear(in_features=768, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (8): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 960, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=640, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 640, eps=1e-06, affine=True) + (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=640, out_features=640, bias=False) + (to_v): Linear(in_features=640, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=640, out_features=5120, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=2560, out_features=640, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=768, out_features=640, bias=False) + (to_v): Linear(in_features=768, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): Upsample( + (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (9): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 960, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=320, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 320, eps=1e-06, affine=True) + (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=320, out_features=320, bias=False) + (to_v): Linear(in_features=320, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=320, out_features=2560, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=1280, out_features=320, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=768, out_features=320, bias=False) + (to_v): Linear(in_features=768, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (10): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=320, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 320, eps=1e-06, affine=True) + (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=320, out_features=320, bias=False) + (to_v): Linear(in_features=320, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=320, out_features=2560, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=1280, out_features=320, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=768, out_features=320, bias=False) + (to_v): Linear(in_features=768, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (11): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=320, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 320, eps=1e-06, affine=True) + (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=320, out_features=320, bias=False) + (to_v): Linear(in_features=320, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=320, out_features=2560, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=1280, out_features=320, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=768, out_features=320, bias=False) + (to_v): Linear(in_features=768, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + ) + (out): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) +) \ No newline at end of file diff --git a/scripts/p.sd_model.txt b/scripts/p.sd_model.txt new file mode 100644 index 0000000..2512bb6 --- /dev/null +++ b/scripts/p.sd_model.txt @@ -0,0 +1,1619 @@ +LatentDiffusion( + (model): DiffusionWrapper( + (diffusion_model): UNetModel( + (time_embed): Sequential( + (0): Linear(in_features=320, out_features=1280, bias=True) + (1): SiLU() + (2): Linear(in_features=1280, out_features=1280, bias=True) + ) + (input_blocks): ModuleList( + (0): TimestepEmbedSequential( + (0): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (1): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=320, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 320, eps=1e-06, affine=True) + (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=320, out_features=320, bias=False) + (to_v): Linear(in_features=320, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=320, out_features=2560, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=1280, out_features=320, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=768, out_features=320, bias=False) + (to_v): Linear(in_features=768, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (2): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=320, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 320, eps=1e-06, affine=True) + (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=320, out_features=320, bias=False) + (to_v): Linear(in_features=320, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=320, out_features=2560, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=1280, out_features=320, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=768, out_features=320, bias=False) + (to_v): Linear(in_features=768, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (3): TimestepEmbedSequential( + (0): Downsample( + (op): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + (4): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=640, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 640, eps=1e-06, affine=True) + (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=640, out_features=640, bias=False) + (to_v): Linear(in_features=640, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=640, out_features=5120, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=2560, out_features=640, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=768, out_features=640, bias=False) + (to_v): Linear(in_features=768, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (5): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=640, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 640, eps=1e-06, affine=True) + (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=640, out_features=640, bias=False) + (to_v): Linear(in_features=640, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=640, out_features=5120, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=2560, out_features=640, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=768, out_features=640, bias=False) + (to_v): Linear(in_features=768, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (6): TimestepEmbedSequential( + (0): Downsample( + (op): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + (7): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) + (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=1280, out_features=1280, bias=False) + (to_v): Linear(in_features=1280, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=1280, out_features=10240, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=5120, out_features=1280, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=768, out_features=1280, bias=False) + (to_v): Linear(in_features=768, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (8): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) + (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=1280, out_features=1280, bias=False) + (to_v): Linear(in_features=1280, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=1280, out_features=10240, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=5120, out_features=1280, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=768, out_features=1280, bias=False) + (to_v): Linear(in_features=768, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (9): TimestepEmbedSequential( + (0): Downsample( + (op): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + (10): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + ) + (11): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + ) + ) + (middle_block): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) + (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=1280, out_features=1280, bias=False) + (to_v): Linear(in_features=1280, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=1280, out_features=10240, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=5120, out_features=1280, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=768, out_features=1280, bias=False) + (to_v): Linear(in_features=768, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Identity() + ) + ) + (output_blocks): ModuleList( + (0): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (1): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (2): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): Upsample( + (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (3): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) + (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=1280, out_features=1280, bias=False) + (to_v): Linear(in_features=1280, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=1280, out_features=10240, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=5120, out_features=1280, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=768, out_features=1280, bias=False) + (to_v): Linear(in_features=768, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (4): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) + (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=1280, out_features=1280, bias=False) + (to_v): Linear(in_features=1280, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=1280, out_features=10240, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=5120, out_features=1280, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=768, out_features=1280, bias=False) + (to_v): Linear(in_features=768, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (5): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1920, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=1280, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) + (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=1280, out_features=1280, bias=False) + (to_v): Linear(in_features=1280, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=1280, out_features=10240, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=5120, out_features=1280, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=1280, out_features=1280, bias=False) + (to_k): Linear(in_features=768, out_features=1280, bias=False) + (to_v): Linear(in_features=768, out_features=1280, bias=False) + (to_out): Sequential( + (0): Linear(in_features=1280, out_features=1280, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): Upsample( + (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (6): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1920, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=640, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 640, eps=1e-06, affine=True) + (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=640, out_features=640, bias=False) + (to_v): Linear(in_features=640, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=640, out_features=5120, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=2560, out_features=640, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=768, out_features=640, bias=False) + (to_v): Linear(in_features=768, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (7): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=640, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 640, eps=1e-06, affine=True) + (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=640, out_features=640, bias=False) + (to_v): Linear(in_features=640, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=640, out_features=5120, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=2560, out_features=640, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=768, out_features=640, bias=False) + (to_v): Linear(in_features=768, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (8): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 960, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=640, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 640, eps=1e-06, affine=True) + (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=640, out_features=640, bias=False) + (to_v): Linear(in_features=640, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=640, out_features=5120, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=2560, out_features=640, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=640, out_features=640, bias=False) + (to_k): Linear(in_features=768, out_features=640, bias=False) + (to_v): Linear(in_features=768, out_features=640, bias=False) + (to_out): Sequential( + (0): Linear(in_features=640, out_features=640, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): Upsample( + (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (9): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 960, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=320, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 320, eps=1e-06, affine=True) + (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=320, out_features=320, bias=False) + (to_v): Linear(in_features=320, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=320, out_features=2560, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=1280, out_features=320, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=768, out_features=320, bias=False) + (to_v): Linear(in_features=768, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (10): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=320, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 320, eps=1e-06, affine=True) + (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=320, out_features=320, bias=False) + (to_v): Linear(in_features=320, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=320, out_features=2560, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=1280, out_features=320, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=768, out_features=320, bias=False) + (to_v): Linear(in_features=768, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (11): TimestepEmbedSequential( + (0): ResBlock( + (in_layers): Sequential( + (0): GroupNorm32(32, 640, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (h_upd): Identity() + (x_upd): Identity() + (emb_layers): Sequential( + (0): SiLU() + (1): Linear(in_features=1280, out_features=320, bias=True) + ) + (out_layers): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Dropout(p=0, inplace=False) + (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): SpatialTransformer( + (norm): GroupNorm(32, 320, eps=1e-06, affine=True) + (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + (transformer_blocks): ModuleList( + (0): BasicTransformerBlock( + (attn1): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=320, out_features=320, bias=False) + (to_v): Linear(in_features=320, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (ff): FeedForward( + (net): Sequential( + (0): GEGLU( + (proj): Linear(in_features=320, out_features=2560, bias=True) + ) + (1): Dropout(p=0.0, inplace=False) + (2): Linear(in_features=1280, out_features=320, bias=True) + ) + ) + (attn2): CrossAttention( + (to_q): Linear(in_features=320, out_features=320, bias=False) + (to_k): Linear(in_features=768, out_features=320, bias=False) + (to_v): Linear(in_features=768, out_features=320, bias=False) + (to_out): Sequential( + (0): Linear(in_features=320, out_features=320, bias=True) + (1): Dropout(p=0.0, inplace=False) + ) + ) + (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) + ) + ) + (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + ) + (out): Sequential( + (0): GroupNorm32(32, 320, eps=1e-05, affine=True) + (1): SiLU() + (2): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (first_stage_model): AutoencoderKL( + (encoder): Encoder( + (conv_in): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (down): ModuleList( + (0): Module( + (block): ModuleList( + (0): ResnetBlock( + (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) + (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (1): ResnetBlock( + (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) + (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (attn): ModuleList() + (downsample): Downsample( + (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2)) + ) + ) + (1): Module( + (block): ModuleList( + (0): ResnetBlock( + (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) + (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nin_shortcut): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock( + (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) + (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (attn): ModuleList() + (downsample): Downsample( + (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + ) + ) + (2): Module( + (block): ModuleList( + (0): ResnetBlock( + (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) + (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nin_shortcut): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (attn): ModuleList() + (downsample): Downsample( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2)) + ) + ) + (3): Module( + (block): ModuleList( + (0): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (1): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (attn): ModuleList() + ) + ) + (mid): Module( + (block_1): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (attn_1): MemoryEfficientAttnBlock( + (norm): GroupNorm(32, 512, eps=1e-06, affine=True) + (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) + (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) + (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) + (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (block_2): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (norm_out): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv_out): Conv2d(512, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (decoder): Decoder( + (conv_in): Conv2d(4, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (mid): Module( + (block_1): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (attn_1): MemoryEfficientAttnBlock( + (norm): GroupNorm(32, 512, eps=1e-06, affine=True) + (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) + (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) + (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) + (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (block_2): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (up): ModuleList( + (0): Module( + (block): ModuleList( + (0): ResnetBlock( + (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) + (conv1): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nin_shortcut): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock( + (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) + (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (2): ResnetBlock( + (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) + (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (attn): ModuleList() + ) + (1): Module( + (block): ModuleList( + (0): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nin_shortcut): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock( + (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) + (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (2): ResnetBlock( + (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) + (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (attn): ModuleList() + (upsample): Upsample( + (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (2): Module( + (block): ModuleList( + (0): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (1): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (2): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (attn): ModuleList() + (upsample): Upsample( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (3): Module( + (block): ModuleList( + (0): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (1): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (2): ResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (attn): ModuleList() + (upsample): Upsample( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (norm_out): GroupNorm(32, 128, eps=1e-06, affine=True) + (conv_out): Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (loss): Identity() + (quant_conv): Conv2d(8, 8, kernel_size=(1, 1), stride=(1, 1)) + (post_quant_conv): Conv2d(4, 4, kernel_size=(1, 1), stride=(1, 1)) + ) + (cond_stage_model): FrozenCLIPEmbedderWithCustomWords( + (wrapped): FrozenCLIPEmbedder( + (transformer): CLIPTextModel( + (text_model): CLIPTextTransformer( + (embeddings): CLIPTextEmbeddings( + (token_embedding): EmbeddingsWithFixes( + (wrapped): Embedding(49408, 768) + ) + (position_embedding): Embedding(77, 768) + ) + (encoder): CLIPEncoder( + (layers): ModuleList( + (0): CLIPEncoderLayer( + (self_attn): CLIPAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): CLIPMLP( + (activation_fn): QuickGELUActivation() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (1): CLIPEncoderLayer( + (self_attn): CLIPAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): CLIPMLP( + (activation_fn): QuickGELUActivation() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (2): CLIPEncoderLayer( + (self_attn): CLIPAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): CLIPMLP( + (activation_fn): QuickGELUActivation() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (3): CLIPEncoderLayer( + (self_attn): CLIPAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): CLIPMLP( + (activation_fn): QuickGELUActivation() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (4): CLIPEncoderLayer( + (self_attn): CLIPAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): CLIPMLP( + (activation_fn): QuickGELUActivation() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (5): CLIPEncoderLayer( + (self_attn): CLIPAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): CLIPMLP( + (activation_fn): QuickGELUActivation() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (6): CLIPEncoderLayer( + (self_attn): CLIPAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): CLIPMLP( + (activation_fn): QuickGELUActivation() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (7): CLIPEncoderLayer( + (self_attn): CLIPAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): CLIPMLP( + (activation_fn): QuickGELUActivation() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (8): CLIPEncoderLayer( + (self_attn): CLIPAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): CLIPMLP( + (activation_fn): QuickGELUActivation() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (9): CLIPEncoderLayer( + (self_attn): CLIPAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): CLIPMLP( + (activation_fn): QuickGELUActivation() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (10): CLIPEncoderLayer( + (self_attn): CLIPAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): CLIPMLP( + (activation_fn): QuickGELUActivation() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + (11): CLIPEncoderLayer( + (self_attn): CLIPAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + (mlp): CLIPMLP( + (activation_fn): QuickGELUActivation() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + ) + ) + (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) + ) + ) + ) + ) +) \ No newline at end of file diff --git a/style.css b/style.css new file mode 100644 index 0000000..183adbe --- /dev/null +++ b/style.css @@ -0,0 +1,3 @@ +#dumpunet-layerinfo { + font-family: monospace; +} \ No newline at end of file