commit 571798cb1d3ec470f964f8a8e5ea515f3ea75b9a
Author: hnmr293 <hnmr293@gmail.com>
Date:   Mon Jan 2 20:40:08 2023 +0900

    add implementation of U-net dumping

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a065928
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+images/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..2a3c197
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1 @@
+MIT Licence
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/javascript/size.js b/javascript/size.js
new file mode 100644
index 0000000..087843d
--- /dev/null
+++ b/javascript/size.js
@@ -0,0 +1,86 @@
+onUiUpdate(() => {
+    if (globalThis.DumpUnet) return;
+    const DumpUnet = {};
+    globalThis.DumpUnet = DumpUnet;
+
+    DumpUnet.applySizeCallback = function () {
+        if (globalThis.DumpUnet.applySizeCallbackCalled) return;
+
+        const app = gradioApp();
+        if (!app || app === document) return;
+
+        const labels = Array.of(...app.querySelectorAll('#tab_txt2img label'));
+        const width_label = labels.find(x => x.textContent.trim() === "Width");
+        const height_label = labels.find(x => x.textContent.trim() === "Height");
+        const steps_label = labels.find(x => x.textContent.trim() === "Sampling Steps");
+        if (!width_label || !height_label || !steps_label) return;
+
+        const width_slider = app.querySelector(`#${width_label.htmlFor}`);
+        const height_slider = app.querySelector(`#${height_label.htmlFor}`);
+        const steps_slider = app.querySelector(`#${steps_label.htmlFor}`)
+        if (!width_slider || !height_slider || !steps_slider) return;
+        //if (+width_slider.dataset.dumpunetHooked && +height_slider.dataset.dumpunetHooked) return
+        //
+        //const value_hook = ele => {
+        //    const proto = Object.getPrototypeOf(ele);
+        //    const old_desc = Object.getOwnPropertyDescriptor(proto, 'value');
+        //    Object.defineProperty(ele, 'value', {
+        //        get: function () { return old_desc.get.apply(this, arguments); },
+        //        set: function () {
+        //            const old_value = this.value;
+        //            old_desc.set.apply(this, arguments);
+        //            const new_value = this.value;
+        //            const ev = new CustomEvent('imagesizesliderchange', { detail: { old_value: old_value }, bubbles: true });
+        //            ele.dispatchEvent(ev);
+        //        }
+        //    });
+        //    ele.dataset.dumpunetHooked = 1;
+        //};
+        //
+        //value_hook(width_slider);
+        //value_hook(height_slider);
+
+        globalThis.DumpUnet.applySizeCallbackCalled = true;
+
+        const update_info = () => {
+            const layer = app.querySelector('#dumpunet-layer select').value;
+            const info = JSON.parse(app.querySelector('#dumpunet-layer_setting').textContent)[layer];
+            const
+                w = +width_slider.value,
+                h = +height_slider.value,
+                steps = +steps_slider.value,
+                iw = Math.max(1, Math.ceil(w / 64)),
+                ih = Math.max(1, Math.ceil(h / 64)),
+                ch = info[1][0],
+                nx = +app.querySelector('#dumpunet-gridx input').value,
+                ny = +app.querySelector('#dumpunet-gridy input').value,
+                n = Math.ceil(ch / (nx * ny));
+            info[0][1] *= ih;
+            info[0][2] *= iw;
+            info[1][1] *= ih;
+            info[1][2] *= iw;
+            app.querySelector('#dumpunet-layerinfo').innerHTML = `
+[Layer Info]<br/>
+Name:&nbsp;&nbsp;&nbsp;<b>${layer}</b><br/>
+Input:&nbsp;&nbsp;(${info[0].join(',')})<br/>
+Outout:&nbsp;(${info[1].join(',')})<br/>
+[Output Images Info]<br/>
+N:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;${n} x ${steps}steps<br/>
+Width:&nbsp;&nbsp;${nx == 1 ? info[1][2] : ((info[1][2] + 1) * nx - 1)}px<br/>
+Height:&nbsp;${ny == 1 ? info[1][1] : ((info[1][1] + 1) * ny - 1)}px<br/>
+`.trim();
+        };
+
+        //app.addEventListener('imagesizesliderchange', e => {
+        //    //console.log(e.detail.old_value, e.target.value);
+        //    update_info();
+        //}, false);
+
+        app.addEventListener('input', update_info, false);
+        app.addEventListener('change', update_info, false);
+
+        update_info();
+    };
+
+    onUiUpdate(DumpUnet.applySizeCallback);
+});
diff --git a/scripts/dumpunet.py b/scripts/dumpunet.py
new file mode 100644
index 0000000..ecd6c38
--- /dev/null
+++ b/scripts/dumpunet.py
@@ -0,0 +1,304 @@
+import os
+import time
+import json
+import re
+import numpy as np
+from torch import nn, Tensor
+import gradio as gr
+from PIL import Image
+
+import modules.scripts as scripts
+from modules.processing import process_images, Processed, StableDiffusionProcessing
+from modules import shared
+
+re_num = re.compile(r"^\s*\+?\s*\d+\s*$")
+re_range = re.compile(r"^\s*(\+?\s*\d+)\s*-\s*(\+?\s*\d+)\s*(?:\(\s*\+?\s*(\d+)\s*\))?\s*$")
+
+class Script(scripts.Script):
+    
+    def title(self):
+        return "Dump U-net features"
+    
+    def show(self, is_img2img):
+        return not is_img2img
+    
+    def ui(self, is_img2img):
+        settings = {
+            #            input shape   output shape
+            "IN00":   ( (   4, 8, 8), ( 320, 8, 8) ),
+            "IN01":   ( ( 320, 8, 8), ( 320, 8, 8) ),
+            "IN02":   ( ( 320, 8, 8), ( 320, 8, 8) ),
+            "IN03":   ( ( 320, 8, 8), ( 320, 4, 4) ),
+            "IN04":   ( ( 320, 4, 4), ( 640, 4, 4) ),
+            "IN05":   ( ( 640, 4, 4), ( 640, 4, 4) ),
+            "IN06":   ( ( 640, 4, 4), ( 640, 2, 2) ),
+            "IN07":   ( ( 640, 2, 2), (1280, 2, 2) ),
+            "IN08":   ( (1280, 2, 2), (1280, 2, 2) ),
+            "IN09":   ( (1280, 2, 2), (1280, 1, 1) ),
+            "IN10":   ( (1280, 1, 1), (1280, 1, 1) ),
+            "IN11":   ( (1280, 1, 1), (1280, 1, 1) ),
+            "M00":    ( (1280, 1, 1), (1280, 1, 1) ),
+            "OUT00":  ( (2560, 1, 1), (1280, 1, 1) ),
+            "OUT01":  ( (2560, 1, 1), (1280, 1, 1) ),
+            "OUT02":  ( (2560, 1, 1), (1280, 2, 2) ),
+            "OUT03":  ( (2560, 2, 2), (1280, 2, 2) ),
+            "OUT04":  ( (2560, 2, 2), (1280, 2, 2) ),
+            "OUT05":  ( (1920, 2, 2), (1280, 4, 4) ),
+            "OUT06":  ( (1920, 4, 4), ( 640, 4, 4) ),
+            "OUT07":  ( (1280, 4, 4), ( 640, 4, 4) ),
+            "OUT08":  ( ( 960, 4, 4), ( 640, 8, 8) ),
+            "OUT09":  ( ( 960, 8, 8), ( 320, 8, 8) ),
+            "OUT10":  ( ( 640, 8, 8), ( 320, 8, 8) ),
+            "OUT11":  ( ( 640, 8, 8), ( 320, 8, 8) ),
+        }
+        
+        with gr.Blocks(elem_id="dumpunet"):
+            layer = gr.Dropdown([f"IN{i:02}" for i in range(12)] + ["M00"] + [f"OUT{i:02}" for i in range(12)], label="Layer", value="M00", elem_id="dumpunet-layer")
+            layer_setting_hidden = gr.HTML(json.dumps(settings), visible=False, elem_id="dumpunet-layer_setting")
+            
+            with gr.Row():
+                grid_x = gr.Slider(1, 512, value=1, step=1, label="Grid X", elem_id="dumpunet-gridx")
+                grid_y = gr.Slider(1, 512, value=1, step=1, label="Grid Y", elem_id="dumpunet-gridy")
+            
+            steps = gr.Textbox(label="Image saving steps")
+            
+            color = gr.Checkbox(False, label="Use red/blue color map (red=POSITIVE, black=ZERO, blue=NEGATIVE)") 
+            
+            with gr.Blocks():
+                path_on = gr.Checkbox(False, label="Dump tensor to files")
+                path = gr.Textbox(label="Output path")
+            
+            layer_info = gr.HTML(elem_id="dumpunet-layerinfo")
+        
+        return [layer, grid_x, grid_y, steps, color, path_on, path]
+    
+    def run(self,
+            p: StableDiffusionProcessing,
+            layer: str,
+            grid_x: float,
+            grid_y: float,
+            step_input: str,
+            color: bool,
+            path_on: bool,
+            path: str):
+        
+        IN = [ f"IN{i:02}" for i in range(12) ]
+        OUT = [ f"OUT{i:02}" for i in range(12) ]
+        
+        assert p.n_iter == 1, "[DumpUnet] Batch count must be 1."
+        assert p.batch_size == 1, "[DumpUnet] Batch size must be 1."
+        assert layer is not None and layer != "", "[DumpUnet] <Layer> must not be empty."
+        assert 1 <= grid_x, "[DumpUnet] <Grid X> must not be positive integer."
+        assert 1 <= grid_y, "[DumpUnet] <Grid Y> must not be positive integer."
+        if path_on:
+            assert path is not None and path != "", "[DumpUnet] <Output path> must not be empty."
+        
+        steps : list[int]|None = []
+        step_input_tokens = (step_input or "").split(",")
+        for token in step_input_tokens:
+            if token == "":
+                continue
+            m1 = re_num.fullmatch(token)
+            m2 = re_range.fullmatch(token)
+            if m1:
+                steps1 = eval("[" + m1.group(0) + "]")
+            elif m2:
+                n1 = eval(m2.group(1))
+                n2 = eval(m2.group(2))
+                n3 = eval(m2.group(3)) if m2.group(3) else 1
+                steps1 = list(range(n1, n2+1, n3))
+            else:
+                raise ValueError("[DumpUnet] Invalid input for <Image saving steps>.")
+            steps.extend(steps1)
+        
+        steps = list(set(steps))
+        if len(steps) == 0:
+            steps = None # all steps
+        else:
+            steps.sort()
+        
+        grid_x = int(grid_x)
+        grid_y = int(grid_y)
+        
+        unet = p.sd_model.model.diffusion_model # type: ignore
+        
+        #time_embed :  nn.modules.container.Sequential
+        #input_blocks  : nn.modules.container.ModuleList
+        #middle_block : ldm.modules.diffusionmodules.openaimodel.TimestepEmbedSequential
+        #output_blocks : nn.modules.container.ModuleList
+        #time_embed = unet.time_embed
+        #input_blocks = unet.input_blocks
+        #middle_block = unet.middle_block
+        #output_blocks = unet.output_blocks
+        #summary(unet, (4, 512, 512))
+        
+        # mkdir -p path
+        if path_on:
+            if os.path.exists(path):
+                assert os.path.isdir(path), "[DumpUnet] <Output path> already exists and is not a directory."
+            else:
+                os.makedirs(path, exist_ok=True)
+        
+        target : nn.modules.Module
+        if layer in IN:
+            idx = IN.index(layer)
+            target = unet.input_blocks[idx]
+        elif layer == "M00":
+            target = unet.middle_block
+        elif layer in OUT:
+            idx = OUT.index(layer)
+            target = unet.output_blocks[idx]
+        else:
+            assert False, "[DumpUnet] Invalid <Layer> value."
+        
+        features = []
+        current_step = [0]
+        def create_hook(features, name):
+            def forward_hook(module, inputs, outputs):
+                #print(f"{name}\t{inputs[0].size()}\t{outputs.size()}")
+                current_step[0] += 1
+                if steps is None or current_step[0] in steps:
+                    features.append({
+                        "steps": current_step[0],
+                        "name": name,
+                        "input_dims": [ x.size() for x in inputs if type(x) == Tensor ],
+                        "output_dims": outputs.size(),
+                        "outputs": outputs.detach().clone(),
+                    })
+            return forward_hook
+        
+        handles = []
+        handles.append(target.register_forward_hook(create_hook(features, layer)))
+        
+        #for idx, mod in enumerate(input_blocks.children()):
+        #    handles.append(mod.register_forward_hook(create_hook(features, f"IN{idx:02}")))
+        #
+        #handles.append(middle_block.register_forward_hook(create_hook(features, "M00")))
+        #
+        #for idx, mod in enumerate(output_blocks.children()):
+        #    handles.append(mod.register_forward_hook(create_hook(features, f"OUT{idx:02}")))
+        
+        t0 = int(time.time())
+        try:
+            proc = process_images(p)
+        finally:
+            for handle in handles:
+                handle.remove()
+        
+        if proc:
+            assert len(proc.images) == 1, f"[DumpUnet] internal (#images={len(proc.images)}))"
+            images = [proc.images[-1]]
+            
+            for step, feature in enumerate(features, 1):
+                if shared.state.interrupted:
+                    break
+                
+                tensors = feature["outputs"]
+                assert len(tensors.size()) == 4
+                for idx in range(tensors.size()[0]):
+                    # two same outputs???
+                    tensor = tensors[idx]
+                    basename = f"{layer}-{step:03}-{{ch:04}}-{t0}"
+                    canvases = process(tensor, grid_x, grid_y, tensor.size(), color, path, basename, path_on)
+                    images.extend(canvases)
+                    break
+            
+        else:
+            images = proc.images
+            
+        N = lambda x: [x] * len(images)
+        return Processed(
+            p,
+            images, 
+            seed=proc.seed,
+            info=proc.info,
+            subseed=proc.subseed,
+            all_seeds=N(proc.seed),
+            all_subseeds=N(proc.subseed),
+            all_prompts=N(proc.prompt),
+            all_negative_prompts=N(proc.negative_prompt),
+            infotexts=[proc.infotexts[0]] + [f"{proc.infotexts[0]}\nFeature Steps: {n}" for n in (steps or range(1, current_step[0]+1))]
+        )
+
+def process(tensor: Tensor,
+            grid_x: int,
+            grid_y: int,
+            dims: tuple[int,int,int],
+            color: bool,
+            save_dir: str,
+            basename: str,
+            save_bin: bool = False
+            ):
+    # Regardless of wheather --opt-channelslast is enabled or not, 
+    # feature.size() seems to return (batch, ch, h, w).
+    # Is this intended result???
+    
+    max_ch, ih, iw = dims
+    width = (grid_x * (iw + 1) - 1)
+    height = (grid_y * (ih + 1) - 1)
+    
+    def each_slice(it: range, n: int):
+        cur = []
+        for x in it:
+            cur.append(x)
+            if n == len(cur):
+                yield cur
+                cur = []
+        if 0 < len(cur):
+            yield cur
+    
+    canvases = []
+    color_format = "RGB" if color else "L"
+    
+    for chs in each_slice(range(max_ch), grid_x * grid_y):
+        chs = list(chs)
+        
+        canvas = Image.new(color_format, (width, height), 0)
+        for iy in range(grid_y):
+            if len(chs) == 0:
+                break
+            
+            for ix in range(grid_x):
+                if shared.state.interrupted:
+                    break
+                
+                if len(chs) == 0:
+                    break
+                
+                ch = chs.pop(0)
+                array = tensor[ch].cpu().numpy().astype(np.float32)
+                filename = basename.format(x=ix, y=iy, ch=ch)
+                
+                # create image
+                x = (iw+1) * ix
+                y = (ih+1) * iy
+                image = tensor_to_image(array, color)
+                canvas.paste(Image.fromarray(image, color_format), (x, y))
+                
+                # save binary
+                if save_bin:
+                    assert save_dir is not None
+                    binpath = os.path.join(save_dir, filename + ".bin")
+                    with open(binpath, "wb") as io:
+                        io.write(bytearray(array))
+        
+        canvases.append(canvas)
+    return canvases
+
+def tensor_to_image(array: np.ndarray, color: bool):
+    # array := (-∞, ∞)
+    
+    if color:
+        def colorize(v: float):
+            # v = -1 .. 1 を
+            # v < 0 のとき青 (0, 0, 1)
+            # v > 0 のとき赤 (1 ,0, 0)
+            # にする
+            rgb = (v if v > 0.0 else 0.0, 0.0, -v if v < 0.0 else 0.0)
+            return rgb
+        colorize2 = np.vectorize(colorize, otypes=[np.float32, np.float32, np.float32])
+        rgb = colorize2(np.clip(array, -1.0, 1.0))
+        return np.clip((np.dstack(rgb) * 256), 0, 255).astype(np.uint8)
+            
+    else:
+        return np.clip(np.abs(array) * 256, 0, 255).astype(np.uint8)
diff --git a/scripts/networks.txt b/scripts/networks.txt
new file mode 100644
index 0000000..fd2dffd
--- /dev/null
+++ b/scripts/networks.txt
@@ -0,0 +1,32 @@
+M00 := (1280, ceil(H/64), ceil(H/64))
+
+------------------------------------------------------------------
+512x512
+------------------------------------------------------------------
+name    modules          input size (ch,h,w) output size (ch,h,w)
+------------------------------------------------------------------
+IN00    Conv2D              (   4, 64, 64)    ( 320, 64, 64)
+IN01    Res+Trans           ( 320, 64, 64)    ( 320, 64, 64)
+IN02    Res+Trans           ( 320, 64, 64)    ( 320, 64, 64)
+IN03    Down                ( 320, 64, 64)    ( 320, 32, 32)
+IN04    Res+Trans           ( 320, 32, 32)    ( 640, 32, 32)
+IN05    Res+Trans           ( 640, 32, 32)    ( 640, 32, 32)
+IN06    Down                ( 640, 32, 32)    ( 640, 16, 16)
+IN07    Res+Trans           ( 640, 16, 16)    (1280, 16, 16)
+IN08    Res+Trans           (1280, 16, 16)    (1280, 16, 16)
+IN09    Down                (1280, 16, 16)    (1280,  8,  8)
+IN10    Res                 (1280,  8,  8)    (1280,  8,  8)
+IN11    Res                 (1280,  8,  8)    (1280,  8,  8)
+M00     Res+Trans+Res       (1280,  8,  8)    (1280,  8,  8)
+OUT00   ⊕IN11/Res           (2560,  8,  8)    (1280,  8,  8)
+OUT01   ⊕IN10/Res           (2560,  8,  8)    (1280,  8,  8)
+OUT02   ⊕IN09/Res+Up        (2560,  8,  8)    (1280, 16, 16)
+OUT03   ⊕IN08/Res+Trans     (2560, 16, 16)    (1280, 16, 16)
+OUT04   ⊕IN07/Res+Trans     (2560, 16, 16)    (1280, 16, 16)
+OUT05   ⊕IN06/Res+Trans+Up  (1920, 16, 16)    (1280, 32, 32)
+OUT06   ⊕IN05/Res+Trans     (1920, 32, 32)    ( 640, 32, 32)
+OUT07   ⊕IN04/Res+Trans     (1280, 32, 32)    ( 640, 32, 32)
+OUT08   ⊕IN03/Res+Trans+Up  ( 960, 32, 32)    ( 640, 64, 64)
+OUT09   ⊕IN02/Res+Trans     ( 960, 64, 64)    ( 320, 64, 64)
+OUT10   ⊕IN01/Res+Trans     ( 640, 64, 64)    ( 320, 64, 64)
+OUT11   ⊕IN00/Res+Trans     ( 640, 64, 64)    ( 320, 64, 64)
diff --git a/scripts/p.sd_model.model.txt b/scripts/p.sd_model.model.txt
new file mode 100644
index 0000000..f14ef5c
--- /dev/null
+++ b/scripts/p.sd_model.model.txt
@@ -0,0 +1,1151 @@
+DiffusionWrapper(
+  (diffusion_model): UNetModel(
+    (time_embed): Sequential(
+      (0): Linear(in_features=320, out_features=1280, bias=True)
+      (1): SiLU()
+      (2): Linear(in_features=1280, out_features=1280, bias=True)
+    )
+    (input_blocks): ModuleList(
+      (0): TimestepEmbedSequential(
+        (0): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (1): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=320, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Identity()
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+          (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=320, out_features=320, bias=False)
+                (to_k): Linear(in_features=320, out_features=320, bias=False)
+                (to_v): Linear(in_features=320, out_features=320, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=320, out_features=320, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=320, out_features=2560, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=1280, out_features=320, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=320, out_features=320, bias=False)
+                (to_k): Linear(in_features=768, out_features=320, bias=False)
+                (to_v): Linear(in_features=768, out_features=320, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=320, out_features=320, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (2): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=320, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Identity()
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+          (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=320, out_features=320, bias=False)
+                (to_k): Linear(in_features=320, out_features=320, bias=False)
+                (to_v): Linear(in_features=320, out_features=320, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=320, out_features=320, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=320, out_features=2560, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=1280, out_features=320, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=320, out_features=320, bias=False)
+                (to_k): Linear(in_features=768, out_features=320, bias=False)
+                (to_v): Linear(in_features=768, out_features=320, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=320, out_features=320, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (3): TimestepEmbedSequential(
+        (0): Downsample(
+          (op): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+        )
+      )
+      (4): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=640, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+          (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=640, out_features=640, bias=False)
+                (to_k): Linear(in_features=640, out_features=640, bias=False)
+                (to_v): Linear(in_features=640, out_features=640, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=640, out_features=640, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=640, out_features=5120, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=2560, out_features=640, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=640, out_features=640, bias=False)
+                (to_k): Linear(in_features=768, out_features=640, bias=False)
+                (to_v): Linear(in_features=768, out_features=640, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=640, out_features=640, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (5): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=640, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Identity()
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+          (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=640, out_features=640, bias=False)
+                (to_k): Linear(in_features=640, out_features=640, bias=False)
+                (to_v): Linear(in_features=640, out_features=640, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=640, out_features=640, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=640, out_features=5120, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=2560, out_features=640, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=640, out_features=640, bias=False)
+                (to_k): Linear(in_features=768, out_features=640, bias=False)
+                (to_v): Linear(in_features=768, out_features=640, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=640, out_features=640, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (6): TimestepEmbedSequential(
+        (0): Downsample(
+          (op): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+        )
+      )
+      (7): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=1280, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+          (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=1280, out_features=1280, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=1280, out_features=10240, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=5120, out_features=1280, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_k): Linear(in_features=768, out_features=1280, bias=False)
+                (to_v): Linear(in_features=768, out_features=1280, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=1280, out_features=1280, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (8): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=1280, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Identity()
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+          (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=1280, out_features=1280, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=1280, out_features=10240, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=5120, out_features=1280, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_k): Linear(in_features=768, out_features=1280, bias=False)
+                (to_v): Linear(in_features=768, out_features=1280, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=1280, out_features=1280, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (9): TimestepEmbedSequential(
+        (0): Downsample(
+          (op): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+        )
+      )
+      (10): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=1280, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Identity()
+        )
+      )
+      (11): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=1280, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Identity()
+        )
+      )
+    )
+    (middle_block): TimestepEmbedSequential(
+      (0): ResBlock(
+        (in_layers): Sequential(
+          (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+          (1): SiLU()
+          (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        )
+        (h_upd): Identity()
+        (x_upd): Identity()
+        (emb_layers): Sequential(
+          (0): SiLU()
+          (1): Linear(in_features=1280, out_features=1280, bias=True)
+        )
+        (out_layers): Sequential(
+          (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+          (1): SiLU()
+          (2): Dropout(p=0, inplace=False)
+          (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        )
+        (skip_connection): Identity()
+      )
+      (1): SpatialTransformer(
+        (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+        (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+        (transformer_blocks): ModuleList(
+          (0): BasicTransformerBlock(
+            (attn1): CrossAttention(
+              (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+              (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+              (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+              (to_out): Sequential(
+                (0): Linear(in_features=1280, out_features=1280, bias=True)
+                (1): Dropout(p=0.0, inplace=False)
+              )
+            )
+            (ff): FeedForward(
+              (net): Sequential(
+                (0): GEGLU(
+                  (proj): Linear(in_features=1280, out_features=10240, bias=True)
+                )
+                (1): Dropout(p=0.0, inplace=False)
+                (2): Linear(in_features=5120, out_features=1280, bias=True)
+              )
+            )
+            (attn2): CrossAttention(
+              (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+              (to_k): Linear(in_features=768, out_features=1280, bias=False)
+              (to_v): Linear(in_features=768, out_features=1280, bias=False)
+              (to_out): Sequential(
+                (0): Linear(in_features=1280, out_features=1280, bias=True)
+                (1): Dropout(p=0.0, inplace=False)
+              )
+            )
+            (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+            (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+            (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+          )
+        )
+        (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+      )
+      (2): ResBlock(
+        (in_layers): Sequential(
+          (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+          (1): SiLU()
+          (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        )
+        (h_upd): Identity()
+        (x_upd): Identity()
+        (emb_layers): Sequential(
+          (0): SiLU()
+          (1): Linear(in_features=1280, out_features=1280, bias=True)
+        )
+        (out_layers): Sequential(
+          (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+          (1): SiLU()
+          (2): Dropout(p=0, inplace=False)
+          (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        )
+        (skip_connection): Identity()
+      )
+    )
+    (output_blocks): ModuleList(
+      (0): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=1280, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (1): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=1280, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (2): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=1280, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (1): Upsample(
+          (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        )
+      )
+      (3): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=1280, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+          (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=1280, out_features=1280, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=1280, out_features=10240, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=5120, out_features=1280, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_k): Linear(in_features=768, out_features=1280, bias=False)
+                (to_v): Linear(in_features=768, out_features=1280, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=1280, out_features=1280, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (4): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=1280, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+          (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=1280, out_features=1280, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=1280, out_features=10240, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=5120, out_features=1280, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_k): Linear(in_features=768, out_features=1280, bias=False)
+                (to_v): Linear(in_features=768, out_features=1280, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=1280, out_features=1280, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (5): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=1280, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+          (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=1280, out_features=1280, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=1280, out_features=10240, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=5120, out_features=1280, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_k): Linear(in_features=768, out_features=1280, bias=False)
+                (to_v): Linear(in_features=768, out_features=1280, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=1280, out_features=1280, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (2): Upsample(
+          (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        )
+      )
+      (6): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=640, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+          (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=640, out_features=640, bias=False)
+                (to_k): Linear(in_features=640, out_features=640, bias=False)
+                (to_v): Linear(in_features=640, out_features=640, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=640, out_features=640, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=640, out_features=5120, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=2560, out_features=640, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=640, out_features=640, bias=False)
+                (to_k): Linear(in_features=768, out_features=640, bias=False)
+                (to_v): Linear(in_features=768, out_features=640, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=640, out_features=640, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (7): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=640, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+          (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=640, out_features=640, bias=False)
+                (to_k): Linear(in_features=640, out_features=640, bias=False)
+                (to_v): Linear(in_features=640, out_features=640, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=640, out_features=640, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=640, out_features=5120, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=2560, out_features=640, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=640, out_features=640, bias=False)
+                (to_k): Linear(in_features=768, out_features=640, bias=False)
+                (to_v): Linear(in_features=768, out_features=640, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=640, out_features=640, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (8): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 960, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=640, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+          (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=640, out_features=640, bias=False)
+                (to_k): Linear(in_features=640, out_features=640, bias=False)
+                (to_v): Linear(in_features=640, out_features=640, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=640, out_features=640, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=640, out_features=5120, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=2560, out_features=640, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=640, out_features=640, bias=False)
+                (to_k): Linear(in_features=768, out_features=640, bias=False)
+                (to_v): Linear(in_features=768, out_features=640, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=640, out_features=640, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (2): Upsample(
+          (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        )
+      )
+      (9): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 960, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=320, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+          (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=320, out_features=320, bias=False)
+                (to_k): Linear(in_features=320, out_features=320, bias=False)
+                (to_v): Linear(in_features=320, out_features=320, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=320, out_features=320, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=320, out_features=2560, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=1280, out_features=320, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=320, out_features=320, bias=False)
+                (to_k): Linear(in_features=768, out_features=320, bias=False)
+                (to_v): Linear(in_features=768, out_features=320, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=320, out_features=320, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (10): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=320, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+          (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=320, out_features=320, bias=False)
+                (to_k): Linear(in_features=320, out_features=320, bias=False)
+                (to_v): Linear(in_features=320, out_features=320, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=320, out_features=320, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=320, out_features=2560, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=1280, out_features=320, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=320, out_features=320, bias=False)
+                (to_k): Linear(in_features=768, out_features=320, bias=False)
+                (to_v): Linear(in_features=768, out_features=320, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=320, out_features=320, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+      (11): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=320, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+          (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=320, out_features=320, bias=False)
+                (to_k): Linear(in_features=320, out_features=320, bias=False)
+                (to_v): Linear(in_features=320, out_features=320, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=320, out_features=320, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=320, out_features=2560, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=1280, out_features=320, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=320, out_features=320, bias=False)
+                (to_k): Linear(in_features=768, out_features=320, bias=False)
+                (to_v): Linear(in_features=768, out_features=320, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=320, out_features=320, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+    )
+    (out): Sequential(
+      (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+      (1): SiLU()
+      (2): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    )
+  )
+)
\ No newline at end of file
diff --git a/scripts/p.sd_model.txt b/scripts/p.sd_model.txt
new file mode 100644
index 0000000..2512bb6
--- /dev/null
+++ b/scripts/p.sd_model.txt
@@ -0,0 +1,1619 @@
+LatentDiffusion(
+  (model): DiffusionWrapper(
+    (diffusion_model): UNetModel(
+      (time_embed): Sequential(
+        (0): Linear(in_features=320, out_features=1280, bias=True)
+        (1): SiLU()
+        (2): Linear(in_features=1280, out_features=1280, bias=True)
+      )
+      (input_blocks): ModuleList(
+        (0): TimestepEmbedSequential(
+          (0): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        )
+        (1): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=320, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Identity()
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+            (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=320, out_features=320, bias=False)
+                  (to_k): Linear(in_features=320, out_features=320, bias=False)
+                  (to_v): Linear(in_features=320, out_features=320, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=320, out_features=320, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=320, out_features=2560, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=1280, out_features=320, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=320, out_features=320, bias=False)
+                  (to_k): Linear(in_features=768, out_features=320, bias=False)
+                  (to_v): Linear(in_features=768, out_features=320, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=320, out_features=320, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (2): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=320, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Identity()
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+            (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=320, out_features=320, bias=False)
+                  (to_k): Linear(in_features=320, out_features=320, bias=False)
+                  (to_v): Linear(in_features=320, out_features=320, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=320, out_features=320, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=320, out_features=2560, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=1280, out_features=320, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=320, out_features=320, bias=False)
+                  (to_k): Linear(in_features=768, out_features=320, bias=False)
+                  (to_v): Linear(in_features=768, out_features=320, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=320, out_features=320, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (3): TimestepEmbedSequential(
+          (0): Downsample(
+            (op): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+          )
+        )
+        (4): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=640, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+            (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=640, out_features=640, bias=False)
+                  (to_k): Linear(in_features=640, out_features=640, bias=False)
+                  (to_v): Linear(in_features=640, out_features=640, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=640, out_features=640, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=640, out_features=5120, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=2560, out_features=640, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=640, out_features=640, bias=False)
+                  (to_k): Linear(in_features=768, out_features=640, bias=False)
+                  (to_v): Linear(in_features=768, out_features=640, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=640, out_features=640, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (5): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=640, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Identity()
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+            (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=640, out_features=640, bias=False)
+                  (to_k): Linear(in_features=640, out_features=640, bias=False)
+                  (to_v): Linear(in_features=640, out_features=640, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=640, out_features=640, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=640, out_features=5120, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=2560, out_features=640, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=640, out_features=640, bias=False)
+                  (to_k): Linear(in_features=768, out_features=640, bias=False)
+                  (to_v): Linear(in_features=768, out_features=640, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=640, out_features=640, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (6): TimestepEmbedSequential(
+          (0): Downsample(
+            (op): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+          )
+        )
+        (7): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=1280, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+            (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=1280, out_features=1280, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=1280, out_features=10240, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=5120, out_features=1280, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_k): Linear(in_features=768, out_features=1280, bias=False)
+                  (to_v): Linear(in_features=768, out_features=1280, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=1280, out_features=1280, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (8): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=1280, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Identity()
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+            (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=1280, out_features=1280, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=1280, out_features=10240, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=5120, out_features=1280, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_k): Linear(in_features=768, out_features=1280, bias=False)
+                  (to_v): Linear(in_features=768, out_features=1280, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=1280, out_features=1280, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (9): TimestepEmbedSequential(
+          (0): Downsample(
+            (op): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+          )
+        )
+        (10): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=1280, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Identity()
+          )
+        )
+        (11): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=1280, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Identity()
+          )
+        )
+      )
+      (middle_block): TimestepEmbedSequential(
+        (0): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=1280, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Identity()
+        )
+        (1): SpatialTransformer(
+          (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+          (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+          (transformer_blocks): ModuleList(
+            (0): BasicTransformerBlock(
+              (attn1): CrossAttention(
+                (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=1280, out_features=1280, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (ff): FeedForward(
+                (net): Sequential(
+                  (0): GEGLU(
+                    (proj): Linear(in_features=1280, out_features=10240, bias=True)
+                  )
+                  (1): Dropout(p=0.0, inplace=False)
+                  (2): Linear(in_features=5120, out_features=1280, bias=True)
+                )
+              )
+              (attn2): CrossAttention(
+                (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                (to_k): Linear(in_features=768, out_features=1280, bias=False)
+                (to_v): Linear(in_features=768, out_features=1280, bias=False)
+                (to_out): Sequential(
+                  (0): Linear(in_features=1280, out_features=1280, bias=True)
+                  (1): Dropout(p=0.0, inplace=False)
+                )
+              )
+              (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (2): ResBlock(
+          (in_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (h_upd): Identity()
+          (x_upd): Identity()
+          (emb_layers): Sequential(
+            (0): SiLU()
+            (1): Linear(in_features=1280, out_features=1280, bias=True)
+          )
+          (out_layers): Sequential(
+            (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+            (1): SiLU()
+            (2): Dropout(p=0, inplace=False)
+            (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+          (skip_connection): Identity()
+        )
+      )
+      (output_blocks): ModuleList(
+        (0): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=1280, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (1): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=1280, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (2): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=1280, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (1): Upsample(
+            (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+        )
+        (3): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=1280, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+            (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=1280, out_features=1280, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=1280, out_features=10240, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=5120, out_features=1280, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_k): Linear(in_features=768, out_features=1280, bias=False)
+                  (to_v): Linear(in_features=768, out_features=1280, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=1280, out_features=1280, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (4): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=1280, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+            (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=1280, out_features=1280, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=1280, out_features=10240, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=5120, out_features=1280, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_k): Linear(in_features=768, out_features=1280, bias=False)
+                  (to_v): Linear(in_features=768, out_features=1280, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=1280, out_features=1280, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (5): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=1280, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+            (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=1280, out_features=1280, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=1280, out_features=10240, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=5120, out_features=1280, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+                  (to_k): Linear(in_features=768, out_features=1280, bias=False)
+                  (to_v): Linear(in_features=768, out_features=1280, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=1280, out_features=1280, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (2): Upsample(
+            (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+        )
+        (6): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=640, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+            (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=640, out_features=640, bias=False)
+                  (to_k): Linear(in_features=640, out_features=640, bias=False)
+                  (to_v): Linear(in_features=640, out_features=640, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=640, out_features=640, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=640, out_features=5120, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=2560, out_features=640, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=640, out_features=640, bias=False)
+                  (to_k): Linear(in_features=768, out_features=640, bias=False)
+                  (to_v): Linear(in_features=768, out_features=640, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=640, out_features=640, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (7): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=640, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+            (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=640, out_features=640, bias=False)
+                  (to_k): Linear(in_features=640, out_features=640, bias=False)
+                  (to_v): Linear(in_features=640, out_features=640, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=640, out_features=640, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=640, out_features=5120, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=2560, out_features=640, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=640, out_features=640, bias=False)
+                  (to_k): Linear(in_features=768, out_features=640, bias=False)
+                  (to_v): Linear(in_features=768, out_features=640, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=640, out_features=640, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (8): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 960, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=640, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+            (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=640, out_features=640, bias=False)
+                  (to_k): Linear(in_features=640, out_features=640, bias=False)
+                  (to_v): Linear(in_features=640, out_features=640, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=640, out_features=640, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=640, out_features=5120, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=2560, out_features=640, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=640, out_features=640, bias=False)
+                  (to_k): Linear(in_features=768, out_features=640, bias=False)
+                  (to_v): Linear(in_features=768, out_features=640, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=640, out_features=640, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (2): Upsample(
+            (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+        )
+        (9): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 960, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=320, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+            (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=320, out_features=320, bias=False)
+                  (to_k): Linear(in_features=320, out_features=320, bias=False)
+                  (to_v): Linear(in_features=320, out_features=320, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=320, out_features=320, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=320, out_features=2560, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=1280, out_features=320, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=320, out_features=320, bias=False)
+                  (to_k): Linear(in_features=768, out_features=320, bias=False)
+                  (to_v): Linear(in_features=768, out_features=320, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=320, out_features=320, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (10): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=320, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+            (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=320, out_features=320, bias=False)
+                  (to_k): Linear(in_features=320, out_features=320, bias=False)
+                  (to_v): Linear(in_features=320, out_features=320, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=320, out_features=320, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=320, out_features=2560, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=1280, out_features=320, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=320, out_features=320, bias=False)
+                  (to_k): Linear(in_features=768, out_features=320, bias=False)
+                  (to_v): Linear(in_features=768, out_features=320, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=320, out_features=320, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+        (11): TimestepEmbedSequential(
+          (0): ResBlock(
+            (in_layers): Sequential(
+              (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (h_upd): Identity()
+            (x_upd): Identity()
+            (emb_layers): Sequential(
+              (0): SiLU()
+              (1): Linear(in_features=1280, out_features=320, bias=True)
+            )
+            (out_layers): Sequential(
+              (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+              (1): SiLU()
+              (2): Dropout(p=0, inplace=False)
+              (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))
+          )
+          (1): SpatialTransformer(
+            (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+            (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+            (transformer_blocks): ModuleList(
+              (0): BasicTransformerBlock(
+                (attn1): CrossAttention(
+                  (to_q): Linear(in_features=320, out_features=320, bias=False)
+                  (to_k): Linear(in_features=320, out_features=320, bias=False)
+                  (to_v): Linear(in_features=320, out_features=320, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=320, out_features=320, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (ff): FeedForward(
+                  (net): Sequential(
+                    (0): GEGLU(
+                      (proj): Linear(in_features=320, out_features=2560, bias=True)
+                    )
+                    (1): Dropout(p=0.0, inplace=False)
+                    (2): Linear(in_features=1280, out_features=320, bias=True)
+                  )
+                )
+                (attn2): CrossAttention(
+                  (to_q): Linear(in_features=320, out_features=320, bias=False)
+                  (to_k): Linear(in_features=768, out_features=320, bias=False)
+                  (to_v): Linear(in_features=768, out_features=320, bias=False)
+                  (to_out): Sequential(
+                    (0): Linear(in_features=320, out_features=320, bias=True)
+                    (1): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+                (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+                (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+            (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+          )
+        )
+      )
+      (out): Sequential(
+        (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+    )
+  )
+  (first_stage_model): AutoencoderKL(
+    (encoder): Encoder(
+      (conv_in): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      (down): ModuleList(
+        (0): Module(
+          (block): ModuleList(
+            (0): ResnetBlock(
+              (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
+              (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (1): ResnetBlock(
+              (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
+              (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+          )
+          (attn): ModuleList()
+          (downsample): Downsample(
+            (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2))
+          )
+        )
+        (1): Module(
+          (block): ModuleList(
+            (0): ResnetBlock(
+              (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
+              (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (nin_shortcut): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1))
+            )
+            (1): ResnetBlock(
+              (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
+              (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+          )
+          (attn): ModuleList()
+          (downsample): Downsample(
+            (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2))
+          )
+        )
+        (2): Module(
+          (block): ModuleList(
+            (0): ResnetBlock(
+              (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
+              (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (nin_shortcut): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1))
+            )
+            (1): ResnetBlock(
+              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+          )
+          (attn): ModuleList()
+          (downsample): Downsample(
+            (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2))
+          )
+        )
+        (3): Module(
+          (block): ModuleList(
+            (0): ResnetBlock(
+              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (1): ResnetBlock(
+              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+          )
+          (attn): ModuleList()
+        )
+      )
+      (mid): Module(
+        (block_1): ResnetBlock(
+          (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+          (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+          (dropout): Dropout(p=0.0, inplace=False)
+          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        )
+        (attn_1): MemoryEfficientAttnBlock(
+          (norm): GroupNorm(32, 512, eps=1e-06, affine=True)
+          (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
+          (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
+          (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
+          (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (block_2): ResnetBlock(
+          (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+          (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+          (dropout): Dropout(p=0.0, inplace=False)
+          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        )
+      )
+      (norm_out): GroupNorm(32, 512, eps=1e-06, affine=True)
+      (conv_out): Conv2d(512, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    )
+    (decoder): Decoder(
+      (conv_in): Conv2d(4, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      (mid): Module(
+        (block_1): ResnetBlock(
+          (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+          (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+          (dropout): Dropout(p=0.0, inplace=False)
+          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        )
+        (attn_1): MemoryEfficientAttnBlock(
+          (norm): GroupNorm(32, 512, eps=1e-06, affine=True)
+          (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
+          (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
+          (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
+          (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
+        )
+        (block_2): ResnetBlock(
+          (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+          (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+          (dropout): Dropout(p=0.0, inplace=False)
+          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        )
+      )
+      (up): ModuleList(
+        (0): Module(
+          (block): ModuleList(
+            (0): ResnetBlock(
+              (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
+              (conv1): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (nin_shortcut): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
+            )
+            (1): ResnetBlock(
+              (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
+              (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (2): ResnetBlock(
+              (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
+              (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+          )
+          (attn): ModuleList()
+        )
+        (1): Module(
+          (block): ModuleList(
+            (0): ResnetBlock(
+              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (conv1): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (nin_shortcut): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
+            )
+            (1): ResnetBlock(
+              (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
+              (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (2): ResnetBlock(
+              (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
+              (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+          )
+          (attn): ModuleList()
+          (upsample): Upsample(
+            (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+        )
+        (2): Module(
+          (block): ModuleList(
+            (0): ResnetBlock(
+              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (1): ResnetBlock(
+              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (2): ResnetBlock(
+              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+          )
+          (attn): ModuleList()
+          (upsample): Upsample(
+            (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+        )
+        (3): Module(
+          (block): ModuleList(
+            (0): ResnetBlock(
+              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (1): ResnetBlock(
+              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+            (2): ResnetBlock(
+              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
+              (dropout): Dropout(p=0.0, inplace=False)
+              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            )
+          )
+          (attn): ModuleList()
+          (upsample): Upsample(
+            (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+          )
+        )
+      )
+      (norm_out): GroupNorm(32, 128, eps=1e-06, affine=True)
+      (conv_out): Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    )
+    (loss): Identity()
+    (quant_conv): Conv2d(8, 8, kernel_size=(1, 1), stride=(1, 1))
+    (post_quant_conv): Conv2d(4, 4, kernel_size=(1, 1), stride=(1, 1))
+  )
+  (cond_stage_model): FrozenCLIPEmbedderWithCustomWords(
+    (wrapped): FrozenCLIPEmbedder(
+      (transformer): CLIPTextModel(
+        (text_model): CLIPTextTransformer(
+          (embeddings): CLIPTextEmbeddings(
+            (token_embedding): EmbeddingsWithFixes(
+              (wrapped): Embedding(49408, 768)
+            )
+            (position_embedding): Embedding(77, 768)
+          )
+          (encoder): CLIPEncoder(
+            (layers): ModuleList(
+              (0): CLIPEncoderLayer(
+                (self_attn): CLIPAttention(
+                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                )
+                (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (mlp): CLIPMLP(
+                  (activation_fn): QuickGELUActivation()
+                  (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                  (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                )
+                (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+              )
+              (1): CLIPEncoderLayer(
+                (self_attn): CLIPAttention(
+                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                )
+                (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (mlp): CLIPMLP(
+                  (activation_fn): QuickGELUActivation()
+                  (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                  (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                )
+                (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+              )
+              (2): CLIPEncoderLayer(
+                (self_attn): CLIPAttention(
+                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                )
+                (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (mlp): CLIPMLP(
+                  (activation_fn): QuickGELUActivation()
+                  (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                  (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                )
+                (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+              )
+              (3): CLIPEncoderLayer(
+                (self_attn): CLIPAttention(
+                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                )
+                (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (mlp): CLIPMLP(
+                  (activation_fn): QuickGELUActivation()
+                  (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                  (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                )
+                (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+              )
+              (4): CLIPEncoderLayer(
+                (self_attn): CLIPAttention(
+                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                )
+                (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (mlp): CLIPMLP(
+                  (activation_fn): QuickGELUActivation()
+                  (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                  (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                )
+                (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+              )
+              (5): CLIPEncoderLayer(
+                (self_attn): CLIPAttention(
+                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                )
+                (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (mlp): CLIPMLP(
+                  (activation_fn): QuickGELUActivation()
+                  (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                  (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                )
+                (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+              )
+              (6): CLIPEncoderLayer(
+                (self_attn): CLIPAttention(
+                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                )
+                (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (mlp): CLIPMLP(
+                  (activation_fn): QuickGELUActivation()
+                  (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                  (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                )
+                (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+              )
+              (7): CLIPEncoderLayer(
+                (self_attn): CLIPAttention(
+                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                )
+                (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (mlp): CLIPMLP(
+                  (activation_fn): QuickGELUActivation()
+                  (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                  (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                )
+                (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+              )
+              (8): CLIPEncoderLayer(
+                (self_attn): CLIPAttention(
+                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                )
+                (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (mlp): CLIPMLP(
+                  (activation_fn): QuickGELUActivation()
+                  (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                  (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                )
+                (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+              )
+              (9): CLIPEncoderLayer(
+                (self_attn): CLIPAttention(
+                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                )
+                (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (mlp): CLIPMLP(
+                  (activation_fn): QuickGELUActivation()
+                  (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                  (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                )
+                (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+              )
+              (10): CLIPEncoderLayer(
+                (self_attn): CLIPAttention(
+                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                )
+                (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (mlp): CLIPMLP(
+                  (activation_fn): QuickGELUActivation()
+                  (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                  (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                )
+                (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+              )
+              (11): CLIPEncoderLayer(
+                (self_attn): CLIPAttention(
+                  (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                  (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                )
+                (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (mlp): CLIPMLP(
+                  (activation_fn): QuickGELUActivation()
+                  (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                  (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                )
+                (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+              )
+            )
+          )
+          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+    )
+  )
+)
\ No newline at end of file
diff --git a/style.css b/style.css
new file mode 100644
index 0000000..183adbe
--- /dev/null
+++ b/style.css
@@ -0,0 +1,3 @@
+#dumpunet-layerinfo {
+    font-family: monospace;
+}
\ No newline at end of file