stable-diffusion-webui-dump.../scripts/p.sd_model.txt

1619 lines
75 KiB
Plaintext

LatentDiffusion(
(model): DiffusionWrapper(
(diffusion_model): UNetModel(
(time_embed): Sequential(
(0): Linear(in_features=320, out_features=1280, bias=True)
(1): SiLU()
(2): Linear(in_features=1280, out_features=1280, bias=True)
)
(input_blocks): ModuleList(
(0): TimestepEmbedSequential(
(0): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=320, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=320, out_features=320, bias=False)
(to_k): Linear(in_features=320, out_features=320, bias=False)
(to_v): Linear(in_features=320, out_features=320, bias=False)
(to_out): Sequential(
(0): Linear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=320, out_features=2560, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1280, out_features=320, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=320, out_features=320, bias=False)
(to_k): Linear(in_features=768, out_features=320, bias=False)
(to_v): Linear(in_features=768, out_features=320, bias=False)
(to_out): Sequential(
(0): Linear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
(2): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=320, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=320, out_features=320, bias=False)
(to_k): Linear(in_features=320, out_features=320, bias=False)
(to_v): Linear(in_features=320, out_features=320, bias=False)
(to_out): Sequential(
(0): Linear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=320, out_features=2560, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1280, out_features=320, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=320, out_features=320, bias=False)
(to_k): Linear(in_features=768, out_features=320, bias=False)
(to_v): Linear(in_features=768, out_features=320, bias=False)
(to_out): Sequential(
(0): Linear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
(3): TimestepEmbedSequential(
(0): Downsample(
(op): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
(4): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=640, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=640, out_features=640, bias=False)
(to_v): Linear(in_features=640, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=2560, out_features=640, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=768, out_features=640, bias=False)
(to_v): Linear(in_features=768, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(5): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=640, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=640, out_features=640, bias=False)
(to_v): Linear(in_features=640, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=2560, out_features=640, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=768, out_features=640, bias=False)
(to_v): Linear(in_features=768, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(6): TimestepEmbedSequential(
(0): Downsample(
(op): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
(7): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=5120, out_features=1280, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=768, out_features=1280, bias=False)
(to_v): Linear(in_features=768, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(8): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=5120, out_features=1280, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=768, out_features=1280, bias=False)
(to_v): Linear(in_features=768, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(9): TimestepEmbedSequential(
(0): Downsample(
(op): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
(10): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
)
(11): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
)
)
(middle_block): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=5120, out_features=1280, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=768, out_features=1280, bias=False)
(to_v): Linear(in_features=768, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(2): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
)
(output_blocks): ModuleList(
(0): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(1): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(2): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(1): Upsample(
(conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(3): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=5120, out_features=1280, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=768, out_features=1280, bias=False)
(to_v): Linear(in_features=768, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(4): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=5120, out_features=1280, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=768, out_features=1280, bias=False)
(to_v): Linear(in_features=768, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(5): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=5120, out_features=1280, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=768, out_features=1280, bias=False)
(to_v): Linear(in_features=768, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(2): Upsample(
(conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(6): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=640, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=640, out_features=640, bias=False)
(to_v): Linear(in_features=640, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=2560, out_features=640, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=768, out_features=640, bias=False)
(to_v): Linear(in_features=768, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(7): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=640, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=640, out_features=640, bias=False)
(to_v): Linear(in_features=640, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=2560, out_features=640, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=768, out_features=640, bias=False)
(to_v): Linear(in_features=768, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(8): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 960, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=640, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=640, out_features=640, bias=False)
(to_v): Linear(in_features=640, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=2560, out_features=640, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=768, out_features=640, bias=False)
(to_v): Linear(in_features=768, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
)
(2): Upsample(
(conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(9): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 960, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=320, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=320, out_features=320, bias=False)
(to_k): Linear(in_features=320, out_features=320, bias=False)
(to_v): Linear(in_features=320, out_features=320, bias=False)
(to_out): Sequential(
(0): Linear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=320, out_features=2560, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1280, out_features=320, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=320, out_features=320, bias=False)
(to_k): Linear(in_features=768, out_features=320, bias=False)
(to_v): Linear(in_features=768, out_features=320, bias=False)
(to_out): Sequential(
(0): Linear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
(10): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=320, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=320, out_features=320, bias=False)
(to_k): Linear(in_features=320, out_features=320, bias=False)
(to_v): Linear(in_features=320, out_features=320, bias=False)
(to_out): Sequential(
(0): Linear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=320, out_features=2560, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1280, out_features=320, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=320, out_features=320, bias=False)
(to_k): Linear(in_features=768, out_features=320, bias=False)
(to_v): Linear(in_features=768, out_features=320, bias=False)
(to_out): Sequential(
(0): Linear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
(11): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=320, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=320, out_features=320, bias=False)
(to_k): Linear(in_features=320, out_features=320, bias=False)
(to_v): Linear(in_features=320, out_features=320, bias=False)
(to_out): Sequential(
(0): Linear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=320, out_features=2560, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=1280, out_features=320, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=320, out_features=320, bias=False)
(to_k): Linear(in_features=768, out_features=320, bias=False)
(to_v): Linear(in_features=768, out_features=320, bias=False)
(to_out): Sequential(
(0): Linear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
)
(out): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(first_stage_model): AutoencoderKL(
(encoder): Encoder(
(conv_in): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(down): ModuleList(
(0): Module(
(block): ModuleList(
(0): ResnetBlock(
(norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
(conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): ResnetBlock(
(norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
(conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
(downsample): Downsample(
(conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2))
)
)
(1): Module(
(block): ModuleList(
(0): ResnetBlock(
(norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
(conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nin_shortcut): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1))
)
(1): ResnetBlock(
(norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
(conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
(downsample): Downsample(
(conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2))
)
)
(2): Module(
(block): ModuleList(
(0): ResnetBlock(
(norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
(conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nin_shortcut): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1))
)
(1): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
(downsample): Downsample(
(conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2))
)
)
(3): Module(
(block): ModuleList(
(0): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
)
)
(mid): Module(
(block_1): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(attn_1): MemoryEfficientAttnBlock(
(norm): GroupNorm(32, 512, eps=1e-06, affine=True)
(q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
(k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
(v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
(proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
)
(block_2): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(norm_out): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv_out): Conv2d(512, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(decoder): Decoder(
(conv_in): Conv2d(4, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(mid): Module(
(block_1): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(attn_1): MemoryEfficientAttnBlock(
(norm): GroupNorm(32, 512, eps=1e-06, affine=True)
(q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
(k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
(v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
(proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
)
(block_2): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(up): ModuleList(
(0): Module(
(block): ModuleList(
(0): ResnetBlock(
(norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
(conv1): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nin_shortcut): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
)
(1): ResnetBlock(
(norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
(conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): ResnetBlock(
(norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
(conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
)
(1): Module(
(block): ModuleList(
(0): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nin_shortcut): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
)
(1): ResnetBlock(
(norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
(conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): ResnetBlock(
(norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
(conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
(upsample): Upsample(
(conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(2): Module(
(block): ModuleList(
(0): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
(upsample): Upsample(
(conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(3): Module(
(block): ModuleList(
(0): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
(upsample): Upsample(
(conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(norm_out): GroupNorm(32, 128, eps=1e-06, affine=True)
(conv_out): Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(loss): Identity()
(quant_conv): Conv2d(8, 8, kernel_size=(1, 1), stride=(1, 1))
(post_quant_conv): Conv2d(4, 4, kernel_size=(1, 1), stride=(1, 1))
)
(cond_stage_model): FrozenCLIPEmbedderWithCustomWords(
(wrapped): FrozenCLIPEmbedder(
(transformer): CLIPTextModel(
(text_model): CLIPTextTransformer(
(embeddings): CLIPTextEmbeddings(
(token_embedding): EmbeddingsWithFixes(
(wrapped): Embedding(49408, 768)
)
(position_embedding): Embedding(77, 768)
)
(encoder): CLIPEncoder(
(layers): ModuleList(
(0): CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(1): CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(2): CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(3): CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(4): CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(5): CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(6): CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(7): CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(8): CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(9): CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(10): CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(11): CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
)
)