1151 lines
49 KiB
Plaintext
1151 lines
49 KiB
Plaintext
DiffusionWrapper(
|
|
(diffusion_model): UNetModel(
|
|
(time_embed): Sequential(
|
|
(0): Linear(in_features=320, out_features=1280, bias=True)
|
|
(1): SiLU()
|
|
(2): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(input_blocks): ModuleList(
|
|
(0): TimestepEmbedSequential(
|
|
(0): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(1): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=320, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Identity()
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_k): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_v): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=320, out_features=320, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=320, out_features=2560, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=1280, out_features=320, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=320, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=320, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=320, out_features=320, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(2): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=320, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Identity()
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_k): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_v): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=320, out_features=320, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=320, out_features=2560, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=1280, out_features=320, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=320, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=320, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=320, out_features=320, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(3): TimestepEmbedSequential(
|
|
(0): Downsample(
|
|
(op): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
|
|
)
|
|
)
|
|
(4): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=640, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_k): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_v): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=640, out_features=640, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=640, out_features=5120, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=2560, out_features=640, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=640, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=640, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=640, out_features=640, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(5): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=640, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Identity()
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_k): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_v): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=640, out_features=640, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=640, out_features=5120, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=2560, out_features=640, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=640, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=640, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=640, out_features=640, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(6): TimestepEmbedSequential(
|
|
(0): Downsample(
|
|
(op): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
|
|
)
|
|
)
|
|
(7): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=1280, out_features=1280, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=1280, out_features=10240, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=5120, out_features=1280, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=1280, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=1280, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=1280, out_features=1280, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(8): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Identity()
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=1280, out_features=1280, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=1280, out_features=10240, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=5120, out_features=1280, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=1280, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=1280, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=1280, out_features=1280, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(9): TimestepEmbedSequential(
|
|
(0): Downsample(
|
|
(op): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
|
|
)
|
|
)
|
|
(10): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Identity()
|
|
)
|
|
)
|
|
(11): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Identity()
|
|
)
|
|
)
|
|
)
|
|
(middle_block): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Identity()
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=1280, out_features=1280, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=1280, out_features=10240, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=5120, out_features=1280, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=1280, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=1280, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=1280, out_features=1280, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(2): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Identity()
|
|
)
|
|
)
|
|
(output_blocks): ModuleList(
|
|
(0): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(1): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(2): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(1): Upsample(
|
|
(conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
)
|
|
(3): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=1280, out_features=1280, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=1280, out_features=10240, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=5120, out_features=1280, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=1280, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=1280, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=1280, out_features=1280, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(4): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=1280, out_features=1280, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=1280, out_features=10240, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=5120, out_features=1280, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=1280, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=1280, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=1280, out_features=1280, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(5): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=1280, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=1280, out_features=1280, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=1280, out_features=10240, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=5120, out_features=1280, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=1280, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=1280, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=1280, out_features=1280, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(2): Upsample(
|
|
(conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
)
|
|
(6): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=640, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_k): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_v): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=640, out_features=640, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=640, out_features=5120, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=2560, out_features=640, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=640, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=640, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=640, out_features=640, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(7): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=640, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_k): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_v): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=640, out_features=640, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=640, out_features=5120, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=2560, out_features=640, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=640, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=640, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=640, out_features=640, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(8): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 960, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=640, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_k): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_v): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=640, out_features=640, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=640, out_features=5120, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=2560, out_features=640, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=640, out_features=640, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=640, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=640, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=640, out_features=640, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(2): Upsample(
|
|
(conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
)
|
|
(9): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 960, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=320, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_k): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_v): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=320, out_features=320, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=320, out_features=2560, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=1280, out_features=320, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=320, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=320, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=320, out_features=320, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(10): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=320, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_k): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_v): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=320, out_features=320, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=320, out_features=2560, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=1280, out_features=320, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=320, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=320, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=320, out_features=320, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
(11): TimestepEmbedSequential(
|
|
(0): ResBlock(
|
|
(in_layers): Sequential(
|
|
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(h_upd): Identity()
|
|
(x_upd): Identity()
|
|
(emb_layers): Sequential(
|
|
(0): SiLU()
|
|
(1): Linear(in_features=1280, out_features=320, bias=True)
|
|
)
|
|
(out_layers): Sequential(
|
|
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Dropout(p=0, inplace=False)
|
|
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
(skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
(1): SpatialTransformer(
|
|
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
|
|
(proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
(transformer_blocks): ModuleList(
|
|
(0): BasicTransformerBlock(
|
|
(attn1): CrossAttention(
|
|
(to_q): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_k): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_v): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=320, out_features=320, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(ff): FeedForward(
|
|
(net): Sequential(
|
|
(0): GEGLU(
|
|
(proj): Linear(in_features=320, out_features=2560, bias=True)
|
|
)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
(2): Linear(in_features=1280, out_features=320, bias=True)
|
|
)
|
|
)
|
|
(attn2): CrossAttention(
|
|
(to_q): Linear(in_features=320, out_features=320, bias=False)
|
|
(to_k): Linear(in_features=768, out_features=320, bias=False)
|
|
(to_v): Linear(in_features=768, out_features=320, bias=False)
|
|
(to_out): Sequential(
|
|
(0): Linear(in_features=320, out_features=320, bias=True)
|
|
(1): Dropout(p=0.0, inplace=False)
|
|
)
|
|
)
|
|
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
|
|
)
|
|
)
|
|
(proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
|
|
)
|
|
)
|
|
)
|
|
(out): Sequential(
|
|
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
|
|
(1): SiLU()
|
|
(2): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
|
)
|
|
)
|
|
) |