Skip to content

Commit

Permalink
add Attention across Time after all (spatial) linear attention
Browse files Browse the repository at this point in the history
  • Loading branch information
lucidrains committed Apr 13, 2022
1 parent 174a896 commit 0d8f175
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 10 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ sampled_videos.shape # (2, 3, 5, 32, 32)
- [ ] find a good torchvideo-like library (torchvideo seems immature) for training on fireworks
- [ ] consider doing a 3d version of CLIP, so one can eventually apply the lessons of DALL-E2 to video
- [ ] add a forward keyword argument that arrests attention across time (as reported / claimed in the paper, this type of image + video simultaneous training improves results)
- [ ] project text into 4-8 tokens, and use them as memory key / values to condition both time and space in attention blocks

## Citations

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name = 'video-diffusion-pytorch',
packages = find_packages(exclude=[]),
version = '0.0.4',
version = '0.0.6',
license='MIT',
description = 'Video Diffusion - Pytorch',
author = 'Phil Wang',
Expand Down
22 changes: 13 additions & 9 deletions video_diffusion_pytorch/video_diffusion_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def forward(self, x, time_emb = None):
h = self.net(h)
return h + self.res_conv(x)

class LinearAttention(nn.Module):
class SpatialLinearAttention(nn.Module):
def __init__(self, dim, heads = 4, dim_head = 32):
super().__init__()
self.scale = dim_head ** -0.5
Expand Down Expand Up @@ -253,25 +253,26 @@ def __init__(

num_resolutions = len(in_out)
conv_next = partial(ConvNextBlock, time_emb_dim = cond_dim)
temporal_attn = lambda dim: EinopsToAndFrom('b c f h w', 'b (h w) f c', Attention(dim))

for ind, (dim_in, dim_out) in enumerate(in_out):
is_last = ind >= (num_resolutions - 1)

self.downs.append(nn.ModuleList([
conv_next(dim_in, dim_out, norm = ind != 0),
conv_next(dim_out, dim_out),
Residual(PreNorm(dim_out, LinearAttention(dim_out))),
Residual(PreNorm(dim_out, SpatialLinearAttention(dim_out))),
Residual(PreNorm(dim_out, temporal_attn(dim_out))),
Downsample(dim_out) if not is_last else nn.Identity()
]))

mid_dim = dims[-1]
self.mid_block1 = conv_next(mid_dim, mid_dim)

spatial_attn = EinopsToAndFrom('b c f h w', 'b f (h w) c', Attention(mid_dim))
temporal_attn = EinopsToAndFrom('b c f h w', 'b (h w) f c', Attention(mid_dim))

self.mid_spatial_attn = Residual(PreNorm(mid_dim, spatial_attn))
self.mid_temporal_attn = Residual(PreNorm(mid_dim, temporal_attn))
self.mid_temporal_attn = Residual(PreNorm(mid_dim, temporal_attn(mid_dim)))

self.mid_block2 = conv_next(mid_dim, mid_dim)

Expand All @@ -281,7 +282,8 @@ def __init__(
self.ups.append(nn.ModuleList([
conv_next(dim_out * 2, dim_in),
conv_next(dim_in, dim_in),
Residual(PreNorm(dim_in, LinearAttention(dim_in))),
Residual(PreNorm(dim_in, SpatialLinearAttention(dim_in))),
Residual(PreNorm(dim_in, temporal_attn(dim_in))),
Upsample(dim_in) if not is_last else nn.Identity()
]))

Expand Down Expand Up @@ -314,10 +316,11 @@ def forward(self, x, time, cond = None, null_cond_prob = 0.):

h = []

for convnext, convnext2, attn, downsample in self.downs:
for convnext, convnext2, spatial_attn, temporal_attn, downsample in self.downs:
x = convnext(x, t)
x = convnext2(x, t)
x = attn(x)
x = spatial_attn(x)
x = temporal_attn(x)
h.append(x)
x = downsample(x)

Expand All @@ -326,11 +329,12 @@ def forward(self, x, time, cond = None, null_cond_prob = 0.):
x = self.mid_temporal_attn(x)
x = self.mid_block2(x, t)

for convnext, convnext2, attn, upsample in self.ups:
for convnext, convnext2, spatial_attn, temporal_attn, upsample in self.ups:
x = torch.cat((x, h.pop()), dim=1)
x = convnext(x, t)
x = convnext2(x, t)
x = attn(x)
x = spatial_attn(x)
x = temporal_attn(x)
x = upsample(x)

return self.final_conv(x)
Expand Down

0 comments on commit 0d8f175

Please sign in to comment.