add Attention across Time after all (spatial) linear attention

lucidrains · Apr 13, 2022 · 0d8f175 · 0d8f175
1 parent 174a896
commit 0d8f175
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -80,6 +80,7 @@ sampled_videos.shape # (2, 3, 5, 32, 32)
 - [ ] find a good torchvideo-like library (torchvideo seems immature) for training on fireworks
 - [ ] consider doing a 3d version of CLIP, so one can eventually apply the lessons of DALL-E2 to video
 - [ ] add a forward keyword argument that arrests attention across time (as reported / claimed in the paper, this type of image + video simultaneous training improves results)
+- [ ] project text into 4-8 tokens, and use them as memory key / values to condition both time and space in attention blocks
 
 ## Citations
 

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'video-diffusion-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.4',
+  version = '0.0.6',
   license='MIT',
   description = 'Video Diffusion - Pytorch',
   author = 'Phil Wang',

diff --git a/video_diffusion_pytorch/video_diffusion_pytorch.py b/video_diffusion_pytorch/video_diffusion_pytorch.py
@@ -148,7 +148,7 @@ def forward(self, x, time_emb = None):
         h = self.net(h)
         return h + self.res_conv(x)
 
-class LinearAttention(nn.Module):
+class SpatialLinearAttention(nn.Module):
     def __init__(self, dim, heads = 4, dim_head = 32):
         super().__init__()
         self.scale = dim_head ** -0.5
@@ -253,25 +253,26 @@ def __init__(
 
         num_resolutions = len(in_out)
         conv_next = partial(ConvNextBlock, time_emb_dim = cond_dim)
+        temporal_attn = lambda dim: EinopsToAndFrom('b c f h w', 'b (h w) f c', Attention(dim))
 
         for ind, (dim_in, dim_out) in enumerate(in_out):
             is_last = ind >= (num_resolutions - 1)
 
             self.downs.append(nn.ModuleList([
                 conv_next(dim_in, dim_out, norm = ind != 0),
                 conv_next(dim_out, dim_out),
-                Residual(PreNorm(dim_out, LinearAttention(dim_out))),
+                Residual(PreNorm(dim_out, SpatialLinearAttention(dim_out))),
+                Residual(PreNorm(dim_out, temporal_attn(dim_out))),
                 Downsample(dim_out) if not is_last else nn.Identity()
             ]))
 
         mid_dim = dims[-1]
         self.mid_block1 = conv_next(mid_dim, mid_dim)
 
         spatial_attn = EinopsToAndFrom('b c f h w', 'b f (h w) c', Attention(mid_dim))
-        temporal_attn = EinopsToAndFrom('b c f h w', 'b (h w) f c', Attention(mid_dim))
 
         self.mid_spatial_attn = Residual(PreNorm(mid_dim, spatial_attn))
-        self.mid_temporal_attn = Residual(PreNorm(mid_dim, temporal_attn))
+        self.mid_temporal_attn = Residual(PreNorm(mid_dim, temporal_attn(mid_dim)))
 
         self.mid_block2 = conv_next(mid_dim, mid_dim)
 
@@ -281,7 +282,8 @@ def __init__(
             self.ups.append(nn.ModuleList([
                 conv_next(dim_out * 2, dim_in),
                 conv_next(dim_in, dim_in),
-                Residual(PreNorm(dim_in, LinearAttention(dim_in))),
+                Residual(PreNorm(dim_in, SpatialLinearAttention(dim_in))),
+                Residual(PreNorm(dim_in, temporal_attn(dim_in))),
                 Upsample(dim_in) if not is_last else nn.Identity()
             ]))
 
@@ -314,10 +316,11 @@ def forward(self, x, time, cond = None, null_cond_prob = 0.):
 
         h = []
 
-        for convnext, convnext2, attn, downsample in self.downs:
+        for convnext, convnext2, spatial_attn, temporal_attn, downsample in self.downs:
             x = convnext(x, t)
             x = convnext2(x, t)
-            x = attn(x)
+            x = spatial_attn(x)
+            x = temporal_attn(x)
             h.append(x)
             x = downsample(x)
 
@@ -326,11 +329,12 @@ def forward(self, x, time, cond = None, null_cond_prob = 0.):
         x = self.mid_temporal_attn(x)
         x = self.mid_block2(x, t)
 
-        for convnext, convnext2, attn, upsample in self.ups:
+        for convnext, convnext2, spatial_attn, temporal_attn, upsample in self.ups:
             x = torch.cat((x, h.pop()), dim=1)
             x = convnext(x, t)
             x = convnext2(x, t)
-            x = attn(x)
+            x = spatial_attn(x)
+            x = temporal_attn(x)
             x = upsample(x)
 
         return self.final_conv(x)