FluxML · darsnack · Feb 11, 2022 · Feb 2, 2022 · Feb 4, 2022 · Feb 4, 2022
diff --git a/src/Metalhead.jl b/src/Metalhead.jl
@@ -26,18 +26,22 @@ include("convnets/mobilenet.jl")
 # Other models
 include("other/mlpmixer.jl")
 
+# ViT-based models
+include("vit-based/vit.jl")
+
 export  AlexNet,
         VGG, VGG11, VGG13, VGG16, VGG19,
         ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152,
         GoogLeNet, Inception3, SqueezeNet,
         DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201,
         ResNeXt,
         MobileNetv2, MobileNetv3,
-        MLPMixer
+        MLPMixer,
+        ViT
 
 # use Flux._big_show to pretty print large models
 for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet, :ResNeXt, 
-          :MobileNetv2, :MobileNetv3, :MLPMixer)
+          :MobileNetv2, :MobileNetv3, :MLPMixer, :ViT)
   @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model)
 end
 

diff --git a/src/layers.jl b/src/layers.jl
@@ -91,28 +91,90 @@ end
 skip_identity(inplanes, outplanes, downsample) = skip_identity(inplanes, outplanes)
 
 """
-    addrelu(x, y)
+    mlpblock(planes, expansion_factor = 4, dropout = 0., dense = Dense)
+
+Feedforward block used in many vision transformer-like models.
+
+# Arguments
+- `planes`: Number of dimensions in the input and output.
+- `hidden_planes`: Number of dimensions in the intermediate layer.
+- `dropout`: Dropout rate.
+- `dense`: Type of dense layer to use in the feedforward block.
+- `activation`: Activation function to use.
+"""
+function mlpblock(planes, hidden_planes, dropout = 0., dense = Dense; activation = gelu)
+  Chain(dense(planes, hidden_planes, activation), Dropout(dropout),
+        dense(hidden_planes, planes, activation), Dropout(dropout))
+end
+
+"""
+    Attention{T}
 
-Convenience function for `(x, y) -> @. relu(x + y)`.
-Useful as the `connection` argument for [`resnet`](#).
-See also [`reluadd`](#).
+Self attention layer used by transformer models. Can be instantiated with a layer that produces
+the key, value and query vectors from the input.
 """
-addrelu(x, y) = @. relu(x + y)
+struct Attention{T}
+  qkv::T
+end
+
+Attention(in, out) = Attention(Dense(in, out * 3; bias = false))
+
+@functor Attention
+
+function (attn::Attention)(x::AbstractArray{T}) where T
+  q, k, v = chunk(attn.qkv(x), 3; dim = 1)
+  scale = convert(T, sqrt(size(q, 1)))
+  score = softmax(batched_mul(batched_transpose(q), k) / scale)
+  attention = batched_mul(v, score)
+
+  return attention
+end
+
+struct MHAttention{Q <: Integer, S, T}
+  nheads::Q
+  heads::S
+  projection::T
+end
 
 """
-    reluadd(x, y)
+    MHAttention(in, hidden, nheads, dropout = 0.)
+
+Multi-head self-attention layer used in many vision transformer-like models.
 
-Convenience function for `(x, y) -> @. relu(x) + relu(y)`.
-Useful as the `connection` argument for [`resnet`](#).
-See also [`addrelu`](#).
+# Arguments
+- `in`: Number of dimensions in the input.
+- `hidden`: Number of dimensions in the intermediate layer.
+- `nheads`: Number of attention heads.
+- `dropout`: Dropout rate for the projection layer.
 """
-reluadd(x, y) = @. relu(x) + relu(y)
+function MHAttention(in, hidden, nheads, dropout = 0.)
+  project_out = !(nheads == 1 && hidden == in)
+  inheads, innerheads = chunk(1:in, nheads), chunk(1:hidden, nheads)
+  heads = Parallel(vcat, [Attention(length(i), length(o)) for (i, o) in zip(inheads, innerheads)]...)
+  projection = project_out ? Chain(Dense(hidden, in), Dropout(dropout)) : identity
+
+  MHAttention(nheads, heads, projection)
+end
 
-# Patching layer used by many vision transformer-like models
+@functor MHAttention
+
+function (mha::MHAttention)(x)
+  xhead = chunk(x, mha.nheads; dim = 1)
+
+  return mha.projection(mha.heads(xhead...))
+end
+
+"""
+    Patching{T <: Integer}
+
+Patching layer used by many vision transformer-like models to split the input image into patches.
+Can be instantiated with a tuple `(patch_height, patch_width)` or a single value `patch_size`.
+"""
 struct Patching{T <: Integer}
   patch_height::T
   patch_width::T
 end
+
 Patching(patch_size) = Patching(patch_size, patch_size)
 
 function (p::Patching)(x)
@@ -127,18 +189,32 @@ end
 @functor Patching
 
 """
-    mlpblock(planes, expansion_factor = 4, dropout = 0., dense = Dense)
+    PosEmbedding{T}
 
-Feedforward block used in many vision transformer-like models.
+Positional embedding layer used by many vision transformer-like models. Instantiated with an 
+embedding vector which is a learnable parameter.
+"""
+struct PosEmbedding{T}
+  embedding_vector::T
+end
+
+(p::PosEmbedding)(x) = x .+ p.embedding_vector[:, 1:size(x)[2], :]
+
+@functor PosEmbedding
 
-# Arguments
-  `planes`: Number of dimensions in the input and output.
-  `hidden_planes`: Number of dimensions in the intermediate layer.
-  `dropout`: Dropout rate.
-  `dense`: Type of dense layer to use in the feedforward block.
-  `activation`: Activation function to use.
 """
-function mlpblock(planes, hidden_planes, dropout = 0., dense = Dense; activation = gelu)
-  Chain(dense(planes, hidden_planes, activation), Dropout(dropout),
-        dense(hidden_planes, planes, activation), Dropout(dropout))
+    CLSTokens{T}
+
+Appends class tokens to the input that are used for classfication by many vision 
+transformer-like models. Instantiated with a class token vector which is a learnable parameter.
+"""
+struct CLSTokens{T}
+  cls_token::T
 end
+
+function(m::CLSTokens)(x)
+  cls_tokens = repeat(m.cls_token, 1, 1, size(x)[3])
+  return cat(cls_tokens, x; dims = 2)
+end
+
+@functor CLSTokens
diff --git a/src/utilities.jl b/src/utilities.jl
@@ -1,6 +1,28 @@
+# Utility function for getting chunks of an ND-array along a particular dimension
+chunk(A, k::Int; dim::Int = 1) = 
+    (selectdim(A, dim, i) for i in Iterators.partition(axes(A,dim), cld(size(A,dim), k)));
+
 # Utility function for classifier head of vision transformer-like models
 _seconddimmean(x) = mean(x, dims = 2)[:, 1, :]
 
+"""
+    addrelu(x, y)
+
+Convenience function for `(x, y) -> @. relu(x + y)`.
+Useful as the `connection` argument for [`resnet`](#).
+See also [`reluadd`](#).
+"""
+addrelu(x, y) = @. relu(x + y)
+
+"""
+    reluadd(x, y)
+
+Convenience function for `(x, y) -> @. relu(x) + relu(y)`.
+Useful as the `connection` argument for [`resnet`](#).
+See also [`addrelu`](#).
+"""
+reluadd(x, y) = @. relu(x) + relu(y)
+
 """
     weights(model)
 

diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
@@ -0,0 +1,116 @@
+# Utility function for applying LayerNorm before a block
+prenorm(planes, fn) = Chain(fn, LayerNorm(planes))
+
+"""
+    Transformer(planes, depth, heads, headplanes, mlppanes, dropout = 0.)
+
+Transformer as used in the base ViT architecture.
+([reference](https://arxiv.org/abs/2010.11929)).
+
+# Arguments
+- `planes`: number of input channels
+- `depth`: number of layers
+- `heads`: number of attention heads
+- `headplanes`: number of hidden channels per head
+- `mlppanes`: number of hidden channels in the MLP block
+- `dropout`: dropout rate
+"""
+function Transformer(planes, depth, heads, headplanes, mlpplanes, dropout = 0.)
+  layers = [Chain(SkipConnection(prenorm(planes, MHAttention(planes, headplanes, heads, dropout)), +),
+                  SkipConnection(prenorm(planes, mlpblock(planes, mlpplanes, dropout)), +)) 
+            for _ in 1:depth]
+
+  Chain(layers...)
+end
+
+"""
+    vit(imsize::NTuple{2} = (256, 256); inchannels = 3, patch_size = (16, 16), planes = 1024, 
+        depth = 6, heads = 16, mlppanes = 2048, headplanes = 64, dropout = 0.1, emb_dropout = 0.1, 
+        pool = "cls", nclasses = 1000)
+
+Creates a Vision Transformer model as detailed in the paper An Image is Worth 16x16 
+Words: Transformers for Image Recognition at Scale .
+([reference](https://arxiv.org/abs/2010.11929)).
+
+# Arguments
+- `imsize`: image size
+- `inchannels`: number of input channels
+- `patch_size`: size of the patches
+- `planes`: the number of channels fed into the main model
+- `depth`: number of blocks in the transformer
+- `heads`: number of attention heads in the transformer
+- `mlpplanes`: number of hidden channels in the MLP block in the transformer
+- `headplanes`: number of hidden channels per head in the transformer
+- `dropout`: dropout rate
+- `emb_dropout`: dropout rate for the positional embedding layer
+- `pool`: pooling type, either "cls" or "avg"
+- `nclasses`: number of classes in the output
+"""
+function vit(imsize::NTuple{2} = (256, 256); inchannels = 3, patch_size = (16, 16), planes = 1024, 
+  depth = 6, heads = 16, mlppanes = 2048, headplanes = 64, dropout = 0.1, emb_dropout = 0.1, 
+  pool = "cls", nclasses = 1000)
+
+  im_height, im_width = imsize
+  patch_height, patch_width = patch_size
+
+  @assert (im_height % patch_height == 0) && (im_width % patch_width == 0)
+  "Image dimensions must be divisible by the patch size."
+  @assert pool in ["cls", "avg"]
+  "Pool type must be either cls (cls token) or avg (mean pooling)"
+
+  num_patches = (im_height ÷ patch_height) * (im_width ÷ patch_width)
+  patchplanes = inchannels * patch_height * patch_width
+
+  return Chain(Patching(patch_height, patch_width),
+               Dense(patchplanes, planes),
+               CLSTokens(rand(Float32, (planes, 1, 1))),
+               PosEmbedding(rand(Float32, (planes, num_patches + 1, 1))),
+               Dropout(emb_dropout),
+               Transformer(planes, depth, heads, headplanes, mlppanes, dropout),
+               (pool == "cls") ? x -> x[:, 1, :] : x -> _seconddimmean(x),
+               Chain(LayerNorm(planes), Dense(planes, nclasses)))
+end
+
+struct ViT
+  layers
+end
+
+"""
+    ViT(imsize::NTuple{2} = (256, 256); inchannels = 3, patch_size = (16, 16), planes = 1024, 
+        depth = 6, heads = 16, mlppanes = 2048, headplanes = 64, dropout = 0.1, emb_dropout = 0.1, 
+        pool = "cls", nclasses = 1000)
+
+Creates a Vision Transformer model as detailed in the paper An Image is Worth 16x16 
+Words: Transformers for Image Recognition at Scale .
+([reference](https://arxiv.org/abs/2010.11929)).
+
+# Arguments
+- `imsize`: image size
+- `inchannels`: number of input channels
+- `patch_size`: size of the patches
+- `planes`: the number of channels fed into the main model
+- `depth`: number of blocks in the transformer
+- `heads`: number of attention heads in the transformer
+- `mlpplanes`: number of hidden channels in the MLP block in the transformer
+- `headplanes`: number of hidden channels per head in the transformer
+- `dropout`: dropout rate
+- `emb_dropout`: dropout rate for the positional embedding layer
+- `pool`: pooling type, either "cls" or "avg"
+- `nclasses`: number of classes in the output
+"""
+function ViT(imsize::NTuple{2} = (256, 256); inchannels = 3, patch_size = (16, 16), planes = 1024, 
+  depth = 6, heads = 16, mlppanes = 2048, headplanes = 64, dropout = 0.1, emb_dropout = 0.1, 
+  pool = "cls", nclasses = 1000)
+
+  layers = vit(imsize; inchannels, patch_size, planes, depth, heads, mlppanes, headplanes, 
+               dropout, emb_dropout, pool, nclasses)
+
+  ViT(layers)
+end
+
+(m::ViT)(x) = m.layers(x)
+
+backbone(m::ViT) = m.layers[1:end-1]
+classifier(m::ViT) = m.layers[end]
+
+@functor ViT
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -19,3 +19,8 @@ end
 @testset verbose = true "Other" begin
   include("other.jl")
 end
+
+# ViT tests
+@testset verbose = true "ViTs" begin
+  include("vit-based.jl")
+end
diff --git a/test/vit-based.jl b/test/vit-based.jl
@@ -0,0 +1,7 @@
+using Metalhead, Test
+using Flux
+
+@testset "ViT" begin
+    @test size(ViT()(rand(Float32, 256, 256, 3, 2))) == (1000, 2)
+    @test_skip gradtest(ViT(), rand(Float32, 256, 256, 3, 2))
+end