From ac20bd379f74786fa035a42086e93c1b891142fa Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Wed, 9 Mar 2022 23:08:36 -0500
Subject: [PATCH 1/4] use gpuarrays

---
 Project.toml        | 10 ++++++++--
 src/OneHotArrays.jl |  2 +-
 src/array.jl        |  8 ++++----
 test/gpu.jl         | 41 +++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl    | 25 +++++++++++++++++++++++++
 5 files changed, 79 insertions(+), 7 deletions(-)
 create mode 100644 test/gpu.jl

diff --git a/Project.toml b/Project.toml
index bbc65b4..2f52bb4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,8 +4,8 @@ version = "0.1.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
@@ -14,11 +14,17 @@ NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Adapt = "3.0"
 CUDA = "3.8"
 ChainRulesCore = "1.13"
+GPUArrays = "8.2.1"
 MLUtils = "0.2"
 NNlib = "0.8"
+Zygote = "0.6.35"
+julia = "1.6"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test"]
+test = ["Test", "CUDA", "Random", "Zygote"]
diff --git a/src/OneHotArrays.jl b/src/OneHotArrays.jl
index 1c074b2..c14387e 100644
--- a/src/OneHotArrays.jl
+++ b/src/OneHotArrays.jl
@@ -2,7 +2,7 @@ module OneHotArrays
 
 using Adapt
 using ChainRulesCore
-using CUDA
+using GPUArrays
 using LinearAlgebra
 using MLUtils 
 using NNlib
diff --git a/src/array.jl b/src/array.jl
index 3ac48bf..bf3fd29 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -61,13 +61,13 @@ function Base.replace_in_print_matrix(x::OneHotLike, i::Integer, j::Integer, s::
 end
 
 # copy CuArray versions back before trying to print them:
-Base.print_array(io::IO, X::OneHotLike{T, L, N, var"N+1", <:CuArray}) where {T, L, N, var"N+1"} = 
+Base.print_array(io::IO, X::OneHotLike{T, L, N, var"N+1", <:AbstractGPUArray}) where {T, L, N, var"N+1"} = 
   Base.print_array(io, adapt(Array, X))
-Base.print_array(io::IO, X::LinearAlgebra.AdjOrTrans{Bool, <:OneHotLike{T, L, N, var"N+1", <:CuArray}}) where {T, L, N, var"N+1"} = 
+Base.print_array(io::IO, X::LinearAlgebra.AdjOrTrans{Bool, <:OneHotLike{T, L, N, var"N+1", <:AbstractGPUArray}}) where {T, L, N, var"N+1"} = 
   Base.print_array(io, adapt(Array, X))
 
 _onehot_bool_type(::OneHotLike{<:Any, <:Any, <:Any, N, <:Union{Integer, AbstractArray}}) where N = Array{Bool, N}
-_onehot_bool_type(::OneHotLike{<:Any, <:Any, <:Any, N, <:CuArray}) where N = CuArray{Bool, N}
+_onehot_bool_type(::OneHotLike{<:Any, <:Any, <:Any, N, <:AbstractGPUArray}) where N = AbstractGPUArray{Bool, N}
 
 function Base.cat(x::OneHotLike{<:Any, L}, xs::OneHotLike{<:Any, L}...; dims::Int) where L
   if isone(dims) || any(x -> !_isonehot(x), (x, xs...))
@@ -90,7 +90,7 @@ MLUtils.batch(xs::AbstractArray{<:OneHotVector{<:Any, L}}) where L = OneHotMatri
 
 Adapt.adapt_structure(T, x::OneHotArray{<:Any, L}) where L = OneHotArray(adapt(T, _indices(x)), L)
 
-Base.BroadcastStyle(::Type{<:OneHotArray{<: Any, <: Any, <: Any, N, <: CuArray}}) where N = CUDA.CuArrayStyle{N}()
+Base.BroadcastStyle(::Type{<:OneHotArray{<: Any, <: Any, <: Any, N, T}}) where {N, T <: AbstractGPUArray} = Base.BroadcastStyle(T)
 
 Base.map(f, x::OneHotLike) = Base.broadcast(f, x)
 
diff --git a/test/gpu.jl b/test/gpu.jl
new file mode 100644
index 0000000..32a6dee
--- /dev/null
+++ b/test/gpu.jl
@@ -0,0 +1,41 @@
+
+# Tests from Flux, probably not the optimal testset organisation!
+
+@testset "CUDA" begin
+  x = randn(5, 5)
+  cx = cu(x)
+  @test cx isa CuArray
+
+  @test_broken onecold(cu([1.0, 2.0, 3.0])) == 3  # scalar indexing error?
+
+  x = onehotbatch([1, 2, 3], 1:3)
+  cx = cu(x)
+  @test cx isa OneHotMatrix && cx.indices isa CuArray
+  @test (cx .+ 1) isa CuArray
+
+  xs = rand(5, 5)
+  ys = onehotbatch(1:5,1:5)
+  @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)
+end
+
+@testset "onehot gpu" begin
+  y = onehotbatch(ones(3), 1:2) |> cu;
+  @test (repr("text/plain", y); true)
+
+  gA = rand(3, 2) |> cu;
+  @test_broken gradient(A -> sum(A * y), gA)[1] isa CuArray  # fails with JLArray, bug in Zygote?
+end
+
+@testset "onecold gpu" begin
+  y = onehotbatch(ones(3), 1:10) |> cu;
+  l = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
+  @test onecold(y) isa CuArray
+  @test y[3,:] isa CuArray
+  @test onecold(y, l) == ['a', 'a', 'a']
+end
+
+@testset "onehot forward map to broadcast" begin
+  oa = OneHotArray(rand(1:10, 5, 5), 10) |> cu
+  @test all(map(identity, oa) .== oa)
+  @test all(map(x -> 2 * x, oa) .== 2 .* oa)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index f23b34b..39ad146 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -12,3 +12,28 @@ end
 @testset "Linear Algebra" begin
   include("linalg.jl")
 end
+
+using Zygote
+import CUDA
+if CUDA.functional()
+  using CUDA  # exports CuArray, etc
+  @info "starting CUDA tests"
+else
+  @info "CUDA not functional, testing via GPUArrays"
+  using GPUArrays
+  GPUArrays.allowscalar(false)
+
+  # GPUArrays provides a fake GPU array, for testing
+  jl_file = normpath(joinpath(pathof(GPUArrays), "..", "..", "test", "jlarray.jl"))
+  using Random  # loaded within jl_file
+  include(jl_file)
+  using .JLArrays
+  cu = jl
+  CuArray{T,N} = JLArray{T,N}
+end
+
+@test cu(rand(3)) .+ 1 isa CuArray
+
+@testset "GPUArrays" begin
+  include("gpu.jl")
+end

From ffb6f003923c99fe23c4d012b6aeb917f2b60b64 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Thu, 10 Mar 2022 11:16:45 -0500
Subject: [PATCH 2/4] BroadcastStyle{N+1}

---
 src/array.jl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/array.jl b/src/array.jl
index bf3fd29..0ca50ab 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -90,7 +90,13 @@ MLUtils.batch(xs::AbstractArray{<:OneHotVector{<:Any, L}}) where L = OneHotMatri
 
 Adapt.adapt_structure(T, x::OneHotArray{<:Any, L}) where L = OneHotArray(adapt(T, _indices(x)), L)
 
-Base.BroadcastStyle(::Type{<:OneHotArray{<: Any, <: Any, <: Any, N, T}}) where {N, T <: AbstractGPUArray} = Base.BroadcastStyle(T)
+function Base.BroadcastStyle(::Type{<:OneHotArray{<: Any, <: Any, <: Any, var"N+1", T}}) where {var"N+1", T <: AbstractGPUArray}
+  # We want CuArrayStyle{N+1}(). There's an AbstractGPUArrayStyle but it doesn't do what we need. 
+  S = Base.BroadcastStyle(T)
+  # S has dim N not N+1. The following hack to fix it relies on the arraystyle having N as its first type parameter, which
+  # isn't guaranteed, but there are not so many GPU broadcasting styles in the wild. (Far fewer than there are array wrappers.)
+  (typeof(S).name.wrapper){var"N+1"}()
+end
 
 Base.map(f, x::OneHotLike) = Base.broadcast(f, x)
 

From 4fb6fe9a6316f52a79be76772cf9f523ee81f3d8 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Thu, 10 Mar 2022 11:17:19 -0500
Subject: [PATCH 3/4] use N and N+1 more consistently

---
 src/array.jl  | 4 ++--
 src/onehot.jl | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 0ca50ab..1fdaff2 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -66,8 +66,8 @@ Base.print_array(io::IO, X::OneHotLike{T, L, N, var"N+1", <:AbstractGPUArray}) w
 Base.print_array(io::IO, X::LinearAlgebra.AdjOrTrans{Bool, <:OneHotLike{T, L, N, var"N+1", <:AbstractGPUArray}}) where {T, L, N, var"N+1"} = 
   Base.print_array(io, adapt(Array, X))
 
-_onehot_bool_type(::OneHotLike{<:Any, <:Any, <:Any, N, <:Union{Integer, AbstractArray}}) where N = Array{Bool, N}
-_onehot_bool_type(::OneHotLike{<:Any, <:Any, <:Any, N, <:AbstractGPUArray}) where N = AbstractGPUArray{Bool, N}
+_onehot_bool_type(::OneHotLike{<:Any, <:Any, <:Any, var"N+1", <:Union{Integer, AbstractArray}}) where {var"N+1"} = Array{Bool, var"N+1"}
+_onehot_bool_type(::OneHotLike{<:Any, <:Any, <:Any, var"N+1", <:AbstractGPUArray}) where {var"N+1"} = AbstractGPUArray{Bool, var"N+1"}
 
 function Base.cat(x::OneHotLike{<:Any, L}, xs::OneHotLike{<:Any, L}...; dims::Int) where L
   if isone(dims) || any(x -> !_isonehot(x), (x, xs...))
diff --git a/src/onehot.jl b/src/onehot.jl
index 93d37f0..4ff19e8 100644
--- a/src/onehot.jl
+++ b/src/onehot.jl
@@ -57,8 +57,8 @@ nonzero elements.
 If one of the inputs in `xs` is not found in `labels`, that column is `onehot(default, labels)`
 if `default` is given, else an error.
 
-If `xs` has more dimensions, `M = ndims(xs) > 1`, then the result is an 
-`AbstractArray{Bool, M+1}` which is one-hot along the first dimension, 
+If `xs` has more dimensions, `N = ndims(xs) > 1`, then the result is an 
+`AbstractArray{Bool, N+1}` which is one-hot along the first dimension, 
 i.e. `result[:, k...] == onehot(xs[k...], labels)`.
 
 Note that `xs` can be any iterable, such as a string. And that using a tuple

From cea26a4124ee4d0b13395d0d0649908d95cad84a Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Thu, 10 Mar 2022 11:22:39 -0500
Subject: [PATCH 4/4] fix 2-arg show

---
 src/array.jl | 12 ++++++++----
 test/gpu.jl  | 10 ++++++++++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 1fdaff2..a67ee7f 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -61,10 +61,14 @@ function Base.replace_in_print_matrix(x::OneHotLike, i::Integer, j::Integer, s::
 end
 
 # copy CuArray versions back before trying to print them:
-Base.print_array(io::IO, X::OneHotLike{T, L, N, var"N+1", <:AbstractGPUArray}) where {T, L, N, var"N+1"} = 
-  Base.print_array(io, adapt(Array, X))
-Base.print_array(io::IO, X::LinearAlgebra.AdjOrTrans{Bool, <:OneHotLike{T, L, N, var"N+1", <:AbstractGPUArray}}) where {T, L, N, var"N+1"} = 
-  Base.print_array(io, adapt(Array, X))
+for fun in (:show, :print_array)  # print_array is used by 3-arg show
+  @eval begin
+    Base.$fun(io::IO, X::OneHotLike{T, L, N, var"N+1", <:AbstractGPUArray}) where {T, L, N, var"N+1"} = 
+      Base.$fun(io, adapt(Array, X))
+    Base.$fun(io::IO, X::LinearAlgebra.AdjOrTrans{Bool, <:OneHotLike{T, L, N, <:Any, <:AbstractGPUArray}}) where {T, L, N} = 
+      Base.$fun(io, adapt(Array, X))
+  end
+end
 
 _onehot_bool_type(::OneHotLike{<:Any, <:Any, <:Any, var"N+1", <:Union{Integer, AbstractArray}}) where {var"N+1"} = Array{Bool, var"N+1"}
 _onehot_bool_type(::OneHotLike{<:Any, <:Any, <:Any, var"N+1", <:AbstractGPUArray}) where {var"N+1"} = AbstractGPUArray{Bool, var"N+1"}
diff --git a/test/gpu.jl b/test/gpu.jl
index 32a6dee..91fa2a8 100644
--- a/test/gpu.jl
+++ b/test/gpu.jl
@@ -39,3 +39,13 @@ end
   @test all(map(identity, oa) .== oa)
   @test all(map(x -> 2 * x, oa) .== 2 .* oa)
 end
+
+@testset "show gpu" begin
+  x = onehotbatch([1, 2, 3], 1:3)
+  cx = cu(x)
+  # 3-arg show
+  @test contains(repr("text/plain", cx), "1  ⋅  ⋅")
+  @test contains(repr("text/plain", cx), string(typeof(cx.indices)))
+  # 2-arg show, https://github.com/FluxML/Flux.jl/issues/1905
+  @test repr(cx) == "Bool[1 0 0; 0 1 0; 0 0 1]"
+end