Add unique(AbstractArray, dim)

Efficiently finds the unique columns, rows, etc. of an array. The algorithm first hashes each row, then finds the unique hashes, and finally checks that the hashes don't collide. It is roughly O(n) in the number of elements in the matrix. This is my first time using Cartesian. Without it, this code is presently about 10% faster for finding unique rows of a matrix, but the overhead is probably worth it for the generality.
JuliaLang · Feb 14, 2014 · db3b28d · timholy · Feb 14, 2014 · simonster
1 parent d15ce97
commit db3b28d
Show file tree

Hide file tree

Showing 2 changed files with 93 additions and 0 deletions.
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
@@ -410,3 +410,70 @@ for (V, PT, BT) in [((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)]
         return P
     end
 end
+
+## unique across dim
+
+immutable Prehashed
+    hash::Uint
+end
+hash(x::Prehashed) = x.hash
+
+@ngenerate N typeof(A) function unique{T,N}(A::AbstractArray{T,N}, dim::Int)
+    1 <= dim <= N || return copy(A)
+    hashes = zeros(Uint, size(A, dim))
+
+    # Compute hash for each row
+    j = 0
+    @nloops N i A d->(if d == dim; j = i_d; end) begin
+       @inbounds hashes[j] = bitmix(hashes[j], hash((@nref N A i)))
+    end
+
+    # Collect index of first row for each hash
+    uniquerow = Array(Int, size(A, dim))
+    firstrow = Dict{Prehashed,Int}()
+    for j = 1:size(A, dim)
+        uniquerow[j] = get!(firstrow, Prehashed(hashes[j]), j)
+    end
+    uniquerows = collect(values(firstrow))
+
+    # Check for collisions
+    collided = falses(size(A, dim))
+    @inbounds begin
+        @nloops N i A d->(if d == dim; j = i_d; end) begin
+            if (@nref N A d->ifelse(d == dim, uniquerow[j], i_d)) != (@nref N A i)
+                collided[j] = true
+            end
+        end
+    end
+
+    if any(collided)
+        nowcollided = BitArray(size(A, dim))
+        while any(collided)
+            # Collect index of first row for each collided hash
+            empty!(firstrow)
+            for j = 1:size(A, dim)
+                collided[j] || continue
+                uniquerow[j] = get!(firstrow, Prehashed(hashes[j]), j)
+            end
+            for v in values(firstrow)
+                push!(uniquerows, v)
+            end
+
+            # Check for collisions
+            fill!(nowcollided, false)
+            @nloops N i A d->begin
+                                 if d == dim
+                                     j = i_d
+                                     (!collided[j] || uniquerow[j] == j) && continue
+                                 end
+                             end begin
+                if (@nref N A d->ifelse(d == dim, uniquerow[j], i_d)) != (@nref N A i)
+                    nowcollided[j] = true
+                end
+            end
+            (collided, nowcollided) = (nowcollided, collided)
+        end
+    end
+
+    @nref N A d->d == dim ? sort!(uniquerows) : (1:size(A, d))
+end
diff --git a/test/arrayops.jl b/test/arrayops.jl
@@ -331,6 +331,32 @@ for i = tensors
     @test isequal(i,permutedims(ipermutedims(i,perm),perm))
 end
 
+## unique across dim ##
+
+# All rows and columns unique
+A = ones(10, 10)
+A[diagind(A)] = shuffle!([1:10])
+@test unique(A, 1) == A
+@test unique(A, 2) == A
+
+# 10 repeats of each row
+B = A[shuffle!(repmat(1:10, 10)), :]
+C = unique(B, 1)
+@test sortrows(C) == sortrows(A)
+@test unique(B, 2) == B
+@test unique(B.', 2).' == C
+
+# Along third dimension
+D = cat(3, B, B)
+@test unique(D, 1) == cat(3, C, C)
+@test unique(D, 3) == cat(3, B)
+
+# With hash collisions
+immutable HashCollision
+    x::Float64
+end
+Base.hash(::HashCollision) = uint(0)
+@test map(x->x.x, unique(map(HashCollision, B), 1)) == C
 
 ## reduce ##