diff --git a/README.md b/README.md index 9ebd5d47ff48f8..02dd68eb5cf421 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Basic statistics functions for Julia * `mad(a)`: Compute the median absolute deviation of `a` with a correction factor, which ensures that the MAD will be a consistent estimator of the mean for normally distributed data. * `midrange(a)`: Compute the mid point of the range of `a` (e.g `(max(a) + min(a) / 2)`). * `modes(a)`: Compute all modes of `a`. Be warned that every element of an array with no repeated elements is considered a mode. +* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide a list of possible values, e.g. ["A", "B, "C"] or [1:3]. * `percentile(a)`: Compute the percentiles (0%, 10%, ..., 100%) of `a`. * `quantile(a)`: Compute any desired quantile of `a`. * `quartile(a): Compute the quartiles of `a`. diff --git a/src/Stats.jl b/src/Stats.jl index cc091672dac976..af0d77ed4833f0 100644 --- a/src/Stats.jl +++ b/src/Stats.jl @@ -45,6 +45,7 @@ module Stats confint, ecdf, findat, + indicators, inverse_rle, loglikelihood, nobs, diff --git a/src/others.jl b/src/others.jl index 8ff5e745f4399a..86e2e53907286b 100644 --- a/src/others.jl +++ b/src/others.jl @@ -97,6 +97,60 @@ function ecdf{T}(X::AbstractVector{T}) return e end +function indicators{T}(input::AbstractMatrix{T}, + categories::Array{Any,1}={}; + sparse::Bool=false) + nfeatures, nsamples = size(input) + if length(categories) != 0 && length(categories) != nfeatures + error("You must provide either categories for each feature or no categories") + end + internal_categories = copy(categories) + noutrows = 0 + if length(internal_categories) != nfeatures + for i in 1:nfeatures + push!(internal_categories, sort(unique(input[i, :]))) + end + end + for i in 1:nfeatures + noutrows += length(internal_categories[i]) + end + if sparse + output = spzeros(noutrows, nsamples) + else + output = zeros(noutrows, nsamples) + end + offset = 1 + for i in 1:nfeatures + indicators!(output, offset, slice(input, i, :), internal_categories[i]) + offset += length(internal_categories[i]) + end + return output +end + +function indicators{T}(input::AbstractVector{T}, + categories::Array{T,1}=sort(unique(input)); + sparse::Bool=false) + if sparse + output = spzeros(length(categories), length(input)) + else + output = zeros(length(categories), length(input)) + end + indicators!(output, 1, input, categories) + return output +end + +function indicators!{S<:Real,T}(output::AbstractArray{S}, + offset::Integer, + input::AbstractVector{T}, + categories::Array{T,1}=sort(unique(input))) + indices = (T=>Integer)[categories[i]=>i for i in 1:length(categories)] + const lo = offset-1 + for i in 1:length(input) + output[indices[input[i]]+lo, i] = one(S) + end + return +end + abstract StatisticalModel coef(obj::StatisticalModel) = error("No method defined") diff --git a/test/01.jl b/test/01.jl index 6a4a3f81d1db06..fdcfab0e0bcef7 100644 --- a/test/01.jl +++ b/test/01.jl @@ -31,3 +31,22 @@ fnecdf = ecdf(randn(10000000)) fnecdf = ecdf([0.5]) @test fnecdf([zeros(5000), ones(5000)]) == [zeros(5000), ones(5000)] + +y = [1, 2, 1, 3, 2] +expected = [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0]' +@test indicators(y) == expected +@test indicators(y, [1:3], sparse=true) == expected +y = [2, 3, 2, 4, 3] +@test indicators(y) == expected +X = [1 2 3; 1 1 1; 2 1 1] +expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 1 1; 1 0 0] +@test indicators(X) == expected +expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 0 0; 0 0 0; 0 1 1; 1 0 0] +@test indicators(X, {[1:3], [1:3], [1:2]}) == expected +y = ["A", "B", "C", "B", "A"] +expected = [1.0 0.0 0.0 0.0; 0.0 1.0 0.0 0.0; 0.0 0.0 1.0 0.0; 0.0 1.0 0.0 0.0; 1.0 0.0 0.0 0.0]' +@test indicators(y, ["A", "B", "C", "D"], sparse=true) == expected +X = ["A" "B" "C"; "B" "A" "C"] +cats = ["A", "B", "C", "D"] +expected = [1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0; 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0; 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0]' +@test indicators(X, {cats, cats}, sparse=false) == expected