Added support for sparse data matrices. (#25)

ararslan · web-flow · commit f3d7cf19e768 · 2017-10-02T21:38:21.000-07:00
diff --git a/src/GLMNet.jl b/src/GLMNet.jl
@@ -283,6 +283,37 @@ function glmnet!(X::Matrix{Float64}, y::Vector{Float64},
 
     @check_and_return
 end
+function glmnet!(X::SparseMatrixCSC{Float64,Int32}, y::Vector{Float64},
+             family::Normal=Normal();
+             weights::Vector{Float64}=ones(length(y)),
+             naivealgorithm::Bool=(size(X, 2) >= 500), alpha::Real=1.0,
+             penalty_factor::Vector{Float64}=ones(size(X, 2)),
+             constraints::Array{Float64, 2}=[x for x in (-Inf, Inf), y in 1:size(X, 2)],
+             dfmax::Int=size(X, 2), pmax::Int=min(dfmax*2+20, size(X, 2)), nlambda::Int=100,
+             lambda_min_ratio::Real=(length(y) < size(X, 2) ? 1e-2 : 1e-4),
+             lambda::Vector{Float64}=Float64[], tol::Real=1e-7, standardize::Bool=true,
+             intercept::Bool=true, maxit::Int=1000000)
+    @validate_and_init
+
+    ccall((:spelnet_, libglmnet), Void,
+          (Ref{Int32}, Ref{Float64}, Ref{Int32}, Ref{Int32}, Ptr{Float64}, Ptr{Int32},
+           Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ref{Int32}, Ptr{Float64}, Ptr{Float64},
+           Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ptr{Float64}, Ref{Float64},
+           Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64},
+           Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ptr{Int32}, Ptr{Int32}),
+          (naivealgorithm ? 2 : 1), alpha, size(X, 1), size(X, 2), X.nzval, X.colptr,
+          X.rowval, y, weights, 0, penalty_factor, constraints, dfmax, pmax, nlambda,
+          lambda_min_ratio, lambda, tol, standardize, intercept, maxit, lmu, a0, ca, ia,
+          nin, fdev, alm, nlp, jerr)
+
+    null_dev = 0.0
+    mu = mean(y)
+    for i = 1:length(y)
+        null_dev += abs2(null_dev-mu)
+    end
+
+    @check_and_return
+end
 
 function glmnet!(X::Matrix{Float64}, y::Matrix{Float64},
              family::Binomial;
@@ -330,6 +361,54 @@ function glmnet!(X::Matrix{Float64}, y::Matrix{Float64},
     null_dev = null_dev[1]
     @check_and_return
 end
+function glmnet!(X::SparseMatrixCSC{Float64,Int32}, y::Matrix{Float64},
+             family::Binomial;
+             offsets::Union{Vector{Float64},Void}=nothing,
+             weights::Vector{Float64}=ones(size(y, 1)),
+             alpha::Real=1.0,
+             penalty_factor::Vector{Float64}=ones(size(X, 2)),
+             constraints::Array{Float64, 2}=[x for x in (-Inf, Inf), y in 1:size(X, 2)],
+             dfmax::Int=size(X, 2), pmax::Int=min(dfmax*2+20, size(X, 2)), nlambda::Int=100,
+             lambda_min_ratio::Real=(length(y) < size(X, 2) ? 1e-2 : 1e-4),
+             lambda::Vector{Float64}=Float64[], tol::Real=1e-7, standardize::Bool=true,
+             intercept::Bool=true, maxit::Int=1000000, algorithm::Symbol=:newtonraphson)
+    @validate_and_init
+    size(y, 2) == 2 || error("glmnet for logistic models requires a two-column matrix with "*
+                             "counts of negative responses in the first column and positive "*
+                             "responses in the second")
+    kopt = algorithm == :newtonraphson ? 0 :
+           algorithm == :modifiednewtonraphson ? 1 :
+           algorithm == :nzsame ? 2 : error("unknown algorithm ")
+    offsets::Vector{Float64} = isa(offsets, @compat Void) ? zeros(size(y, 1)) : copy(offsets)
+    length(offsets) == size(y, 1) || error("length of offsets must match length of y")
+
+    null_dev = Vector{Float64}(1)
+
+    # The Fortran code expects positive responses in first column, but
+    # this convention is evidently unacceptable to the authors of the R
+    # code, and, apparently, to us
+    for i = 1:size(y, 1)
+        a = y[i, 1]
+        b = y[i, 2]
+        y[i, 1] = b*weights[i]
+        y[i, 2] = a*weights[i]
+    end
+
+    ccall((:splognet_, libglmnet), Void,
+          (Ref{Float64}, Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Float64}, Ptr{Int32},
+           Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ref{Int32}, Ptr{Float64}, Ptr{Float64},
+           Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ptr{Float64}, Ref{Float64},
+           Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Int32}, Ptr{Float64},
+           Ptr{Float64}, Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64},
+           Ptr{Int32}, Ptr{Int32}),
+          alpha, size(X, 1), size(X, 2), 1, X.nzval, X.colptr, X.rowval, y, copy(offsets),
+          0, penalty_factor, constraints, dfmax, pmax, nlambda, lambda_min_ratio, lambda,
+          tol, standardize, intercept, maxit, kopt, lmu, a0, ca, ia, nin, null_dev, fdev,
+          alm, nlp, jerr)
+
+    null_dev = null_dev[1]
+    @check_and_return
+end
 
 function glmnet!(X::Matrix{Float64}, y::Vector{Float64},
              family::Poisson;
@@ -361,13 +440,49 @@ function glmnet!(X::Matrix{Float64}, y::Vector{Float64},
     null_dev = null_dev[1]
     @check_and_return
 end
+function glmnet!(X::SparseMatrixCSC{Float64,Int32}, y::Vector{Float64},
+             family::Poisson;
+             offsets::Union{Vector{Float64},Void}=nothing,
+             weights::Vector{Float64}=ones(length(y)),
+             alpha::Real=1.0,
+             penalty_factor::Vector{Float64}=ones(size(X, 2)),
+             constraints::Array{Float64, 2}=[x for x in (-Inf, Inf), y in 1:size(X, 2)],
+             dfmax::Int=size(X, 2), pmax::Int=min(dfmax*2+20, size(X, 2)), nlambda::Int=100,
+             lambda_min_ratio::Real=(length(y) < size(X, 2) ? 1e-2 : 1e-4),
+             lambda::Vector{Float64}=Float64[], tol::Real=1e-7, standardize::Bool=true,
+             intercept::Bool=true, maxit::Int=1000000)
+    @validate_and_init
+    null_dev = Vector{Float64}(1)
+
+    offsets::Vector{Float64} = isa(offsets, Void) ? zeros(length(y)) : copy(offsets)
+    length(offsets) == length(y) || error("length of offsets must match length of y")
+
+    ccall((:spfishnet_, libglmnet), Void,
+          (Ref{Float64}, Ref{Int32}, Ref{Int32}, Ptr{Float64}, Ptr{Int32}, Ptr{Int32},
+           Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Int32}, Ptr{Float64}, Ptr{Float64},
+           Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ptr{Float64}, Ref{Float64},
+           Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64},
+           Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ptr{Int32},
+           Ptr{Int32}),
+          alpha, size(X, 1), size(X, 2), X.nzval, X.colptr, X.rowval, y, offsets, weights,
+          0, penalty_factor, constraints, dfmax, pmax, nlambda, lambda_min_ratio, lambda,
+          tol, standardize, intercept, maxit, lmu, a0, ca, ia, nin, null_dev, fdev, alm,
+          nlp, jerr)
+
+    null_dev = null_dev[1]
+    @check_and_return
+end
 
 glmnet(X::Matrix{Float64}, y::Vector{Float64}, family::Distribution=Normal(); kw...) =
     glmnet!(copy(X), copy(y), family; kw...)
 glmnet(X::AbstractMatrix, y::AbstractVector, family::Distribution=Normal(); kw...) =
     glmnet(convert(Matrix{Float64}, X), convert(Vector{Float64}, y), family; kw...)
+glmnet(X::SparseMatrixCSC, y::AbstractVector, family::Distribution=Normal(); kw...) =
+    glmnet!(convert(SparseMatrixCSC{Float64,Int32}, X), convert(Vector{Float64}, y), family; kw...)
 glmnet(X::Matrix{Float64}, y::Matrix{Float64}, family::Binomial; kw...) =
     glmnet!(copy(X), copy(y), family; kw...)
+glmnet(X::SparseMatrixCSC, y::AbstractMatrix, family::Binomial; kw...) =
+    glmnet!(convert(SparseMatrixCSC{Float64,Int32}, X), convert(Matrix{Float64}, y), family; kw...)
 glmnet(X::Matrix, y::Matrix, family::Binomial; kw...) =
     glmnet(convert(Matrix{Float64}, X), convert(Matrix{Float64}, y), family; kw...)
 
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -85,6 +85,22 @@ path = glmnet(X, y)
                               69.396354069847447,12.253877034216755,81.104769545494065,
                               17.808632244707766]
 
+# sparse
+path = glmnet(sparse(X), y)
+@test nactive(path.betas) == df_true
+@test path.dev_ratio ≈ dev_ratio
+@test path.lambda ≈ lambda
+@test path.a0 ≈ a0
+@test path.betas[:, models] ≈ betas
+@test predict(path, X, 16) ≈ [25.67451533276341,62.52776614976348,78.11952611080198,
+                              69.61492976841734,20.00478443784032,58.98418434043655,
+                              64.65391523535965,25.67451533276341,77.41080974893660,
+                              14.33505354291723]
+@test predict(path, X, 62) ≈ [9.688463955335449,65.513664866328625,89.537586892649699,
+                              84.299096349896985,5.711287928399321,61.686267113123805,
+                              69.396354069847447,12.253877034216755,81.104769545494065,
+                              17.808632244707766]
+
 # Cross-validation
 cv = glmnetcv(X, y; folds=[1,1,1,1,2,2,2,3,3,3])
 @test cv.meanloss ≈ [1196.1831818915,1054.30217435069,882.722957995572,741.473677317198,
@@ -211,6 +227,22 @@ path = glmnet(X, convert(Matrix{Float64}, yl), Binomial())
                               6.514278565882654,-5.352339338181535,10.448596815251006,
                               -8.571939893775767]
 
+# sparse
+path = glmnet(sparse(X), yl, Binomial())
+@test nactive(path.betas) == df_true
+@test path.dev_ratio ≈ dev_ratio
+@test path.lambda ≈ lambda
+@test path.a0 ≈ a0
+@test path.betas[:, models] ≈ betas
+@test predict(path, X, 16) ≈ [-1.315519391606169,1.722425698139390,3.007710159185589,
+                              2.306645907705844,-1.782895559259332,1.430315593356164,
+                              1.897691761009327,-1.315519391606169,2.949288138228943,
+                              -2.250271726912495]
+@test predict(path, X, 62) ≈ [-5.328152634222764,5.936301834664929,10.640103977849353,
+                              8.017225144937120,-6.891307125813680,5.068602350662909,
+                              6.514278565882654,-5.352339338181535,10.448596815251006,
+                              -8.571939893775767]
+
 # Cross-validation
 cv = glmnetcv(X, yl, Binomial(); folds=[1,1,1,1,2,2,2,3,3,3])
 @test cv.meanloss ≈ [1.49234494227531,1.38615874813194,1.21206306885012,
@@ -356,6 +388,22 @@ path = glmnet(X, y, Poisson())
 path = glmnet([1 1; 2 2; 3 4], [0, 0, 1], Poisson())
 @test !any(isnan, GLMNet.loss(path, [1 1; 2 2; 4 4], [0, 0, 1]))
 
+# sparse
+path = glmnet(sparse(X), y, Poisson())
+@test nactive(path.betas) == df_true
+@test path.dev_ratio ≈ dev_ratio
+@test path.lambda ≈ lambda
+@test path.a0 ≈ a0
+@test path.betas[:, models] ≈ betas
+@test predict(path, X, 16) ≈ [3.195043339285695,4.063275663211828,4.430604723334422,
+                              4.230243417813007,3.061469135604752,3.979791785911238,
+                              4.113365989592181,3.195043339285695,4.413907947874304,
+                              2.927894931923808]
+@test predict(path, X, 62) ≈ [2.108607974403907,4.125962319203899,4.481867295351227,
+                              4.492300995443095,2.410180465556811,4.082152789977005,
+                              4.183424906852268,2.381251991983247,4.446875861428943,
+                              2.829218161240957]
+
 # Cross-validation
 cv = glmnetcv(X, y, Poisson(); folds=[1,1,1,1,2,2,2,3,3,3])
 @test cv.meanloss ≈ [29.7515432302351,26.9128224177608,23.2024175329887,