Skip to content

Commit f3d7cf1

Browse files
authored
Added support for sparse data matrices. (#25)
1 parent c6c46f5 commit f3d7cf1

File tree

2 files changed

+163
-0
lines changed

2 files changed

+163
-0
lines changed

src/GLMNet.jl

+115
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,37 @@ function glmnet!(X::Matrix{Float64}, y::Vector{Float64},
283283

284284
@check_and_return
285285
end
286+
function glmnet!(X::SparseMatrixCSC{Float64,Int32}, y::Vector{Float64},
287+
family::Normal=Normal();
288+
weights::Vector{Float64}=ones(length(y)),
289+
naivealgorithm::Bool=(size(X, 2) >= 500), alpha::Real=1.0,
290+
penalty_factor::Vector{Float64}=ones(size(X, 2)),
291+
constraints::Array{Float64, 2}=[x for x in (-Inf, Inf), y in 1:size(X, 2)],
292+
dfmax::Int=size(X, 2), pmax::Int=min(dfmax*2+20, size(X, 2)), nlambda::Int=100,
293+
lambda_min_ratio::Real=(length(y) < size(X, 2) ? 1e-2 : 1e-4),
294+
lambda::Vector{Float64}=Float64[], tol::Real=1e-7, standardize::Bool=true,
295+
intercept::Bool=true, maxit::Int=1000000)
296+
@validate_and_init
297+
298+
ccall((:spelnet_, libglmnet), Void,
299+
(Ref{Int32}, Ref{Float64}, Ref{Int32}, Ref{Int32}, Ptr{Float64}, Ptr{Int32},
300+
Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ref{Int32}, Ptr{Float64}, Ptr{Float64},
301+
Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ptr{Float64}, Ref{Float64},
302+
Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64},
303+
Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ptr{Int32}, Ptr{Int32}),
304+
(naivealgorithm ? 2 : 1), alpha, size(X, 1), size(X, 2), X.nzval, X.colptr,
305+
X.rowval, y, weights, 0, penalty_factor, constraints, dfmax, pmax, nlambda,
306+
lambda_min_ratio, lambda, tol, standardize, intercept, maxit, lmu, a0, ca, ia,
307+
nin, fdev, alm, nlp, jerr)
308+
309+
null_dev = 0.0
310+
mu = mean(y)
311+
for i = 1:length(y)
312+
null_dev += abs2(null_dev-mu)
313+
end
314+
315+
@check_and_return
316+
end
286317

287318
function glmnet!(X::Matrix{Float64}, y::Matrix{Float64},
288319
family::Binomial;
@@ -330,6 +361,54 @@ function glmnet!(X::Matrix{Float64}, y::Matrix{Float64},
330361
null_dev = null_dev[1]
331362
@check_and_return
332363
end
364+
function glmnet!(X::SparseMatrixCSC{Float64,Int32}, y::Matrix{Float64},
365+
family::Binomial;
366+
offsets::Union{Vector{Float64},Void}=nothing,
367+
weights::Vector{Float64}=ones(size(y, 1)),
368+
alpha::Real=1.0,
369+
penalty_factor::Vector{Float64}=ones(size(X, 2)),
370+
constraints::Array{Float64, 2}=[x for x in (-Inf, Inf), y in 1:size(X, 2)],
371+
dfmax::Int=size(X, 2), pmax::Int=min(dfmax*2+20, size(X, 2)), nlambda::Int=100,
372+
lambda_min_ratio::Real=(length(y) < size(X, 2) ? 1e-2 : 1e-4),
373+
lambda::Vector{Float64}=Float64[], tol::Real=1e-7, standardize::Bool=true,
374+
intercept::Bool=true, maxit::Int=1000000, algorithm::Symbol=:newtonraphson)
375+
@validate_and_init
376+
size(y, 2) == 2 || error("glmnet for logistic models requires a two-column matrix with "*
377+
"counts of negative responses in the first column and positive "*
378+
"responses in the second")
379+
kopt = algorithm == :newtonraphson ? 0 :
380+
algorithm == :modifiednewtonraphson ? 1 :
381+
algorithm == :nzsame ? 2 : error("unknown algorithm ")
382+
offsets::Vector{Float64} = isa(offsets, @compat Void) ? zeros(size(y, 1)) : copy(offsets)
383+
length(offsets) == size(y, 1) || error("length of offsets must match length of y")
384+
385+
null_dev = Vector{Float64}(1)
386+
387+
# The Fortran code expects positive responses in first column, but
388+
# this convention is evidently unacceptable to the authors of the R
389+
# code, and, apparently, to us
390+
for i = 1:size(y, 1)
391+
a = y[i, 1]
392+
b = y[i, 2]
393+
y[i, 1] = b*weights[i]
394+
y[i, 2] = a*weights[i]
395+
end
396+
397+
ccall((:splognet_, libglmnet), Void,
398+
(Ref{Float64}, Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Float64}, Ptr{Int32},
399+
Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ref{Int32}, Ptr{Float64}, Ptr{Float64},
400+
Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ptr{Float64}, Ref{Float64},
401+
Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Int32}, Ptr{Float64},
402+
Ptr{Float64}, Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64},
403+
Ptr{Int32}, Ptr{Int32}),
404+
alpha, size(X, 1), size(X, 2), 1, X.nzval, X.colptr, X.rowval, y, copy(offsets),
405+
0, penalty_factor, constraints, dfmax, pmax, nlambda, lambda_min_ratio, lambda,
406+
tol, standardize, intercept, maxit, kopt, lmu, a0, ca, ia, nin, null_dev, fdev,
407+
alm, nlp, jerr)
408+
409+
null_dev = null_dev[1]
410+
@check_and_return
411+
end
333412

334413
function glmnet!(X::Matrix{Float64}, y::Vector{Float64},
335414
family::Poisson;
@@ -361,13 +440,49 @@ function glmnet!(X::Matrix{Float64}, y::Vector{Float64},
361440
null_dev = null_dev[1]
362441
@check_and_return
363442
end
443+
function glmnet!(X::SparseMatrixCSC{Float64,Int32}, y::Vector{Float64},
444+
family::Poisson;
445+
offsets::Union{Vector{Float64},Void}=nothing,
446+
weights::Vector{Float64}=ones(length(y)),
447+
alpha::Real=1.0,
448+
penalty_factor::Vector{Float64}=ones(size(X, 2)),
449+
constraints::Array{Float64, 2}=[x for x in (-Inf, Inf), y in 1:size(X, 2)],
450+
dfmax::Int=size(X, 2), pmax::Int=min(dfmax*2+20, size(X, 2)), nlambda::Int=100,
451+
lambda_min_ratio::Real=(length(y) < size(X, 2) ? 1e-2 : 1e-4),
452+
lambda::Vector{Float64}=Float64[], tol::Real=1e-7, standardize::Bool=true,
453+
intercept::Bool=true, maxit::Int=1000000)
454+
@validate_and_init
455+
null_dev = Vector{Float64}(1)
456+
457+
offsets::Vector{Float64} = isa(offsets, Void) ? zeros(length(y)) : copy(offsets)
458+
length(offsets) == length(y) || error("length of offsets must match length of y")
459+
460+
ccall((:spfishnet_, libglmnet), Void,
461+
(Ref{Float64}, Ref{Int32}, Ref{Int32}, Ptr{Float64}, Ptr{Int32}, Ptr{Int32},
462+
Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Int32}, Ptr{Float64}, Ptr{Float64},
463+
Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ptr{Float64}, Ref{Float64},
464+
Ref{Int32}, Ref{Int32}, Ref{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64},
465+
Ptr{Int32}, Ptr{Int32}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ptr{Int32},
466+
Ptr{Int32}),
467+
alpha, size(X, 1), size(X, 2), X.nzval, X.colptr, X.rowval, y, offsets, weights,
468+
0, penalty_factor, constraints, dfmax, pmax, nlambda, lambda_min_ratio, lambda,
469+
tol, standardize, intercept, maxit, lmu, a0, ca, ia, nin, null_dev, fdev, alm,
470+
nlp, jerr)
471+
472+
null_dev = null_dev[1]
473+
@check_and_return
474+
end
364475

365476
glmnet(X::Matrix{Float64}, y::Vector{Float64}, family::Distribution=Normal(); kw...) =
366477
glmnet!(copy(X), copy(y), family; kw...)
367478
glmnet(X::AbstractMatrix, y::AbstractVector, family::Distribution=Normal(); kw...) =
368479
glmnet(convert(Matrix{Float64}, X), convert(Vector{Float64}, y), family; kw...)
480+
glmnet(X::SparseMatrixCSC, y::AbstractVector, family::Distribution=Normal(); kw...) =
481+
glmnet!(convert(SparseMatrixCSC{Float64,Int32}, X), convert(Vector{Float64}, y), family; kw...)
369482
glmnet(X::Matrix{Float64}, y::Matrix{Float64}, family::Binomial; kw...) =
370483
glmnet!(copy(X), copy(y), family; kw...)
484+
glmnet(X::SparseMatrixCSC, y::AbstractMatrix, family::Binomial; kw...) =
485+
glmnet!(convert(SparseMatrixCSC{Float64,Int32}, X), convert(Matrix{Float64}, y), family; kw...)
371486
glmnet(X::Matrix, y::Matrix, family::Binomial; kw...) =
372487
glmnet(convert(Matrix{Float64}, X), convert(Matrix{Float64}, y), family; kw...)
373488

test/runtests.jl

+48
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,22 @@ path = glmnet(X, y)
8585
69.396354069847447,12.253877034216755,81.104769545494065,
8686
17.808632244707766]
8787

88+
# sparse
89+
path = glmnet(sparse(X), y)
90+
@test nactive(path.betas) == df_true
91+
@test path.dev_ratio dev_ratio
92+
@test path.lambda lambda
93+
@test path.a0 a0
94+
@test path.betas[:, models] betas
95+
@test predict(path, X, 16) [25.67451533276341,62.52776614976348,78.11952611080198,
96+
69.61492976841734,20.00478443784032,58.98418434043655,
97+
64.65391523535965,25.67451533276341,77.41080974893660,
98+
14.33505354291723]
99+
@test predict(path, X, 62) [9.688463955335449,65.513664866328625,89.537586892649699,
100+
84.299096349896985,5.711287928399321,61.686267113123805,
101+
69.396354069847447,12.253877034216755,81.104769545494065,
102+
17.808632244707766]
103+
88104
# Cross-validation
89105
cv = glmnetcv(X, y; folds=[1,1,1,1,2,2,2,3,3,3])
90106
@test cv.meanloss [1196.1831818915,1054.30217435069,882.722957995572,741.473677317198,
@@ -211,6 +227,22 @@ path = glmnet(X, convert(Matrix{Float64}, yl), Binomial())
211227
6.514278565882654,-5.352339338181535,10.448596815251006,
212228
-8.571939893775767]
213229

230+
# sparse
231+
path = glmnet(sparse(X), yl, Binomial())
232+
@test nactive(path.betas) == df_true
233+
@test path.dev_ratio dev_ratio
234+
@test path.lambda lambda
235+
@test path.a0 a0
236+
@test path.betas[:, models] betas
237+
@test predict(path, X, 16) [-1.315519391606169,1.722425698139390,3.007710159185589,
238+
2.306645907705844,-1.782895559259332,1.430315593356164,
239+
1.897691761009327,-1.315519391606169,2.949288138228943,
240+
-2.250271726912495]
241+
@test predict(path, X, 62) [-5.328152634222764,5.936301834664929,10.640103977849353,
242+
8.017225144937120,-6.891307125813680,5.068602350662909,
243+
6.514278565882654,-5.352339338181535,10.448596815251006,
244+
-8.571939893775767]
245+
214246
# Cross-validation
215247
cv = glmnetcv(X, yl, Binomial(); folds=[1,1,1,1,2,2,2,3,3,3])
216248
@test cv.meanloss [1.49234494227531,1.38615874813194,1.21206306885012,
@@ -356,6 +388,22 @@ path = glmnet(X, y, Poisson())
356388
path = glmnet([1 1; 2 2; 3 4], [0, 0, 1], Poisson())
357389
@test !any(isnan, GLMNet.loss(path, [1 1; 2 2; 4 4], [0, 0, 1]))
358390

391+
# sparse
392+
path = glmnet(sparse(X), y, Poisson())
393+
@test nactive(path.betas) == df_true
394+
@test path.dev_ratio dev_ratio
395+
@test path.lambda lambda
396+
@test path.a0 a0
397+
@test path.betas[:, models] betas
398+
@test predict(path, X, 16) [3.195043339285695,4.063275663211828,4.430604723334422,
399+
4.230243417813007,3.061469135604752,3.979791785911238,
400+
4.113365989592181,3.195043339285695,4.413907947874304,
401+
2.927894931923808]
402+
@test predict(path, X, 62) [2.108607974403907,4.125962319203899,4.481867295351227,
403+
4.492300995443095,2.410180465556811,4.082152789977005,
404+
4.183424906852268,2.381251991983247,4.446875861428943,
405+
2.829218161240957]
406+
359407
# Cross-validation
360408
cv = glmnetcv(X, y, Poisson(); folds=[1,1,1,1,2,2,2,3,3,3])
361409
@test cv.meanloss [29.7515432302351,26.9128224177608,23.2024175329887,

0 commit comments

Comments
 (0)