-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathplsda.R
189 lines (173 loc) · 8.34 KB
/
plsda.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# ========================================================================================================
# plsda: perform a PLS-DA
# this function is a particular setting of internal_mint.block, the formatting of the input is checked in internal_wrapper.mint
# ========================================================================================================
#' Partial Least Squares Discriminant Analysis (PLS-DA).
#'
#' Function to perform standard Partial Least Squares regression to classify
#' samples.
#'
#' \code{plsda} function fit PLS models with \eqn{1,...,}\code{ncomp} components
#' to the factor or class vector \code{Y}. The appropriate indicator matrix is created.
#'
#' Logratio transformation and multilevel analysis are
#' performed sequentially as internal pre-processing step, through
#' \code{\link{logratio.transfo}} and \code{\link{withinVariation}}
#' respectively. Logratio can only be applied if the data do not contain any 0 value (for
#' count data, we thus advise the normalise raw data with a 1 offset).
#'
#' The type of deflation used is \code{'regression'} for discriminant algorithms.
#' i.e. no deflation is performed on Y.
#'
#' @inheritParams pls
#' @param Y a factor or a class vector for the discrete outcome.
#' @param multilevel sample information for multilevel decomposition for
#' repeated measurements. A numeric matrix or data frame indicating the
#' repeated measures on each individual, i.e. the individuals ID. See examples
#' in \code{?splsda}.
#' @return \code{plsda} returns an object of class \code{"plsda"}, a list that
#' contains the following components:
#'
#' \item{X}{the centered and standardized original predictor matrix.}
#' \item{Y}{the centered and standardized indicator response vector or matrix.}
#' \item{ind.mat}{the indicator matrix.} \item{ncomp}{the number of components
#' included in the model.} \item{variates}{list containing the \code{X} and
#' \code{Y} variates.} \item{loadings}{list containing the estimated loadings
#' associated to each component/variate. The loading weights multiplied with the
#' deflated (residual) matrix gives the variate.} \item{loadings.stars}{list
#' containing the estimated loadings associated to each component/variate. The
#' loading weights are projected so that when multiplied with the original
#' matrix we obtain the variate.} \item{names}{list containing the names to be
#' used for individuals and variables.} \item{nzv}{list containing the zero- or
#' near-zero predictors information.} \item{tol}{the tolerance used in the
#' iterative algorithm, used for subsequent S3 methods} \item{max.iter}{the
#' maximum number of iterations, used for subsequent S3 methods}
#' \item{iter}{Number of iterations of the algorithm for each component}
#' \item{prop_expl_var}{The proportion of the variance explained by each
#' variate / component divided by the total variance in the \code{data} (after
#' removing the possible missing values) using the definition of 'redundancy'.
#' Note that contrary to \code{PCA}, this amount may not decrease in the
#' following components as the aim of the method is not to maximise the
#' variance, but the covariance between data sets (including the dummy matrix
#' representation of the outcome variable in case of the supervised
#' approaches).}\item{mat.c}{matrix of coefficients from the regression of X /
#' residual matrices X on the X-variates, to be used internally by
#' \code{predict}.} \item{defl.matrix}{residual matrices X for each dimension.}
#' @author Ignacio González, Kim-Anh Lê Cao, Florian Rohart, Al J Abadi
#' @seealso \code{\link{splsda}}, \code{\link{summary}},
#' \code{\link{plotIndiv}}, \code{\link{plotVar}}, \code{\link{predict}},
#' \code{\link{perf}}, \code{\link{mint.block.plsda}},
#' \code{\link{block.plsda}} and http://mixOmics.org for more details.
#' @references On PLSDA: Barker M and Rayens W (2003). Partial least squares
#' for discrimination. \emph{Journal of Chemometrics} \bold{17}(3), 166-173.
#' Perez-Enciso, M. and Tenenhaus, M. (2003). Prediction of clinical outcome
#' with microarray data: a partial least squares discriminant analysis (PLS-DA)
#' approach. \emph{Human Genetics} \bold{112}, 581-592. Nguyen, D. V. and
#' Rocke, D. M. (2002). Tumor classification by partial least squares using
#' microarray gene expression data. \emph{Bioinformatics} \bold{18}, 39-50. On
#' log ratio transformation: Filzmoser, P., Hron, K., Reimann, C.: Principal
#' component analysis for compositional data with outliers. Environmetrics
#' 20(6), 621-632 (2009) Lê Cao K.-A., Costello ME, Lakis VA, Bartolo, F,Chua
#' XY, Brazeilles R, Rondeau P. MixMC: Multivariate insights into Microbial
#' Communities. PLoS ONE, 11(8): e0160169 (2016). On multilevel decomposition:
#' Westerhuis, J.A., van Velzen, E.J., Hoefsloot, H.C., Smilde, A.K.:
#' Multivariate paired data analysis: multilevel plsda versus oplsda.
#' Metabolomics 6(1), 119-128 (2010) Liquet, B., Lê Cao K.-A., Hocini, H.,
#' Thiebaut, R.: A novel approach for biomarker selection and the integration
#' of repeated measures experiments from two assays. BMC bioinformatics 13(1),
#' 325 (2012)
#' @keywords regression multivariate
#' @export
#' @example ./examples/plsda-examples.R
plsda <- function(X,
Y,
ncomp = 2,
scale = TRUE,
tol = 1e-06,
max.iter = 100,
near.zero.var = FALSE,
logratio = c("none", "CLR"),
multilevel = NULL,
all.outputs = TRUE)
{
#-- validation des arguments --#
# most of the checks are done in the wrapper.mint.spls.hybrid function
if (is.null(multilevel))
{
if (is.null(Y))
stop("'Y' has to be something else than NULL.")
if (is.null(dim(Y)))
{
Y = factor(Y)
} else {
stop("'Y' should be a factor or a class vector.")
}
if (nlevels(Y) == 1)
stop("'Y' should be a factor with more than one level")
Y.mat = unmap(Y)
colnames(Y.mat) = levels(Y)
} else {
# we expect a vector or a 2-columns matrix in 'Y' and the repeated measurements in 'multilevel'
multilevel = data.frame(multilevel)
if ((nrow(X) != nrow(multilevel)))
stop("unequal number of rows in 'X' and 'multilevel'.")
if (ncol(multilevel) != 1)
stop("'multilevel' should have a single column for the repeated measurements, other factors should be included in 'Y'.")
if (!is.null(ncol(Y)) && !ncol(Y) %in% c(0,1,2))# multilevel 1 or 2 factors
stop("'Y' should either be a factor, a single column data.frame containing a factor, or a 2-columns data.frame containing 2 factors.")
multilevel = data.frame(multilevel, Y)
multilevel[, 1] = as.numeric(factor(multilevel[, 1])) # we want numbers for the repeated measurements
Y.mat = NULL
}
logratio <- match.arg(logratio)
# call to 'internal_wrapper.mint'
result = internal_wrapper.mint(
X = X,
Y = Y.mat,
ncomp = ncomp,
scale = scale,
near.zero.var = near.zero.var,
mode = 'regression',
max.iter = max.iter,
tol = tol,
logratio = logratio,
multilevel = multilevel,
DA = TRUE,
all.outputs = all.outputs
)
# choose the desired output from 'result'
out = list(
call = match.call(),
X = result$A[-result$indY][[1]],
Y = if (is.null(multilevel))
{
Y
} else {
result$Y.factor
},
ind.mat = result$A[result$indY][[1]],
ncomp = result$ncomp,
mode = result$mode,
variates = result$variates,
loadings = result$loadings,
loadings.star = result$loadings.star,
names = result$names,
tol = result$tol,
iter = result$iter,
max.iter = result$max.iter,
nzv = result$nzv,
scale = scale,
logratio = logratio,
prop_expl_var = result$prop_expl_var,
input.X = result$input.X,
mat.c = result$mat.c
)
class(out) = c("mixo_plsda","mixo_pls","DA")
# output if multilevel analysis
if (!is.null(multilevel))
{
out$multilevel = multilevel
class(out) = c("mixo_mlplsda",class(out))
}
return(invisible(out))
}