Skip to content

Commit

Permalink
add snp_projectSelfPCA()
Browse files Browse the repository at this point in the history
  • Loading branch information
privefl committed Sep 24, 2024
1 parent 9355f9b commit 5c0cc1a
Show file tree
Hide file tree
Showing 13 changed files with 173 additions and 24 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ Encoding: UTF-8
Package: bigsnpr
Type: Package
Title: Analysis of Massive SNP Arrays
Version: 1.12.15
Date: 2024-09-20
Version: 1.12.16
Date: 2024-09-24
Authors@R: c(
person("Florian", "Privé", email = "[email protected]", role = c("aut", "cre")),
person("Michael", "Blum", role = "ths"),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ export(snp_plinkKINGQC)
export(snp_plinkQC)
export(snp_plinkRmSamples)
export(snp_prodBGEN)
export(snp_projectSelfPCA)
export(snp_pruning)
export(snp_qq)
export(snp_readBGEN)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## bigsnpr 1.12.16

- Add function `snp_projectSelfPCA()` (only `bed_projectSelfPCA()` existed).

## bigsnpr 1.12.14

- Add a new `min.maf = 0.02` parameter to `snp_autoSVD()` and `bed_autoSVD()`. Then variants are now discarded when they have either a small MAC or a small MAF.
Expand Down
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ prod_bgen2 <- function(filename, offsets, XY, Y, ind_row, decode, dosage, N, max
.Call(`_bigsnpr_prod_bgen2`, filename, offsets, XY, Y, ind_row, decode, dosage, N, max_size, ncores)
}

prod_and_rowSumsSq2 <- function(BM, ind_row, ind_col, center, scale, V) {
.Call(`_bigsnpr_prod_and_rowSumsSq2`, BM, ind_row, ind_col, center, scale, V)
}

read_bgen <- function(filename, offsets, BM, ind_row, ind_col, decode, dosage, N, ncores) {
.Call(`_bigsnpr_read_bgen`, filename, offsets, BM, ind_row, ind_col, decode, dosage, N, ncores)
}
Expand Down
82 changes: 69 additions & 13 deletions R/bed-projectPCA.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@ part_prod <- function(X, ind, ind.row, ind.col, center, scale, V, XV, X_norm) {

################################################################################

OADP_proj <- function(XV, X_norm, sval, ncores) {
bigparallelr::split_parapply(function(XV, X_norm, sval, ind) {
bigutilsr::pca_OADP_proj2(XV[ind, , drop = FALSE], X_norm[ind], sval)
}, ind = rows_along(XV), XV = XV, X_norm = X_norm, sval = sval,
.combine = "rbind", ncores = ncores)
}

################################################################################

#' Projecting PCA
#'
#' Computing and projecting PCA of reference dataset to a target dataset.
Expand Down Expand Up @@ -195,16 +204,16 @@ bed_projectSelfPCA <- function(obj.svd, obj.bed, ind.row,

big_parallelize(
obj.bed$light,
p.FUN = part_prod,
ind = seq_along(ind.col),
ncores = ncores,
p.FUN = part_prod,
ind = seq_along(ind.col),
ncores = ncores,
ind.row = ind.row,
ind.col = ind.col,
center = obj.svd$center,
scale = obj.svd$scale,
V = obj.svd$v,
XV = XV,
X_norm = X_norm
center = obj.svd$center,
scale = obj.svd$scale,
V = obj.svd$v,
XV = XV,
X_norm = X_norm
)

list(
Expand All @@ -216,11 +225,58 @@ bed_projectSelfPCA <- function(obj.svd, obj.bed, ind.row,

################################################################################

OADP_proj <- function(XV, X_norm, sval, ncores) {
bigparallelr::split_parapply(function(XV, X_norm, sval, ind) {
bigutilsr::pca_OADP_proj2(XV[ind, , drop = FALSE], X_norm[ind], sval)
}, ind = rows_along(XV), XV = XV, X_norm = X_norm, sval = sval,
.combine = "rbind", ncores = ncores)
part_prod2 <- function(X, ind, ind.row, ind.col, center, scale, V, XV, X_norm) {

res <- prod_and_rowSumsSq2(
BM = X,
ind_row = ind.row,
ind_col = ind.col[ind],
center = center[ind],
scale = scale[ind],
V = V[ind, , drop = FALSE]
)

big_increment(XV, res[[1]], use_lock = TRUE)
big_increment(X_norm, res[[2]], use_lock = TRUE)
}

################################################################################

#' @rdname bed_projectSelfPCA
#'
#' @param G The [FBM.code256][bigstatsr::FBM.code256-class] that was used to
#' compute `obj.svd`.
#'
#' @export
snp_projectSelfPCA <- function(obj.svd, G, ind.row,
ind.col = attr(obj.svd, "subset"),
ncores = 1) {

check_args()
assert_lengths(rows_along(obj.svd$v), ind.col)

X_norm <- FBM(length(ind.row), 1, init = 0)
XV <- FBM(length(ind.row), ncol(obj.svd$v), init = 0)

big_parallelize(
G,
p.FUN = part_prod2,
ind = seq_along(ind.col),
ncores = ncores,
ind.row = ind.row,
ind.col = ind.col,
center = obj.svd$center,
scale = obj.svd$scale,
V = obj.svd$v,
XV = XV,
X_norm = X_norm
)

list(
obj.svd.ref = obj.svd,
simple_proj = XV[, , drop = FALSE],
OADP_proj = OADP_proj(XV, X_norm, obj.svd$d, ncores = ncores)
)
}

################################################################################
7 changes: 3 additions & 4 deletions R/clumping.R
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ snp_pruning <- function(G, infos.chr,

#' Long-range LD regions
#'
#' 34 long-range Linkage Disequilibrium (LD) regions for the human genome
#' based on some [wiki table](https://goo.gl/0Ou7uI).
#' 34 long-range Linkage Disequilibrium (LD) regions for the human genome based on
#' [this wiki table](https://genome.sph.umich.edu/wiki/Regions_of_high_linkage_disequilibrium_(LD)).
#'
#' @format A data frame with 34 rows (regions) and 4 variables:
#' - `Chr`: region's chromosome
Expand All @@ -167,8 +167,7 @@ snp_pruning <- function(G, infos.chr,
"LD.wiki34"

#' @param LD.regions A `data.frame` with columns "Chr", "Start" and "Stop".
#' Default use the table of 34 long-range LD regions that you can find
#' [there](https://goo.gl/0Ou7uI).
#' Default use [LD.wiki34].
#'
#' @import foreach
#' @export
Expand Down
6 changes: 5 additions & 1 deletion docs/news/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/LD.wiki34.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions man/bed_projectSelfPCA.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/snp_clumping.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,22 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// prod_and_rowSumsSq2
List prod_and_rowSumsSq2(Environment BM, const IntegerVector& ind_row, const IntegerVector& ind_col, const NumericVector& center, const NumericVector& scale, const NumericMatrix& V);
RcppExport SEXP _bigsnpr_prod_and_rowSumsSq2(SEXP BMSEXP, SEXP ind_rowSEXP, SEXP ind_colSEXP, SEXP centerSEXP, SEXP scaleSEXP, SEXP VSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< Environment >::type BM(BMSEXP);
Rcpp::traits::input_parameter< const IntegerVector& >::type ind_row(ind_rowSEXP);
Rcpp::traits::input_parameter< const IntegerVector& >::type ind_col(ind_colSEXP);
Rcpp::traits::input_parameter< const NumericVector& >::type center(centerSEXP);
Rcpp::traits::input_parameter< const NumericVector& >::type scale(scaleSEXP);
Rcpp::traits::input_parameter< const NumericMatrix& >::type V(VSEXP);
rcpp_result_gen = Rcpp::wrap(prod_and_rowSumsSq2(BM, ind_row, ind_col, center, scale, V));
return rcpp_result_gen;
END_RCPP
}
// read_bgen
List read_bgen(std::string filename, const NumericVector& offsets, const Environment& BM, const IntegerVector& ind_row, const IntegerVector& ind_col, const RawVector& decode, bool dosage, int N, int ncores);
RcppExport SEXP _bigsnpr_read_bgen(SEXP filenameSEXP, SEXP offsetsSEXP, SEXP BMSEXP, SEXP ind_rowSEXP, SEXP ind_colSEXP, SEXP decodeSEXP, SEXP dosageSEXP, SEXP NSEXP, SEXP ncoresSEXP) {
Expand Down Expand Up @@ -605,6 +621,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_bigsnpr_multLinReg", (DL_FUNC) &_bigsnpr_multLinReg, 5},
{"_bigsnpr_extract_submat_bgen", (DL_FUNC) &_bigsnpr_extract_submat_bgen, 8},
{"_bigsnpr_prod_bgen2", (DL_FUNC) &_bigsnpr_prod_bgen2, 10},
{"_bigsnpr_prod_and_rowSumsSq2", (DL_FUNC) &_bigsnpr_prod_and_rowSumsSq2, 6},
{"_bigsnpr_read_bgen", (DL_FUNC) &_bigsnpr_read_bgen, 9},
{"_bigsnpr_readbina", (DL_FUNC) &_bigsnpr_readbina, 3},
{"_bigsnpr_readbina2", (DL_FUNC) &_bigsnpr_readbina2, 5},
Expand Down
45 changes: 45 additions & 0 deletions src/project-utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/******************************************************************************/

#include <bigstatsr/BMCodeAcc.h>

using namespace Rcpp;

/******************************************************************************/

// the one for bed files is in bed-fun.cpp

// [[Rcpp::export]]
List prod_and_rowSumsSq2(Environment BM,
const IntegerVector& ind_row,
const IntegerVector& ind_col,
const NumericVector& center,
const NumericVector& scale,
const NumericMatrix& V) {

XPtr<FBM> xpBM = BM["address"];
SubBMCode256Acc macc(xpBM, ind_row, ind_col, BM["code256"], 1);

int n = macc.nrow();
int m = macc.ncol();
myassert_size(m, V.rows());
myassert_size(m, center.size());
myassert_size(m, scale.size());
int K = V.cols();

NumericMatrix XV(n, K);
NumericVector rowSumsSq(n);

for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
double x = (macc(i, j) - center[j]) / scale[j];
rowSumsSq[i] += x*x;
for (int k = 0; k < K; k++) {
XV(i, k) += x * V(j, k);
}
}
}

return List::create(XV, rowSumsSq);
}

/******************************************************************************/
8 changes: 8 additions & 0 deletions tests/testthat/test-2-pca-project.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ pred2 <- unlist(by(proj$OADP_proj[ind.test, 2:3], pop[ind.test], geom_med))
expect_gt(sum(ref^2), sum(pred1^2))
expect_lt(sum((ref - pred2)^2), sum((ref - pred1)^2))

G <- snp_attach(snp_readBed(obj.bed$bedfile, backingfile = tempfile()))$genotypes
proj.2 <- snp_projectSelfPCA(obj.svd, G, ind.row = ind.test,
ind.col = cols_along(G), ncores = 2)
expect_identical(proj.2$obj.svd.ref, proj$obj.svd.ref)
expect_equal(proj.2$simple_proj, proj$simple_proj[ind.test, ])
expect_equal(proj.2$OADP_proj, proj$OADP_proj[ind.test, ])


proj2 <- bed_projectPCA(obj.bed, obj.bed,
ind.row.new = ind.test,
ind.row.ref = ind.row,
Expand Down

0 comments on commit 5c0cc1a

Please sign in to comment.