Skip to content

Commit

Permalink
some tests to save
Browse files Browse the repository at this point in the history
  • Loading branch information
privefl committed Jan 20, 2025
1 parent 725eb15 commit 74f7004
Show file tree
Hide file tree
Showing 5 changed files with 563 additions and 0 deletions.
68 changes: 68 additions & 0 deletions tmp-save/maybe-fortuto-qcss.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
library(testthat)
library(bigsnpr)

bedfile <- file.path(tempdir(), "tmp-data/public-data3.bed")
if (!file.exists(rdsfile <- sub_bed(bedfile, ".rds"))) {
zip <- tempfile(fileext = ".zip")
download.file(
"https://github.com/privefl/bigsnpr/blob/master/data-raw/public-data3.zip?raw=true",
destfile = zip, mode = "wb")
unzip(zip, exdir = tempdir())
rds <- snp_readBed(bedfile)
expect_identical(normalizePath(rds), normalizePath(rdsfile))
}

obj.bigSNP <- snp_attach(rdsfile)
G <- obj.bigSNP$genotypes
y <- obj.bigSNP$fam$affection
POS2 <- obj.bigSNP$map$genetic.dist + 1000 * obj.bigSNP$map$chromosome

sumstats <- bigreadr::fread2(file.path(tempdir(), "tmp-data/public-data3-sumstats.txt"))
sumstats$n_eff <- sumstats$N
map <- setNames(obj.bigSNP$map[-3], c("chr", "rsid", "pos", "a1", "a0"))
df_beta <- snp_match(sumstats, map, join_by_pos = FALSE)

ind_var <- df_beta$`_NUM_ID_`
corr0 <- snp_cor_extendedThr(
G, ind.col = ind_var, thr_r2 = 0.2,
infos.pos = POS2[ind_var], size = 1 / 1000, ncores = 2)
corr <- as_SFBM(corr0)

maf <- snp_MAF(G, ind.col = ind_var)
sd0 <- sqrt(2 * maf * (1 - maf))

expect_equal(
snp_qcimp_sumstats(corr, df_beta, sd0 = sd0, max_run = 2),
snp_qcimp_sumstats(corr, df_beta, sd0 = sd0, max_run = 2)
)
#
# expect_equal(
# snp_qcimp_sumstats(corr, df_beta, sd0 = sd0, max_run = 1),
# snp_qcimp_sumstats(corr, df_beta, sd0 = sd0, max_run = 1, ncores = 2)
# )
# 1 iter is fine
# 2 iter is fine with ncores = 1
# any is fine with ncores = 1
# the problem comes from ncores = 2 and max_run > 1

# test0 <- replicate(10, snp_qcimp_sumstats(corr, df_beta, sd0 = sd0, max_run = 2))
# #18378 / 10129 / 19061 is TRUE

expect_equal(
test1 <- snp_qcimp_sumstats(corr, df_beta, sd0 = sd0, max_run = 2, ncores = 3),
test2 <- snp_qcimp_sumstats(corr, df_beta, sd0 = sd0, max_run = 2, ncores = 3)
)

# only r2_max is the same
# thr are all 0.2 / keep are all TRUE

expect_equal(
test3 <- snp_qcimp_sumstats(corr, df_beta, sd0 = sd0, ncores = 3),
test4 <- snp_qcimp_sumstats(corr, df_beta, sd0 = sd0, ncores = 3)
)

plot(lengths(test1$ind_imp), lengths(test2$ind_imp))

res <- snp_qcimp_sumstats(corr, df_beta, sd0 = sd0, ncores = 2)
expect_false(any(res$rm_qc, na.rm = TRUE))
expect_equal(snp_qcimp_sumstats(corr, df_beta, sd0 = sd0, ncores = 2), res)
21 changes: 21 additions & 0 deletions tmp-tests/test-gwas-combi.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
library(bigsnpr)
obj <- snp_attachExtdata()
G <- obj$genotypes
set.seed(1)
y <- rnorm(nrow(G))
w <- runif(5); w <- w / sum(w)
G_w <- G[, 1:5] %*% w

sapply(1:5, function(j) {
summary(lm(y ~ G[, j]))$coef[2, 1:3]
})

gwas <- big_univLinReg(G, y, ind.col = 1:5)
gwas$score

(Z_w <- crossprod(gwas$score, w))
summary(mylm <- lm(y ~ G_w))$coef # 0.570 instead of 0.266
# crossprod(scale(G_w), scale(y)) / (sd(mylm$residuals) * sqrt(nrow(G)))

var <- big_colstats(G, ind.col = 1:5)$var
(beta_w <- crossprod(gwas$estim * var, w) / var(G_w)) # both 0.0739
137 changes: 137 additions & 0 deletions tmp-tests/test-gwas-multipop.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
library(bigsnpr)

celiac <- snp_attach("../Dubois2010_data/FinnuncorrNLITUK1UK3hap300_QC_norel.rds")
dim(G <- celiac$genotypes) # 15155 x 281122
CHR <- celiac$map$chromosome
POS <- celiac$map$physical.pos
y <- celiac$fam$affection - 1
NCORES <- nb_cores()

pop <- snp_getSampleInfos(celiac, "../Dubois2010_data/FinnNLITUK1UK3.clusterv2")[[1]]
pop2 <- c("Netherlands", "Italy", "UK1", "UK2", "Finland")[pop]
table(pop2, exclude = NULL)
# Finland Italy Netherlands UK1 UK2
# 2436 1035 1623 3325 6736

big_counts(G, ind.col = 1:10)
G2 <- snp_fastImputeSimple(G, method = "mean2", ncores = NCORES)

obj.svd <- snp_autoSVD(G2, CHR, POS, ncores = NCORES)
plot(obj.svd)
plot(obj.svd, type = "scores", scores = 1:6, coef = 0.6)
PC <- predict(obj.svd)[, 1:6]

pop_centers <- bigutilsr::geometric_median(PC, by_grp = pop2)

ldist_to_centers <- apply(pop_centers, 1, function(center) {
log(rowSums(sweep(PC, 2, center, '-')^2))
})
hist(ldist_to_centers[pop2 != "Finland", "Finland"])
hist(ldist_to_centers[pop2 == "Italy", "Italy"])
hist(ldist_to_centers[pop2 != "Italy", "Italy"])
hist(ldist_to_centers[pop2 != "UK2", "UK2"])

ind_NW <- which(ldist_to_centers[, "UK2"] < 6)
ind_Fin <- which(ldist_to_centers[, "Finland"] < 7)
ind_It <- which(ldist_to_centers[, "Italy"] < 6)

run_GWAS <- function(ind) {
big_univLinReg(G2, y.train = y[ind], ind.train = ind,
covar.train = cbind(celiac$fam$sex, PC)[ind, ],
ncores = NCORES)
}

gwas_NW <- run_GWAS(ind_NW)
gwas_Fin <- run_GWAS(ind_Fin)
gwas_It <- run_GWAS(ind_It)

plot_grid(
snp_manhattan(gwas_NW, CHR, POS, npoints = 20e3) +
ggplot2::scale_y_log10(),
snp_manhattan(gwas_Fin, CHR, POS, npoints = 20e3) +
ggplot2::scale_y_log10(),
snp_manhattan(gwas_It, CHR, POS, npoints = 20e3) +
ggplot2::scale_y_log10(),
ncol = 1
)

ind_sig <- which(predict(gwas_NW, log10 = FALSE) < 1e-20)
plot(gwas_NW$estim[ind_sig], gwas_Fin$estim[ind_sig]); abline(0, 1, col = "red")
plot(gwas_NW$estim[ind_sig], gwas_It$estim[ind_sig]); abline(0, 1, col = "red")

reg <- deming::deming(gwas_NW$estim[ind_sig] ~ gwas_Fin$estim[ind_sig] + 0,
xstd = gwas_Fin$std.err[ind_sig], ystd = gwas_NW$std.err[ind_sig])
reg

deming::deming(gwas_NW$estim[ind_sig] ~ gwas_It$estim[ind_sig] + 0,
xstd = gwas_It$std.err[ind_sig], ystd = gwas_NW$std.err[ind_sig])


ind_chr6 <- which(CHR == 6)
POS2 <- snp_asGeneticPos(CHR[ind_chr6], POS[ind_chr6])
corr_chr6_NW <- snp_cor(G, ind.row = ind_NW, ind.col = ind_chr6,
infos.pos = POS2, size = 3 / 1000, ncores = NCORES)
corr_chr6_Fin <- snp_cor(G, ind.row = ind_Fin, ind.col = ind_chr6,
infos.pos = POS2, size = 3 / 1000, ncores = NCORES)
corr_chr6_It <- snp_cor(G, ind.row = ind_It, ind.col = ind_chr6,
infos.pos = POS2, size = 3 / 1000, ncores = NCORES)

ind <- Matrix::which(corr_chr6_NW^2 > 0.1 & corr_chr6_Fin^2 > 0.1, arr.ind = TRUE)
ind2 <- ind[ind[, 1] > ind[, 2], ]
plot(corr_chr6_Fin[ind2], corr_chr6_NW[ind2], pch = 20); abline(0, 1, col = "red")


ind3 <- Matrix::which(corr_chr6_NW^2 > 0.1 & corr_chr6_It^2 > 0.1, arr.ind = TRUE)
ind4 <- ind3[ind3[, 1] > ind3[, 2], ]
plot(corr_chr6_It[ind4], corr_chr6_NW[ind4], pch = 20); abline(0, 1, col = "red")

plot(corr_chr6_It[ind4], corr_chr6_Fin[ind4], pch = 20); abline(0, 1, col = "red")


ind_UK2 <- which(ldist_to_centers[, "UK2"] < 5 & pop2 == "UK2")

gwas_UK1 <- run_GWAS(ind_UK1)
gwas_UK2 <- run_GWAS(ind_UK2)

plot_grid(
snp_manhattan(gwas_UK1, CHR, POS, npoints = 20e3) +
ggplot2::scale_y_log10(),
snp_manhattan(gwas_UK2, CHR, POS, npoints = 20e3) +
ggplot2::scale_y_log10(),
ncol = 1
)
corr_chr6_UK2 <- snp_cor(G, ind.row = ind_UK2, ind.col = ind_chr6,
infos.pos = POS2, size = 3 / 1000, ncores = NCORES)
plot(corr_chr6_UK1[ind2], corr_chr6_UK2[ind2], pch = 20); abline(0, 1, col = "red")


ind_Net <- which(ldist_to_centers[, "Netherlands"] < 5 & pop2 == "Netherlands")
corr_chr6_Net <- snp_cor(G, ind.row = ind_Net, ind.col = ind_chr6,
infos.pos = POS2, size = 3 / 1000, ncores = NCORES)
plot(corr_chr6_Net[ind2], corr_chr6_UK2[ind2], pch = 20); abline(0, 1, col = "red")


ind_UK1 <- sample(which(ldist_to_centers[, "UK1"] < 5 & pop2 == "UK1"), length(ind_Net))
corr_chr6_UK1 <- snp_cor(G, ind.row = ind_UK1, ind.col = ind_chr6,
infos.pos = POS2, size = 3 / 1000, ncores = NCORES)
plot(corr_chr6_UK1[ind2], corr_chr6_UK2[ind2], pch = 20); abline(0, 1, col = "red")


ind_max <- which.min(predict(gwas_NW))
ind5 <- ind_max + c(-4:2, 4, 10)
plot_grid(
snp_manhattan(gwas_NW[ind5, ], CHR[ind5], POS[ind5]) +
ggplot2::scale_y_log10(),
snp_manhattan(gwas_Fin[ind5, ], CHR[ind5], POS[ind5]) +
ggplot2::scale_y_log10(),
snp_manhattan(gwas_It[ind5, ], CHR[ind5], POS[ind5]) +
ggplot2::scale_y_log10(),
ncol = 1
)

ind6 <- match(ind5, ind_chr6)
options(max.print = 200)
rbind(
corr_chr6_NW[ind6, ind6[5]],
corr_chr6_Fin[ind6, ind6[5]],
corr_chr6_It[ind6, ind6[5]])
Loading

0 comments on commit 74f7004

Please sign in to comment.