Skip to content

Latest commit

 

History

History
249 lines (218 loc) · 9.93 KB

local_clump.md

File metadata and controls

249 lines (218 loc) · 9.93 KB

记录如何本地clump

加载需要的包

library(TwoSampleMR)
library(data.table)
library(ieugwasr)
library(plinkbinr)  #这个包是帮你装个plink的,自己装plink也行,就是要注意运行时是否有权限

读取数据

gene_blood_exp_snp<-readRDS("gene_blood_exp_snp.rds") #eQTL数据
snp_epilepsy_out<-readRDS("snp_epilepsy_out.rds")   #你的gwas数据
exp_dat1<-gene_blood_exp_snp
genelist <- unique(exp_dat1$gene)
head(gene_blood_exp_snp)
 gene hgnc_symbol        SNP chr.exposure pos.exposure effect_allele.exposure other_allele.exposure eaf.exposure z.exposure
1 ENSG00000000938         FGR rs34806307            1     27958339                      T                     C   0.05812083    11.1724
2 ENSG00000000938         FGR  rs4908343            1     27931698                      G                     A   0.18086010     8.3407
3 ENSG00000000938         FGR rs12726763            1     27958245                      G                     A   0.94275336     7.9185
4 ENSG00000000938         FGR rs12749647            1     27895517                      G                     A   0.11428375     6.9687
5 ENSG00000000938         FGR rs72886321            1     27909007                      A                     C   0.01914212     6.5440
6 ENSG00000000938         FGR rs59900503            1     27907201                      C                     T   0.01919834     6.5196
  beta.exposure se.exposure pval.exposure id.exposure exposure         F
1    0.18878560  0.01689750    5.5656e-29       blood exposure 124.82252
2    0.08574590  0.01028042    7.3915e-17       blood exposure  69.56728
3    0.13488910  0.01703468    2.4059e-15       blood exposure  62.70264
4    0.08669946  0.01244127    3.2011e-12       blood exposure  48.56278
5    0.18905513  0.02888984    5.9975e-11       blood exposure  42.82394
6    0.18808058  0.02884848    7.0495e-11       blood exposure  42.50518

head(snp_epilepsy_out)
chr.outcome pos.outcome        SNP effect_allele.outcome other_allele.outcome eaf.outcome z.outcome beta.outcome  se.outcome outcome
1           6   130840091  rs2326918                     A                    G      0.8470    -0.279 -0.002586613 0.009271014 outcome
2           7   145771806  rs6977693                     T                    C      0.8587    -0.874 -0.008373962 0.009581192 outcome
3          11   100009976 rs12364336                     A                    G      0.8772    -1.506 -0.015313714 0.010168469 outcome
4           1   166367755 rs12562373                     A                    G      0.7678     0.760  0.006007180 0.007904185 outcome
5          14    86737556  rs2135099                     A                    G      0.1652    -0.779 -0.007000891 0.008987023 outcome
6           2   201527977 rs57502521                     A                    G      0.9649    -0.088 -0.001595890 0.018135114 outcome
  mr_keep.outcome pval.outcome pval_origin.outcome id.outcome
1            TRUE    0.7802448            inferred     AIkFiR
2            TRUE    0.3821183            inferred     AIkFiR
3            TRUE    0.1320672            inferred     AIkFiR
4            TRUE    0.4472546            inferred     AIkFiR
5            TRUE    0.4359797            inferred     AIkFiR
6            TRUE    0.9298767            inferred     AIkFiR

为了在后面不报错,我魔改了一下这个函数

修改部分主要是在fun2这个部分,报错主要也是集中在这个部分

Important

注意这个是服务器版本的,windows版的在下面

ld_clump_local <- function (dat, clump_kb, clump_r2, clump_p, bfile, plink_bin) {
  shell <- ifelse(Sys.info()["sysname"] == "Windows", "cmd", 
                  "sh")
  fn <- tempfile()
  write.table(data.frame(SNP = dat[["rsid"]], P = dat[["pval"]]), 
              file = fn, row.names = F, col.names = T, quote = F)
  fun2 <- paste0(shQuote(plink_bin, type = shell), " --bfile ", 
                 shQuote(bfile, type = shell), " --clump ", shQuote(fn, 
                                                                    type = shell), " --clump-p1 ", clump_p, " --clump-r2 ", 
                 clump_r2, " --clump-kb ", clump_kb, " --out ", shQuote(fn, 
                                                                        type = shell))
  system_output <- try(system(paste(fun2, "2>&1"), intern = TRUE))
  # Check for the specific warning message
  warning_message <- "Warning: No significant --clump results.  Skipping."
  if (warning_message %in% system_output) {
    message("Encountered the specific warning message: ", warning_message)
    return(data.frame()) # return an empty data frame
  } 
  res <- read.table(paste(fn, ".clumped", sep = ""), header = T)
  unlink(paste(fn, "*", sep = ""))
  y <- subset(dat, !dat[["rsid"]] %in% res[["SNP"]])
  if (nrow(y) > 0) {
    message("Removing ", length(y[["rsid"]]), " of ", nrow(dat), 
            " variants due to LD with other variants or absence from LD reference panel")
  }
  return(subset(dat, dat[["rsid"]] %in% res[["SNP"]]))
}

需要下载plink这个软件

https://www.cog-genomics.org/plink/ 根据自己的操作系统下载

需要下载LD 参考数据集

http://fileserve.mrcieu.ac.uk/ld/1kg.v3.tgz 要解压,里面有五个人群的,一般选欧洲

Windows本地版的

ld_clump_local<- function (dat, clump_kb, clump_r2, clump_p, bfile, plink_bin) {
  shell <- ifelse(Sys.info()["sysname"] == "Windows", "cmd", 
                  "sh")
  fn <- tempfile()
  write.table(data.frame(SNP = dat[["rsid"]], P = dat[["pval"]]), 
              file = fn, row.names = F, col.names = T, quote = F)
  fun2 <- paste0(shQuote(plink_bin, type = shell), " --bfile ", 
                 shQuote(bfile, type = shell), " --clump ", gsub('\\\\',"/",fn), " --clump-p1 ", clump_p, " --clump-r2 ", 
                 clump_r2, " --clump-kb ", clump_kb, " --out ", gsub('\\\\',"/",fn))
  system(fun2)
  res <- read.table(paste(fn, ".clumped", sep = ""), header = T)
  unlink(paste(fn, "*", sep = ""))
  y <- subset(dat, !dat[["rsid"]] %in% res[["SNP"]])
  if (nrow(y) > 0) {
    message("Removing ", length(y[["rsid"]]), " of ", nrow(dat), 
            " variants due to LD with other variants or absence from LD reference panel")
  }
  return(subset(dat, dat[["rsid"]] %in% res[["SNP"]]))
}

Important

还不行的话,最简单的方法是fun2修改成fun2 <- paste0(plink_bin, " --bfile ",bfile, " --clump ",fn," --clump-p1 ", clump_p, " --clump-r2 ", clump_r2, " --clump-kb ", clump_kb, " --out ",fn)

循环执行函数

reslist <- list() # 获取基因列表
for (i in genelist) {
  exp_dat <- exp_dat1[exp_dat1$gene == i,]     
  epilepsy <- snp_epilepsy_out[snp_epilepsy_out$SNP %in% exp_dat$SNP, ]
  exp_dat <- exp_dat[exp_dat$SNP %in% epilepsy$SNP, ]

  ld <- ld_clump_local(dplyr::tibble(rsid=exp_dat$SNP, pval=exp_dat$pval.exposure),
                       clump_r2=0.001,clump_kb=10000,clump_p=1, 
                       bfile = "/GPUFS/gyfyy_jxhe_1/User/zyt/mr_gy/0127/1000genomes/EUR", 
                       plink_bin = plinkbinr::get_plink_exe())  #这个是天河服务器版本的,你要改自己的路径哦
  exp_dat <- exp_dat[exp_dat$SNP %in% ld$rsid, ]

if (nrow(exp_dat) == 0) {
  next
}       #这个是为了当没有数据时执行下一个循环

dat <- harmonise_data(exposure_dat=exp_dat, outcome_dat=epilepsy)
if (nrow(dat) == 0) {
  next
}
  res <- mr(dat)     #执行mr
  reslist[[i]] <- res   #把结果写入列表
  
}

result <- data.frame()   
for (i in names(reslist)) {
  if (nrow(reslist[[i]]) == 0) {
    next
  }    #当结果为空时跳到下一个循环
  r1 <- reslist[[i]]
  r1$gene  <-  i
  result <- rbind(result,r1)
}

我有一计可以多线程运行

Important

函数需要根据自己项目的不同进行修改哦 数据名和上面的稍微有点区别,其实也就加了个format_data,应该很好理解

process_gene <- function(i) {
  cough1 <- cough[cough$Gene == i,]    #这个是eqtl文件
  cough1 <- cough1[cough1$p<5E-08]
  if (nrow(cough1) == 0) {
    return(NULL)
  }
  exp_dat <- format_data(
    dat=cough1,
    type = 'exposure',
    snp_col = "SNP",
    beta_col = "b",
    se_col = "SE",
    eaf_col = "Freq",
    effect_allele_col = "A1",
    other_allele_col = "A2",
    # samplesize_col = "all_meta_sample_N",
    pval_col = "p",
    chr_col = "Chr",
    pos_col = "BP"
  )
  
  asthma2 <- disease[disease$rsid %in% exp_dat$SNP, ]     #这个是你的gwas文件
  if (nrow(asthma2) == 0) {
    return(NULL)
  }
  asthma <- format_data(
    dat=asthma2,
    type = "outcome",
    header = TRUE,
    snps = exp_dat$SNP,
    snp_col = "rsid",
    beta_col = "inv_var_meta_beta",
    se_col = "inv_var_meta_sebeta",
    eaf_col = "all_meta_AF",
    effect_allele_col = "ALT",
    other_allele_col = "REF",
    # samplesize_col = "all_meta_sample_N",
    pval_col = "inv_var_meta_p",
    chr_col = "CHR",
    pos_col = "POS"
  )
  
  exp_dat <- exp_dat[exp_dat$SNP %in% asthma$SNP, ]
  
  
  ld <- ld_clump_local(dplyr::tibble(rsid=exp_dat$SNP, pval=exp_dat$pval.exposure),
                       clump_r2=0.001,clump_kb=10000,clump_p=1, bfile = "./GWAS_data/1000genomic/EUR", plink_bin = "./lin/projects/bin/plink")   #执行本地clump
  exp_dat <- exp_dat[exp_dat$SNP %in% ld$rsid, ]
  
  if (nrow(exp_dat) == 0) {
    return(NULL)
  }
  
  dat <- harmonise_data(exposure_dat=exp_dat, outcome_dat=asthma)
  if (nrow(dat) == 0) {
    return(NULL)
  }
  res <- mr(dat)
  
  return(res)
}

运行函数

library(parallel)
# 您可以根据您的机器的核心数调整mc.cores的值
reslist <- mclapply(genelist, process_gene, mc.cores = 8)

# 清理结果,移除NULL元素
reslist <- reslist[!sapply(reslist, is.null)]

到这里这个list就是结果啦,自己转成dataframe哦,给你们举个例子差不多是这样写,自己去改哈哈哈

result <- data.frame()
for (i in 1:length(reslist)) {
  result <- rbind(result,c(reslist[[i]]$summary,reslist[[i]]$gene))
}