- Author(s): Sara Bandres-Ciga, Mary B. Makarious, Monica Diez-Fairen
- Project: LIN28A Letter (IPDGC)
- PI(s): Mike Nalls, Andrew Singleton
- Collaborators: Cornelis Blauwendraat
- Date Last Updated: 21.04.2020
- Working Title: Assessment of LIN28A variants in Parkinson's disease
- Aim: To scrutinize whether LIN28A (LOF) mutations are causative for Parkinson's disease
- Brief Description: A loss-of-function LIN28A mutation has been reported to cause early onset Parkinson's disease. However, replication in a large cohort is needed. So we were inspired to look into the IPDGC + AMP PD cohorts
- Original Publication: https://www.ncbi.nlm.nih.gov/pubmed/31750563
- Generate
--mac 3
for statistics,--mac 1
for burdens
# Initializing some variables
## REMOVED paths to files
COV_NAMES="SEX,AGE,FAMILY_HISTORY,EDUCATION,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,STUDY_BioFIND,STUDY_PDBP,STUDY_PPMI"
module load plink #v1.9
module load samtools #v1.9
module load annovar #v2018-04-16
module load rvtests #v2.1.0
- BioFind, PDBP, and PPMI cohorts
--geno 0.05
- European ancestry
- 28195229 variants
- 2927 people total -
- 1647 cases, 1050 controls (230 missing phenotype)
# Create a working directory for the LIN28A paper
# Change into directory
cd $WORKING_DIR
# Extract the LIN28A region
# These are hg38 co-ordinates
# --mac 1
plink --bfile $AMP_PLINK \
--chr 1 --from-bp 26410817 --to-bp 26429728 \
--mac 1 \
--make-bed --out $WORKING_DIR/LIN28A_WGS_AMP_PD_mac1
# Done 21.04.2020
# PLINK output:
# Total genotyping rate is 0.999946.
# 108 variants and 2927 people pass filters and QC.
# Among remaining phenotypes, 1647 are cases and 1050 are controls. (230 phenotypes are missing.)
# --mac 3
plink --bfile $AMP_PLINK \
--chr 1 --from-bp 26410817 --to-bp 26429728 \
--mac 3 \
--make-bed --out $WORKING_DIR/LIN28A_WGS_AMP_PD_mac3
# wc -l LIN28A_WGS_AMP_PD_mac3.bim
# 83 LIN28A_WGS_AMP_PD_mac3.bim
# 83 variants total pass --mac 3
We run --model and --assoc to get the frequencies of the SNPs and distribution of alleles of the dataset
plink --bfile $WORKING_DIR/LIN28A_WGS_AMP_PD_mac3 --model \
--out LIN28A_WGS_AMP_PD_mac3_noCov
# File generated @ LIN28A_WGS_AMP_PD_mac3.model
# --model does not take covariates
# Shows distribution of hom/het/hom
# Done 21.04.2020
plink --bfile $WORKING_DIR/LIN28A_WGS_AMP_PD_mac3 --assoc \
--pheno $PHENO \
--ci 0.95 \
--out LIN28A_WGS_AMP_PD_mac3_noCov
# File generated @ LIN28A_WGS_AMP_PD_mac3_noCov.assoc
# --assoc is a quick way to get frequencies
# Does not take covariates
# Done 21.04.2020
plink --bfile $WORKING_DIR/LIN28A_WGS_AMP_PD_mac3 --logistic \
--pheno $PHENO \
--covar $COV_FILE \
--covar-name SEX,AGE,FAMILY_HISTORY,EDUCATION,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 \
--ci 0.95 \
--hide-covar \
--out LIN28A_WGS_AMP_PD_mac3_wCovs
# File generated @ LIN28A_WGS_AMP_PD_mac3_wCovs.assoc.logistic
# --logistic does take covariates but wasn't run here
# --hide-covar flag to remove each and every individual covariate test from the final output
# Adding study covariates with PLINK did not work... so were removed
# But did work for RVTests, see section 7
# Done 21.04.2020
plink --bfile $WORKING_DIR/LIN28A_WGS_AMP_PD_mac3 --assoc fisher \
--pheno $PHENO \
--ci 0.95 \
--out LIN28A_WGS_AMP_PD_mac3
# File generated @ LIN28A_WGS_AMP_PD_mac3.assoc.fisher
# --assoc fisher does not take covariates
# Done 21.04.2020
plink --bfile $WORKING_DIR/LIN28A_WGS_AMP_PD_mac1 \
--mac 1 \
--recode 'vcf-fid' --out $WORKING_DIR/LIN28A_WGS_AMP_PD_mac1
# Done 21.04.2020
plink --bfile $WORKING_DIR/LIN28A_WGS_AMP_PD_mac3 \
--mac 3 \
--recode 'vcf-fid' --out $WORKING_DIR/LIN28A_WGS_AMP_PD_mac3
# Done 21.04.2020
bgzip $WORKING_DIR/LIN28A_WGS_AMP_PD_mac1.vcf
tabix -f -p vcf $WORKING_DIR/LIN28A_WGS_AMP_PD_mac1.vcf.gz
# Done 21.04.2020
bgzip $WORKING_DIR/LIN28A_WGS_AMP_PD_mac3.vcf
tabix -f -p vcf $WORKING_DIR/LIN28A_WGS_AMP_PD_mac3.vcf.gz
# Done 21.04.2020
# For burdens, only need --mac 1
table_annovar.pl $WORKING_DIR/LIN28A_WGS_AMP_PD_mac1.vcf.gz \
$ANNOVAR_DATA/hg38 --thread 20 -buildver hg38 \
-out $WORKING_DIR/LIN28A_WGS_AMP_PD_mac1.annovar \
-arg '-splicing 15',,, \
-remove -protocol refGene,ljb26_all,gnomad211_genome,clinvar_20190305 \
-operation g,f,f,f -nastring . -vcfinput -polish
# Remove the VCF
rm $WORKING_DIR/LIN28A_WGS_AMP_PD_mac1.annovar.hg38_multianno.vcf
# Done 21.04.2020
wc -l LIN28A_WGS_AMP_PD.annovar.hg38_multianno.txt
- 135 LIN28A_WGS_AMP_PD.annovar.hg38_multianno.txt
- Matches variant numbers above
head -1 LIN28A_WGS_AMP_PD_mac1.annovar.hg38_multianno.txt > header.txt
colct="$(wc -w header.txt| cut -f1 -d' ')"
cut -f1-$colct LIN28A_WGS_AMP_PD_mac1.annovar.hg38_multianno.txt > LIN28A.trimmed.annotation.txt
For burdens, only need --mac 1
- synonymous + non-synonymous
- From trimmed file
grep "exonic" $WORKING_DIR/LIN28A.trimmed.annotation.txt > $WORKING_DIR/CODING_regions.txt
cut -f 1,2,3,7 $WORKING_DIR/CODING_regions.txt > CODING.txt
# Information:
# wc -l CODING.txt
# 4 CODING.txt
# head CODING.txt
# 1 26411441 26411441 LIN28A
# 1 26425455 26425455 LIN28A
# 1 26426394 26426394 LIN28A
# 1 26426443 26426443 LIN28A
plink --bfile $WORKING_DIR/LIN28A_WGS_AMP_PD_mac1 --extract range CODING.txt --make-bed --out $WORKING_DIR/LIN28A_exonic_AMP_PD
# Files generated @ LIN28A_exonic_AMP_PD.*
# Done 21.04.2020
plink --bfile $WORKING_DIR/LIN28A_exonic_AMP_PD --recode 'vcf-fid' --out $WORKING_DIR/LIN28A_exonic_AMP_PD
# Done 21.04.2020
# bgzip and tabix
bgzip $WORKING_DIR/LIN28A_exonic_AMP_PD.vcf
tabix -f -p vcf $WORKING_DIR/LIN28A_exonic_AMP_PD.vcf.gz
# Done 21.04.2020
## ALL VARIANTS ##
rvtest --noweb --hide-covar --inVcf $WORKING_DIR/LIN28A_WGS_AMP_PD_mac1.vcf.gz \
--pheno $COV_FILE \
--pheno-name PHENO \
--covar $COV_FILE \
--covar-name $COV_NAMES \
--kernel skat,skato --geneFile /data/LNG/makariousmb/refFlat_hg38.txt \
--out AMP_PD_BURDEN.LIN28A.NOFREQTHRESHOLD
# Files generated @ AMP_PD_BURDEN.LIN28A.NOFREQTHRESHOLD.*.assoc
# Done 21.04.2020
## CODING VARIANTS ##
rvtest --noweb --hide-covar --inVcf $WORKING_DIR/LIN28A_exonic_AMP_PD.vcf.gz \
--pheno $COV_FILE --covar $COV_FILE \
--pheno-name PHENO \
--covar-name $COV_NAMES \
--kernel skat,skato --geneFile /data/LNG/makariousmb/refFlat_hg38.txt \
--out AMP_PD_BURDEN.LIN28A.NOFREQTHRESHOLD_CODING
# Files generated @ AMP_PD_BURDEN.LIN28A.NOFREQTHRESHOLD_CODING.*.assoc
# Done 21.04.2020
## ALL VARIANTS ##
rvtest --noweb --hide-covar --inVcf $WORKING_DIR/LIN28A_WGS_AMP_PD_mac1.vcf.gz \
--pheno $COV_FILE \
--pheno-name PHENO \
--covar $COV_FILE \
--covar-name $COV_NAMES \
--kernel skat,skato --geneFile /data/LNG/makariousmb/refFlat_hg38.txt \
--freqUpper 0.01 --out AMP_PD_BURDEN.LIN28A.maf001
# Files generated @ AMP_PD_BURDEN.LIN28A.maf001.*.assoc
# Done 21.04.2020
## CODING VARIANTS ##
rvtest --noweb --hide-covar --inVcf $WORKING_DIR/LIN28A_exonic_AMP_PD.vcf.gz \
--pheno $COV_FILE \
--pheno-name PHENO \
--covar $COV_FILE \
--covar-name $COV_NAMES \
--kernel skat,skato --geneFile /data/LNG/makariousmb/refFlat_hg38.txt \
--freqUpper 0.01 --out AMP_PD_BURDEN.LIN28A.maf001_CODING
# Files generated @ AMP_PD_BURDEN.LIN28A.maf001_CODING.*.assoc
# Done 21.04.2020
## ALL VARIANTS ##
rvtest --noweb --hide-covar --inVcf $WORKING_DIR/LIN28A_WGS_AMP_PD_mac1.vcf.gz \
--pheno $COV_FILE \
--pheno-name PHENO \
--covar $COV_FILE \
--covar-name $COV_NAMES \
--kernel skat,skato --geneFile /data/LNG/makariousmb/refFlat_hg38.txt \
--freqUpper 0.03 --out AMP_PD_BURDEN.LIN28A.maf003
# Files generated @ AMP_PD_BURDEN.LIN28A.maf003.*.assoc
# Done 21.04.2020
## CODING VARIANTS ##
rvtest --noweb --inVcf $WORKING_DIR/LIN28A_exonic_AMP_PD.vcf.gz \
--pheno $COV_FILE --covar $COV_FILE \
--pheno-name PHENO \
--covar-name $COV_NAMES \
--kernel skat,skato \
--geneFile /data/LNG/makariousmb/refFlat_hg38.txt \
--freqUpper 0.03 --out AMP_PD_BURDEN.LIN28A.maf003_CODING
# Files generated @ AMP_PD_BURDEN.LIN28A.maf003_CODING.*.assoc
# Done 21.04.2020
For statistics, need --mac 3: Reason being if there is one allele, then not enough information to do stats, it is assigned a 0 and results in an NA
# Tests done on single variants done on ALL variants
# Wald = same as logistic in PLINK, better for common variants
# score = better for rare variants
rvtest --noweb --hide-covar --inVcf $WORKING_DIR/LIN28A_WGS_AMP_PD_mac3.vcf.gz \
--pheno $COV_FILE \
--pheno-name PHENO \
--covar $COV_FILE \
--covar-name $COV_NAMES \
--single wald,score --geneFile /data/LNG/makariousmb/refFlat_hg38.txt \
--out AMP_PD_mac3.LIN28A.LOGISTIC.allPCs
# Files generated at AMP_PD_mac3.LIN28A.LOGISTIC.allPCs.*.assoc
# --hide-covar flag to remove each and every individual covariate test from the final output
# The variant line reported includes all PCs
# Done 21.04.2020
A. Getting Started
B. Subset PLINK Binaries + Convert to VCFs
C. Output Frequency with PLINK
D. Logistic Regression with PLINK
E. Annotate VCFs with Annovar + KGGSeq
F. RVTESTS Burden tests
mkdir $WORKING_DIR/
cd $WORKING_DIR/
mkdir hardcallsNoNeuroX hardcallsNoNeuroX/bin
mkdir hardcallsNoNeuroX/freq hardcallsNoNeuroX/score
mkdir hardcallsNoNeuroX/burden hardcallsNoNeuroX/vcf
mkdir hardcallsNoNeuroX/annotation hardcallsNoNeuroX/burden
mkdir hardcallsNoNeuroX/logistic hardcallsNoNeuroX/freq
# ---------Remove NeuroX + keep males + genotype quality of 15% or less missingness
plink --bfile $WORKING_DIR/HARDCALLS_PD_september_2018_no_cousins \
--remove-fam $WORKING_DIR/NeuroX.fID.txt \
--chr 1 --geno 0.15 --from-bp 26737269 --to-bp 26756213 \
--make-bed --out hardcallsNoNeuroX/bin/LIN28A.GWAS
plink --bfile hardcallsNoNeuroX/bin/LIN28A.GWAS --recode vcf --out hardcallsNoNeuroX/vcf/LIN28A.GWAS
cd hardcallsNoNeuroX/vcf
bgzip LIN28A.GWAS.vcf
tabix -f -p vcf LIN28A.GWAS.vcf.gz
cd $WORKING_DIR/
## Overall
plink --bfile $WORKING_DIR/HARDCALLS_PD_september_2018_no_cousins --chr 1 --from-bp 26737269 --to-bp 26756213 --remove-fam $WORKING_DIR/NeuroX.fID.txt --freq --geno 0.15 --out hardcallsNoNeuroX/freq/LIN28A
## Case-control
plink --bfile $WORKING_DIR/HARDCALLS_PD_september_2018_no_cousins --chr 1 --from-bp 26737269 --to-bp 26756213 --remove-fam $WORKING_DIR/NeuroX.fID.txt --freq case-control --geno 0.15 --out hardcallsNoNeuroX/freq/LIN28A
plink --bfile $WORKING_DIR/HARDCALLS_PD_september_2018_no_cousins \
--remove-fam $WORKING_DIR/NeuroX.fID.txt \
--chr 1 --from-bp 26737269 --to-bp 26756213 \
--geno 0.15 --covar $WORKING_DIR/IPDGC_all_samples_covariates.tab \
--covar-name sex,AGE,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,DUTCH,FINLAND,GERMANY,MCGILL,MF,NIA,OSLO,PROBAND,PROPARK,SHULMAN,SPAIN3,SPAIN4,TUBI,UK_GWAS,VANCE \
--assoc --out hardcallsNoNeuroX/logistic/LIN28A
mkdir $WORKING_DIR/annovar
# Downloading databases
# annotate_variation.pl -buildver hg19 -downdb -webfrom annovar refGene humandb/
# annotate_variation.pl -buildver hg19 -downdb cytoBand humandb/
# annotate_variation.pl -buildver hg19 -downdb -webfrom annovar ensGene humandb/
# annotate_variation.pl -buildver hg19 -downdb -webfrom annovar exac03 humandb/
# annotate_variation.pl -buildver hg19 -downdb -webfrom annovar avsnp147 humandb/
# annotate_variation.pl -buildver hg19 -downdb -webfrom annovar dbnsfp30a humandb/
# annotate_variation.pl -buildver hg19 -downdb -webfrom annovar gnomad211_genome humandb/
table_annovar.pl $WORKING_DIR/hardcallsNoNeuroX/vcf/LIN28A.GWAS.vcf.gz $WORKING_DIR/annovar/humandb/ -buildver hg19 \
--thread 16 \
-out $WORKING_DIR/hardcallsNoNeuroX/annotation/LIN28A.annovar \
-remove -protocol avsnp147,refGene,ensGene,gnomad211_genome \
-operation f,g,g,f \
-nastring . \
-vcfinput
cd $WORKING_DIR/hardcallsNoNeuroX/annotation
head -1 LIN28A.annovar.hg19_multianno.txt > header.txt
colct="$(wc -w header.txt| cut -f1 -d' ')"
cut -f1-$colct LIN28A.annovar.hg19_multianno.txt > LIN28A.trimmed.annotation.txt
Note that here we do not adjust by AGE since the model does not fit in RVTests
cd $WORKING_DIR/LIN28A
rvtest --noweb --inVcf $WORKING_DIR/hardcallsNoNeuroX/vcf/LIN28A.GWAS.vcf.gz --pheno $WORKING_DIR/IPDGC_all_samples_covariates.vcf.tab --covar $WORKING_DIR/IPDGC_all_samples_covariates.vcf.tab --covar-name sex,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,DUTCH,FINLAND,GERMANY,MCGILL,MF,NIA,OSLO,PROBAND,PROPARK,SHULMAN,SPAIN3,SPAIN4,TUBI,UK_GWAS,VANCE --burden cmc,zeggini,mb,fp,cmcWald --kernel skat,skato --geneFile $WORKING_DIR/refFlat_hg19.txt --freqUpper 0.03 --out hardcallsNoNeuroX/burden/BURDEN.LIN28A.maf03