To activate conda (once installed) on ponderosa
source ~/anaconda3/bin/activate
Create conda env just for entropy (if it doesn't already exist)
conda create -n entropy
conda activate entropy
Either add bioconda channel and install
conda config --append channels bioconda
conda install popgen-entropy
Or install via:
conda install bioconda::popgen-entropy
As of 10/02/2024 this will install gsl 2.7. Need to install gsl 2.6 via (which will give you libgsl.so.25
):
conda install gsl=2.6
Make an entropy directory within your project directory
mkdir entropy
cd PATH/entropy
cp KRLA.01.30.2.15.a70.i40.recode.vcf ./
This first script would produce a file named KRLA.01.30.2.15.a70.i40.recode.mpgl
These scripts can be found on ponderosa in /mnt/parchman_lab/tfaske/denovo/src/perl_scripts/
perl PATH/vcf2mpgl_universal.pl KRLA.01.30.2.15.a70.i40.recode.vcf
perl PATH/gl2genest_universal.pl KRLA.01.30.2.15.a70.i40.recode.mpgl mean
Do the following in R:
library(dplyr)
library(tidyr)
library(readr)
library(stringr)
library(MASS)
library(LEA)
# change as appropriate
setwd('PATH')
makePopId <- function(fileIndv){
PopIDdf = read.table(fileIndv, sep="\t") %>%
as.data.frame() %>%
rename(All = V1) %>%
mutate(Population = str_split_i(All, '_', 2),
ID = str_split_i(All, '_', 3))
return(PopIDdf)
}
PCA_entropy <- function(g){
colmean = apply(g, 2, mean, na.rm = T)
normalize = matrix(nrow = nrow(g), ncol = ncol(g))
af = colmean/2
for (m in 1:length(af)){
nr = g[,m]-colmean[m]
dn = sqrt(af[m]*(1-af[m]))
normalize[,m] = nr/dn
}
normalize[is.na(normalize)] = 0
method1 = prcomp(normalize, scale. = F,center = F)
pca_df = method1$x[,1:27]
return(pca_df)
}
Need .indv
output from vcftools. Will come with generating a .012
file.
PopID <- makePopId('KRLA.01.30.2.15.a70.i40.recode.vcf.012.indv')
g <- read.table('pntest_mean_KRLA.01.30.2.15.a70.i40.recode.txt', header = F)
pca_df <- PCA_entropy(t(g)) %>%
.[,1:10] %>%
cbind(PopID)
Do kmeans clustering & LDA with desired k-values
writeLDAfile <- function(pcaDF, k){
kCluster = kmeans(pcaDF[,1:5], k, iter.max = 10, nstart = 10, algorithm = 'Hartigan-Wong')
ldaOut = lda(x = pcaDF[,1:5], grouping = kCluster$cluster, CV = T)
write.table(round(ldaOut$posterior, 5),
file = paste('ldak', as.character(k), '.txt', sep = ''),
quote = F, row.names = F, col.names = F)
}
writeLDAfile(pca_df, 2)
writeLDAfile(pca_df, 3)
writeLDAfile(pca_df, 4)
writeLDAfile(pca_df, 5)
writeLDAfile(pca_df, 6)
writeLDAfile(pca_df, 7)
writeLDAfile(pca_df, 8)
Create header for .mpgl
file
PopID_list <- paste(PopID$Pop, PopID$ID, sep = '_')
header <- data.frame(dims = NA, PopID_list)
df <- t(header)
dims <- paste(dim(g)[2], dim(g)[1], sep = " ")
df[1,1] <- dims
write.table(df, 'entropy_header.txt',
sep = " ", na = "",
quote = FALSE, row.names = FALSE, col.names = FALSE)
Back to terminal
cat entropy_header.txt KRLA.01.30.2.15.a70.i40.recode.mpgl > entropy.mpgl
Example for running it on k2...
In Trevor's workflow, he runs 5 chains per k-value and averages. For POMA I ran 3 chains for k 2-7, which is still probably overkill.
Also, -r
is only necessary if running different chains of the same k-value at the same time (i.e. you need different seeds to initialize the MCMC). Otherwise the seed is set based on the clock.
entropy -i entropy.mpgl -o entropy_k2_c1.hdf5 -r RANDOMINT?? -n 2 -l 60000 -b 10000 -t 10 -s 50 -e .01 -k 2 -q ldak2.txt -m 1 -w 0
Using python to work with estpost
import sys
import pandas as pd
import numpy as np
import scipy as sp
import glob
import re
import random
np.set_printoptions(precision=8) # increases float print option
pd.set_option("display.precision", 8)
hdf5_files = !find . -name '*hdf5'
hdf5_files = hdf5_files.sort()
hdf5_files
estpost = '~/anaconda3/envs/entropy/bin/estpost.entropy'
#make DIC
for i in range(0,len(hdf5_files)):
f = hdf5_files[i]
k = f.split('_')[1] #set this
c = f.split('_')[2].split('.hdf5')[0]
#print(k,c)
dic = "DIC_%s_%s.txt" % (k,c)
!$estpost $f -s 3 -p deviance > $dic
dic_files = !find . -name 'DIC*'
dic_files
for d in dic_files:
!cat $d
print('\n')
dic_list = []
for d in dic_files:
k = d.split('_k')[1].split('_')[0] #set this
c = d.split('_c')[1].split('.txt')[0]
#print(k,c)
dic = !grep 'DIC' $d
dic = float(re.search('(\d+.\d+)',str(dic)).group(0))
#print(dic)
dic_list.append([k,dic,c])
dic_df = pd.DataFrame(dic_list,columns=['k','DIC','chain'])
dic_df.head()
dic_df.to_csv('dic_list.csv')
dic_sum = dic_df.groupby('k').describe().DIC
dic_sum.sort_values('mean')
dic_sum.sort_values('mean')
dic_sum.to_csv('dic_sum.csv')
Generating ancestry coefficients (combining all chains if there are multiple?)
# ancestry coeffecients
!$estpost *k2*.hdf5 -p q -s 0 -o q2.txt
!$estpost *k3*.hdf5 -p q -s 0 -o q3.txt
!$estpost *k4*.hdf5 -p q -s 0 -o q4.txt
!$estpost *k5*.hdf5 -p q -s 0 -o q5.txt
!$estpost *k6*.hdf5 -p q -s 0 -o q6.txt
#MCMC diagnostics
!$estpost *k2*.hdf5 -p q -s 4 -o MCMC_k2.txt
!$estpost *k3*.hdf5 -p q -s 4 -o MCMC_k3.txt
!$estpost *k4*.hdf5 -p q -s 4 -o MCMC_k4.txt
!$estpost *k5*.hdf5 -p q -s 4 -o MCMC_k5.txt
!$estpost *k6*.hdf5 -p q -s 4 -o MCMC_k6.txt
Generate genotype probability file for k2 (also combining chains if there are multiple?)
estpost *k2*.hdf5 -p gprob -s 0 -o gprob2.txt
Can also generate a gprob file that's an average of gprobs across all run k-values
hdf5_files = []
num_k = [2,3,4,5,6]
num_c = 4
for k in num_k:
for c in range(1,num_c+1):
f = '../entropy_k' + str(k) + '_c' + str(c) + '.hdf5'
hdf5_files.append(f)
hdf5_files
gprob_cmd = estpost + ' ' + ' '.join(hdf5_files) + ' -p gprob -s 0 -o ../gprobAll.txt'
gprob_cmd