-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsitewise_recode_constant_sites.py
executable file
·91 lines (78 loc) · 3.69 KB
/
sitewise_recode_constant_sites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
#
# sitewise_recode_constant_sites.py created 2018-02-02
'''sitewise_recode_constant_sites.py last modified 2018-03-12
recode constant sites as null from site-wise RAxML or phylobayes tabular output
sitewise_recode_constant_sites.py -a matrix1.aln -l RAxML_perSiteLLs.matrix1.tab > RAxML_perSiteLLs.matrix1_w_const.tab
matrix can be in alternate formats (use -f), and gzipped
generate the tabular sitewise results using:
sitewise_ll_to_columns.py RAxML_perSiteLLs.matrix1 > RAxML_perSiteLLs.matrix1.tab
tabular likelihood output (-l) is assumed to look like:
site T1 T2 ...
1 -12.345 -12.456 ...
'''
import sys
import argparse
import time
import gzip
from collections import Counter
from Bio import AlignIO
def determine_constant_sites(fullalignment, alignformat):
'''read large alignment, and return a dict where keys are constant positions'''
if fullalignment.rsplit('.',1)[1]=="gz": # autodetect gzip format
opentype = gzip.open
print >> sys.stderr, "# reading alignment {} as gzipped".format(fullalignment), time.asctime()
else: # otherwise assume normal open
opentype = open
print >> sys.stderr, "# reading alignment {}".format(fullalignment), time.asctime()
alignedseqs = AlignIO.read(opentype(fullalignment), alignformat)
numtaxa = len(alignedseqs)
al_length = alignedseqs.get_alignment_length()
print >> sys.stderr, "# Alignment contains {} taxa for {} sites, including gaps".format( numtaxa, al_length )
constsites = {}
print >> sys.stderr, "# determining constant sites", time.asctime()
for i in range(al_length):
alignment_column = alignedseqs[:,i] # all letters per site
numgaps = alignment_column.count("-")
nogap_alignment_column = alignment_column.replace("-","").replace("X","") # excluding gaps
num_nogap_taxa = len(nogap_alignment_column)
aa_counter = Counter( nogap_alignment_column )
mostcommonaa = aa_counter.most_common(1)[0][0]
if len(aa_counter)==1: # meaning site has more than 1 possible AA, so use lnL
constsites[i+1] = True # use index plus 1 to match positions
mostcommoncount = aa_counter.most_common(1)[0][1]
print >> sys.stderr, "# Counted {} constant sites".format( len(constsites) )
return constsites
def read_tabular_ln(lntabular, constsites, wayout):
'''read tabular log-likelihood results and print a modified table where constants sites are recoded'''
linecounter = 0
recodecount = 0
print >> sys.stderr, "# Reading log-likelihood by site from {}".format(lntabular), time.asctime()
for line in open(lntabular,'r'):
line = line.strip()
if line: # ignore blank lines
linecounter += 1
if linecounter < 2:
print >> wayout, line
continue
lsplits = line.split('\t')
pos = int(lsplits[0]) # sites begin at 1
if constsites.get(pos, False): # meaning site is constant, so recode
recodecount += 1
recodedsites = ["const"] * len(lsplits[1:])
print >> wayout, "{}\t{}".format( pos, "\t".join(recodedsites) )
else:
print >> wayout, line
print >> sys.stderr, "# Read {} lines, recoded {} sites".format( linecounter,recodecount ), time.asctime()
def main(argv, wayout):
if not len(argv):
argv.append('-h')
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-a','--alignment', help="supermatrix alignment")
parser.add_argument('-f','--format', default="fasta", help="alignment format [fasta]")
parser.add_argument('-l','--log-likelihood', help="tabular log-likelihood data file from RAxML")
args = parser.parse_args(argv)
constsitedict = determine_constant_sites(args.alignment, args.format)
read_tabular_ln(args.log_likelihood, constsitedict, wayout)
if __name__ == "__main__":
main(sys.argv[1:], sys.stdout)