-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathentrez_qiime.py
executable file
·505 lines (371 loc) · 21.8 KB
/
entrez_qiime.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
#!/usr/bin/python
# Author: Chris Baker
# Email: [email protected]
# The original version of this script was written while the author was in the
# Pierce Lab, Department of Organismic and Evolutionary Biology, Harvard University
# This update, which allows the script to work with NCBI accession.version numbers instead
# of GI numbers, was written while the author was in the Pringle and Tarnita Labs,
# Department of Ecology and Evolutionary Biology, Princeton University
# Last modified: 6 November 2017
#################################################
import os, argparse
from textwrap import dedent
from time import strftime
from time import localtime
from sys import exit
from cogent.parse.ncbi_taxonomy import NcbiTaxonomyFromFiles
from cogent.parse.ncbi_taxonomy import NcbiTaxonNode
from cogent.core.tree import TreeNode
#################################################
# parse command line options
parser = argparse.ArgumentParser(description='Generates QIIME-compatible files from a suitable FASTA file or list of accession numbers, plus the NCBI taxonomy database.')
inputfilepath_args = parser.add_mutually_exclusive_group(required=True)
# Path including filename for input FASTA file. Expects deflines that look like ">XX######.# ..." where "XX######.# " is the NCBI accession.version number that uniquely identifies the sequence, and "..." is arbitrary text. Does NOT work with other defline formats, including the old deflines with the format ">gi|######|...". For files with these old-style deflines see v.1.0 of this script.
inputfilepath_args.add_argument('-i','--inputfasta',
metavar='path/file.ext',
dest='infile_fasta_path',
action='store',
help='The path, including filename, of an input FASTA file. Expects deflines that look like ">XX######.# ..." where "XX######.# " is the NCBI accession.version number that uniquely identifies the sequence, and "..." is arbitrary text. Does NOT work with other defline formats, including the old deflines with the format ">gi|######|...". [REQUIRED, unless an accession number list is supplied with -L or --List. If both are supplied, the script fails.]')
# The path, including filename, of an input accession number list as an alternative to an input FASTA file.
inputfilepath_args.add_argument('-L','--List',
metavar='path/file.ext',
dest='infile_list_path',
action='store',
help='The path, including filename, of an input accession number list (one accession number per line) as an alternative to supplying an input FASTA file. [REQUIRED, unless a FASTA file is supplied with -i or --inputfasta. If both are supplied, the script fails.]')
# Path to output file.
parser.add_argument('-o','--outputfile',
metavar='path/file.ext',
dest='outfile_path',
action='store',
help='The path, including filename, for the accession-number-to-taxonomy output file. Will be created if it does not exist. Defaults to same directory as input FASTA or list file, with file name derived from the input file\'s name, if not supplied here.')
# Path for output log file.
parser.add_argument('-g','--outputlog',
metavar='path/file.ext',
dest='logfile_path',
action='store',
help='The path, including filename, for the output log file. Will be created if it does not exist. Defaults to same directory as input FASTA or list file, with file name derived from the input file\'s name, if not supplied here.')
# Force overwrite if output file already exists?
parser.add_argument('-f','--force',
dest='force_overwrite',
action='store_true',
default=False,
help='If -f or --force option are passed, output file will overwrite any existing file with the same name. Without this option, output filename will be modified by the addition of a date/time string in the event that a file with the same name already exists. [Default: False]')
# Filename and path for NCBI node-parent list (from taxdump.tar.gz)
parser.add_argument('-n','--nodes',
metavar='path/',
dest='ncbi_taxonomy_dir',
action='store',
default='./',
help='The directory where the files nodes.dmp, names.dmp, merged.dmp and delnodes.dmp are located. The files are typically downloaded from the NCBI in the compressed archive taxdump.tar.gz. [Default: ./]')
# Filename and path for NCBI accession number - taxid list (e.g. from nucl_gb.accession2taxid.gz)
parser.add_argument('-a','--acc2taxid',
metavar='path/file.ext',
dest='infile_acc2taxid_path',
action='store',
default='./nucl_gb.accession2taxid',
help='The path, including filename, of the (uncompressed) input accession number - taxid file, typically downloaded from the NCBI as nucl_gb.accession2taxid.gz and uncompressed to nucl_gb.accession2taxid. [Default: ./nucl_gb.accession2taxid]')
# lineage info to retain
parser.add_argument('-r','--ranks',
metavar='rank1,rank2,...',
dest='output_ranks',
action='store',
default='phylum,class,order,family,genus,species',
help=dedent('''\
A comma-separated list (no spaces) of the taxonomic ranks you want to keep in the taxon names output. Ranks can be provided in any order, but names output will be generated in that order, so should typically be in descending order. Any taxonomic names assigned to one of the ranks in the list will be retained; other names will be discarded. If a taxon has no name for one of the ranks in the list, then NA will be inserted in the output. No sequences or taxa will be discarded -- this option only affects the names that are output for each taxon or sequence. The following ranks are available in the NCBI taxonomy database as at 2 June 2012 (in alphabetical order):
class,
family,
forma,
genus,
infraclass,
infraorder,
kingdom,
no_rank,
order,
parvorder,
phylum,
species,
species_group,
species_subgroup,
subclass,
subfamily,
subgenus,
subkingdom,
suborder,
subphylum,
subspecies,
subtribe,
superclass,
superfamily,
superkingdom,
superorder,
superphylum,
tribe,
varietas.
Note the use of underscores in three of the names. You should include these on the command line but they will be replaced by spaces when the script runs so that the names match the NCBI ranks. [Default: phylum,class,order,family,genus,species]'''))
args = parser.parse_args()
#################################################
def main():
args.infile_nodesdmp_path, args.infile_namesdmp_path, args.infile_mergeddmp_path, \
args.infile_delnodesdmp_path = input_files()
args.outfile_path, args.logfile_path = output_files()
args.output_ranks = args.output_ranks.replace('_',',').split(',')
accessions_in_input_file = get_accessions_from_input_file()
ncbi_full_taxonomy = NcbiTaxonomyFromFiles(open(args.infile_nodesdmp_path), open(args.infile_namesdmp_path))
merged_taxids = get_merged_nodes()
deleted_taxids = get_deleted_nodes()
included_nodes, taxid, missing_accessions = obtain_nodes_for_each_accession(accessions_in_input_file, \
args.infile_acc2taxid_path, ncbi_full_taxonomy, merged_taxids, deleted_taxids)
taxid_taxonomy, missing_taxonomy = generate_taxonid_taxonomy(included_nodes, ncbi_full_taxonomy, args.output_ranks)
generate_output_files(args.outfile_path, taxid_taxonomy, taxid, missing_accessions, missing_taxonomy)
log_file = open(args.logfile_path, 'a')
log_file.write('Finished running entrez_qiime.py at ' + strftime("%H:%M:%S on %d-%m-%Y",localtime()) + '\n')
log_file.close()
#################################################
def input_files():
if not os.path.isdir(args.ncbi_taxonomy_dir):
exit('Error: path supplied via -n does not appear to be a valid directory (' + args.ncbi_taxonomy_dir + '))')
infile_nodesdmp_path = os.path.join(args.ncbi_taxonomy_dir, 'nodes.dmp')
if not os.path.isfile(infile_nodesdmp_path):
exit('Error: cannot find file nodes.dmp at ' + infile_nodesdmp_path)
infile_namesdmp_path = os.path.join(args.ncbi_taxonomy_dir, 'names.dmp')
if not os.path.isfile(infile_namesdmp_path):
exit('Error: cannot find file names.dmp at ' + infile_namesdmp_path)
infile_mergeddmp_path = os.path.join(args.ncbi_taxonomy_dir, 'merged.dmp')
if not os.path.isfile(infile_mergeddmp_path):
exit('Error: cannot find file merged.dmp at ' + infile_mergeddmp_path)
infile_delnodesdmp_path = os.path.join(args.ncbi_taxonomy_dir, 'delnodes.dmp')
if not os.path.isfile(infile_delnodesdmp_path):
exit('Error: cannot find file delnodes.dmp at ' + infile_delnodesdmp_path)
if args.infile_fasta_path != None:
if not os.path.isfile(args.infile_fasta_path):
exit('Error: cannot find input FASTA file at ' + args.infile_fasta_path)
elif args.infile_list_path != None:
if not os.path.isfile(args.infile_list_path):
exit('Error: cannot find input accession number list at ' + args.infile_list_path)
else:
print "Need to supply either FASTA file or accession number list as input." # should never get here because either -i or -L is a required argument.
if not os.path.isfile(args.infile_acc2taxid_path):
exit('Error: cannot find input accession number - TaxonID file at ' + args.infile_acc2taxid_path)
return(infile_nodesdmp_path,
infile_namesdmp_path,
infile_mergeddmp_path,
infile_delnodesdmp_path)
#################################################
def output_files():
timestamp = strftime("%Y%m%d%H%M%S",localtime())
# Construct output filenames if not supplied as arguments.
if args.infile_fasta_path != None:
inputfile_dir = os.path.dirname(args.infile_fasta_path)
inputfile_filename = os.path.splitext(os.path.basename(args.infile_fasta_path))[0].rstrip(".") # rstrip in case there are multiple periods at the end of the filename
else:
inputfile_dir = os.path.dirname(args.infile_list_path)
inputfile_filename = os.path.splitext(os.path.basename(args.infile_list_path))[0].rstrip(".") # rstrip in case there are multiple periods at the end of the filename
if args.outfile_path == None:
# if no output filepath supplied, save file to input directory and construct filename based on input filename
outputfile_dir = inputfile_dir
outputfile_filename = inputfile_filename + '_accession_taxonomy.txt'
outfile_path = os.path.join(outputfile_dir, outputfile_filename)
elif os.path.isdir(args.outfile_path):
# if supplied output filepath is actually a directory, construct filename based on input filename
outputfile_dir = args.outfile_path
outputfile_filename = inputfile_filename + '_accession_taxonomy.txt'
outfile_path = os.path.join(outputfile_dir, outputfile_filename)
else:
# if supplied output filepath is not an existing directory, then use it for output file
outfile_path = args.outfile_path
if args.logfile_path == None:
# if no logfile filepath supplied, save logfile to input directory and construct filename based on input filename
logfile_dir = inputfile_dir
logfile_filename = inputfile_filename + '.log'
logfile_path = os.path.join(logfile_dir, logfile_filename)
elif os.path.isdir(args.logfile_path):
# if supplied logfile filepath is actually a directory, construct filename based on input filename
logfile_dir = args.logfile_path
logfile_filename = inputfile_filename + '.log'
logfile_path = os.path.join(logfile_dir, logfile_filename)
else:
# if supplied logfile filepath is not an existing directory, then use it for output logfile
logfile_path = args.logfile_path
# Try to deal with relative paths
if not os.path.dirname(outfile_path):
outfile_path = './' + outfile_path
if not os.path.dirname(logfile_path):
logfile_path = './' + logfile_path
# Check that directories for output files exist, otherwise create them.
if not os.path.exists(os.path.dirname(outfile_path)):
os.makedirs(os.path.dirname(outfile_path))
if not os.path.exists(os.path.dirname(logfile_path)):
os.makedirs(os.path.dirname(logfile_path))
# Checks that the output file and log file don't already exist. If they do, either delete them or modify name to be used.
if args.force_overwrite:
if os.path.isfile(outfile_path):
os.unlink(outfile_path)
if os.path.isfile(logfile_path):
os.unlink(logfile_path)
else:
if os.path.isfile(outfile_path):
outfile_path = os.path.splitext(outfile_path)[0] + '_' + timestamp + os.path.splitext(outfile_path)[1]
if os.path.isfile(logfile_path):
logfile_path = os.path.splitext(logfile_path)[0] + '_' + timestamp + os.path.splitext(logfile_path)[1]
# Checks that the output file and log file aren't the same file. If they are, append different extensions.
if outfile_path == logfile_path:
outfile_path = outfile_path + '.txt'
logfile_path = logfile_path + '.log'
# Initiate log file.
log_file = open(logfile_path, 'w')
log_file.write('Log file for entrez_qiime.py run ' + strftime("%H:%M:%S on %d-%m-%Y",localtime()) + '\n')
log_file.write('Using entrez_qiime v.2.0' + '\n')
log_file.close()
return(outfile_path, logfile_path)
#################################################
def get_accessions_from_input_file():
# Create list of accession numbers in the input file (either FASTA or accession number list).
included_accessions = {}
if args.infile_fasta_path != None:
input_file = open(args.infile_fasta_path, 'r')
for curr_line in input_file:
if curr_line[0] == '>':
curr_line_accession = curr_line.split(' ',1)[0][1:]
curr_line_accession = curr_line_accession.rstrip()
included_accessions[curr_line_accession] = True
else:
input_file = open(args.infile_list_path, 'r')
for curr_line in input_file:
curr_line_accession = curr_line.rstrip()
if curr_line_accession:
included_accessions[curr_line_accession] = True
input_file.close()
log_file = open(args.logfile_path, 'a')
log_file.write('get_accessions_from_input_file() finished ' + strftime("%H:%M:%S on %d-%m-%Y",localtime()) + '\n')
log_file.close()
return included_accessions
#################################################
def get_merged_nodes():
merged = {}
mergeddmp = open(args.infile_mergeddmp_path,'r')
for curr_line in mergeddmp:
curr_line_old_taxid = curr_line.split('|')[0].strip()
curr_line_new_taxid = curr_line.split('|')[1].strip()
merged[curr_line_old_taxid] = curr_line_new_taxid
mergeddmp.close()
log_file = open(args.logfile_path, 'a')
log_file.write('get_merged_nodes() finished ' + strftime("%H:%M:%S on %d-%m-%Y",localtime()) + '\n')
log_file.close()
return(merged)
#################################################
def get_deleted_nodes():
deleted = {}
delnodesdmp = open(args.infile_delnodesdmp_path,'r')
for curr_line in delnodesdmp:
curr_line_old_taxid = curr_line.split('|')[0].strip()
deleted[curr_line_old_taxid] = True
delnodesdmp.close()
log_file = open(args.logfile_path, 'a')
log_file.write('get_deleted_nodes() finished ' + strftime("%H:%M:%S on %d-%m-%Y",localtime()) + '\n')
log_file.close()
return(deleted)
#################################################
def obtain_nodes_for_each_accession(accessions_in_input_file, infile_acc2taxid_path, \
ncbi_full_taxonomy, merged_taxids, deleted_taxids):
included_nodes = []
node_numbers = {}
taxid = {}
failed_taxids = {}
missing_accessions = {}
with open(infile_acc2taxid_path, 'r') as acc2taxid:
discard_header_line = next(acc2taxid)
for curr_line in acc2taxid:
curr_accession, curr_taxid = curr_line.rstrip().split('\t')[1:3]
if curr_accession in accessions_in_input_file:
if not curr_taxid in node_numbers:
try:
included_nodes.append(ncbi_full_taxonomy[curr_taxid])
except KeyError:
if curr_taxid in merged_taxids:
old_taxid = curr_taxid
curr_taxid = merged_taxids[curr_taxid]
log_file = open(args.logfile_path, 'a')
log_file.write('The following TaxonID was changed:\n acc2taxid gives accession=' \
+ str(curr_accession) + ' with taxid=' + str(old_taxid) + \
' --> changed to taxid=' + str(curr_taxid) + '.\n')
log_file.close()
elif curr_taxid in deleted_taxids:
old_taxid = curr_taxid
curr_taxid = 1 # assigns to root
log_file = open(args.logfile_path, 'a')
log_file.write('The following accession number was assigned to a deleted TaxonID:\n acc2taxid gives accession=' \
+ str(curr_accession) + ' with taxid=' + str(old_taxid) + \
' --> changed to taxid=' + str(curr_taxid) + '.\n')
log_file.close()
else:
old_taxid = curr_taxid
curr_taxid = 1 # assigns to root
log_file = open(args.logfile_path, 'a')
log_file.write('The following accession number was assigned to a TaxonID that could not be found:\n acc2taxid gives accession=' \
+ str(curr_accession) + ' with taxid=' + str(old_taxid) + \
' --> changed to taxid=' + str(curr_taxid) + '.\n')
log_file.close()
finally:
node_numbers[curr_taxid] = True
try:
included_nodes.append(ncbi_full_taxonomy[curr_taxid])
except KeyError:
failed_taxids[curr_taxid] = True
log_file = open(args.logfile_path, 'a')
log_file.write('The following taxid could not be added to included_nodes: taxid=' + str(curr_taxid) + '.\n')
log_file.close()
taxid[curr_accession] = curr_taxid
# determine any accession numbers from input FASTA or list file that do not have taxid information
for curr_accession in accessions_in_input_file:
if curr_accession not in taxid:
missing_accessions[curr_accession] = True
log_file = open(args.logfile_path, 'a')
log_file.write('obtain_nodes_for_each_accession() finished ' + strftime("%H:%M:%S on %d-%m-%Y",localtime()) + '\n')
log_file.close()
return included_nodes, taxid, missing_accessions
#################################################
def generate_taxonid_taxonomy(included_nodes, ncbi_full_taxonomy, output_ranks):
# Generate dictionary that matches taxonIDs to lineage information for all nodes represented by a sequence in the FASTA file.
taxid_taxonomy = {}
ranks_lookup = dict([(r,idx) for idx,r in enumerate(output_ranks)])
for node in included_nodes:
lineage = ['NA'] * len(ranks_lookup)
curr = node
lineage_complete = False
while lineage_complete is False:
if curr.Rank in ranks_lookup:
lineage[ranks_lookup[curr.Rank]] = curr.Name
curr = curr.Parent
if curr is None:
lineage_complete = True
elif curr.TaxonId in taxid_taxonomy:
for level in range(0,len(lineage)):
if (taxid_taxonomy[curr.TaxonId][level] is not 'NA') and (lineage[level] is 'NA'):
lineage[level] = taxid_taxonomy[curr.TaxonId][level]
lineage_complete = True
taxid_taxonomy[node.TaxonId] = lineage
missing_taxonomy = ['NA'] * len(ranks_lookup) # taxonomy string to be used in the output file for any accessions in input FASTA or list file that do not have corresponding TaxonID information
log_file = open(args.logfile_path, 'a')
log_file.write('generate_taxonid_taxonomy() finished ' + strftime("%H:%M:%S on %d-%m-%Y",localtime()) + '\n')
log_file.close()
return taxid_taxonomy, missing_taxonomy
#################################################
def generate_output_files(outfile_path, taxid_taxonomy, taxid, missing_accessions, missing_taxonomy):
# Generate accession-to-taxonomy mapping file (e.g. for QIIME).
accession_taxonomy = open(outfile_path, 'w')
for curr_accession in taxid:
accession_taxonomy.write(curr_accession + '\t' + ';'.join(taxid_taxonomy[int(taxid[curr_accession])]) + '\n')
# fill in taxonomy file with NAs for accession numbers that are missing taxIDs
log_file = open(args.logfile_path, 'a')
for curr_accession in missing_accessions:
accession_taxonomy.write(curr_accession + '\t' + ';'.join(missing_taxonomy) + '\n')
log_file.write('The following accession number has no taxid associated with it: ' + curr_accession + '\n')
log_file.close()
accession_taxonomy.close()
log_file = open(args.logfile_path, 'a')
log_file.write('generate_output_files() finished ' + strftime("%H:%M:%S on %d-%m-%Y",localtime()) + '\n')
log_file.close()
#################################################
if __name__ == "__main__":
main()
#################################################