Skip to content

Commit ac2addd

Browse files
committed
Add OmpK gene checks
1 parent 2e9505d commit ac2addd

18 files changed

+250
-33
lines changed

kleborate/data/OmpK.aa

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
>OmpK35
2+
MMKRNILAVVIPALLVAGAANAAEIYNKNGNKLDFYGKMVGEHVWTTNGDTSSDDTTYARIGLKGETQINDQLIGYGQWEYNMDASNVEGSQTTKTRLAFAGLKAGEYGSFDYGRNYGAIYDVEAATDMLVEWGGDGWNYTDNYMTGRTNGVATYRNSDFFGLVDGLSFALQYQGKNDHDRAIRKQNGDGFSTAATYAFDNGIALSAGYSSSNRSVDQKADGNGDKAEAWATSAKYDANNIYAAVMYSQTYNMTPEEDNHFAGKTQNFEAVVQYQFDFGLRPSIGYVQTKGKDLQSRAGFSGGDADLVKYIEVGTWYYFNKNMNVYAAYKFNQLDDNDYTKAAGVATDDQAAVGIVYQF
3+
>OmpK36
4+
MKVKVLSLLVPALLVAGAANAAEIYNKDGNKLDLYGKIDGLHYFSDDKSVDGDQTYMRVGVKGETQINDQLTGYGQWEYNVQANNTESSSDQAWTRLAFAGLKFGDAGSFDYGRNYGVVYDVTSWTDVLPEFGGDTYGSDNFLQSRANGVATYRNSDFFGLVDGLNFALQYQGKNGSVSGEGALSPTNNGRTALKQNGDGYGTSLTYDIYDGISAGFAYSNSKRLGDQNSKLALGRGDNAETYTGGLKYDANNIYLATQYTQTYNATRAGSLGFANKAQNFEVVAQYQFDFGLRPSVAYLQSKGKDLEGYGDQDILKYVDVGATYYFNKNMSTYVDYKINLLDDNSFTHNAGISTDDVVALGLVYQF
5+
>OmpK36GD
6+
MKVKVLSLLVPALLVAGAANAAEIYNKDGNKLDLYGKIDGLHYFSDDKSVDGDQTYMRVGVKGETQINDQLTGYGQWEYNVQANNTESSSDQAWTRLAFAGLKFGDAGSFDYGRNYGVVYDVTSWTDVLPEFGGDGDTYGSDNFLQSRANGVATYRNSDFFGLVDGLNFALQYQGKNGSVSGEGALSPTNNGRTALKQNGDGYGTSLTYDIYDGISAGFAYSNSKRLGDQNSKLALGRGDNAETYTGGLKYDANNIYLATQYTQTYNATRAGSLGFANKAQNFEVVAQYQFDFGLRPSVAYLQSKGKDLEGYGDQDILKYVDVGATYYFNKNMSTYVDYKINLLDDNSFTHNAGISTDDVVALGLVYQF
7+
>OmpK36TD
8+
MKVKVLSLLVPALLVAGAANAAEIYNKDGNKLDLYGKIDGLHYFSDDKSVDGDQTYMRVGVKGETQINDQLTGYGQWEYNVQANNTESSSDQAWTRLAFAGLKFGDAGSFDYGRNYGVVYDVTSWTDVLPEFGGDTDTYGSDNFLQSRANGVATYRNSDFFGLVDGLNFALQYQGKNGSVSGEGALSPTNNGRTALKQNGDGYGTSLTYDIYDGISAGFAYSNSKRLGDQNSKLALGRGDNAETYTGGLKYDANNIYLATQYTQTYNATRAGSLGFANKAQNFEVVAQYQFDFGLRPSVAYLQSKGKDLEGYGDQDILKYVDVGATYYFNKNMSTYVDYKINLLDDNSFTHNAGISTDDVVALGLVYQF

kleborate/kleborate.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -439,8 +439,9 @@ def get_resistance_results(data_folder, contigs, args, res_headers):
439439
gene_info, _, _ = read_class_file(data_folder + '/ARGannot_clustered80_r3.csv')
440440
qrdr = data_folder + '/QRDR_120.aa'
441441
trunc = data_folder + '/MgrB_and_PmrB.aa'
442+
omp = data_folder + '/OmpK.aa'
442443
seqs = data_folder + '/ARGannot_r3.fasta'
443-
res_hits = resblast_one_assembly(contigs, gene_info, qrdr, trunc, seqs, 80.0, 90.0)
444+
res_hits = resblast_one_assembly(contigs, gene_info, qrdr, trunc, omp, seqs, 80.0, 90.0)
444445
return {r: ';'.join(sorted(res_hits[r])) if r in res_hits else '-'
445446
for r in res_headers}
446447
else:

kleborate/resBLAST.py

+80-27
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,20 @@ def main():
2828
print_header(res_classes, bla_classes)
2929

3030
for contigs in args.assemblies:
31-
hits_dict = resblast_one_assembly(contigs, gene_info, args.qrdr, args.trunc, args.seqs,
32-
args.mincov, args.minident)
31+
hits_dict = resblast_one_assembly(contigs, gene_info, args.qrdr, args.trunc, args.omp,
32+
args.seqs, args.mincov, args.minident)
3333
print_results(contigs, res_classes, bla_classes, hits_dict)
3434

3535

36-
def resblast_one_assembly(contigs, gene_info, qrdr, trunc, seqs, mincov, minident):
37-
build_blast_databases(seqs, qrdr, trunc)
36+
def resblast_one_assembly(contigs, gene_info, qrdr, trunc, omp, seqs, mincov, minident):
37+
build_blast_databases(seqs, qrdr, trunc, omp)
3838
hits_dict = blast_against_all(seqs, mincov, minident, contigs, gene_info)
3939
if qrdr:
4040
check_for_qrdr_mutations(hits_dict, contigs, qrdr)
4141
if trunc:
42-
check_for_gene_truncations(hits_dict, contigs, trunc)
42+
check_for_mgrb_pmrb_gene_truncations(hits_dict, contigs, trunc)
43+
if omp:
44+
check_omp_genes(hits_dict, contigs, omp)
4345
return hits_dict
4446

4547

@@ -63,6 +65,9 @@ def parse_arguments():
6365
additional_screening_args.add_argument('-r', '--trunc', type=str,
6466
help='MgrB and PmrB genes for which truncation can '
6567
'cause colistin resistance')
68+
additional_screening_args.add_argument('-o', '--omp', type=str,
69+
help='OmpK genes for which truncation/mutation can '
70+
'cause carbapenem resistance')
6671

6772
settings_args = parser.add_argument_group('Settings')
6873
settings_args.add_argument('-m', '--minident', type=float, default=90.0,
@@ -77,7 +82,7 @@ def parse_arguments():
7782
return parser.parse_args()
7883

7984

80-
def build_blast_databases(seqs, qrdr, trunc):
85+
def build_blast_databases(seqs, qrdr, trunc, omp):
8186
if not os.path.exists(seqs + '.nin'):
8287
with open(os.devnull, 'w') as devnull:
8388
subprocess.check_call('makeblastdb -dbtype nucl -in ' + seqs,
@@ -92,6 +97,11 @@ def build_blast_databases(seqs, qrdr, trunc):
9297
with open(os.devnull, 'w') as devnull:
9398
subprocess.check_call('makeblastdb -dbtype prot -in ' + trunc,
9499
stdout=devnull, shell=True)
100+
if omp:
101+
if not os.path.exists(omp + '.pin'):
102+
with open(os.devnull, 'w') as devnull:
103+
subprocess.check_call('makeblastdb -dbtype prot -in ' + omp,
104+
stdout=devnull, shell=True)
95105

96106

97107
def read_class_file(res_class_file):
@@ -223,7 +233,7 @@ def check_for_exact_aa_match(seqs, gene_nucl_seq):
223233

224234
def blastx_results_as_xml_tree(database, query):
225235
blastx_cmd = 'blastx -db ' + database + ' -query ' + query + ' -query_gencode 11' + \
226-
' -outfmt 5 -ungapped -comp_based_stats F -culling_limit 1 -max_hsps 1 -seg no'
236+
' -outfmt 5 -comp_based_stats F -culling_limit 1 -max_hsps 1 -seg no'
227237
process = subprocess.Popen(blastx_cmd, stdout=subprocess.PIPE, stderr=None, shell=True)
228238
blast_output = process.communicate()[0]
229239
if not isinstance(blast_output, str):
@@ -232,11 +242,12 @@ def blastx_results_as_xml_tree(database, query):
232242

233243

234244
def check_for_qrdr_mutations(hits_dict, contigs, qrdr):
235-
qrdr_loci = {'GyrA': [(83, 'S'), (87, 'D')], 'ParC': [(80, 'S'), (84, 'E')]}
245+
qrdr_loci = {'GyrA': [(83, 'S'), (87, 'D')],
246+
'ParC': [(80, 'S'), (84, 'E')]}
236247

237248
# key = (locus, pos), value = allele,
238249
# if found in a simple hit starting at position 1 of the protein seq
239-
complete_hits, incomplete_hits = {}, {}
250+
complete_hits, incomplete_hits = collections.defaultdict(list), collections.defaultdict(list)
240251

241252
root = blastx_results_as_xml_tree(qrdr, contigs)
242253
for query in root[8]:
@@ -250,29 +261,22 @@ def check_for_qrdr_mutations(hits_dict, contigs, qrdr):
250261
hsp_qseq = hsp[14].text
251262
hsp_hseq = hsp[15].text
252263

253-
for (pos, wt) in qrdr_loci[gene_id]:
254-
if hsp_hit_to >= pos and (hsp_gaps == 0) and (hsp_hit_from == 1):
264+
for pos, wt in qrdr_loci[gene_id]:
265+
if hsp_hit_to >= pos and hsp_gaps == 0 and hsp_hit_from == 1:
255266
# simple alignment
256-
if (gene_id, pos) in complete_hits:
257-
complete_hits[(gene_id, pos)].append(hsp_qseq[pos - 1])
258-
else:
259-
complete_hits[(gene_id, pos)] = [hsp_qseq[pos - 1]]
267+
complete_hits[(gene_id, pos)].append(hsp_qseq[pos - 1])
260268
else:
261269
# not a simple alignment, need to align query and hit and extract loci
262270
# manually
263-
if (pos >= hsp_hit_from) and (pos <= hsp_hit_to) and \
264-
(hsp_hit_eval <= 0.00001):
271+
if hsp_hit_from <= pos <= hsp_hit_to and hsp_hit_eval <= 0.00001:
265272
# locus is within aligned area, set evalue to filter out the junk
266273
# alignments
267274
pos_in_aln = get_gapped_position(hsp_hseq, pos - hsp_hit_from + 1)
268-
if (gene_id, pos) in incomplete_hits:
269-
incomplete_hits[(gene_id, pos)].append(hsp_qseq[pos_in_aln - 1])
270-
else:
271-
incomplete_hits[(gene_id, pos)] = [hsp_qseq[pos_in_aln - 1]]
275+
incomplete_hits[(gene_id, pos)].append(hsp_qseq[pos_in_aln - 1])
272276
snps = []
273277

274278
for locus in qrdr_loci:
275-
for (pos, wt) in qrdr_loci[locus]:
279+
for pos, wt in qrdr_loci[locus]:
276280
if (locus, pos) in complete_hits:
277281
if complete_hits[(locus, pos)][0] != wt:
278282
snps.append(locus + '-' + str(pos) +
@@ -283,12 +287,10 @@ def check_for_qrdr_mutations(hits_dict, contigs, qrdr):
283287
snps.append(locus + '-' + str(pos) +
284288
incomplete_hits[(locus, pos)][0]) # record SNP at this site
285289
if snps:
286-
if 'Flq' not in hits_dict:
287-
hits_dict['Flq'] = []
288290
hits_dict['Flq'] += snps
289291

290292

291-
def check_for_gene_truncations(hits_dict, contigs, trunc):
293+
def check_for_mgrb_pmrb_gene_truncations(hits_dict, contigs, trunc):
292294
best_mgrb_cov, best_pmrb_cov = 0.0, 0.0
293295

294296
root = blastx_results_as_xml_tree(trunc, contigs)
@@ -297,12 +299,10 @@ def check_for_gene_truncations(hits_dict, contigs, trunc):
297299
gene_id = hit[2].text
298300
assert gene_id == 'MgrB' or gene_id == 'PmrB'
299301
gene_len = int(hit[4].text)
300-
301302
for hsp in hit[5]:
302303
hsp_qseq = hsp[14].text
303304
hit_length = max(len(x) for x in hsp_qseq.split('*'))
304305
coverage = 100.0 * float(hit_length) / gene_len
305-
306306
if gene_id == 'MgrB' and coverage > best_mgrb_cov:
307307
best_mgrb_cov = coverage
308308
elif gene_id == 'PmrB' and coverage > best_pmrb_cov:
@@ -320,6 +320,59 @@ def check_for_gene_truncations(hits_dict, contigs, trunc):
320320
hits_dict['Col'] += truncations
321321

322322

323+
def check_omp_genes(hits_dict, contigs, omp):
324+
check_for_omp_gene_truncations(hits_dict, contigs, omp)
325+
check_for_ompk36_mutations(hits_dict, contigs, omp)
326+
327+
328+
def check_for_omp_gene_truncations(hits_dict, contigs, omp):
329+
best_ompk35_cov, best_ompk36_cov = 0.0, 0.0
330+
331+
root = blastx_results_as_xml_tree(omp, contigs)
332+
for query in root[8]:
333+
for hit in query[4]:
334+
gene_id = hit[2].text
335+
gene_len = int(hit[4].text)
336+
for hsp in hit[5]:
337+
hsp_qseq = hsp[14].text
338+
hit_length = max(len(x) for x in hsp_qseq.split('*'))
339+
coverage = 100.0 * float(hit_length) / gene_len
340+
341+
if gene_id == 'OmpK35' and coverage > best_ompk35_cov:
342+
best_ompk35_cov = coverage
343+
elif (gene_id == 'OmpK36' or gene_id == 'OmpK36GD' or
344+
gene_id == 'OmpK36TD') and coverage > best_ompk36_cov:
345+
best_ompk36_cov = coverage
346+
347+
truncations = []
348+
if best_ompk35_cov < 90.0:
349+
truncations.append('OmpK35-' + ('%.0f' % best_ompk35_cov) + '%')
350+
if best_ompk36_cov < 90.0:
351+
truncations.append('OmpK36-' + ('%.0f' % best_ompk36_cov) + '%')
352+
353+
if truncations:
354+
if 'Bla_Carb' not in hits_dict:
355+
hits_dict['Bla_Carb'] = []
356+
hits_dict['Bla_Carb'] += truncations
357+
358+
359+
def check_for_ompk36_mutations(hits_dict, contigs, omp):
360+
root = blastx_results_as_xml_tree(omp, contigs)
361+
for query in root[8]:
362+
for hit in query[4]:
363+
gene_id = hit[2].text
364+
gene_len = int(hit[4].text)
365+
for hsp in hit[5]:
366+
hsp_qseq = hsp[14].text
367+
hit_length = max(len(x) for x in hsp_qseq.split('*'))
368+
coverage = 100.0 * float(hit_length) / gene_len
369+
if coverage >= 90.0:
370+
if gene_id == 'OmpK36GD' and 'GDGDTY' in hsp_qseq:
371+
hits_dict['Bla_Carb'].append('OmpK36GD')
372+
if gene_id == 'OmpK36TD' and 'GDTDTY' in hsp_qseq:
373+
hits_dict['Bla_Carb'].append('OmpK36TD')
374+
375+
323376
def get_strain_name(full_path):
324377
filename = os.path.split(full_path)[1]
325378
if filename.endswith('_temp_decompress.fasta'):

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def run(self):
7777
'iro_alleles.fasta', 'iuc_alleles.fasta',
7878
'Klebsiella_pneumoniae.fasta', 'wzi.fasta', 'ybt_alleles.fasta']:
7979
build_blast_db(data_dir, fasta, 'nucl')
80-
for fasta in ['MgrB_and_PmrB.aa', 'QRDR_120.aa']:
80+
for fasta in ['MgrB_and_PmrB.aa', 'QRDR_120.aa', 'OmpK.aa']:
8181
build_blast_db(data_dir, fasta, 'prot')
8282
except subprocess.CalledProcessError:
8383
print('\n')

test/res_test/data/OmpK.aa

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
>OmpK35
2+
MMKRNILAVVIPALLVAGAANAAEIYNKNGNKLDFYGKMVGEHVWTTNGDTSSDDTTYARIGLKGETQINDQLIGYGQWEYNMDASNVEGSQTTKTRLAFAGLKAGEYGSFDYGRNYGAIYDVEAATDMLVEWGGDGWNYTDNYMTGRTNGVATYRNSDFFGLVDGLSFALQYQGKNDHDRAIRKQNGDGFSTAATYAFDNGIALSAGYSSSNRSVDQKADGNGDKAEAWATSAKYDANNIYAAVMYSQTYNMTPEEDNHFAGKTQNFEAVVQYQFDFGLRPSIGYVQTKGKDLQSRAGFSGGDADLVKYIEVGTWYYFNKNMNVYAAYKFNQLDDNDYTKAAGVATDDQAAVGIVYQF
3+
>OmpK36
4+
MKVKVLSLLVPALLVAGAANAAEIYNKDGNKLDLYGKIDGLHYFSDDKSVDGDQTYMRVGVKGETQINDQLTGYGQWEYNVQANNTESSSDQAWTRLAFAGLKFGDAGSFDYGRNYGVVYDVTSWTDVLPEFGGDTYGSDNFLQSRANGVATYRNSDFFGLVDGLNFALQYQGKNGSVSGEGALSPTNNGRTALKQNGDGYGTSLTYDIYDGISAGFAYSNSKRLGDQNSKLALGRGDNAETYTGGLKYDANNIYLATQYTQTYNATRAGSLGFANKAQNFEVVAQYQFDFGLRPSVAYLQSKGKDLEGYGDQDILKYVDVGATYYFNKNMSTYVDYKINLLDDNSFTHNAGISTDDVVALGLVYQF
5+
>OmpK36GD
6+
MKVKVLSLLVPALLVAGAANAAEIYNKDGNKLDLYGKIDGLHYFSDDKSVDGDQTYMRVGVKGETQINDQLTGYGQWEYNVQANNTESSSDQAWTRLAFAGLKFGDAGSFDYGRNYGVVYDVTSWTDVLPEFGGDGDTYGSDNFLQSRANGVATYRNSDFFGLVDGLNFALQYQGKNGSVSGEGALSPTNNGRTALKQNGDGYGTSLTYDIYDGISAGFAYSNSKRLGDQNSKLALGRGDNAETYTGGLKYDANNIYLATQYTQTYNATRAGSLGFANKAQNFEVVAQYQFDFGLRPSVAYLQSKGKDLEGYGDQDILKYVDVGATYYFNKNMSTYVDYKINLLDDNSFTHNAGISTDDVVALGLVYQF
7+
>OmpK36TD
8+
MKVKVLSLLVPALLVAGAANAAEIYNKDGNKLDLYGKIDGLHYFSDDKSVDGDQTYMRVGVKGETQINDQLTGYGQWEYNVQANNTESSSDQAWTRLAFAGLKFGDAGSFDYGRNYGVVYDVTSWTDVLPEFGGDTDTYGSDNFLQSRANGVATYRNSDFFGLVDGLNFALQYQGKNGSVSGEGALSPTNNGRTALKQNGDGYGTSLTYDIYDGISAGFAYSNSKRLGDQNSKLALGRGDNAETYTGGLKYDANNIYLATQYTQTYNATRAGSLGFANKAQNFEVVAQYQFDFGLRPSVAYLQSKGKDLEGYGDQDILKYVDVGATYYFNKNMSTYVDYKINLLDDNSFTHNAGISTDDVVALGLVYQF

0 commit comments

Comments
 (0)