emollier
diff --git a/‎kleborate/data/OmpK.aa
+8 b/‎kleborate/data/OmpK.aa
+8
diff --git a/‎kleborate/kleborate.py
+2-1 b/‎kleborate/kleborate.py
+2-1
diff --git a/‎kleborate/resBLAST.py
+80-27 b/‎kleborate/resBLAST.py
+80-27
diff --git a/‎setup.py
+1-1 b/‎setup.py
+1-1
diff --git a/‎test/res_test/data/OmpK.aa
+8 b/‎test/res_test/data/OmpK.aa
+8
@@ -0,0 +1,8 @@
+>OmpK35
+MMKRNILAVVIPALLVAGAANAAEIYNKNGNKLDFYGKMVGEHVWTTNGDTSSDDTTYARIGLKGETQINDQLIGYGQWEYNMDASNVEGSQTTKTRLAFAGLKAGEYGSFDYGRNYGAIYDVEAATDMLVEWGGDGWNYTDNYMTGRTNGVATYRNSDFFGLVDGLSFALQYQGKNDHDRAIRKQNGDGFSTAATYAFDNGIALSAGYSSSNRSVDQKADGNGDKAEAWATSAKYDANNIYAAVMYSQTYNMTPEEDNHFAGKTQNFEAVVQYQFDFGLRPSIGYVQTKGKDLQSRAGFSGGDADLVKYIEVGTWYYFNKNMNVYAAYKFNQLDDNDYTKAAGVATDDQAAVGIVYQF
+>OmpK36
+MKVKVLSLLVPALLVAGAANAAEIYNKDGNKLDLYGKIDGLHYFSDDKSVDGDQTYMRVGVKGETQINDQLTGYGQWEYNVQANNTESSSDQAWTRLAFAGLKFGDAGSFDYGRNYGVVYDVTSWTDVLPEFGGDTYGSDNFLQSRANGVATYRNSDFFGLVDGLNFALQYQGKNGSVSGEGALSPTNNGRTALKQNGDGYGTSLTYDIYDGISAGFAYSNSKRLGDQNSKLALGRGDNAETYTGGLKYDANNIYLATQYTQTYNATRAGSLGFANKAQNFEVVAQYQFDFGLRPSVAYLQSKGKDLEGYGDQDILKYVDVGATYYFNKNMSTYVDYKINLLDDNSFTHNAGISTDDVVALGLVYQF
+>OmpK36GD
+MKVKVLSLLVPALLVAGAANAAEIYNKDGNKLDLYGKIDGLHYFSDDKSVDGDQTYMRVGVKGETQINDQLTGYGQWEYNVQANNTESSSDQAWTRLAFAGLKFGDAGSFDYGRNYGVVYDVTSWTDVLPEFGGDGDTYGSDNFLQSRANGVATYRNSDFFGLVDGLNFALQYQGKNGSVSGEGALSPTNNGRTALKQNGDGYGTSLTYDIYDGISAGFAYSNSKRLGDQNSKLALGRGDNAETYTGGLKYDANNIYLATQYTQTYNATRAGSLGFANKAQNFEVVAQYQFDFGLRPSVAYLQSKGKDLEGYGDQDILKYVDVGATYYFNKNMSTYVDYKINLLDDNSFTHNAGISTDDVVALGLVYQF
+>OmpK36TD
+MKVKVLSLLVPALLVAGAANAAEIYNKDGNKLDLYGKIDGLHYFSDDKSVDGDQTYMRVGVKGETQINDQLTGYGQWEYNVQANNTESSSDQAWTRLAFAGLKFGDAGSFDYGRNYGVVYDVTSWTDVLPEFGGDTDTYGSDNFLQSRANGVATYRNSDFFGLVDGLNFALQYQGKNGSVSGEGALSPTNNGRTALKQNGDGYGTSLTYDIYDGISAGFAYSNSKRLGDQNSKLALGRGDNAETYTGGLKYDANNIYLATQYTQTYNATRAGSLGFANKAQNFEVVAQYQFDFGLRPSVAYLQSKGKDLEGYGDQDILKYVDVGATYYFNKNMSTYVDYKINLLDDNSFTHNAGISTDDVVALGLVYQF
@@ -439,8 +439,9 @@ def get_resistance_results(data_folder, contigs, args, res_headers):
         gene_info, _, _ = read_class_file(data_folder + '/ARGannot_clustered80_r3.csv')
         qrdr = data_folder + '/QRDR_120.aa'
         trunc = data_folder + '/MgrB_and_PmrB.aa'
+        omp = data_folder + '/OmpK.aa'
         seqs = data_folder + '/ARGannot_r3.fasta'
-        res_hits = resblast_one_assembly(contigs, gene_info, qrdr, trunc, seqs, 80.0, 90.0)
+        res_hits = resblast_one_assembly(contigs, gene_info, qrdr, trunc, omp, seqs, 80.0, 90.0)
         return {r: ';'.join(sorted(res_hits[r])) if r in res_hits else '-'
                 for r in res_headers}
     else:
 
@@ -28,18 +28,20 @@ def main():
     print_header(res_classes, bla_classes)
 
     for contigs in args.assemblies:
-        hits_dict = resblast_one_assembly(contigs, gene_info, args.qrdr, args.trunc, args.seqs,
-                                          args.mincov, args.minident)
+        hits_dict = resblast_one_assembly(contigs, gene_info, args.qrdr, args.trunc, args.omp,
+                                          args.seqs, args.mincov, args.minident)
         print_results(contigs, res_classes, bla_classes, hits_dict)
 
 
-def resblast_one_assembly(contigs, gene_info, qrdr, trunc, seqs, mincov, minident):
-    build_blast_databases(seqs, qrdr, trunc)
+def resblast_one_assembly(contigs, gene_info, qrdr, trunc, omp, seqs, mincov, minident):
+    build_blast_databases(seqs, qrdr, trunc, omp)
     hits_dict = blast_against_all(seqs, mincov, minident, contigs, gene_info)
     if qrdr:
         check_for_qrdr_mutations(hits_dict, contigs, qrdr)
     if trunc:
-        check_for_gene_truncations(hits_dict, contigs, trunc)
+        check_for_mgrb_pmrb_gene_truncations(hits_dict, contigs, trunc)
+    if omp:
+        check_omp_genes(hits_dict, contigs, omp)
     return hits_dict
 
 
@@ -63,6 +65,9 @@ def parse_arguments():
     additional_screening_args.add_argument('-r', '--trunc', type=str,
                                            help='MgrB and PmrB genes for which truncation can '
                                                 'cause colistin resistance')
+    additional_screening_args.add_argument('-o', '--omp', type=str,
+                                           help='OmpK genes for which truncation/mutation can '
+                                                'cause carbapenem resistance')
 
     settings_args = parser.add_argument_group('Settings')
     settings_args.add_argument('-m', '--minident', type=float, default=90.0,
@@ -77,7 +82,7 @@ def parse_arguments():
     return parser.parse_args()
 
 
-def build_blast_databases(seqs, qrdr, trunc):
+def build_blast_databases(seqs, qrdr, trunc, omp):
     if not os.path.exists(seqs + '.nin'):
         with open(os.devnull, 'w') as devnull:
             subprocess.check_call('makeblastdb -dbtype nucl -in ' + seqs,
@@ -92,6 +97,11 @@ def build_blast_databases(seqs, qrdr, trunc):
             with open(os.devnull, 'w') as devnull:
                 subprocess.check_call('makeblastdb -dbtype prot -in ' + trunc,
                                       stdout=devnull, shell=True)
+    if omp:
+        if not os.path.exists(omp + '.pin'):
+            with open(os.devnull, 'w') as devnull:
+                subprocess.check_call('makeblastdb -dbtype prot -in ' + omp,
+                                      stdout=devnull, shell=True)
 
 
 def read_class_file(res_class_file):
@@ -223,7 +233,7 @@ def check_for_exact_aa_match(seqs, gene_nucl_seq):
 
 def blastx_results_as_xml_tree(database, query):
     blastx_cmd = 'blastx -db ' + database + ' -query ' + query + ' -query_gencode 11' + \
-                 ' -outfmt 5 -ungapped -comp_based_stats F -culling_limit 1 -max_hsps 1 -seg no'
+                 ' -outfmt 5 -comp_based_stats F -culling_limit 1 -max_hsps 1 -seg no'
     process = subprocess.Popen(blastx_cmd, stdout=subprocess.PIPE, stderr=None, shell=True)
     blast_output = process.communicate()[0]
     if not isinstance(blast_output, str):
@@ -232,11 +242,12 @@ def blastx_results_as_xml_tree(database, query):
 
 
 def check_for_qrdr_mutations(hits_dict, contigs, qrdr):
-    qrdr_loci = {'GyrA': [(83, 'S'), (87, 'D')], 'ParC': [(80, 'S'), (84, 'E')]}
+    qrdr_loci = {'GyrA': [(83, 'S'), (87, 'D')],
+                 'ParC': [(80, 'S'), (84, 'E')]}
 
     # key = (locus, pos), value = allele,
     # if found in a simple hit starting at position 1 of the protein seq
-    complete_hits, incomplete_hits = {}, {}
+    complete_hits, incomplete_hits = collections.defaultdict(list), collections.defaultdict(list)
 
     root = blastx_results_as_xml_tree(qrdr, contigs)
     for query in root[8]:
@@ -250,29 +261,22 @@ def check_for_qrdr_mutations(hits_dict, contigs, qrdr):
                 hsp_qseq = hsp[14].text
                 hsp_hseq = hsp[15].text
 
-                for (pos, wt) in qrdr_loci[gene_id]:
-                    if hsp_hit_to >= pos and (hsp_gaps == 0) and (hsp_hit_from == 1):
+                for pos, wt in qrdr_loci[gene_id]:
+                    if hsp_hit_to >= pos and hsp_gaps == 0 and hsp_hit_from == 1:
                         # simple alignment
-                        if (gene_id, pos) in complete_hits:
-                            complete_hits[(gene_id, pos)].append(hsp_qseq[pos - 1])
-                        else:
-                            complete_hits[(gene_id, pos)] = [hsp_qseq[pos - 1]]
+                        complete_hits[(gene_id, pos)].append(hsp_qseq[pos - 1])
                     else:
                         # not a simple alignment, need to align query and hit and extract loci
                         # manually
-                        if (pos >= hsp_hit_from) and (pos <= hsp_hit_to) and \
-                                (hsp_hit_eval <= 0.00001):
+                        if hsp_hit_from <= pos <= hsp_hit_to and hsp_hit_eval <= 0.00001:
                             # locus is within aligned area, set evalue to filter out the junk
                             # alignments
                             pos_in_aln = get_gapped_position(hsp_hseq, pos - hsp_hit_from + 1)
-                            if (gene_id, pos) in incomplete_hits:
-                                incomplete_hits[(gene_id, pos)].append(hsp_qseq[pos_in_aln - 1])
-                            else:
-                                incomplete_hits[(gene_id, pos)] = [hsp_qseq[pos_in_aln - 1]]
+                            incomplete_hits[(gene_id, pos)].append(hsp_qseq[pos_in_aln - 1])
     snps = []
 
     for locus in qrdr_loci:
-        for (pos, wt) in qrdr_loci[locus]:
+        for pos, wt in qrdr_loci[locus]:
             if (locus, pos) in complete_hits:
                 if complete_hits[(locus, pos)][0] != wt:
                     snps.append(locus + '-' + str(pos) +
@@ -283,12 +287,10 @@ def check_for_qrdr_mutations(hits_dict, contigs, qrdr):
                         snps.append(locus + '-' + str(pos) +
                                     incomplete_hits[(locus, pos)][0])  # record SNP at this site
     if snps:
-        if 'Flq' not in hits_dict:
-            hits_dict['Flq'] = []
         hits_dict['Flq'] += snps
 
 
-def check_for_gene_truncations(hits_dict, contigs, trunc):
+def check_for_mgrb_pmrb_gene_truncations(hits_dict, contigs, trunc):
     best_mgrb_cov, best_pmrb_cov = 0.0, 0.0
 
     root = blastx_results_as_xml_tree(trunc, contigs)
@@ -297,12 +299,10 @@ def check_for_gene_truncations(hits_dict, contigs, trunc):
             gene_id = hit[2].text
             assert gene_id == 'MgrB' or gene_id == 'PmrB'
             gene_len = int(hit[4].text)
-
             for hsp in hit[5]:
                 hsp_qseq = hsp[14].text
                 hit_length = max(len(x) for x in hsp_qseq.split('*'))
                 coverage = 100.0 * float(hit_length) / gene_len
-
                 if gene_id == 'MgrB' and coverage > best_mgrb_cov:
                     best_mgrb_cov = coverage
                 elif gene_id == 'PmrB' and coverage > best_pmrb_cov:
@@ -320,6 +320,59 @@ def check_for_gene_truncations(hits_dict, contigs, trunc):
         hits_dict['Col'] += truncations
 
 
+def check_omp_genes(hits_dict, contigs, omp):
+    check_for_omp_gene_truncations(hits_dict, contigs, omp)
+    check_for_ompk36_mutations(hits_dict, contigs, omp)
+
+
+def check_for_omp_gene_truncations(hits_dict, contigs, omp):
+    best_ompk35_cov, best_ompk36_cov = 0.0, 0.0
+
+    root = blastx_results_as_xml_tree(omp, contigs)
+    for query in root[8]:
+        for hit in query[4]:
+            gene_id = hit[2].text
+            gene_len = int(hit[4].text)
+            for hsp in hit[5]:
+                hsp_qseq = hsp[14].text
+                hit_length = max(len(x) for x in hsp_qseq.split('*'))
+                coverage = 100.0 * float(hit_length) / gene_len
+
+                if gene_id == 'OmpK35' and coverage > best_ompk35_cov:
+                    best_ompk35_cov = coverage
+                elif (gene_id == 'OmpK36' or gene_id == 'OmpK36GD' or
+                      gene_id == 'OmpK36TD') and coverage > best_ompk36_cov:
+                    best_ompk36_cov = coverage
+
+    truncations = []
+    if best_ompk35_cov < 90.0:
+        truncations.append('OmpK35-' + ('%.0f' % best_ompk35_cov) + '%')
+    if best_ompk36_cov < 90.0:
+        truncations.append('OmpK36-' + ('%.0f' % best_ompk36_cov) + '%')
+
+    if truncations:
+        if 'Bla_Carb' not in hits_dict:
+            hits_dict['Bla_Carb'] = []
+        hits_dict['Bla_Carb'] += truncations
+
+
+def check_for_ompk36_mutations(hits_dict, contigs, omp):
+    root = blastx_results_as_xml_tree(omp, contigs)
+    for query in root[8]:
+        for hit in query[4]:
+            gene_id = hit[2].text
+            gene_len = int(hit[4].text)
+            for hsp in hit[5]:
+                hsp_qseq = hsp[14].text
+                hit_length = max(len(x) for x in hsp_qseq.split('*'))
+                coverage = 100.0 * float(hit_length) / gene_len
+                if coverage >= 90.0:
+                    if gene_id == 'OmpK36GD' and 'GDGDTY' in hsp_qseq:
+                        hits_dict['Bla_Carb'].append('OmpK36GD')
+                    if gene_id == 'OmpK36TD' and 'GDTDTY' in hsp_qseq:
+                        hits_dict['Bla_Carb'].append('OmpK36TD')
+
+
 def get_strain_name(full_path):
     filename = os.path.split(full_path)[1]
     if filename.endswith('_temp_decompress.fasta'):
 
@@ -77,7 +77,7 @@ def run(self):
                               'iro_alleles.fasta', 'iuc_alleles.fasta',
                               'Klebsiella_pneumoniae.fasta', 'wzi.fasta', 'ybt_alleles.fasta']:
                     build_blast_db(data_dir, fasta, 'nucl')
-                for fasta in ['MgrB_and_PmrB.aa', 'QRDR_120.aa']:
+                for fasta in ['MgrB_and_PmrB.aa', 'QRDR_120.aa', 'OmpK.aa']:
                     build_blast_db(data_dir, fasta, 'prot')
             except subprocess.CalledProcessError:
                 print('\n')
 
@@ -0,0 +1,8 @@
+>OmpK35
+MMKRNILAVVIPALLVAGAANAAEIYNKNGNKLDFYGKMVGEHVWTTNGDTSSDDTTYARIGLKGETQINDQLIGYGQWEYNMDASNVEGSQTTKTRLAFAGLKAGEYGSFDYGRNYGAIYDVEAATDMLVEWGGDGWNYTDNYMTGRTNGVATYRNSDFFGLVDGLSFALQYQGKNDHDRAIRKQNGDGFSTAATYAFDNGIALSAGYSSSNRSVDQKADGNGDKAEAWATSAKYDANNIYAAVMYSQTYNMTPEEDNHFAGKTQNFEAVVQYQFDFGLRPSIGYVQTKGKDLQSRAGFSGGDADLVKYIEVGTWYYFNKNMNVYAAYKFNQLDDNDYTKAAGVATDDQAAVGIVYQF
+>OmpK36
+MKVKVLSLLVPALLVAGAANAAEIYNKDGNKLDLYGKIDGLHYFSDDKSVDGDQTYMRVGVKGETQINDQLTGYGQWEYNVQANNTESSSDQAWTRLAFAGLKFGDAGSFDYGRNYGVVYDVTSWTDVLPEFGGDTYGSDNFLQSRANGVATYRNSDFFGLVDGLNFALQYQGKNGSVSGEGALSPTNNGRTALKQNGDGYGTSLTYDIYDGISAGFAYSNSKRLGDQNSKLALGRGDNAETYTGGLKYDANNIYLATQYTQTYNATRAGSLGFANKAQNFEVVAQYQFDFGLRPSVAYLQSKGKDLEGYGDQDILKYVDVGATYYFNKNMSTYVDYKINLLDDNSFTHNAGISTDDVVALGLVYQF
+>OmpK36GD
+MKVKVLSLLVPALLVAGAANAAEIYNKDGNKLDLYGKIDGLHYFSDDKSVDGDQTYMRVGVKGETQINDQLTGYGQWEYNVQANNTESSSDQAWTRLAFAGLKFGDAGSFDYGRNYGVVYDVTSWTDVLPEFGGDGDTYGSDNFLQSRANGVATYRNSDFFGLVDGLNFALQYQGKNGSVSGEGALSPTNNGRTALKQNGDGYGTSLTYDIYDGISAGFAYSNSKRLGDQNSKLALGRGDNAETYTGGLKYDANNIYLATQYTQTYNATRAGSLGFANKAQNFEVVAQYQFDFGLRPSVAYLQSKGKDLEGYGDQDILKYVDVGATYYFNKNMSTYVDYKINLLDDNSFTHNAGISTDDVVALGLVYQF
+>OmpK36TD
+MKVKVLSLLVPALLVAGAANAAEIYNKDGNKLDLYGKIDGLHYFSDDKSVDGDQTYMRVGVKGETQINDQLTGYGQWEYNVQANNTESSSDQAWTRLAFAGLKFGDAGSFDYGRNYGVVYDVTSWTDVLPEFGGDTDTYGSDNFLQSRANGVATYRNSDFFGLVDGLNFALQYQGKNGSVSGEGALSPTNNGRTALKQNGDGYGTSLTYDIYDGISAGFAYSNSKRLGDQNSKLALGRGDNAETYTGGLKYDANNIYLATQYTQTYNATRAGSLGFANKAQNFEVVAQYQFDFGLRPSVAYLQSKGKDLEGYGDQDILKYVDVGATYYFNKNMSTYVDYKINLLDDNSFTHNAGISTDDVVALGLVYQF