updates to handle Ter and Var snps

arpcard · Feb 11, 2025 · 5a370a4 · 5a370a4
1 parent 5ffacf8
commit 5a370a4
Show file tree

Hide file tree

Showing 8 changed files with 601 additions and 504 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -10,7 +10,7 @@ on:
   workflow_dispatch:
 
 env:
-  BLAST_VERSION: 2.14.1
+  BLAST_VERSION: 2.16.0
 
 jobs:
   build:
@@ -38,9 +38,9 @@ jobs:
         sudo mv ncbi-blast-${BLAST_VERSION}+/bin/* /usr/bin
 
         echo "install bowtie2 ..."
-        wget --quiet https://github.com/BenLangmead/bowtie2/releases/download/v2.4.5/bowtie2-2.4.5-linux-x86_64.zip
-        unzip -q bowtie2-2.4.5-linux-x86_64.zip
-        sudo mv bowtie2-2.4.5-linux-x86_64/bowtie2* /usr/bin
+        wget --quiet https://github.com/BenLangmead/bowtie2/releases/download/v2.5.4/bowtie2-2.5.4-linux-x86_64.zip
+        unzip -q bowtie2-2.5.4-linux-x86_64.zip
+        sudo mv bowtie2-2.5.4-linux-x86_64/bowtie2* /usr/bin
 
         echo "install diamond ..."
         wget --quiet http://github.com/bbuchfink/diamond/releases/download/v0.8.36/diamond-linux64.tar.gz
@@ -64,11 +64,11 @@ jobs:
         wget --quiet https://github.com/pezmaster31/bamtools/archive/v2.5.2.tar.gz
         tar xzf v2.5.2.tar.gz
         pushd bamtools-2.5.2/ && mkdir build && cd build && cmake .. && make --quiet && sudo make install --quiet && popd
-        bamtools --version
 
         echo "install bedtools ..."
-        wget --quiet https://github.com/arq5x/bedtools2/releases/download/v2.28.0/bedtools
-        chmod +x bedtools && sudo mv bedtools /usr/bin
+        wget --quiet https://github.com/arq5x/bedtools2/releases/download/v2.31.1/bedtools-2.31.1.tar.gz
+        tar xzf bedtools-2.31.1.tar.gz
+        pushd bedtools2 && make --quiet && sudo mv bin/* /usr/bin && popd
 
         echo "install kma ..."
         git clone --quiet https://bitbucket.org/genomicepidemiology/kma.git
@@ -95,9 +95,11 @@ jobs:
           diamond --version
           jellyfish --version
           samtools --version
+          bamtools --version
           bedtools --version
           kma -v
-          rgi -h
+          rgi main --version
+          prodigal -v
     - name: Test with pytest
       run: |
         bash test.sh
diff --git a/README.rst b/README.rst
@@ -138,8 +138,8 @@ The following conda command will install all RGI dependencies (listed below):
     conda env create -f conda_env.yml
     conda activate rgi
 
-- `Python 3.6 <https://www.python.org/>`_
-- `NCBI BLAST 2.14.0 <https://blast.ncbi.nlm.nih.gov/Blast.cgi>`_
+- `Python 3.6+ <https://www.python.org/>`_
+- `NCBI BLAST 2.16.0 <https://blast.ncbi.nlm.nih.gov/Blast.cgi>`_
 - `zlib <https://bitbucket.org/gutworth/six>`_
 - `Prodigal 2.6.3 <https://github.com/hyattpd/prodigal/wiki/Installation>`_
 - `DIAMOND 0.8.36 <https://github.com/bbuchfink/diamond>`_
@@ -152,13 +152,13 @@ The following conda command will install all RGI dependencies (listed below):
 - `pyfaidx 0.5.4.1+ <https://pypi.org/project/pyfaidx/>`_
 - `pyahocorasick 1.1.7+ <https://pypi.org/project/pyahocorasick/>`_
 - `OligoArrayAux 3.8 <http://unafold.rna.albany.edu/?q=DINAMelt/OligoArrayAux>`_
-- `samtools 1.9 <https://github.com/samtools/samtools>`_
-- `bamtools 2.5.1 <https://github.com/pezmaster31/bamtools>`_
-- `bedtools 2.27.1 <https://github.com/arq5x/bedtools2>`_
+- `samtools 1.21 <https://github.com/samtools/samtools>`_
+- `bamtools 2.5.2 <https://github.com/pezmaster31/bamtools>`_
+- `bedtools 2.31.1 <https://github.com/arq5x/bedtools2>`_
 - `Jellyfish 2.2.10 <https://github.com/gmarcais/Jellyfish>`_
-- `Bowtie2 2.3.4.3 <http://bowtie-bio.sourceforge.net/bowtie2/index.shtml>`_
-- `BWA 0.7.17 (r1188) <https://github.com/lh3/bwa>`_
-- `KMA 1.3.4 <https://bitbucket.org/genomicepidemiology/kma/src/master>`_
+- `Bowtie2 2.5.4 <http://bowtie-bio.sourceforge.net/bowtie2/index.shtml>`_
+- `BWA 0.7.18 <https://github.com/lh3/bwa>`_
+- `KMA 1.4.17 <https://bitbucket.org/genomicepidemiology/kma/src/master>`_
 
 
 Install RGI

diff --git a/app/Base.py b/app/Base.py
@@ -12,6 +12,7 @@
 from Bio.Seq import Seq
 from pyfaidx import Fasta
 
+
 class RGIBase(object):
     """Interface for RGI"""
     __metaclass__ = ABCMeta
@@ -250,7 +251,7 @@ def get_orf_protein_sequence(self, input_file, input_type):
 
         return predicted_genes_dict
 
-    def results(self, blast_results, query_id, perfect, strict , loose, include_nudge=False):
+    def results(self, blast_results, query_id, perfect, strict, loose, include_nudge=False):
         """
         Sort results to perfect, strict, loose paradigm
 
@@ -271,7 +272,7 @@ def results(self, blast_results, query_id, perfect, strict , loose, include_nudg
         nudged = False
         if len(perfect) == 0 and len(strict) == 0 and len(loose) > 0:
             if include_nudge is True:
-                nudged , loose = self.nudge_loose_to_strict(loose)
+                nudged, loose = self.nudge_loose_to_strict(loose)
 
             if nudged is True and self.loose is False:
                 blast_results[query_id] = loose
@@ -282,7 +283,7 @@ def results(self, blast_results, query_id, perfect, strict , loose, include_nudg
             if len(strict) > 0:
                 # TODO:: add try catch here
                 try:
-                    nudged , strict = self.nudge_strict_to_perfect(strict)
+                    nudged, strict = self.nudge_strict_to_perfect(strict)
                     blast_results[query_id] = strict
                 except Exception as e:
                     logger.error(e)
@@ -326,7 +327,8 @@ def nudge_strict_to_perfect(self, strict):
                 partial_protein = ""
                 # Missing n-terminus or c-terminus
                 if len(query) < len(reference) and query in reference:
-                    length_nucleotides = (len(reference) - len(strict[s]["match"]))*3
+                    length_nucleotides = (
+                        len(reference) - len(strict[s]["match"]))*3
 
                     # pull nucleotides from query or submitted sequence
                     partial_bases, orf_start_, orf_end_ = self.get_part_sequence(
@@ -343,22 +345,27 @@ def nudge_strict_to_perfect(self, strict):
                             # update orf_end
                             strict[s]["orf_end_possible"] = orf_end_ + 1
                             strict[s]["orf_start_possible"] = strict[s]["orf_start"]
-                            orf_dna_sequence_ = str(Seq(partial_bases).reverse_complement()) + strict[s]["orf_dna_sequence"]
-                            partial_protein = str(Seq(partial_bases).reverse_complement().translate(table=11))
+                            orf_dna_sequence_ = str(
+                                Seq(partial_bases).reverse_complement()) + strict[s]["orf_dna_sequence"]
+                            partial_protein = str(
+                                Seq(partial_bases).reverse_complement().translate(table=11))
                             # logger.debug("Reverse strand: {}".format(partial_protein))
                         else:
                             # update orf_start
                             strict[s]["orf_start_possible"] = orf_start_
-                            strict[s]["orf_end_possible"] = int(strict[s]["orf_end"]) + 1
-                            orf_dna_sequence_ = partial_bases + strict[s]["orf_dna_sequence"]
-                            partial_protein = str(Seq(partial_bases).translate(table=11)).strip("*")
+                            strict[s]["orf_end_possible"] = int(
+                                strict[s]["orf_end"]) + 1
+                            orf_dna_sequence_ = partial_bases + \
+                                strict[s]["orf_dna_sequence"]
+                            partial_protein = str(
+                                Seq(partial_bases).translate(table=11)).strip("*")
                             # logger.debug("Forward strand: {}".format(partial_protein))
 
                         # logger.debug("Translated protein: [{}]".format(partial_protein))
                         # update start codon to M for all other alternate start codons
                         if len(partial_protein) > 0:
                             _partial_protein = partial_protein[0]
-                            if partial_protein[0] in ["L","M","I","V"]:
+                            if partial_protein[0] in ["L", "M", "I", "V"]:
                                 _partial_protein = "M"+partial_protein[1:]
 
                         combine = _partial_protein + strict[s]["match"]
@@ -412,10 +419,13 @@ def nudge_strict_to_perfect(self, strict):
 
                             if strict[s]["orf_strand"] == "-":
                                 strict[s]["orf_start_possible"] = strict[s]["orf_start"]
-                                strict[s]["orf_end_possible"] = int(strict[s]["orf_start"]) + len(strict[s]["dna_sequence_from_broadstreet"]) - 1
+                                strict[s]["orf_end_possible"] = int(
+                                    strict[s]["orf_start"]) + len(strict[s]["dna_sequence_from_broadstreet"]) - 1
                             else:
-                                strict[s]["orf_start_possible"] = int(strict[s]["orf_end"]) - len(strict[s]["dna_sequence_from_broadstreet"]) + 1
-                                strict[s]["orf_end_possible"] = int(strict[s]["orf_end"])
+                                strict[s]["orf_start_possible"] = int(
+                                    strict[s]["orf_end"]) - len(strict[s]["dna_sequence_from_broadstreet"]) + 1
+                                strict[s]["orf_end_possible"] = int(
+                                    strict[s]["orf_end"])
 
                             # pull nucleotides from query or submitted sequence
                             partial_bases, orf_start_, orf_end_ = self.get_part_sequence(
@@ -426,19 +436,21 @@ def nudge_strict_to_perfect(self, strict):
                             )
 
                             if strict[s]["orf_strand"] == "-":
-                                strict[s]["orf_dna_sequence_possible"] = str(Seq(partial_bases).reverse_complement())
+                                strict[s]["orf_dna_sequence_possible"] = str(
+                                    Seq(partial_bases).reverse_complement())
                             else:
                                 strict[s]["orf_dna_sequence_possible"] = partial_bases
 
                             if len(strict[s]["orf_dna_sequence_possible"]) % 3 == 0:
-                                orf_prot_sequence_possible = str(Seq(strict[s]["orf_dna_sequence_possible"]).translate(table=11)).strip("*")
+                                orf_prot_sequence_possible = str(
+                                    Seq(strict[s]["orf_dna_sequence_possible"]).translate(table=11)).strip("*")
                                 strict[s]["orf_prot_sequence_possible"] = orf_prot_sequence_possible
                                 if orf_prot_sequence_possible == strict[s]["sequence_from_broadstreet"]:
                                     strict[s]["type_match"] = "Perfect"
                                     nudged = True
                             else:
                                 logger.warning("incorrect open reading frame for coordinate: {}-{} on strand {} for {}".format
-                                (strict[s]["orf_start_possible"], strict[s]["orf_end_possible"], strict[s]["orf_strand"], strict[s]["orf_from"]))
+                                               (strict[s]["orf_start_possible"], strict[s]["orf_end_possible"], strict[s]["orf_strand"], strict[s]["orf_from"]))
 
                             strict[s]["nudged"] = nudged
 
@@ -505,7 +517,8 @@ def get_part_sequence(self, fasta_file, header, start, stop, nterminus, strand,
         genes = False
         # logger.info("[PARTIAL] ARO: {} | contig: {} | filename: {}".format(name, header, fasta_file))
         try:
-            genes = Fasta(fasta_file, sequence_always_upper=False, read_long_names=False, one_based_attributes=True)
+            genes = Fasta(fasta_file, sequence_always_upper=False,
+                          read_long_names=False, one_based_attributes=True)
         except Exception as e:
             logger.error(e)
         # logger.info(genes.records)
@@ -517,9 +530,9 @@ def get_part_sequence(self, fasta_file, header, start, stop, nterminus, strand,
                 # logger.debug("grep sequence from {}|-|{}-{}".format(header,_start, _stop,))
                 if nterminus == 0:
                     # logger.debug("grep sequence from {}|-|{}-{}".format(header,start, stop,))
-                    return str(genes.get_spliced_seq( header, [[start, stop]])), start, stop
+                    return str(genes.get_spliced_seq(header, [[start, stop]])), start, stop
                 else:
-                    return str(genes.get_spliced_seq( header, [[_start, _stop]])), _start, _stop
+                    return str(genes.get_spliced_seq(header, [[_start, _stop]])), _start, _stop
             elif strand == "+":
                 _start = start - nterminus
                 _stop = start - 1
@@ -530,9 +543,9 @@ def get_part_sequence(self, fasta_file, header, start, stop, nterminus, strand,
                 # logger.debug("grep sequence from {}|+|{}-{}".format(header,_start, _stop))
                 if nterminus == 0:
                     # logger.debug("grep sequence from {}|+|{}-{}".format(header,start, stop))
-                    return str(genes.get_spliced_seq( header, [[start, stop]])), start, stop
+                    return str(genes.get_spliced_seq(header, [[start, stop]])), start, stop
                 else:
-                    return str(genes.get_spliced_seq( header, [[_start, _stop]])), _start, _stop
+                    return str(genes.get_spliced_seq(header, [[_start, _stop]])), _start, _stop
 
     def nudge_loose_to_strict(self, loose):
         """
@@ -560,6 +573,3 @@ def nudge_loose_to_strict(self, loose):
                 loose[i]["note"] = "loose hit with at least 95 percent identity pushed strict"
 
         return nudged, loose
-
-
-