Skip to content

Commit

Permalink
updates to handle Ter and Var snps
Browse files Browse the repository at this point in the history
  • Loading branch information
raphenya committed Feb 11, 2025
1 parent 5ffacf8 commit 5a370a4
Show file tree
Hide file tree
Showing 8 changed files with 601 additions and 504 deletions.
18 changes: 10 additions & 8 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:
workflow_dispatch:

env:
BLAST_VERSION: 2.14.1
BLAST_VERSION: 2.16.0

jobs:
build:
Expand Down Expand Up @@ -38,9 +38,9 @@ jobs:
sudo mv ncbi-blast-${BLAST_VERSION}+/bin/* /usr/bin
echo "install bowtie2 ..."
wget --quiet https://github.com/BenLangmead/bowtie2/releases/download/v2.4.5/bowtie2-2.4.5-linux-x86_64.zip
unzip -q bowtie2-2.4.5-linux-x86_64.zip
sudo mv bowtie2-2.4.5-linux-x86_64/bowtie2* /usr/bin
wget --quiet https://github.com/BenLangmead/bowtie2/releases/download/v2.5.4/bowtie2-2.5.4-linux-x86_64.zip
unzip -q bowtie2-2.5.4-linux-x86_64.zip
sudo mv bowtie2-2.5.4-linux-x86_64/bowtie2* /usr/bin
echo "install diamond ..."
wget --quiet http://github.com/bbuchfink/diamond/releases/download/v0.8.36/diamond-linux64.tar.gz
Expand All @@ -64,11 +64,11 @@ jobs:
wget --quiet https://github.com/pezmaster31/bamtools/archive/v2.5.2.tar.gz
tar xzf v2.5.2.tar.gz
pushd bamtools-2.5.2/ && mkdir build && cd build && cmake .. && make --quiet && sudo make install --quiet && popd
bamtools --version
echo "install bedtools ..."
wget --quiet https://github.com/arq5x/bedtools2/releases/download/v2.28.0/bedtools
chmod +x bedtools && sudo mv bedtools /usr/bin
wget --quiet https://github.com/arq5x/bedtools2/releases/download/v2.31.1/bedtools-2.31.1.tar.gz
tar xzf bedtools-2.31.1.tar.gz
pushd bedtools2 && make --quiet && sudo mv bin/* /usr/bin && popd
echo "install kma ..."
git clone --quiet https://bitbucket.org/genomicepidemiology/kma.git
Expand All @@ -95,9 +95,11 @@ jobs:
diamond --version
jellyfish --version
samtools --version
bamtools --version
bedtools --version
kma -v
rgi -h
rgi main --version
prodigal -v
- name: Test with pytest
run: |
bash test.sh
16 changes: 8 additions & 8 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ The following conda command will install all RGI dependencies (listed below):
conda env create -f conda_env.yml
conda activate rgi
- `Python 3.6 <https://www.python.org/>`_
- `NCBI BLAST 2.14.0 <https://blast.ncbi.nlm.nih.gov/Blast.cgi>`_
- `Python 3.6+ <https://www.python.org/>`_
- `NCBI BLAST 2.16.0 <https://blast.ncbi.nlm.nih.gov/Blast.cgi>`_
- `zlib <https://bitbucket.org/gutworth/six>`_
- `Prodigal 2.6.3 <https://github.com/hyattpd/prodigal/wiki/Installation>`_
- `DIAMOND 0.8.36 <https://github.com/bbuchfink/diamond>`_
Expand All @@ -152,13 +152,13 @@ The following conda command will install all RGI dependencies (listed below):
- `pyfaidx 0.5.4.1+ <https://pypi.org/project/pyfaidx/>`_
- `pyahocorasick 1.1.7+ <https://pypi.org/project/pyahocorasick/>`_
- `OligoArrayAux 3.8 <http://unafold.rna.albany.edu/?q=DINAMelt/OligoArrayAux>`_
- `samtools 1.9 <https://github.com/samtools/samtools>`_
- `bamtools 2.5.1 <https://github.com/pezmaster31/bamtools>`_
- `bedtools 2.27.1 <https://github.com/arq5x/bedtools2>`_
- `samtools 1.21 <https://github.com/samtools/samtools>`_
- `bamtools 2.5.2 <https://github.com/pezmaster31/bamtools>`_
- `bedtools 2.31.1 <https://github.com/arq5x/bedtools2>`_
- `Jellyfish 2.2.10 <https://github.com/gmarcais/Jellyfish>`_
- `Bowtie2 2.3.4.3 <http://bowtie-bio.sourceforge.net/bowtie2/index.shtml>`_
- `BWA 0.7.17 (r1188) <https://github.com/lh3/bwa>`_
- `KMA 1.3.4 <https://bitbucket.org/genomicepidemiology/kma/src/master>`_
- `Bowtie2 2.5.4 <http://bowtie-bio.sourceforge.net/bowtie2/index.shtml>`_
- `BWA 0.7.18 <https://github.com/lh3/bwa>`_
- `KMA 1.4.17 <https://bitbucket.org/genomicepidemiology/kma/src/master>`_


Install RGI
Expand Down
58 changes: 34 additions & 24 deletions app/Base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from Bio.Seq import Seq
from pyfaidx import Fasta


class RGIBase(object):
"""Interface for RGI"""
__metaclass__ = ABCMeta
Expand Down Expand Up @@ -250,7 +251,7 @@ def get_orf_protein_sequence(self, input_file, input_type):

return predicted_genes_dict

def results(self, blast_results, query_id, perfect, strict , loose, include_nudge=False):
def results(self, blast_results, query_id, perfect, strict, loose, include_nudge=False):
"""
Sort results to perfect, strict, loose paradigm
Expand All @@ -271,7 +272,7 @@ def results(self, blast_results, query_id, perfect, strict , loose, include_nudg
nudged = False
if len(perfect) == 0 and len(strict) == 0 and len(loose) > 0:
if include_nudge is True:
nudged , loose = self.nudge_loose_to_strict(loose)
nudged, loose = self.nudge_loose_to_strict(loose)

if nudged is True and self.loose is False:
blast_results[query_id] = loose
Expand All @@ -282,7 +283,7 @@ def results(self, blast_results, query_id, perfect, strict , loose, include_nudg
if len(strict) > 0:
# TODO:: add try catch here
try:
nudged , strict = self.nudge_strict_to_perfect(strict)
nudged, strict = self.nudge_strict_to_perfect(strict)
blast_results[query_id] = strict
except Exception as e:
logger.error(e)
Expand Down Expand Up @@ -326,7 +327,8 @@ def nudge_strict_to_perfect(self, strict):
partial_protein = ""
# Missing n-terminus or c-terminus
if len(query) < len(reference) and query in reference:
length_nucleotides = (len(reference) - len(strict[s]["match"]))*3
length_nucleotides = (
len(reference) - len(strict[s]["match"]))*3

# pull nucleotides from query or submitted sequence
partial_bases, orf_start_, orf_end_ = self.get_part_sequence(
Expand All @@ -343,22 +345,27 @@ def nudge_strict_to_perfect(self, strict):
# update orf_end
strict[s]["orf_end_possible"] = orf_end_ + 1
strict[s]["orf_start_possible"] = strict[s]["orf_start"]
orf_dna_sequence_ = str(Seq(partial_bases).reverse_complement()) + strict[s]["orf_dna_sequence"]
partial_protein = str(Seq(partial_bases).reverse_complement().translate(table=11))
orf_dna_sequence_ = str(
Seq(partial_bases).reverse_complement()) + strict[s]["orf_dna_sequence"]
partial_protein = str(
Seq(partial_bases).reverse_complement().translate(table=11))
# logger.debug("Reverse strand: {}".format(partial_protein))
else:
# update orf_start
strict[s]["orf_start_possible"] = orf_start_
strict[s]["orf_end_possible"] = int(strict[s]["orf_end"]) + 1
orf_dna_sequence_ = partial_bases + strict[s]["orf_dna_sequence"]
partial_protein = str(Seq(partial_bases).translate(table=11)).strip("*")
strict[s]["orf_end_possible"] = int(
strict[s]["orf_end"]) + 1
orf_dna_sequence_ = partial_bases + \
strict[s]["orf_dna_sequence"]
partial_protein = str(
Seq(partial_bases).translate(table=11)).strip("*")
# logger.debug("Forward strand: {}".format(partial_protein))

# logger.debug("Translated protein: [{}]".format(partial_protein))
# update start codon to M for all other alternate start codons
if len(partial_protein) > 0:
_partial_protein = partial_protein[0]
if partial_protein[0] in ["L","M","I","V"]:
if partial_protein[0] in ["L", "M", "I", "V"]:
_partial_protein = "M"+partial_protein[1:]

combine = _partial_protein + strict[s]["match"]
Expand Down Expand Up @@ -412,10 +419,13 @@ def nudge_strict_to_perfect(self, strict):

if strict[s]["orf_strand"] == "-":
strict[s]["orf_start_possible"] = strict[s]["orf_start"]
strict[s]["orf_end_possible"] = int(strict[s]["orf_start"]) + len(strict[s]["dna_sequence_from_broadstreet"]) - 1
strict[s]["orf_end_possible"] = int(
strict[s]["orf_start"]) + len(strict[s]["dna_sequence_from_broadstreet"]) - 1
else:
strict[s]["orf_start_possible"] = int(strict[s]["orf_end"]) - len(strict[s]["dna_sequence_from_broadstreet"]) + 1
strict[s]["orf_end_possible"] = int(strict[s]["orf_end"])
strict[s]["orf_start_possible"] = int(
strict[s]["orf_end"]) - len(strict[s]["dna_sequence_from_broadstreet"]) + 1
strict[s]["orf_end_possible"] = int(
strict[s]["orf_end"])

# pull nucleotides from query or submitted sequence
partial_bases, orf_start_, orf_end_ = self.get_part_sequence(
Expand All @@ -426,19 +436,21 @@ def nudge_strict_to_perfect(self, strict):
)

if strict[s]["orf_strand"] == "-":
strict[s]["orf_dna_sequence_possible"] = str(Seq(partial_bases).reverse_complement())
strict[s]["orf_dna_sequence_possible"] = str(
Seq(partial_bases).reverse_complement())
else:
strict[s]["orf_dna_sequence_possible"] = partial_bases

if len(strict[s]["orf_dna_sequence_possible"]) % 3 == 0:
orf_prot_sequence_possible = str(Seq(strict[s]["orf_dna_sequence_possible"]).translate(table=11)).strip("*")
orf_prot_sequence_possible = str(
Seq(strict[s]["orf_dna_sequence_possible"]).translate(table=11)).strip("*")
strict[s]["orf_prot_sequence_possible"] = orf_prot_sequence_possible
if orf_prot_sequence_possible == strict[s]["sequence_from_broadstreet"]:
strict[s]["type_match"] = "Perfect"
nudged = True
else:
logger.warning("incorrect open reading frame for coordinate: {}-{} on strand {} for {}".format
(strict[s]["orf_start_possible"], strict[s]["orf_end_possible"], strict[s]["orf_strand"], strict[s]["orf_from"]))
(strict[s]["orf_start_possible"], strict[s]["orf_end_possible"], strict[s]["orf_strand"], strict[s]["orf_from"]))

strict[s]["nudged"] = nudged

Expand Down Expand Up @@ -505,7 +517,8 @@ def get_part_sequence(self, fasta_file, header, start, stop, nterminus, strand,
genes = False
# logger.info("[PARTIAL] ARO: {} | contig: {} | filename: {}".format(name, header, fasta_file))
try:
genes = Fasta(fasta_file, sequence_always_upper=False, read_long_names=False, one_based_attributes=True)
genes = Fasta(fasta_file, sequence_always_upper=False,
read_long_names=False, one_based_attributes=True)
except Exception as e:
logger.error(e)
# logger.info(genes.records)
Expand All @@ -517,9 +530,9 @@ def get_part_sequence(self, fasta_file, header, start, stop, nterminus, strand,
# logger.debug("grep sequence from {}|-|{}-{}".format(header,_start, _stop,))
if nterminus == 0:
# logger.debug("grep sequence from {}|-|{}-{}".format(header,start, stop,))
return str(genes.get_spliced_seq( header, [[start, stop]])), start, stop
return str(genes.get_spliced_seq(header, [[start, stop]])), start, stop
else:
return str(genes.get_spliced_seq( header, [[_start, _stop]])), _start, _stop
return str(genes.get_spliced_seq(header, [[_start, _stop]])), _start, _stop
elif strand == "+":
_start = start - nterminus
_stop = start - 1
Expand All @@ -530,9 +543,9 @@ def get_part_sequence(self, fasta_file, header, start, stop, nterminus, strand,
# logger.debug("grep sequence from {}|+|{}-{}".format(header,_start, _stop))
if nterminus == 0:
# logger.debug("grep sequence from {}|+|{}-{}".format(header,start, stop))
return str(genes.get_spliced_seq( header, [[start, stop]])), start, stop
return str(genes.get_spliced_seq(header, [[start, stop]])), start, stop
else:
return str(genes.get_spliced_seq( header, [[_start, _stop]])), _start, _stop
return str(genes.get_spliced_seq(header, [[_start, _stop]])), _start, _stop

def nudge_loose_to_strict(self, loose):
"""
Expand Down Expand Up @@ -560,6 +573,3 @@ def nudge_loose_to_strict(self, loose):
loose[i]["note"] = "loose hit with at least 95 percent identity pushed strict"

return nudged, loose



Loading

0 comments on commit 5a370a4

Please sign in to comment.