From 53ae70d4731170e694c9403683eec169415302de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Tue, 29 Oct 2024 22:43:26 -0500 Subject: [PATCH 01/17] Refactor RBS module to improve code organization and readability --- src/rbs.jl | 226 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 src/rbs.jl diff --git a/src/rbs.jl b/src/rbs.jl new file mode 100644 index 0000000..5d2905e --- /dev/null +++ b/src/rbs.jl @@ -0,0 +1,226 @@ +export RBS, RBSMOTIFS + +struct RBS + motif::SeqOrView # motif + offset::UnitRange{Int64} # offset + score::Int64 + window::Symbol + + function RBS(motif::SeqOrView{A}, offset::UnitRange{Int64}, score::Int64, windowsymbol::Symbol) where {A} + return new(motif, offset, score, windowsymbol) + end + # rbsinst = RBS(biore"RRR"dna, 3:4, 1.0) +end + +const RBSMOTIFS = Dict( + ExactSearchQuery(dna"GGAGGA", iscompatible) => 27, + ExactSearchQuery(dna"GGAGG", iscompatible) => 24, + ExactSearchQuery(dna"GAGGA", iscompatible) => 22, + ExactSearchQuery(dna"GGACGA", iscompatible) => 19, + ExactSearchQuery(dna"GGATGA", iscompatible) => 19, + ExactSearchQuery(dna"GGAAGA", iscompatible) => 19, + ExactSearchQuery(dna"GGCGGA", iscompatible) => 19, + ExactSearchQuery(dna"GGGGGA", iscompatible) => 19, + ExactSearchQuery(dna"GGTGGA", iscompatible) => 19, + ExactSearchQuery(dna"GGAG", iscompatible) => 16, + ExactSearchQuery(dna"GAGG", iscompatible) => 16, + ExactSearchQuery(dna"AGGA", iscompatible) => 16, + ExactSearchQuery(dna"GGTGG", iscompatible) => 14, + ExactSearchQuery(dna"GGGGG", iscompatible) => 14, + ExactSearchQuery(dna"GGCGA", iscompatible) => 14, + ExactSearchQuery(dna"AGG", iscompatible) => 13, + ExactSearchQuery(dna"GAG", iscompatible) => 13, + ExactSearchQuery(dna"GGA", iscompatible) => 13, + ExactSearchQuery(dna"GAAGA", iscompatible) => 9, + ExactSearchQuery(dna"GATGA", iscompatible) => 9, + ExactSearchQuery(dna"GACGA", iscompatible) => 9, + # ExactSearchQuery(dna"ATACG", iscompatible) => 5, ## testing motif +) + +# export RBSMOTIFS, RBSQUERYMOTIGS +# RBSMOTIFS = ( +# dna"GGAGGA", dna"GGAGG", dna"GAGGA", dna"GGACGA", dna"GGATGA", dna"GGAAGA", dna"GGCGGA", dna"GGGGGA", dna"GGTGGA", +# dna"GGAG", dna"GAGG", dna"AGGA", dna"GGTGG", dna"GGGGG", dna"GGCGG", dna"AGG", dna"GAG", dna"GGA", +# dna"GAAGA", dna"GATGA", dna"GACGA" +# ) + +# RBSQUERYMOTIFS = ExactSearchQuery.(RBSMOTIFS, iscompatible) + + +# RBSQUERYMOTIGS = ExactSearchQuery.(RBSMOTIFS, iscompatible) + +# ExactSearchQuery(dna"ATACG", iscompatible) => 5, ## testing motif + +# -20 -15 -10 -5 |-> start codon +# |....|....|....|....ATG... +#EX1. GGAGGACCCCATGACACACACAACAC +# |----|:27 + +## Window checking a: +# -16 -5 |-> start codon +# ...|..........|....ATG... + +## Window checking b: +# -10 -3 |-> start codon +# ...|......|..ATG... + +## Window checking c: +# -18 -11 |-> start codon +# ...|......|..........ATG... + +export _rbswindows +function _rbswindows(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} + + windowa = orf.strand == STRAND_POS ? (orf.first-16:orf.first-5) : (orf.last+5:orf.last+16) + windowb = orf.strand == STRAND_POS ? (orf.first-10:orf.first-3) : (orf.last+3:orf.last+10) + windowc = orf.strand == STRAND_POS ? (orf.first-18:orf.first-11) : (orf.last+11:orf.last+18) + # This is not searching the correct RBS in the reverse complemente strand + # windows = (a = windowa, b = windowb, c = windowc) + windows = (windowa, windowb, windowc) + return windows#filter(window -> first(window) >= 1, windows) +end + +export findrbs +function findrbs(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} + # s = reverse(seq) + score = 0 + wsymb = (:a, :b, :c) + rbsvect = Vector{RBS}() + windows = _rbswindows(seq, orf) + + for i in 1:length(windows) + window = windows[i] + symbol = wsymb[i] + sqv = @view seq[window] + for (rbs, scr) in pairs(RBSMOTIFS) + if occursin(rbs, sqv) + push!(rbsvect, RBS(rbs.seq, window, scr, symbol)) + # score += scr # Use the value associated with the key + end + end + end + return rbsvect +end + +function orf_rbs_score(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} + # Initialize the score and the max scores dictionary + wsymb = (:a, :b, :c) + windows = _rbswindows(seq, orf) + maxscores = Dict(:a => 0, :b => 0, :c => 0) + + # Iterate over the windows and their corresponding symbols + @inbounds for i in 1:length(windows) + window = windows[i] + symbol = wsymb[i] + sqv = @view seq[window] + + # Check for RBS motifs in the sequence view + @inbounds for (rbs, scr) in pairs(RBSMOTIFS) + if occursin(rbs, sqv) + # Update the max score directly + if scr > maxscores[symbol] + maxscores[symbol] = scr + end + end + end + end + + # Sum the maximum scores of each window + total_score = sum(values(maxscores)) + + return total_score +end + + +### Another implementation + + +export RBSMOTIFS2 +RBSMOTIFS2 = ( + RBS(dna"GGAGGA", 5:10, 27, :A), + RBS(dna"GGAGGA", 3:8, 26, :B), + RBS(dna"GGAGGA", 11:16, 25, :C), + RBS(dna"GGAGG", 5:9, 24, :D), + RBS(dna"GGAGG", 3:8, 23, :E), + RBS(dna"GAGGA", 5:9, 22, :F), + RBS(dna"GAGGA", 3:8, 21, :G), + RBS(dna"GAGGA", 11:16, 20, :H), + RBS(dna"GGACGA", 5:10, 19, :I), + RBS(dna"GGATGA", 5:10, 19, :I), + RBS(dna"GGAAGA", 5:10, 19, :I), + RBS(dna"GGCGGA", 5:10, 19, :I), + RBS(dna"GGGGGA", 5:10, 19, :I), + RBS(dna"GGTGGA", 5:10, 19, :I), + RBS(dna"GGAAGA", 3:9, 18, :J), + RBS(dna"GGTGGA", 3:9, 18, :J), + RBS(dna"GGAAGA", 11:17, 17, :K), + RBS(dna"GGTGGA", 11:17, 17, :K), + RBS(dna"GGAG", 5:9, 16, :L), + RBS(dna"GAGG", 5:9, 16, :L), + RBS(dna"AGGA", 5:9, 15, :M), + RBS(dna"GGTGG", 5:9, 14, :N), + RBS(dna"GGGGG", 5:9, 14, :N), + RBS(dna"GGCNGG", 5:9, 14, :N), + RBS(dna"AGG", 5:8, 13, :O), + RBS(dna"GAG", 5:8, 13, :O), + RBS(dna"GGA", 5:8, 13, :O), + RBS(dna"AGGA", 11:15, 12, :P), + RBS(dna"AGGA", 3:7, 11, :Q), + RBS(dna"GAGGA", 13:19, 10, :R), + RBS(dna"GAAGA", 5:10, 9, :S), + RBS(dna"GATGA", 5:10, 9, :S), + RBS(dna"GACGA", 5:10, 9, :S), + RBS(dna"GGTGG", 3:8, 8, :T), + RBS(dna"GGTGG", 11:16, 7, :U), + RBS(dna"AGG", 11:14, 6, :V), + RBS(dna"GAAGA", 3:8, 5, :W), + RBS(dna"GAAGA", 11:16, 4, :X), + RBS(dna"AGGA", 13:17, 3, :Y), + RBS(dna"AGG", 13:16, 2, :Z), + RBS(dna"GGAAGA", 13:19, 2, :Z), + RBS(dna"GGTGG", 13:18, 2, :Z), + RBS(dna"AGG", 3:6, 1, :Z) +) + +export motifseq +function motifseq(rbs::RBS) + return rbs.motif +end + +export RBSMOTIFS2QUERY +RBSMOTIFS2QUERY = ExactSearchQuery.(motifseq.(RBSMOTIFS2)) + +export offset +function offset(rbs::RBS) + return rbs.offset +end + +function motifscore(seq::SeqOrView{A}) where {A} + score = 0 + for rbs in RBSMOTIFS2 + if occursin(rbs.motif, seq) + score += rbs.score + end + end + return score +end + + +export orf_rbs_score2 +function orf_rbs_score2(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} + for i in 1:length(RBSMOTIFS2) + rbs = RBSMOTIFS2[i] + rbsq = RBSMOTIFS2QUERY[i] + println(rbs) + return findprev(rbsq, seq, orf.first) + end +end + + +# An idea of another implemetation of the a orf finder would be to use qstart = ExactSearchQuery(dna"ATG") +# and then findall the start codons in the sequence and reverse sequence appended (seq * reverse_complement(seq)) +# Use the memory layout of this findings to store also the frame, strand and view of the sequence. Figuring out the +# how the strand and the location will be defined in the memory layout is the next step. + +## Another idea would be to fastly count the number of ATG in a sequence twice. This will be the approx number of ORFs +## in the sequence.Them we can use other way to store the ORFs in the memory layout. \ No newline at end of file From bc923665dc2183bd852bafaad540c57c82a4c24f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Tue, 29 Oct 2024 22:43:31 -0500 Subject: [PATCH 02/17] Refactor GeneFinder module to include RBS Scoring functionality --- src/GeneFinder.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index 5c670ef..0344063 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -21,7 +21,10 @@ using BioSequences: reverse_complement, randdnaseq, ncbi_trans_table, - translate + translate, + + ExactSearchQuery, + iscompatible using BioMarkovChains: BioMarkovChain, ECOLICDS, ECOLINOCDS, log_odds_ratio_score using IterTools: takewhile, iterated @@ -44,6 +47,9 @@ include("io.jl") include("utils.jl") include("extended.jl") +# RBS Scoring +include("rbs.jl") + # Precompiled workloads include("workload.jl") From 74f7a9955d3076a5a802bb97c51a7231dba72e35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Tue, 29 Oct 2024 22:43:26 -0500 Subject: [PATCH 03/17] Refactor RBS module to improve code organization and readability --- src/rbs.jl | 226 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 src/rbs.jl diff --git a/src/rbs.jl b/src/rbs.jl new file mode 100644 index 0000000..5d2905e --- /dev/null +++ b/src/rbs.jl @@ -0,0 +1,226 @@ +export RBS, RBSMOTIFS + +struct RBS + motif::SeqOrView # motif + offset::UnitRange{Int64} # offset + score::Int64 + window::Symbol + + function RBS(motif::SeqOrView{A}, offset::UnitRange{Int64}, score::Int64, windowsymbol::Symbol) where {A} + return new(motif, offset, score, windowsymbol) + end + # rbsinst = RBS(biore"RRR"dna, 3:4, 1.0) +end + +const RBSMOTIFS = Dict( + ExactSearchQuery(dna"GGAGGA", iscompatible) => 27, + ExactSearchQuery(dna"GGAGG", iscompatible) => 24, + ExactSearchQuery(dna"GAGGA", iscompatible) => 22, + ExactSearchQuery(dna"GGACGA", iscompatible) => 19, + ExactSearchQuery(dna"GGATGA", iscompatible) => 19, + ExactSearchQuery(dna"GGAAGA", iscompatible) => 19, + ExactSearchQuery(dna"GGCGGA", iscompatible) => 19, + ExactSearchQuery(dna"GGGGGA", iscompatible) => 19, + ExactSearchQuery(dna"GGTGGA", iscompatible) => 19, + ExactSearchQuery(dna"GGAG", iscompatible) => 16, + ExactSearchQuery(dna"GAGG", iscompatible) => 16, + ExactSearchQuery(dna"AGGA", iscompatible) => 16, + ExactSearchQuery(dna"GGTGG", iscompatible) => 14, + ExactSearchQuery(dna"GGGGG", iscompatible) => 14, + ExactSearchQuery(dna"GGCGA", iscompatible) => 14, + ExactSearchQuery(dna"AGG", iscompatible) => 13, + ExactSearchQuery(dna"GAG", iscompatible) => 13, + ExactSearchQuery(dna"GGA", iscompatible) => 13, + ExactSearchQuery(dna"GAAGA", iscompatible) => 9, + ExactSearchQuery(dna"GATGA", iscompatible) => 9, + ExactSearchQuery(dna"GACGA", iscompatible) => 9, + # ExactSearchQuery(dna"ATACG", iscompatible) => 5, ## testing motif +) + +# export RBSMOTIFS, RBSQUERYMOTIGS +# RBSMOTIFS = ( +# dna"GGAGGA", dna"GGAGG", dna"GAGGA", dna"GGACGA", dna"GGATGA", dna"GGAAGA", dna"GGCGGA", dna"GGGGGA", dna"GGTGGA", +# dna"GGAG", dna"GAGG", dna"AGGA", dna"GGTGG", dna"GGGGG", dna"GGCGG", dna"AGG", dna"GAG", dna"GGA", +# dna"GAAGA", dna"GATGA", dna"GACGA" +# ) + +# RBSQUERYMOTIFS = ExactSearchQuery.(RBSMOTIFS, iscompatible) + + +# RBSQUERYMOTIGS = ExactSearchQuery.(RBSMOTIFS, iscompatible) + +# ExactSearchQuery(dna"ATACG", iscompatible) => 5, ## testing motif + +# -20 -15 -10 -5 |-> start codon +# |....|....|....|....ATG... +#EX1. GGAGGACCCCATGACACACACAACAC +# |----|:27 + +## Window checking a: +# -16 -5 |-> start codon +# ...|..........|....ATG... + +## Window checking b: +# -10 -3 |-> start codon +# ...|......|..ATG... + +## Window checking c: +# -18 -11 |-> start codon +# ...|......|..........ATG... + +export _rbswindows +function _rbswindows(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} + + windowa = orf.strand == STRAND_POS ? (orf.first-16:orf.first-5) : (orf.last+5:orf.last+16) + windowb = orf.strand == STRAND_POS ? (orf.first-10:orf.first-3) : (orf.last+3:orf.last+10) + windowc = orf.strand == STRAND_POS ? (orf.first-18:orf.first-11) : (orf.last+11:orf.last+18) + # This is not searching the correct RBS in the reverse complemente strand + # windows = (a = windowa, b = windowb, c = windowc) + windows = (windowa, windowb, windowc) + return windows#filter(window -> first(window) >= 1, windows) +end + +export findrbs +function findrbs(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} + # s = reverse(seq) + score = 0 + wsymb = (:a, :b, :c) + rbsvect = Vector{RBS}() + windows = _rbswindows(seq, orf) + + for i in 1:length(windows) + window = windows[i] + symbol = wsymb[i] + sqv = @view seq[window] + for (rbs, scr) in pairs(RBSMOTIFS) + if occursin(rbs, sqv) + push!(rbsvect, RBS(rbs.seq, window, scr, symbol)) + # score += scr # Use the value associated with the key + end + end + end + return rbsvect +end + +function orf_rbs_score(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} + # Initialize the score and the max scores dictionary + wsymb = (:a, :b, :c) + windows = _rbswindows(seq, orf) + maxscores = Dict(:a => 0, :b => 0, :c => 0) + + # Iterate over the windows and their corresponding symbols + @inbounds for i in 1:length(windows) + window = windows[i] + symbol = wsymb[i] + sqv = @view seq[window] + + # Check for RBS motifs in the sequence view + @inbounds for (rbs, scr) in pairs(RBSMOTIFS) + if occursin(rbs, sqv) + # Update the max score directly + if scr > maxscores[symbol] + maxscores[symbol] = scr + end + end + end + end + + # Sum the maximum scores of each window + total_score = sum(values(maxscores)) + + return total_score +end + + +### Another implementation + + +export RBSMOTIFS2 +RBSMOTIFS2 = ( + RBS(dna"GGAGGA", 5:10, 27, :A), + RBS(dna"GGAGGA", 3:8, 26, :B), + RBS(dna"GGAGGA", 11:16, 25, :C), + RBS(dna"GGAGG", 5:9, 24, :D), + RBS(dna"GGAGG", 3:8, 23, :E), + RBS(dna"GAGGA", 5:9, 22, :F), + RBS(dna"GAGGA", 3:8, 21, :G), + RBS(dna"GAGGA", 11:16, 20, :H), + RBS(dna"GGACGA", 5:10, 19, :I), + RBS(dna"GGATGA", 5:10, 19, :I), + RBS(dna"GGAAGA", 5:10, 19, :I), + RBS(dna"GGCGGA", 5:10, 19, :I), + RBS(dna"GGGGGA", 5:10, 19, :I), + RBS(dna"GGTGGA", 5:10, 19, :I), + RBS(dna"GGAAGA", 3:9, 18, :J), + RBS(dna"GGTGGA", 3:9, 18, :J), + RBS(dna"GGAAGA", 11:17, 17, :K), + RBS(dna"GGTGGA", 11:17, 17, :K), + RBS(dna"GGAG", 5:9, 16, :L), + RBS(dna"GAGG", 5:9, 16, :L), + RBS(dna"AGGA", 5:9, 15, :M), + RBS(dna"GGTGG", 5:9, 14, :N), + RBS(dna"GGGGG", 5:9, 14, :N), + RBS(dna"GGCNGG", 5:9, 14, :N), + RBS(dna"AGG", 5:8, 13, :O), + RBS(dna"GAG", 5:8, 13, :O), + RBS(dna"GGA", 5:8, 13, :O), + RBS(dna"AGGA", 11:15, 12, :P), + RBS(dna"AGGA", 3:7, 11, :Q), + RBS(dna"GAGGA", 13:19, 10, :R), + RBS(dna"GAAGA", 5:10, 9, :S), + RBS(dna"GATGA", 5:10, 9, :S), + RBS(dna"GACGA", 5:10, 9, :S), + RBS(dna"GGTGG", 3:8, 8, :T), + RBS(dna"GGTGG", 11:16, 7, :U), + RBS(dna"AGG", 11:14, 6, :V), + RBS(dna"GAAGA", 3:8, 5, :W), + RBS(dna"GAAGA", 11:16, 4, :X), + RBS(dna"AGGA", 13:17, 3, :Y), + RBS(dna"AGG", 13:16, 2, :Z), + RBS(dna"GGAAGA", 13:19, 2, :Z), + RBS(dna"GGTGG", 13:18, 2, :Z), + RBS(dna"AGG", 3:6, 1, :Z) +) + +export motifseq +function motifseq(rbs::RBS) + return rbs.motif +end + +export RBSMOTIFS2QUERY +RBSMOTIFS2QUERY = ExactSearchQuery.(motifseq.(RBSMOTIFS2)) + +export offset +function offset(rbs::RBS) + return rbs.offset +end + +function motifscore(seq::SeqOrView{A}) where {A} + score = 0 + for rbs in RBSMOTIFS2 + if occursin(rbs.motif, seq) + score += rbs.score + end + end + return score +end + + +export orf_rbs_score2 +function orf_rbs_score2(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} + for i in 1:length(RBSMOTIFS2) + rbs = RBSMOTIFS2[i] + rbsq = RBSMOTIFS2QUERY[i] + println(rbs) + return findprev(rbsq, seq, orf.first) + end +end + + +# An idea of another implemetation of the a orf finder would be to use qstart = ExactSearchQuery(dna"ATG") +# and then findall the start codons in the sequence and reverse sequence appended (seq * reverse_complement(seq)) +# Use the memory layout of this findings to store also the frame, strand and view of the sequence. Figuring out the +# how the strand and the location will be defined in the memory layout is the next step. + +## Another idea would be to fastly count the number of ATG in a sequence twice. This will be the approx number of ORFs +## in the sequence.Them we can use other way to store the ORFs in the memory layout. \ No newline at end of file From 4ecbd48441db6a560d53e4ba36e5231cac3681df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Tue, 29 Oct 2024 22:43:31 -0500 Subject: [PATCH 04/17] Refactor GeneFinder module to include RBS Scoring functionality --- src/GeneFinder.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index 5c670ef..0344063 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -21,7 +21,10 @@ using BioSequences: reverse_complement, randdnaseq, ncbi_trans_table, - translate + translate, + + ExactSearchQuery, + iscompatible using BioMarkovChains: BioMarkovChain, ECOLICDS, ECOLINOCDS, log_odds_ratio_score using IterTools: takewhile, iterated @@ -44,6 +47,9 @@ include("io.jl") include("utils.jl") include("extended.jl") +# RBS Scoring +include("rbs.jl") + # Precompiled workloads include("workload.jl") From 98db5dbaadc431eded5d42a2af7b10645f44f937 Mon Sep 17 00:00:00 2001 From: Camilo Garcia Date: Fri, 29 Nov 2024 17:41:32 -0500 Subject: [PATCH 05/17] Refactor RBS module to streamline functions and improve motif handling --- src/rbs.jl | 140 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 96 insertions(+), 44 deletions(-) diff --git a/src/rbs.jl b/src/rbs.jl index 5d2905e..28345b1 100644 --- a/src/rbs.jl +++ b/src/rbs.jl @@ -37,18 +37,7 @@ const RBSMOTIFS = Dict( # ExactSearchQuery(dna"ATACG", iscompatible) => 5, ## testing motif ) -# export RBSMOTIFS, RBSQUERYMOTIGS -# RBSMOTIFS = ( -# dna"GGAGGA", dna"GGAGG", dna"GAGGA", dna"GGACGA", dna"GGATGA", dna"GGAAGA", dna"GGCGGA", dna"GGGGGA", dna"GGTGGA", -# dna"GGAG", dna"GAGG", dna"AGGA", dna"GGTGG", dna"GGGGG", dna"GGCGG", dna"AGG", dna"GAG", dna"GGA", -# dna"GAAGA", dna"GATGA", dna"GACGA" -# ) -# RBSQUERYMOTIFS = ExactSearchQuery.(RBSMOTIFS, iscompatible) - - -# RBSQUERYMOTIGS = ExactSearchQuery.(RBSMOTIFS, iscompatible) - # ExactSearchQuery(dna"ATACG", iscompatible) => 5, ## testing motif # -20 -15 -10 -5 |-> start codon @@ -69,29 +58,48 @@ const RBSMOTIFS = Dict( # ...|......|..........ATG... export _rbswindows -function _rbswindows(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} - - windowa = orf.strand == STRAND_POS ? (orf.first-16:orf.first-5) : (orf.last+5:orf.last+16) +function _rbswindows(orf::ORFI{N,F}) where {N,F} + windowa = orf.strand == STRAND_POS ? (orf.first-16:orf.first-5) : (orf.last+5:orf.last+16) windowb = orf.strand == STRAND_POS ? (orf.first-10:orf.first-3) : (orf.last+3:orf.last+10) windowc = orf.strand == STRAND_POS ? (orf.first-18:orf.first-11) : (orf.last+11:orf.last+18) - # This is not searching the correct RBS in the reverse complemente strand - # windows = (a = windowa, b = windowb, c = windowc) windows = (windowa, windowb, windowc) - return windows#filter(window -> first(window) >= 1, windows) + return filter(window -> first(window) >= 1 && last(window) >= 1, windows) end -export findrbs -function findrbs(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} +# export findrbs +# function findrbs(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} +# # s = reverse(seq) +# score = 0 +# wsymb = (:a, :b, :c) +# rbsvect = Vector{RBS}() +# windows = _rbswindows(orf) + +# for i in 1:length(windows) +# window = windows[i] +# symbol = wsymb[i] +# sqv = @view seq[window] +# for (rbs, scr) in pairs(RBSMOTIFS) +# if occursin(rbs, sqv) +# push!(rbsvect, RBS(rbs.seq, window, scr, symbol)) +# # score += scr # Use the value associated with the key +# end +# end +# end +# return rbsvect +# end + +export _findrbs +function _findrbs(orf::ORFI{N,F}) where {N,F} # s = reverse(seq) score = 0 - wsymb = (:a, :b, :c) + wsymb = (:A, :B, :C) rbsvect = Vector{RBS}() - windows = _rbswindows(seq, orf) + windows = _rbswindows(orf) for i in 1:length(windows) window = windows[i] symbol = wsymb[i] - sqv = @view seq[window] + sqv = @view source(orf)[window] for (rbs, scr) in pairs(RBSMOTIFS) if occursin(rbs, sqv) push!(rbsvect, RBS(rbs.seq, window, scr, symbol)) @@ -102,17 +110,48 @@ function findrbs(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} return rbsvect end -function orf_rbs_score(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} +# export orf_rbs_score +# function orf_rbs_score(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} +# # Initialize the score and the max scores dictionary +# wsymb = (:a, :b, :c) +# windows = _rbswindows(orf) +# maxscores = Dict(:a => 0, :b => 0, :c => 0) + +# # Iterate over the windows and their corresponding symbols +# @inbounds for i in 1:length(windows) +# window = windows[i] +# symbol = wsymb[i] +# sqv = @view seq[window] + +# # Check for RBS motifs in the sequence view +# @inbounds for (rbs, scr) in pairs(RBSMOTIFS) +# if occursin(rbs, sqv) +# # Update the max score directly +# if scr > maxscores[symbol] +# maxscores[symbol] = scr +# end +# end +# end +# end + +# # Sum the maximum scores of each window +# total_score = sum(values(maxscores)) + +# return total_score +# end + +export _orf_rbs_score +function _orf_rbs_score(orf::ORFI{N,F}) where {N,F} # Initialize the score and the max scores dictionary wsymb = (:a, :b, :c) - windows = _rbswindows(seq, orf) + windows = _rbswindows(orf) maxscores = Dict(:a => 0, :b => 0, :c => 0) # Iterate over the windows and their corresponding symbols @inbounds for i in 1:length(windows) window = windows[i] symbol = wsymb[i] - sqv = @view seq[window] + sqv = @view source(orf)[window] # Check for RBS motifs in the sequence view @inbounds for (rbs, scr) in pairs(RBSMOTIFS) @@ -131,9 +170,28 @@ function orf_rbs_score(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} return total_score end +export motifseq +function motifseq(rbs::RBS) + return rbs.motif +end + +export offset +function offset(rbs::RBS) + return rbs.offset +end ### Another implementation +# export RBSMOTIFS, RBSQUERYMOTIGS +# RBSMOTIFS = ( +# dna"GGAGGA", dna"GGAGG", dna"GAGGA", dna"GGACGA", dna"GGATGA", dna"GGAAGA", dna"GGCGGA", dna"GGGGGA", dna"GGTGGA", +# dna"GGAG", dna"GAGG", dna"AGGA", dna"GGTGG", dna"GGGGG", dna"GGCGG", dna"AGG", dna"GAG", dna"GGA", +# dna"GAAGA", dna"GATGA", dna"GACGA" +# ) + +# RBSQUERYMOTIFS = ExactSearchQuery.(RBSMOTIFS, iscompatible) +# RBSQUERYMOTIGS = ExactSearchQuery.(RBSMOTIFS, iscompatible) + export RBSMOTIFS2 RBSMOTIFS2 = ( @@ -182,20 +240,8 @@ RBSMOTIFS2 = ( RBS(dna"AGG", 3:6, 1, :Z) ) -export motifseq -function motifseq(rbs::RBS) - return rbs.motif -end - -export RBSMOTIFS2QUERY -RBSMOTIFS2QUERY = ExactSearchQuery.(motifseq.(RBSMOTIFS2)) - -export offset -function offset(rbs::RBS) - return rbs.offset -end - -function motifscore(seq::SeqOrView{A}) where {A} +export _motifscore +function _motifscore(seq::SeqOrView{A}) where {A} score = 0 for rbs in RBSMOTIFS2 if occursin(rbs.motif, seq) @@ -205,18 +251,24 @@ function motifscore(seq::SeqOrView{A}) where {A} return score end +export RBSMOTIFS2QUERY +RBSMOTIFS2QUERY = ExactSearchQuery.(motifseq.(RBSMOTIFS2)) -export orf_rbs_score2 -function orf_rbs_score2(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} +export _findrbs2 +function _findrbs2(orf::ORFI{N,F}) where {N,F} + rbsvect = Vector{RBS}()#Vector{RBS}() for i in 1:length(RBSMOTIFS2) rbs = RBSMOTIFS2[i] rbsq = RBSMOTIFS2QUERY[i] - println(rbs) - return findprev(rbsq, seq, orf.first) + match = findprev(rbsq, source(orf), i) #needs to deal with strand + if match != nothing + push!(rbsvect, RBS(rbs.motif, -(match, rbs.offset), rbs.score, rbs.window)) + println("Match found at: ", match) + end end + return rbsvect end - # An idea of another implemetation of the a orf finder would be to use qstart = ExactSearchQuery(dna"ATG") # and then findall the start codons in the sequence and reverse sequence appended (seq * reverse_complement(seq)) # Use the memory layout of this findings to store also the frame, strand and view of the sequence. Figuring out the From 17eb549e6dc3c49974af95052e46fe1a307493a7 Mon Sep 17 00:00:00 2001 From: Camilo Garcia Date: Fri, 29 Nov 2024 17:44:02 -0500 Subject: [PATCH 06/17] Refactor RBS module and introduce RBSMOTIFS2 for improved motif scoring functionality --- src/rbs.jl | 88 ----------------------------------------------------- src/rbs2.jl | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 88 deletions(-) create mode 100644 src/rbs2.jl diff --git a/src/rbs.jl b/src/rbs.jl index 28345b1..bbb910b 100644 --- a/src/rbs.jl +++ b/src/rbs.jl @@ -180,94 +180,6 @@ function offset(rbs::RBS) return rbs.offset end -### Another implementation - -# export RBSMOTIFS, RBSQUERYMOTIGS -# RBSMOTIFS = ( -# dna"GGAGGA", dna"GGAGG", dna"GAGGA", dna"GGACGA", dna"GGATGA", dna"GGAAGA", dna"GGCGGA", dna"GGGGGA", dna"GGTGGA", -# dna"GGAG", dna"GAGG", dna"AGGA", dna"GGTGG", dna"GGGGG", dna"GGCGG", dna"AGG", dna"GAG", dna"GGA", -# dna"GAAGA", dna"GATGA", dna"GACGA" -# ) - -# RBSQUERYMOTIFS = ExactSearchQuery.(RBSMOTIFS, iscompatible) -# RBSQUERYMOTIGS = ExactSearchQuery.(RBSMOTIFS, iscompatible) - - -export RBSMOTIFS2 -RBSMOTIFS2 = ( - RBS(dna"GGAGGA", 5:10, 27, :A), - RBS(dna"GGAGGA", 3:8, 26, :B), - RBS(dna"GGAGGA", 11:16, 25, :C), - RBS(dna"GGAGG", 5:9, 24, :D), - RBS(dna"GGAGG", 3:8, 23, :E), - RBS(dna"GAGGA", 5:9, 22, :F), - RBS(dna"GAGGA", 3:8, 21, :G), - RBS(dna"GAGGA", 11:16, 20, :H), - RBS(dna"GGACGA", 5:10, 19, :I), - RBS(dna"GGATGA", 5:10, 19, :I), - RBS(dna"GGAAGA", 5:10, 19, :I), - RBS(dna"GGCGGA", 5:10, 19, :I), - RBS(dna"GGGGGA", 5:10, 19, :I), - RBS(dna"GGTGGA", 5:10, 19, :I), - RBS(dna"GGAAGA", 3:9, 18, :J), - RBS(dna"GGTGGA", 3:9, 18, :J), - RBS(dna"GGAAGA", 11:17, 17, :K), - RBS(dna"GGTGGA", 11:17, 17, :K), - RBS(dna"GGAG", 5:9, 16, :L), - RBS(dna"GAGG", 5:9, 16, :L), - RBS(dna"AGGA", 5:9, 15, :M), - RBS(dna"GGTGG", 5:9, 14, :N), - RBS(dna"GGGGG", 5:9, 14, :N), - RBS(dna"GGCNGG", 5:9, 14, :N), - RBS(dna"AGG", 5:8, 13, :O), - RBS(dna"GAG", 5:8, 13, :O), - RBS(dna"GGA", 5:8, 13, :O), - RBS(dna"AGGA", 11:15, 12, :P), - RBS(dna"AGGA", 3:7, 11, :Q), - RBS(dna"GAGGA", 13:19, 10, :R), - RBS(dna"GAAGA", 5:10, 9, :S), - RBS(dna"GATGA", 5:10, 9, :S), - RBS(dna"GACGA", 5:10, 9, :S), - RBS(dna"GGTGG", 3:8, 8, :T), - RBS(dna"GGTGG", 11:16, 7, :U), - RBS(dna"AGG", 11:14, 6, :V), - RBS(dna"GAAGA", 3:8, 5, :W), - RBS(dna"GAAGA", 11:16, 4, :X), - RBS(dna"AGGA", 13:17, 3, :Y), - RBS(dna"AGG", 13:16, 2, :Z), - RBS(dna"GGAAGA", 13:19, 2, :Z), - RBS(dna"GGTGG", 13:18, 2, :Z), - RBS(dna"AGG", 3:6, 1, :Z) -) - -export _motifscore -function _motifscore(seq::SeqOrView{A}) where {A} - score = 0 - for rbs in RBSMOTIFS2 - if occursin(rbs.motif, seq) - score += rbs.score - end - end - return score -end - -export RBSMOTIFS2QUERY -RBSMOTIFS2QUERY = ExactSearchQuery.(motifseq.(RBSMOTIFS2)) - -export _findrbs2 -function _findrbs2(orf::ORFI{N,F}) where {N,F} - rbsvect = Vector{RBS}()#Vector{RBS}() - for i in 1:length(RBSMOTIFS2) - rbs = RBSMOTIFS2[i] - rbsq = RBSMOTIFS2QUERY[i] - match = findprev(rbsq, source(orf), i) #needs to deal with strand - if match != nothing - push!(rbsvect, RBS(rbs.motif, -(match, rbs.offset), rbs.score, rbs.window)) - println("Match found at: ", match) - end - end - return rbsvect -end # An idea of another implemetation of the a orf finder would be to use qstart = ExactSearchQuery(dna"ATG") # and then findall the start codons in the sequence and reverse sequence appended (seq * reverse_complement(seq)) diff --git a/src/rbs2.jl b/src/rbs2.jl new file mode 100644 index 0000000..d90b929 --- /dev/null +++ b/src/rbs2.jl @@ -0,0 +1,88 @@ +### Another implementation + +# export RBSMOTIFS, RBSQUERYMOTIGS +# RBSMOTIFS = ( +# dna"GGAGGA", dna"GGAGG", dna"GAGGA", dna"GGACGA", dna"GGATGA", dna"GGAAGA", dna"GGCGGA", dna"GGGGGA", dna"GGTGGA", +# dna"GGAG", dna"GAGG", dna"AGGA", dna"GGTGG", dna"GGGGG", dna"GGCGG", dna"AGG", dna"GAG", dna"GGA", +# dna"GAAGA", dna"GATGA", dna"GACGA" +# ) + +# RBSQUERYMOTIFS = ExactSearchQuery.(RBSMOTIFS, iscompatible) +# RBSQUERYMOTIGS = ExactSearchQuery.(RBSMOTIFS, iscompatible) + + +export RBSMOTIFS2 +RBSMOTIFS2 = ( + RBS(dna"GGAGGA", 5:10, 27, :A), + RBS(dna"GGAGGA", 3:8, 26, :B), + RBS(dna"GGAGGA", 11:16, 25, :C), + RBS(dna"GGAGG", 5:9, 24, :D), + RBS(dna"GGAGG", 3:8, 23, :E), + RBS(dna"GAGGA", 5:9, 22, :F), + RBS(dna"GAGGA", 3:8, 21, :G), + RBS(dna"GAGGA", 11:16, 20, :H), + RBS(dna"GGACGA", 5:10, 19, :I), + RBS(dna"GGATGA", 5:10, 19, :I), + RBS(dna"GGAAGA", 5:10, 19, :I), + RBS(dna"GGCGGA", 5:10, 19, :I), + RBS(dna"GGGGGA", 5:10, 19, :I), + RBS(dna"GGTGGA", 5:10, 19, :I), + RBS(dna"GGAAGA", 3:9, 18, :J), + RBS(dna"GGTGGA", 3:9, 18, :J), + RBS(dna"GGAAGA", 11:17, 17, :K), + RBS(dna"GGTGGA", 11:17, 17, :K), + RBS(dna"GGAG", 5:9, 16, :L), + RBS(dna"GAGG", 5:9, 16, :L), + RBS(dna"AGGA", 5:9, 15, :M), + RBS(dna"GGTGG", 5:9, 14, :N), + RBS(dna"GGGGG", 5:9, 14, :N), + RBS(dna"GGCNGG", 5:9, 14, :N), + RBS(dna"AGG", 5:8, 13, :O), + RBS(dna"GAG", 5:8, 13, :O), + RBS(dna"GGA", 5:8, 13, :O), + RBS(dna"AGGA", 11:15, 12, :P), + RBS(dna"AGGA", 3:7, 11, :Q), + RBS(dna"GAGGA", 13:19, 10, :R), + RBS(dna"GAAGA", 5:10, 9, :S), + RBS(dna"GATGA", 5:10, 9, :S), + RBS(dna"GACGA", 5:10, 9, :S), + RBS(dna"GGTGG", 3:8, 8, :T), + RBS(dna"GGTGG", 11:16, 7, :U), + RBS(dna"AGG", 11:14, 6, :V), + RBS(dna"GAAGA", 3:8, 5, :W), + RBS(dna"GAAGA", 11:16, 4, :X), + RBS(dna"AGGA", 13:17, 3, :Y), + RBS(dna"AGG", 13:16, 2, :Z), + RBS(dna"GGAAGA", 13:19, 2, :Z), + RBS(dna"GGTGG", 13:18, 2, :Z), + RBS(dna"AGG", 3:6, 1, :Z) +) + +export _motifscore +function _motifscore(seq::SeqOrView{A}) where {A} + score = 0 + for rbs in RBSMOTIFS2 + if occursin(rbs.motif, seq) + score += rbs.score + end + end + return score +end + +export RBSMOTIFS2QUERY +RBSMOTIFS2QUERY = ExactSearchQuery.(motifseq.(RBSMOTIFS2)) + +export _findrbs2 +function _findrbs2(orf::ORFI{N,F}) where {N,F} + rbsvect = Vector{RBS}()#Vector{RBS}() + for i in 1:length(RBSMOTIFS2) + rbs = RBSMOTIFS2[i] + rbsq = RBSMOTIFS2QUERY[i] + match = findprev(rbsq, source(orf), i) #needs to deal with strand + if match != nothing + push!(rbsvect, RBS(rbs.motif, -(match, rbs.offset), rbs.score, rbs.window)) + println("Match found at: ", match) + end + end + return rbsvect +end From 136618b8286008281d3b70f6aa536156a0d2c0a1 Mon Sep 17 00:00:00 2001 From: Camilo Garcia Date: Fri, 29 Nov 2024 17:50:15 -0500 Subject: [PATCH 07/17] Add commented-out filter for ORF scoring in RBS module --- src/rbs.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rbs.jl b/src/rbs.jl index bbb910b..0f227b1 100644 --- a/src/rbs.jl +++ b/src/rbs.jl @@ -180,6 +180,7 @@ function offset(rbs::RBS) return rbs.offset end +# filter(x -> _orf_rbs_score(x) > 20 && length(x) > 100, findorfs(phi)) # An idea of another implemetation of the a orf finder would be to use qstart = ExactSearchQuery(dna"ATG") # and then findall the start codons in the sequence and reverse sequence appended (seq * reverse_complement(seq)) From 73ec805341acfed629f3cba3f9ecb61410efbf46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 8 Dec 2024 17:35:05 -0500 Subject: [PATCH 08/17] Refactor RBS module to enhance motif handling and introduce reverse motif scoring --- src/rbs.jl | 148 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 83 insertions(+), 65 deletions(-) diff --git a/src/rbs.jl b/src/rbs.jl index 0f227b1..6a1e6e7 100644 --- a/src/rbs.jl +++ b/src/rbs.jl @@ -1,18 +1,20 @@ export RBS, RBSMOTIFS struct RBS - motif::SeqOrView # motif + motif::LongSubSeq # motif offset::UnitRange{Int64} # offset - score::Int64 window::Symbol + score::Int64 + strand::Strand - function RBS(motif::SeqOrView{A}, offset::UnitRange{Int64}, score::Int64, windowsymbol::Symbol) where {A} - return new(motif, offset, score, windowsymbol) + function RBS(motif::LongSubSeq{A}, offset::UnitRange{Int64}, windowsymbol::Symbol, score::Int64, strand::Strand) where {A<:NucleicAcidAlphabet} + return new(motif, offset, windowsymbol, score, strand) end # rbsinst = RBS(biore"RRR"dna, 3:4, 1.0) end -const RBSMOTIFS = Dict( +## based on https://github.com/deprekate/PHANOTATE/blob/8b4e728171adc7d900ccbc59f9606bfd585e138c/phanotate_modules/functions.py#L48 +const FORWARDRBSMOTIFS = Dict( ExactSearchQuery(dna"GGAGGA", iscompatible) => 27, ExactSearchQuery(dna"GGAGG", iscompatible) => 24, ExactSearchQuery(dna"GAGGA", iscompatible) => 22, @@ -34,34 +36,67 @@ const RBSMOTIFS = Dict( ExactSearchQuery(dna"GAAGA", iscompatible) => 9, ExactSearchQuery(dna"GATGA", iscompatible) => 9, ExactSearchQuery(dna"GACGA", iscompatible) => 9, - # ExactSearchQuery(dna"ATACG", iscompatible) => 5, ## testing motif ) +const REVERSERBSMOTIFS = Dict( + ExactSearchQuery(dna"TCCTCC", iscompatible) => 27, + ExactSearchQuery(dna"CCTCC", iscompatible) => 24, + ExactSearchQuery(dna"TCCTC", iscompatible) => 22, + ExactSearchQuery(dna"TCGTCC", iscompatible) => 19, + ExactSearchQuery(dna"TCATCC", iscompatible) => 19, + ExactSearchQuery(dna"TTCCTC", iscompatible) => 19, + ExactSearchQuery(dna"TCCGCC", iscompatible) => 19, + ExactSearchQuery(dna"TCCCCT", iscompatible) => 19, + ExactSearchQuery(dna"AGGTCC", iscompatible) => 19, + ExactSearchQuery(dna"CTCC", iscompatible) => 16, + ExactSearchQuery(dna"CCTC", iscompatible) => 16, + ExactSearchQuery(dna"TCCT", iscompatible) => 16, + ExactSearchQuery(dna"GGTCC", iscompatible) => 14, + ExactSearchQuery(dna"CCCCC", iscompatible) => 14, + ExactSearchQuery(dna"AGCCG", iscompatible) => 14, + ExactSearchQuery(dna"CCT", iscompatible) => 13, + ExactSearchQuery(dna"CTC", iscompatible) => 13, + ExactSearchQuery(dna"TCC", iscompatible) => 13, + ExactSearchQuery(dna"TCTTC", iscompatible) => 9, + ExactSearchQuery(dna"TCATC", iscompatible) => 9, + ExactSearchQuery(dna"TCGTC", iscompatible) => 9, +) # ExactSearchQuery(dna"ATACG", iscompatible) => 5, ## testing motif -# -20 -15 -10 -5 |-> start codon -# |....|....|....|....ATG... -#EX1. GGAGGACCCCATGACACACACAACAC -# |----|:27 +## based on the Prodigal paper: +# Spacer Range | Representative RBS Motif(s) +# ------------ | --------------------------- +# 3-4 bp | GGA, GAG, AGG, AGxAG, AGGAG +# 5-10 bp | GGAG, GAGG, GGxGG, AGGAGG +# 11-12 bp | GGA, GAG, AGG, AGxAG, AGGAG +# 13-15 bp | GGA, GAG, AGG, AGGA, AGGAG ## Window checking a: -# -16 -5 |-> start codon -# ...|..........|....ATG... - -## Window checking b: # -10 -3 |-> start codon # ...|......|..ATG... +## Window checking b: +# -16 -5 |-> start codon +# ...|..........|....ATG... + ## Window checking c: -# -18 -11 |-> start codon -# ...|......|..........ATG... +# -20 -11 |-> start codon +# .|........|..........ATG... + +## Example: +# ExactSearchQuery(dna"ATACG", iscompatible) => 5, ## testing motif + +# -20 -15 -10 -5 |-> start codon +# |....|....|....|....ATG... +#EX1. GGAGGACCCCATGACACACACAACAC +# |----|:27 export _rbswindows function _rbswindows(orf::ORFI{N,F}) where {N,F} - windowa = orf.strand == STRAND_POS ? (orf.first-16:orf.first-5) : (orf.last+5:orf.last+16) - windowb = orf.strand == STRAND_POS ? (orf.first-10:orf.first-3) : (orf.last+3:orf.last+10) - windowc = orf.strand == STRAND_POS ? (orf.first-18:orf.first-11) : (orf.last+11:orf.last+18) + windowa = orf.strand == STRAND_POS ? (orf.first-10:orf.first-3) : (orf.last+3:orf.last+10) + windowb = orf.strand == STRAND_POS ? (orf.first-16:orf.first-5) : (orf.last+5:orf.last+16) + windowc = orf.strand == STRAND_POS ? (orf.first-20:orf.first-11) : (orf.last+11:orf.last+20) windows = (windowa, windowb, windowc) return filter(window -> first(window) >= 1 && last(window) >= 1, windows) end @@ -91,71 +126,54 @@ end export _findrbs function _findrbs(orf::ORFI{N,F}) where {N,F} # s = reverse(seq) - score = 0 + # score = 0 wsymb = (:A, :B, :C) rbsvect = Vector{RBS}() windows = _rbswindows(orf) - for i in 1:length(windows) + # Iterate over the windows and their corresponding symbols + for i in 1:3#length(windows) window = windows[i] symbol = wsymb[i] - sqv = @view source(orf)[window] - for (rbs, scr) in pairs(RBSMOTIFS) - if occursin(rbs, sqv) - push!(rbsvect, RBS(rbs.seq, window, scr, symbol)) - # score += scr # Use the value associated with the key + wsqv = view(source(orf), window) + ## here we can apply a logic to decid whether to use the reverse motif or the forward motif + motifs = orf.strand == STRAND_POS ? FORWARDRBSMOTIFS : REVERSERBSMOTIFS + for (rbs, scr) in pairs(motifs) + motifranges::Vector{UnitRange{Int}} = findall(rbs, wsqv) + if !isempty(motifranges) + for motifrange in motifranges + offset = (first(window) + first(motifrange) - 1):(first(window) + last(motifrange) - 1) + # rbseq = orf.strand == STRAND_POS ? view(source(orf), offset) : reverse_complement(view(source(orf), offset)) + rbseq = view(wsqv,motifrange) + push!(rbsvect, RBS(rbseq, offset, symbol, scr, orf.strand)) + end end + # if occursin(rbs, sqv) + # push!(rbsvect, RBS(rbs.seq, window, scr, symbol)) + # # score += scr # Use the value associated with the key + # end end end return rbsvect end -# export orf_rbs_score -# function orf_rbs_score(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} -# # Initialize the score and the max scores dictionary -# wsymb = (:a, :b, :c) -# windows = _rbswindows(orf) -# maxscores = Dict(:a => 0, :b => 0, :c => 0) - -# # Iterate over the windows and their corresponding symbols -# @inbounds for i in 1:length(windows) -# window = windows[i] -# symbol = wsymb[i] -# sqv = @view seq[window] - -# # Check for RBS motifs in the sequence view -# @inbounds for (rbs, scr) in pairs(RBSMOTIFS) -# if occursin(rbs, sqv) -# # Update the max score directly -# if scr > maxscores[symbol] -# maxscores[symbol] = scr -# end -# end -# end -# end - -# # Sum the maximum scores of each window -# total_score = sum(values(maxscores)) - -# return total_score -# end - -export _orf_rbs_score -function _orf_rbs_score(orf::ORFI{N,F}) where {N,F} +export orf_rbs_score +function orf_rbs_score(orf::ORFI{N,F}) where {N,F} # Initialize the score and the max scores dictionary wsymb = (:a, :b, :c) windows = _rbswindows(orf) maxscores = Dict(:a => 0, :b => 0, :c => 0) # Iterate over the windows and their corresponding symbols - @inbounds for i in 1:length(windows) + @inbounds for i in 1:3 window = windows[i] symbol = wsymb[i] - sqv = @view source(orf)[window] + wsqv = view(source(orf), window) + motifs = orf.strand == STRAND_POS ? FORWARDRBSMOTIFS : REVERSERBSMOTIFS # Check for RBS motifs in the sequence view - @inbounds for (rbs, scr) in pairs(RBSMOTIFS) - if occursin(rbs, sqv) + @inbounds for (rbs, scr) in pairs(motifs) + if occursin(rbs, wsqv) # Update the max score directly if scr > maxscores[symbol] maxscores[symbol] = scr @@ -165,9 +183,9 @@ function _orf_rbs_score(orf::ORFI{N,F}) where {N,F} end # Sum the maximum scores of each window - total_score = sum(values(maxscores)) + totscore = sum(values(maxscores)) - return total_score + return totscore end export motifseq @@ -180,7 +198,7 @@ function offset(rbs::RBS) return rbs.offset end -# filter(x -> _orf_rbs_score(x) > 20 && length(x) > 100, findorfs(phi)) +# filter(x -> orf_rbs_score(x) > 9 && length(x) > 100, findorfs(phi)) # An idea of another implemetation of the a orf finder would be to use qstart = ExactSearchQuery(dna"ATG") # and then findall the start codons in the sequence and reverse sequence appended (seq * reverse_complement(seq)) From 82a3141f49b08b3e2c21ff554175c0572d162dbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 8 Dec 2024 17:42:48 -0500 Subject: [PATCH 09/17] Enhance RBS module with detailed documentation for RBS structure and scoring functions --- src/rbs.jl | 150 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 123 insertions(+), 27 deletions(-) diff --git a/src/rbs.jl b/src/rbs.jl index 6a1e6e7..e2158e5 100644 --- a/src/rbs.jl +++ b/src/rbs.jl @@ -1,5 +1,24 @@ -export RBS, RBSMOTIFS +export RBS, orf_rbs_score, motifseq, offset +public _rbswindows, _findrbs + +""" + RBS(motif::LongSubSeq, offset::UnitRange{Int64}, windowsymbol::Symbol, score::Int64, strand::Strand) + +Represents a Ribosome Binding Site (RBS) in a DNA sequence. + +# Fields +- `motif::LongSubSeq`: The nucleotide sequence of the RBS motif +- `offset::UnitRange{Int64}`: The position range of the RBS in the sequence +- `window::Symbol`: Symbol indicating the window type or region where the RBS was found +- `score::Int64`: Numerical score indicating the strength or confidence of the RBS prediction +- `strand::Strand`: Indicates whether the RBS is on the forward or reverse strand + +# Constructor + RBS(motif::LongSubSeq{A}, offset::UnitRange{Int64}, windowsymbol::Symbol, score::Int64, strand::Strand) where {A<:NucleicAcidAlphabet} + +Creates a new RBS instance with the specified parameters. +""" struct RBS motif::LongSubSeq # motif offset::UnitRange{Int64} # offset @@ -62,7 +81,6 @@ const REVERSERBSMOTIFS = Dict( ExactSearchQuery(dna"TCGTC", iscompatible) => 9, ) -# ExactSearchQuery(dna"ATACG", iscompatible) => 5, ## testing motif ## based on the Prodigal paper: # Spacer Range | Representative RBS Motif(s) @@ -90,9 +108,32 @@ const REVERSERBSMOTIFS = Dict( # -20 -15 -10 -5 |-> start codon # |....|....|....|....ATG... #EX1. GGAGGACCCCATGACACACACAACAC -# |----|:27 +# |----|:RBS(dna"GGAGGA", 1:5, :A, 27, STRAND_POS) + +""" + _rbswindows(orf::ORFI{N,F}) where {N,F} -> Tuple{UnitRange{Int64}, UnitRange{Int64}, UnitRange{Int64}} + +Generate potential ribosome binding site (RBS) windows for a given open reading frame (ORF). -export _rbswindows +Returns a tuple of UnitRanges representing three possible RBS windows relative to the ORF's start/end position, +filtering out any windows with invalid positions (< 1). + +For positive strand ORFs: +- Window A: -10 to -3 positions upstream of start +- Window B: -16 to -5 positions upstream of start +- Window C: -20 to -11 positions upstream of start + +For negative strand ORFs: +- Window A: +3 to +10 positions downstream of end +- Window B: +5 to +16 positions downstream of end +- Window C: +11 to +20 positions downstream of end + +# Arguments +- `orf::ORFI{N,F}`: An ORF instance containing strand and position information + +# Returns +- Tuple of valid UnitRanges representing RBS windows +""" function _rbswindows(orf::ORFI{N,F}) where {N,F} windowa = orf.strand == STRAND_POS ? (orf.first-10:orf.first-3) : (orf.last+3:orf.last+10) windowb = orf.strand == STRAND_POS ? (orf.first-16:orf.first-5) : (orf.last+5:orf.last+16) @@ -101,29 +142,36 @@ function _rbswindows(orf::ORFI{N,F}) where {N,F} return filter(window -> first(window) >= 1 && last(window) >= 1, windows) end -# export findrbs -# function findrbs(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} -# # s = reverse(seq) -# score = 0 -# wsymb = (:a, :b, :c) -# rbsvect = Vector{RBS}() -# windows = _rbswindows(orf) -# for i in 1:length(windows) -# window = windows[i] -# symbol = wsymb[i] -# sqv = @view seq[window] -# for (rbs, scr) in pairs(RBSMOTIFS) -# if occursin(rbs, sqv) -# push!(rbsvect, RBS(rbs.seq, window, scr, symbol)) -# # score += scr # Use the value associated with the key -# end -# end -# end -# return rbsvect -# end +""" + _findrbs(orf::ORFI{N,F}) where {N,F} -> Vector{RBS} + +Search for Ribosome Binding Sites (RBS) in the given Open Reading Frame (ORF). -export _findrbs +This function analyzes potential RBS locations in three windows upstream of the ORF start codon. +For each window, it searches for predefined RBS motifs (either forward or reverse depending on +the strand orientation) and records their locations and scores. + +# Arguments +- `orf::ORFI{N,F}`: An Open Reading Frame Interface object containing sequence and strand information + +# Returns +- `Vector{RBS}`: A vector of RBS objects, each containing: + - The RBS sequence + - The position range in the source sequence + - A window symbol (`:A`, `:B`, or `:C`) + - A score value + - The strand orientation + +# Implementation Details +- Searches in three windows upstream of the ORF +- Uses different motif sets for positive and negative strands +- Identifies all occurrences of RBS motifs in each window +- Records both the sequence and its position information + +# Note +This is an internal and public function as indicated by the leading underscore. +""" function _findrbs(orf::ORFI{N,F}) where {N,F} # s = reverse(seq) # score = 0 @@ -157,7 +205,31 @@ function _findrbs(orf::ORFI{N,F}) where {N,F} return rbsvect end -export orf_rbs_score +""" + orf_rbs_score(orf::ORFI{N,F}) where {N,F} -> Float64 + +Calculate a ribosome binding site (RBS) score for a given open reading frame (ORF). + +This function evaluates potential ribosome binding sites in three windows upstream of +the ORF start codon by searching for known RBS motifs. It returns a total score based +on the strongest motif found in each window. + +# Arguments +- `orf::ORFI{N,F}`: An open reading frame interface object containing sequence and position information + +# Returns +- `Float64`: The sum of the maximum RBS motif scores found in each window + +# Details +The function: +1. Divides the upstream region into three windows (a, b, c) +2. Searches each window for matching RBS motifs +3. Keeps track of the highest scoring motif in each window +4. Returns the sum of the maximum scores across all windows + +RBS motifs and their scores are predefined in `FORWARDRBSMOTIFS` for positive strand +and `REVERSERBSMOTIFS` for negative strand sequences. +""" function orf_rbs_score(orf::ORFI{N,F}) where {N,F} # Initialize the score and the max scores dictionary wsymb = (:a, :b, :c) @@ -206,4 +278,28 @@ end # how the strand and the location will be defined in the memory layout is the next step. ## Another idea would be to fastly count the number of ATG in a sequence twice. This will be the approx number of ORFs -## in the sequence.Them we can use other way to store the ORFs in the memory layout. \ No newline at end of file +## in the sequence.Them we can use other way to store the ORFs in the memory layout. + +## Another idea would be to use the findall function to find all the ATG in the sequence and then use the memory layout + +# export findrbs +# function findrbs(seq::SeqOrView{A}, orf::ORFI{N,F}) where {A,N,F} +# # s = reverse(seq) +# score = 0 +# wsymb = (:a, :b, :c) +# rbsvect = Vector{RBS}() +# windows = _rbswindows(orf) + +# for i in 1:length(windows) +# window = windows[i] +# symbol = wsymb[i] +# sqv = @view seq[window] +# for (rbs, scr) in pairs(RBSMOTIFS) +# if occursin(rbs, sqv) +# push!(rbsvect, RBS(rbs.seq, window, scr, symbol)) +# # score += scr # Use the value associated with the key +# end +# end +# end +# return rbsvect +# end \ No newline at end of file From d34de0d449c2f92d84a840251f2e98a4b6fd8511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 8 Dec 2024 19:03:29 -0500 Subject: [PATCH 10/17] Remove log-odds ratio decision rule and iscoding functions from criteria and iscoding modules --- src/criteria/lordr.jl | 63 ------------------------------------------- src/iscoding.jl | 31 --------------------- 2 files changed, 94 deletions(-) delete mode 100644 src/criteria/lordr.jl delete mode 100644 src/iscoding.jl diff --git a/src/criteria/lordr.jl b/src/criteria/lordr.jl deleted file mode 100644 index d403c7b..0000000 --- a/src/criteria/lordr.jl +++ /dev/null @@ -1,63 +0,0 @@ -export log_odds_ratio_decision_rule, lordr #lors - -@doc raw""" - log_odds_ratio_decision_rule( - sequence::LongSequence{DNAAlphabet{4}}; - modela::BioMarkovChain, - modelb::BioMarkovChain, - η::Float64 = 1e-5 - ) - -Check if a given DNA sequence is likely to be coding based on a log-odds ratio. - The log-odds ratio is a statistical measure used to assess the likelihood of a sequence being coding or non-coding. It compares the probability of the sequence generated by a coding model to the probability of the sequence generated by a non-coding model. If the log-odds ratio exceeds a given threshold (`η`), the sequence is considered likely to be coding. - It is formally described as a decision rule: - -```math -S(X) = \log \left( \frac{{P_C(X_1=i_1, \ldots, X_T=i_T)}}{{P_N(X_1=i_1, \ldots, X_T=i_T)}} \right) \begin{cases} > \eta & \Rightarrow \text{{coding}} \\ < \eta & \Rightarrow \text{{noncoding}} \end{cases} -``` - -# Arguments -- `sequence::NucleicSeqOrView{DNAAlphabet{N}}`: The DNA sequence to be evaluated. - -## Keyword Arguments -- `codingmodel::BioMarkovChain`: The transition model for coding regions, (default: `ECOLICDS`). -- `noncodingmodel::BioMarkovChain`: The transition model for non-coding regions, (default: `ECOLINOCDS`) -- `b::Number = 2`: The base of the logarithm used to calculate the log-odds ratio (default: 2). -- `η::Float64 = 1e-5`: The threshold value (eta) for the log-odds ratio (default: 1e-5). - -# Returns -- `true` if the sequence is likely to be coding. -- `false` if the sequence is likely to be non-coding. - -# Raises -- `ErrorException`: if the length of the sequence is not divisible by 3. -- `ErrorException`: if the sequence contains a premature stop codon. - -# Example - -``` -sequence = dna"ATGGCATCTAG" -iscoding(sequence) # Returns: true or false -``` -""" -function lordr( #log_odds_ratio_decision, also lordr/cudr/kfdr/aadr - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - modela::BioMarkovChain = ECOLICDS, - modelb::BioMarkovChain = ECOLINOCDS, - b::Number = 2, - η::Float64 = 5e-3 -) where {N} - - scorea = log_odds_ratio_score(sequence; modela=modela, b=b) - scoreb = log_odds_ratio_score(sequence; modela=modelb, b=b) - - logodds = scorea / scoreb - - if logodds > η - return true - else - false - end -end - -const log_odds_ratio_decision_rule = lordr # criteria \ No newline at end of file diff --git a/src/iscoding.jl b/src/iscoding.jl deleted file mode 100644 index 803099c..0000000 --- a/src/iscoding.jl +++ /dev/null @@ -1,31 +0,0 @@ -export iscoding - -@doc raw""" - iscoding(sequence::LongSequence{DNAAlphabet{4}}; criteria::Function = lordr, kwargs...) -> Bool - -Check if a given DNA sequence is likely to be coding based on a scoring scheme. - -``` -sequence = dna"ATGGCATCTAG" -iscoding(sequence) # Returns: true or false -``` -""" -function iscoding end - -function iscoding( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - criteria::Function = lordr, - kwargs... -) where {N} - return criteria(sequence; kwargs...) -end - -function iscoding( - orf::ORFI{F}; - criteria::Function = lordr, - kwargs... -) where {F<:GeneFinderMethod} - return criteria(sequence(orf); kwargs...) -end - -# iscoding.(seq[i for i in allorfs]) \ No newline at end of file From e9dd91bd734f63fc457acdfe458e2e1374c1faed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 8 Dec 2024 19:03:34 -0500 Subject: [PATCH 11/17] Update Julia version and dependencies in Manifest.toml --- docs/Manifest.toml | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/Manifest.toml b/docs/Manifest.toml index f580a1b..7dc19e0 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -1,6 +1,6 @@ # This file is machine-generated - editing it directly is not advised -julia_version = "1.11.1" +julia_version = "1.11.2" manifest_format = "2.0" project_hash = "a737ae23847074aa3f8c65ffa51f2732c1aeb154" @@ -107,9 +107,9 @@ version = "0.9.3" [[deps.Documenter]] deps = ["ANSIColoredPrinters", "AbstractTrees", "Base64", "CodecZlib", "Dates", "DocStringExtensions", "Downloads", "Git", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "MarkdownAST", "Pkg", "PrecompileTools", "REPL", "RegistryInstances", "SHA", "TOML", "Test", "Unicode"] -git-tree-sha1 = "5a1ee886566f2fa9318df1273d8b778b9d42712d" +git-tree-sha1 = "d0ea2c044963ed6f37703cead7e29f70cba13d7e" uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "1.7.0" +version = "1.8.0" [[deps.DocumenterVitepress]] deps = ["ANSIColoredPrinters", "Base64", "DocStringExtensions", "Documenter", "IOCapture", "Markdown", "NodeJS_20_jll", "REPL"] @@ -130,15 +130,15 @@ version = "1.6.0" [[deps.ExceptionUnwrapping]] deps = ["Test"] -git-tree-sha1 = "dcb08a0d93ec0b1cdc4af184b26b591e9695423a" +git-tree-sha1 = "d36f682e590a83d63d1c7dbd287573764682d12a" uuid = "460bff9d-24e4-43bc-9d9f-a8973cb893f4" -version = "0.1.10" +version = "0.1.11" [[deps.Expat_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "1c6317308b9dc757616f0b5cb379db10494443a7" +git-tree-sha1 = "e51db81749b0777b2147fbe7b783ee79045b8e99" uuid = "2e619515-83b5-522b-bb60-26c02a35a201" -version = "2.6.2+0" +version = "2.6.4+1" [[deps.FileWatching]] uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" @@ -164,15 +164,15 @@ version = "1.3.1" [[deps.Git_jll]] deps = ["Artifacts", "Expat_jll", "JLLWrappers", "LibCURL_jll", "Libdl", "Libiconv_jll", "OpenSSL_jll", "PCRE2_jll", "Zlib_jll"] -git-tree-sha1 = "ea372033d09e4552a04fd38361cd019f9003f4f4" +git-tree-sha1 = "399f4a308c804b446ae4c91eeafadb2fe2c54ff9" uuid = "f8c6e375-362e-5223-8a59-34ff63f689eb" -version = "2.46.2+0" +version = "2.47.1+0" [[deps.HTTP]] -deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"] -git-tree-sha1 = "bc3f416a965ae61968c20d0ad867556367f2817d" +deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "PrecompileTools", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"] +git-tree-sha1 = "6c22309e9a356ac1ebc5c8a217045f9bae6f8d9a" uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" -version = "1.10.9" +version = "1.10.13" [[deps.IOCapture]] deps = ["Logging", "Random"] @@ -208,9 +208,9 @@ uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" version = "0.21.4" [[deps.LazilyInitializedFields]] -git-tree-sha1 = "8f7f3cabab0fd1800699663533b6d5cb3fc0e612" +git-tree-sha1 = "0f2da712350b020bc3957f269c9caad516383ee0" uuid = "0e77f7df-68c5-4e49-93ce-4cd80f5598bf" -version = "1.2.2" +version = "1.3.0" [[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] @@ -321,9 +321,9 @@ uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95" version = "3.0.15+1" [[deps.OrderedCollections]] -git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5" +git-tree-sha1 = "12f1439c4f986bb868acda6ea33ebc78e19b95ad" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.6.3" +version = "1.7.0" [[deps.PCRE2_jll]] deps = ["Artifacts", "Libdl"] From b883807c5410341a08deea8c7885afb7b04d9a29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 8 Dec 2024 19:03:45 -0500 Subject: [PATCH 12/17] Comment out log-odds ratio decision rule and related exports in extended.jl --- src/extended.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/extended.jl b/src/extended.jl index 625cadd..36fd21e 100644 --- a/src/extended.jl +++ b/src/extended.jl @@ -14,9 +14,9 @@ import BioSequences: translate ## Methods from BioMarkovChains that expand their fuctions to this package structs import BioMarkovChains: log_odds_ratio_score -export log_odds_ratio_score, lors +# export log_odds_ratio_score, lors @inline log_odds_ratio_score(orf::ORFI{N,F}; kwargs...) where {N,F} = log_odds_ratio_score(sequence(orf); kwargs...) -@inline log_odds_ratio_decision_rule(orf::ORFI{N,F}; kwargs...) where {N,F} = log_odds_ratio_decision_rule(sequence(orf); kwargs) +# @inline log_odds_ratio_decision_rule(orf::ORFI{N,F}; kwargs...) where {N,F} = log_odds_ratio_decision_rule(sequence(orf); kwargs) -const lors = log_odds_ratio_score +# const lors = log_odds_ratio_score From b8d8eb07f293d720734a6714365d982899697725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 8 Dec 2024 19:03:49 -0500 Subject: [PATCH 13/17] Add iscoding and decision rule functions to criteria module --- src/criteria.jl | 126 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 src/criteria.jl diff --git a/src/criteria.jl b/src/criteria.jl new file mode 100644 index 0000000..8bd5f26 --- /dev/null +++ b/src/criteria.jl @@ -0,0 +1,126 @@ +export iscoding, + log_odds_ratio_decision_rule, lordr, + ribsome_binding_site_decision_rule, rbsdr + + + @doc raw""" + iscoding(orf::ORFI{N,F}; criteria::Function = lordr, kwargs...) -> Bool + +Check if the given DNA sequence of an ORF is likely to be coding based on a scoring scheme/function. + +## Scoring Criteria/Functions +- `lordr`: Log-Odds Ratio Decision Rule +- `rbsdr`: Ribosome Binding Site Decision Rule + +``` +phi = dna"GTGTGAGGTTATAACGCCGAAGCGGTAAAAATTTTAAT...AGTGTTTCCAGTCCGTTCAGTTAATAGTCAGGTTAAAGATAAAAGATTGA" +orfs = findorfs(phi) + +orf = orfs[2] + ORFI{NaiveFinder}(94:126, '-', 1) + +iscoding(orf) # Returns: true +``` +""" +function iscoding( + orf::ORFI{N,F}; + criteria::Function = lordr, #rbsdr + kwargs... +) where {N,F<:GeneFinderMethod} + return criteria(orf; kwargs...) +end + +### Actual criteria functions ### + +## Log-Odds Ratio Decision Rule + +@doc raw""" + log_odds_ratio_decision_rule( + sequence::LongSequence{DNAAlphabet{4}}; + modela::BioMarkovChain, + modelb::BioMarkovChain, + η::Float64 = 1e-5 + ) + +Check if a given DNA sequence is likely to be coding based on a log-odds ratio. + The log-odds ratio is a statistical measure used to assess the likelihood of a sequence being coding or non-coding. It compares the probability of the sequence generated by a coding model to the probability of the sequence generated by a non-coding model. If the log-odds ratio exceeds a given threshold (`η`), the sequence is considered likely to be coding. + It is formally described as a decision rule: + +```math +S(X) = \log \left( \frac{{P_C(X_1=i_1, \ldots, X_T=i_T)}}{{P_N(X_1=i_1, \ldots, X_T=i_T)}} \right) \begin{cases} > \eta & \Rightarrow \text{{coding}} \\ < \eta & \Rightarrow \text{{noncoding}} \end{cases} +``` + +# Arguments +- `sequence::NucleicSeqOrView{DNAAlphabet{N}}`: The DNA sequence to be evaluated. + +## Keyword Arguments +- `codingmodel::BioMarkovChain`: The transition model for coding regions, (default: `ECOLICDS`). +- `noncodingmodel::BioMarkovChain`: The transition model for non-coding regions, (default: `ECOLINOCDS`) +- `b::Number = 2`: The base of the logarithm used to calculate the log-odds ratio (default: 2). +- `η::Float64 = 1e-5`: The threshold value (eta) for the log-odds ratio (default: 1e-5). + +# Returns +- `true` if the sequence is likely to be coding. +- `false` if the sequence is likely to be non-coding. + +# Raises +- `ErrorException`: if the length of the sequence is not divisible by 3. +- `ErrorException`: if the sequence contains a premature stop codon. + +# Example + +``` +sequence = dna"ATGGCATCTAG" +iscoding(sequence) # Returns: true or false +``` +""" +function lordr( #log_odds_ratio_decision, also lordr/cudr/kfdr/aadr + orf::ORFI{N,F}; + modela::BioMarkovChain = ECOLICDS, + modelb::BioMarkovChain = ECOLINOCDS, + b::Number = 2, + η::Float64 = 5e-3 +) where {N,F<:GeneFinderMethod} + orfseq = sequence(orf) + scorea = log_odds_ratio_score(orfseq; modela=modela, b=b) + scoreb = log_odds_ratio_score(orfseq; modela=modelb, b=b) + + logodds = scorea / scoreb + + if logodds > η + return true + else + false + end +end + +const log_odds_ratio_decision_rule = lordr # criteria + +## Ribosome Binding Site Decision Rule + +""" + ribsome_binding_site_decision_rule(orf::ORFI{N,F}) where {N,F<:GeneFinderMethod} -> Bool + +Evaluates if an Open Reading Frame (ORF) has a significant ribosome binding site (RBS). + +The function uses the `orf_rbs_score` to calculate a score for the ORF's RBS region and returns +true if the score exceeds a threshold of 9, indicating the presence of at least one RBS. + +# Arguments +- `orf::ORFI{N,F}`: An Open Reading Frame Interface (ORFI) object parameterized by N and F, + where F is a subtype of GeneFinderMethod + +# Returns +- `Bool`: `true` if the RBS score is greater than 9, `false` otherwise + +""" +function rbsdr(orf::ORFI{N,F}) where {N,F<:GeneFinderMethod} + scr = orf_rbs_score(orf) + if scr > 9 # at least one RBS... + return true + else + return false + end +end + +const ribsome_binding_site_decision_rule = rbsdr \ No newline at end of file From f5379c9c0b1bdfdde939036656b0961592163b10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 8 Dec 2024 19:03:55 -0500 Subject: [PATCH 14/17] Refactor GeneFinder and RBS modules by removing unused criteria and cleaning up code --- src/GeneFinder.jl | 7 +++---- src/rbs.jl | 7 ------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index 0344063..909d8af 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -34,13 +34,9 @@ using GenomicFeatures: GenomicFeatures, AbstractGenomicInterval, Strand, summary include("algorithms/naivefinder.jl") include("algorithms/naivecollector.jl") -# Coding Criteria -include("criteria/lordr.jl") - # Main functions include("types.jl") include("findorfs.jl") -include("iscoding.jl") include("io.jl") # Utils and extended functions @@ -50,6 +46,9 @@ include("extended.jl") # RBS Scoring include("rbs.jl") +# Coding Criteria +include("criteria.jl") + # Precompiled workloads include("workload.jl") diff --git a/src/rbs.jl b/src/rbs.jl index e2158e5..9b2f4ba 100644 --- a/src/rbs.jl +++ b/src/rbs.jl @@ -1,5 +1,4 @@ export RBS, orf_rbs_score, motifseq, offset - public _rbswindows, _findrbs """ @@ -81,7 +80,6 @@ const REVERSERBSMOTIFS = Dict( ExactSearchQuery(dna"TCGTC", iscompatible) => 9, ) - ## based on the Prodigal paper: # Spacer Range | Representative RBS Motif(s) # ------------ | --------------------------- @@ -142,7 +140,6 @@ function _rbswindows(orf::ORFI{N,F}) where {N,F} return filter(window -> first(window) >= 1 && last(window) >= 1, windows) end - """ _findrbs(orf::ORFI{N,F}) where {N,F} -> Vector{RBS} @@ -196,10 +193,6 @@ function _findrbs(orf::ORFI{N,F}) where {N,F} push!(rbsvect, RBS(rbseq, offset, symbol, scr, orf.strand)) end end - # if occursin(rbs, sqv) - # push!(rbsvect, RBS(rbs.seq, window, scr, symbol)) - # # score += scr # Use the value associated with the key - # end end end return rbsvect From 15b1724a8cf71e96f2b7283c319c6405ee36e975 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 8 Dec 2024 19:14:34 -0500 Subject: [PATCH 15/17] Rename orf_rbs_score function to orbs and update return type to Int64; export orbs in RBS module --- src/rbs.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/rbs.jl b/src/rbs.jl index 9b2f4ba..c68c6be 100644 --- a/src/rbs.jl +++ b/src/rbs.jl @@ -1,4 +1,4 @@ -export RBS, orf_rbs_score, motifseq, offset +export RBS, orf_rbs_score, orbs, motifseq, offset public _rbswindows, _findrbs """ @@ -199,7 +199,7 @@ function _findrbs(orf::ORFI{N,F}) where {N,F} end """ - orf_rbs_score(orf::ORFI{N,F}) where {N,F} -> Float64 + orf_rbs_score(orf::ORFI{N,F}) where {N,F} -> Int64 Calculate a ribosome binding site (RBS) score for a given open reading frame (ORF). @@ -223,7 +223,7 @@ The function: RBS motifs and their scores are predefined in `FORWARDRBSMOTIFS` for positive strand and `REVERSERBSMOTIFS` for negative strand sequences. """ -function orf_rbs_score(orf::ORFI{N,F}) where {N,F} +function orbs(orf::ORFI{N,F}) where {N,F} # Initialize the score and the max scores dictionary wsymb = (:a, :b, :c) windows = _rbswindows(orf) @@ -253,6 +253,8 @@ function orf_rbs_score(orf::ORFI{N,F}) where {N,F} return totscore end +const orf_rbs_score = orbs + export motifseq function motifseq(rbs::RBS) return rbs.motif From 9e38c6f96cfd1099db313763660337146453de02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 8 Dec 2024 19:33:01 -0500 Subject: [PATCH 16/17] Add documentation for Ribosome Binding Site (RBS) motifs and scoring in rbs.md; update make.jl to include RBS section --- docs/make.jl | 1 + docs/src/rbs.md | 82 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 docs/src/rbs.md diff --git a/docs/make.jl b/docs/make.jl index e5727db..fecf53e 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -18,6 +18,7 @@ pgs = [ "The ORF type" => "orftype.md", "Scoring ORFs" => "features.md", "A Simple Coding Rule" => "simplecodingrule.md", + "Ribosome Binding Sites" => "rbsrule.md", "Writing ORFs In Files" => "iodocs.md", ], "API" => "api.md", diff --git a/docs/src/rbs.md b/docs/src/rbs.md new file mode 100644 index 0000000..0aa3b9f --- /dev/null +++ b/docs/src/rbs.md @@ -0,0 +1,82 @@ +## Ribosome Binding Site (RBS) Motifs + +This document provides information on Ribosome Binding Site (RBS) motifs based on the Prodigal paper. The RBS motifs are categorized by spacer ranges, which indicate the number of base pairs (bp) between the RBS and the start codon. + +### Spacer Ranges and Representative RBS Motifs + +Example sequence with RBS motif: + + Spacer Range | Representative RBS Motif(s) + ------------ | --------------------------- + 3-4 bp | GGA, GAG, AGG, AGxAG, AGGAG + 5-10 bp | GGAG, GAGG, GGxGG, AGGAGG + 11-12 bp | GGA, GAG, AGG, AGxAG, AGGAG + 13-15 bp | GGA, GAG, AGG, AGGA, AGGAG + +### Window Checking Ranges + +Three different window checking ranges are defined to identify potential RBS motifs relative to the start codon (ATG): + +- Window Checking A: +``` + -10 -3 |-> start codon + ...|......|..ATG... +``` + +- Window Checking B: +``` + -16 -5 |-> start codon + ...|..........|....ATG... +``` + +- Window Checking C: +``` + -20 -11 |-> start codon + .|........|..........ATG... +``` + +### RBS Motif Scoring + +The RBS motif scoring is based on the Prodigal paper, which uses a scoring system to identify potential RBS motifs. The scoring system is as follows: + + + +### Example + +An example of an exact search query for an RBS motif is provided: + +- **ExactSearchQuery(dna"ATACG", iscompatible)** returns 5, indicating the position of the motif. + +``` + -20 -15 -10 -5 |-> start codon + |....|....|....|....ATG... + GGAGGACCCCATGACACACACAACAC + |----|:RBS(dna"GGAGGA", 1:5, :A, 27, STRAND_POS) +``` + +To inspect the RBS motifs of a given ORF, we can use the `_findrbs` function. This function returns the RBS motifs of a given ORF: + +```julia + +phi = dna"GTGTGAGGTTATAACGCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGCTTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGTTCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTATATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTGTCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGCCTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTGAATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGCCGGGCAATAACGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGTTTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTGCTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAAAGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCTGGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTGGTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGATAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTATCTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGGTTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGAGATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGACCAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTATGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCAAACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGACTTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTTCTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGATACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCGTCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTTCTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTATTGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGCATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATGTTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGAATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGGGACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCCCTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATTGCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAGGCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTTATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCGCAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGCCGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCATCGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTTTCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTTCGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCTTGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCGTCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTACGGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTACGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCGGAAGGAGTGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGCCCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCATCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGACTCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTACTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTTGGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACAACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGCTCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTTTCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGCATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCATGATGTTATTTCTTCATTTGGAGGTAAAACCTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTTGATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGCCGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGACTAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTGTATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGTTTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGAAGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGATTATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGT ACCGTTTATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGA" + +orf = findorfs(phi)[3] + +GeneFinder._findrbs(orf) + +6-element Vector{RBS}: + RBS(GAG, 90:92, :A, 13, STRAND_POS) + RBS(GGAG, 89:92, :B, 16, STRAND_POS) + RBS(AGG, 88:90, :B, 13, STRAND_POS) + RBS(GGA, 89:91, :B, 13, STRAND_POS) + RBS(GAG, 90:92, :B, 13, STRAND_POS) + RBS(AGGA, 88:91, :B, 16, STRAND_POS) + +``` + +Now, following the scoring scheme mentioned above, we can calculate the RBS score for each motif. This criteria is implemented in the `orf_rbs_score` or simply aliased to `orbs` function: + +```julia +orf_rbs_score(orf) +29 +``` \ No newline at end of file From ff2ddd3513b0f4507624874145c93047dfcce973 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 8 Dec 2024 19:44:03 -0500 Subject: [PATCH 17/17] Update link for Ribosome Binding Sites documentation in make.jl --- docs/make.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/make.jl b/docs/make.jl index fecf53e..aa1a6d7 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -18,7 +18,7 @@ pgs = [ "The ORF type" => "orftype.md", "Scoring ORFs" => "features.md", "A Simple Coding Rule" => "simplecodingrule.md", - "Ribosome Binding Sites" => "rbsrule.md", + "Ribosome Binding Sites" => "rbs.md", "Writing ORFs In Files" => "iodocs.md", ], "API" => "api.md",