-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathredesign_sequences.py
executable file
·276 lines (249 loc) · 14.4 KB
/
redesign_sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
"""Redesigns influenza genes to add / remove alternative start motifs.
First removes CTG, ATG, GTG motifs from all reading frames as best as possible.
These sequences are denoted as "lowCTG" because this is supposed to remove
alternative start sites.
Then adds CTG motifs in reading frame 1 of the PR8-NP-lowCTG sequence to
create PR8-NP-highCTG motif. This sequence should have lots of alternative
starts in the first reading from.
Does all this under the constraint that changes must be synonymous, and that any
new codons that are introduced are chosen to be as frequent as possible in
natural existing sequences, and must exist in at least a threshold number
of sequences.
Jesse Bloom, 2012.
Edited by Heather Machkovech, 2018"""
import os
import fasta
import align
def OtherCodons(codon):
"""Given some codon, returns all other codons for the same amino acid."""
alternatives = []
assert len(codon) == 3
aa = fasta.Translate([('codon', codon)])[0][1]
nts = ['A', 'T', 'C', 'G']
for nt1 in nts:
for nt2 in nts:
for nt3 in nts:
alternative = "%s%s%s" % (nt1, nt2, nt3)
if aa == fasta.Translate([('codon', alternative)])[0][1] and codon.upper() != alternative:
alternatives.append(alternative)
return alternatives
def CountCodonOccurrences(seqs, icodon, codons):
"""Counts the number of times a codon occurs in a sequence set.
seqs -> a list of sequences in (header, sequence) format. These
sequences should all be of the same length.
icodon -> the position of a codon found in the sequences of length
equal to those in seqs, for 1, 2, 3,... numbering of the codons.
codons -> a list of codons as three nucleotide strings.
This function counts the number of times that each of the
codons in codons occurs at position icodon in the sequences
in seqs. The returned variable is a dictionary counts. It is
keyed by each of the codons in codons, and the values are the
number of occurrences at position icodon in the sequences in seqs.
"""
counts = dict([(codon, 0) for codon in codons])
n = len(seqs[0][1])
assert n % 3 == 0
assert 1 <= icodon <= n / 3
for (head, seq) in seqs:
assert len(seq) == n
seqcodon = seq[3 * (icodon - 1) : 3 * icodon]
for codon in codons:
if codon.upper() == seqcodon.upper():
counts[codon] += 1
return counts
def CountMotifsInFrame(seq, codonmotif, iframe):
"""Counts the number of motifs in a reading frame.
'seq' is a string giving a nucleotide sequence, with length
divisible by three.
'codonmotif' is a 3-nucleotide codon motif.
'iframe' is a reading frame (1, 2, or 3).
Returns the number of occurrences of codonmotif in the
specified reading frame.
"""
n = 0
for i in range(len(seq) / 3):
codon = seq[3 * i + iframe - 1 : 3 * i + 3 + iframe - 1]
if codon.upper() == codonmotif.upper():
n += 1
return n
def RemoveCodonMotif(seq, codonmotif, icodon, iframe, comparison_seqs, count_threshold):
"""Attempts to remove a nucleotide motif of length three (codon length).
The motif is only removed if it can be done so without changing the protein
sequence, and by only using other codons that are common in a set
of comparison sequences. If multiple alternative synonymous codons
meet the threshold for being sufficiently common, we choose the most
common one.
This method assumes that seq[(icodon - 1) * 3 + iframe - 1 : icodon * 3 + iframe - 1]
== codonmotif
seq -> a string giving the sequence from which we want to remove the codon.
It must be a protein-coding sequence, and so have length divisible by 3.
codonmotif -> a three nucleotide string giving the motif that we want to remove.
icodon -> the number of the codon in which the motif starts, with
1 <= icodon <= len(seq) / 3.
iframe -> the frame in which the motif starts. It can be 1, 2, or 3. For
example, if the motif starts at the beginning of codon icodon, then
iframe would be 1.
comparison_seqs -> a list of sequences as (header, sequence) 2-tuples.
Each sequence should be aligned to seq and so be of the same length.
count_threshold -> the number of times that a codon must be found
in count_threshold before we would consider making a change to it.
The returned variable is the original sequence if the motif cannot be removed, or
a new version of seq in which the motif has been removed.
"""
seq = seq.upper()
assert seq[(icodon - 1) * 3 + iframe - 1 : icodon * 3 + iframe - 1] == codonmotif
c1 = seq[(icodon - 1) * 3 : icodon * 3]
c1_alts = OtherCodons(c1)
c1_alts_counts = CountCodonOccurrences(comparison_seqs, icodon, c1_alts)
c1_alts = [(c1_alts_counts[codon], codon) for codon in c1_alts]
c1_alts = [(count, codon) for (count, codon) in c1_alts if count >= count_threshold]
c1_alts.sort()
c1_alts.reverse() # now holds all acceptable alternatives with most common first
c1_alts = [tup[1] for tup in c1_alts]
if iframe == 1:
# in the same reading frame as the protein
if c1_alts: # remove motif to most common alternative
index = (icodon - 1) * 3
newseq = "%s%s%s" % (seq[ : index], c1_alts[0], seq[index + 3 : ])
else:
newseq = seq # motif cannot be removed
elif iframe in [2, 3]:
c2 = seq[icodon * 3 : (icodon + 1) * 3]
c2_alts = OtherCodons(c2)
c2_alts_counts = CountCodonOccurrences(comparison_seqs, icodon + 1, c2_alts)
c2_alts = [(c2_alts_counts[codon], codon) for codon in c2_alts]
c2_alts = [(count, codon) for (count, codon) in c2_alts if count >= count_threshold]
c2_alts.sort()
c2_alts.reverse() # now holds all acceptable alternatives with most common first
c2_alts = [tup[1] for tup in c2_alts]
index = (icodon - 1) * 3
for c1_alt in c1_alts:
alt = "%s%s" % (c1_alt[iframe - 1 : ], c2[ : iframe - 1])
if alt != codonmotif: # alternative works
newseq = "%s%s%s%s" % (seq[ : index], c1_alt, c2, seq[index + 6 : ])
break
else: # failed to find alternative by changing first codon; trying changing second
for c2_alt in c2_alts:
alt = "%s%s" % (c1[iframe - 1 : ], c2_alt[ : iframe - 1])
if alt != codonmotif: # alternative works
newseq = "%s%s%s%s" % (seq[ : index], c1, c2_alt, seq[index + 6 : ])
break
else:
newseq = seq # motif cannot be removed
else:
raise ValueError("Invalid frame of %r." % iframe)
return newseq # return the new sequence
def main():
"""Main body of the script."""
# input / output variables
musclepath = '/Users/jbloom/muscle3.8/'
outdir = 'redesigned_sequences' # output directory
remove_targets = { # targets for removing motifs
# name : (out name, sequence file, comparison file, codonmotifs)
'PR8-NP' : ('PR8-NP-lowCTG', 'sequences/PR8-NP-cDNA.fasta', 'comparison_sequence_sets/NP_aligned.fasta', ['GTG', 'ATG', 'CTG'])
}
addctg_targets = { # targets for adding CTG motifs in frame 1
# name : (out name, sequence file, comparison file)
'PR8-NP-lowCTG' : ('PR8-NP-highCTG', '%s/PR8-NP-lowCTG.fasta' % outdir, 'comparison_sequence_sets/NP_aligned.fasta'),
}
count_threshold = 100 # any new codons introduced must be found naturally in at least this many sequences
spliced_segments = [
# (spliced name, first gene, its start, its end, second gene, its start, first exon end, second exon start, its end) with all numbering 1, 2, 3 from beginning of spliced gene
('PR8-M', 'PR8-M1', 1, 756, 'PR8-M2', 1, 27, 716, 979),
('PR8-NS', 'PR8-NS1', 1, 693, 'PR8-NS2', 1, 30, 503, 909),
]
nonoverlap_range = {} # gives range of non-overlapping reading frame portion
for tup in spliced_segments:
nonoverlap_range[tup[1]] = (tup[6], tup[7])
s2length = tup[8] - tup[7] + tup[6] - tup[5]
nonoverlap_range[tup[4]] = (max(tup[6], tup[3] - tup[7]), s2length)
# begin removing motifs
for (name, (outname, seqfile, comparisonfile, codonmotifs)) in remove_targets.iteritems():
print "\nRedesigning %s to create %s." % (name, outname)
originalseq = fasta.Read(seqfile)
assert len(originalseq) == 1
originalseq = originalseq[0]
(head, seq) = originalseq
seq = seq.upper()
assert len(seq) % 3 == 0
ncodons = len(seq) / 3
comparison_seqs = fasta.Read(comparisonfile)
print "Will use %d comparison sequences for the redesign." % len(comparison_seqs)
for codonmotif in codonmotifs:
if len(codonmotif) != 3:
raise IOError("Method currently only works for 3-nucleotide motifs.")
for icodon in range(1, ncodons + 1):
for iframe in [3, 2, 1]:
if icodon == ncodons and iframe in [2, 3]:
continue # don't look past the end of the last codon
index = 3 * (icodon - 1) + iframe - 1
if seq[index : index + 3] == codonmotif:
newseq = RemoveCodonMotif(seq, codonmotif, icodon, iframe, comparison_seqs, count_threshold)
assert fasta.Translate([('newseq', newseq)])[0][1] == fasta.Translate([('seq', seq)])[0][1], "icodon = %d, iframe = %d, seq[index : index + 3] = %s, newseq[index : index + 3] = %s, seq[3 * (icodon - 1) : 3 * (icodon + 1)] = %s, newseq[3 * (icodon - 1) : 3 * (icodon + 1)] = %s" % (icodon, iframe, seq[index : index + 3], newseq[index : index + 3], seq[3 * (icodon - 1) : 3 * (icodon + 1)], newseq[3 * (icodon - 1) : 3 * (icodon + 1)])
seq = newseq
# if the gene has overlapping reading frames, only use recoded portions of non-overlapping
if name in nonoverlap_range:
print "Adjusting redesign of %s to account for the overlapping reading frames." % name
(start_no, end_no) = nonoverlap_range[name]
(startcodon_no, endcodon_no) = (start_no / 3 + 1, end_no / 3 - 1)
seq = "%s%s%s" % (originalseq[1][ : 3 * startcodon_no], seq[3 * startcodon_no : 3 * endcodon_no], originalseq[1][3 * endcodon_no : ])
assert fasta.Translate([originalseq])[0][1] == fasta.Translate([('seq', seq)])[0][1] # make sure the protein sequence is unchanged
header = "%s: %s redesigned to eliminate possible alternative start motifs. The protein sequence is unchanged, and codons are only used if they occur in %d of %d natural sequences." % (outname, name, count_threshold, len(comparison_seqs))
for codonmotif in codonmotifs:
for iframe in [1, 2, 3]:
s = "%s in frame %d reduced from %d to %d." % (codonmotif, iframe, CountMotifsInFrame(originalseq[1], codonmotif, iframe), CountMotifsInFrame(seq, codonmotif, iframe))
print s
header = "%s %s" % (header, s)
outfile = '%s/%s.fasta' % (outdir, outname)
print "Writing %s to %s." % (outname, outfile)
fasta.Write([(header, seq)], outfile)
# now add CTG motifs
targetcodon = 'CTG'
targetaa = fasta.Translate([('codon', targetcodon)])[0][1]
for (name, (outname, seqfile, comparisonfile)) in addctg_targets.iteritems():
print "\nRedesigning %s to create %s." % (name, outname)
originalseq = fasta.Read(seqfile)
assert len(originalseq) == 1
originalseq = originalseq[0]
(head, seq) = originalseq
seq = seq.upper()
assert len(seq) % 3 == 0
ncodons = len(seq) / 3
comparison_seqs = fasta.Read(comparisonfile)
print "Will use %d comparison sequences for the redesign." % len(comparison_seqs)
for icodon in range(ncodons):
codon = seq[icodon * 3 : icodon * 3 + 3]
aa = fasta.Translate([('codon', codon)])[0][1]
if aa == targetaa and codon != targetcodon: # a replacement possibility
ctgcounts = CountCodonOccurrences(comparison_seqs, icodon + 1, [targetcodon])[targetcodon]
if ctgcounts >= count_threshold: # replacement possible
newseq = "%s%s%s" % (seq[ : icodon * 3], targetcodon, seq[icodon * 3 + 3 : ])
assert fasta.Translate([('newseq', newseq)])[0][1] == fasta.Translate([('seq', seq)])[0][1] # make sure the protein sequence is unchanged
seq = newseq
assert fasta.Translate([originalseq])[0][1] == fasta.Translate([('seq', seq)])[0][1] # make sure the protein sequence is unchanged
s = "%s in frame 1 increased from %d to %d." % (targetcodon, CountMotifsInFrame(originalseq[1], codonmotif, 1), CountMotifsInFrame(seq, codonmotif, 1))
print s
header = "%s: %s redesigned to add %s motifs in frame 1. The protein sequence is unchanged, and codons are only used if they occur in %d of %d natural sequences." % (outname, name, targetcodon, count_threshold, len(comparison_seqs))
header = "%s %s" % (header, s)
outfile = '%s/%s.fasta' % (outdir, outname)
print "Writing %s to %s." % (outname, outfile)
fasta.Write([(header, seq)], outfile)
# write full alternatively spliced segment coding regions
for (name, g1, g1_start, g1_end, g2, g2e1_start, g2e1_end, g2e2_start, g2e2_end) in spliced_segments:
outname = "%s-lowCTG" % name
(g1name, g2name) = ('%s-lowCTG' % g1, '%s-lowCTG' % g2)
g1_file = '%s/%s.fasta' % (outdir, g1name)
g2_file = '%s/%s.fasta' % (outdir, g2name)
outfile = "%s/%s.fasta" % (outdir, outname)
print "Creating %s from %s and %s." % (outname, g1name, g2name)
(g1_head, g1_seq) = fasta.Read(g1_file)[0]
(g2_head, g2_seq) = fasta.Read(g2_file)[0]
merged = "%s%s" % (g1_seq[ : g2e2_start - 1], g2_seq[g2e1_end : ])
assert g1_seq in merged
a = align.Align([(outname, merged), (g1name, g1_seq), (g2name, g2_seq)], musclepath, 'MUSCLE')
a = align.AddDots(a)
print "Here is the alignment:\n>%s\n%s\n>%s\n%s\n>%s\n%s" % (a[0][0], a[0][1], a[1][0], a[1][1], a[2][0], a[2][1])
print "Writing %s to %s" % (outname, outfile)
header = "%s, made by merging the following: %s; %s" % (outname, g1_head, g2_head)
fasta.Write([(header, merged)], outfile)
main() # run the script