Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

initial version of spliced sequence retrieval #127

Merged
merged 3 commits into from
Oct 12, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 28 additions & 3 deletions pyfaidx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,14 +877,39 @@ def __iter__(self):
for rname in self.keys():
yield self[rname]

def get_seq(self, name, start, end):
def get_seq(self, name, start, end, rc=False):
"""Return a sequence by record name and interval [start, end).

Coordinates are 0-based, end-exclusive.
Coordinates are 1-based, end-exclusive.
If rc is set, reverse complement will be returned.
"""
# Get sequence from real genome object and save result.
return self.faidx.fetch(name, start, end)
seq = self.faidx.fetch(name, start, end)
if rc:
return -seq
else:
return seq

def get_spliced_seq(self, name, intervals, rc=False):
"""Return a sequence by record name and list of intervals

Interval list is an iterable of [start, end].
Coordinates are 1-based, end-exclusive.
If rc is set, reverse complement will be returned.
"""
# Get sequence for all intervals
chunks = [self.faidx.fetch(name, s, e) for s,e in intervals]
start = chunks[0].start
end = chunks[-1].end

# reverce complement
if rc:
seq = "".join([(-chunk).seq for chunk in chunks[::-1]])
else:
seq = "".join([chunk.seq for chunk in chunks])

return Sequence(name=name, seq=seq, start=start, end=end)

def close(self):
self.__exit__()

Expand Down
2 changes: 2 additions & 0 deletions tests/data/chr17.hg19.part.fa

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/data/gene.bed12
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chr17 6010 31420 uc010vpx.1 0 - 6012 31270 0 6 158,127,110,75,80,523, 0,5195,5861,7910,16317,24887,
2 changes: 2 additions & 0 deletions tests/data/gene.bed12.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>chr17:6010-31420(-)
CGGAGCGGGCAGCGGCCAAGTCAGGGCCGTCCGGGGGCGCGGCCGGCGATGCCCGCAGCCCCCgccgcgccccgccgggcctgctgagccgcccccgggccggggtcgcgccgggccgggccgcgcccggggcggggcggCGCTGCCTGCATGACCCTCCGGCGGCGCGGGGAGAAGGCGACCATCAGCATCCAGGAGCATATGGCCATCGACGTGTGCCCCGGCCCCATCCGTCCCATCAAGCAGATCTCCGACTACTTCCCCCGCTTcccgcggggcctgcccccggacgccgggccccgagccgctgcacccccggacgcccccgcgcgcccggctgtggccggtgccggccgccgcagcccctccgacggcgcccgcgAGGACGACGAGGATGTGGACCAGCTCTTCGGAGCCTAcggctccagcccgggccccagcccgggtcccagccccGCGCGGCCGCCAGCCAAGCCGCCGGAGGACGAGCCGGACGCCGACGGCTACGAGTCGGACGACTGCACTGCCCTGGGCACGCTGGACTTCAGCCTGCTGTATGACCAGGAGAACAACGCCCTCCACTGCACCATCACCAAGGCCAAGGGCCTGAAGCCAATGGACCACAATGGGCTGGCAGACCCCTACGTCAAGCTGCACCTGCTGCCAGGAGCCAGTAAGGCAAATAAGCTCAGAACAAAAACTCTCCGTAACACTCTGAACCCCACATGGAACGAGACCCTCACTTACTACGGGATCACAGATGAAGACATGATCCGCAAGACCCTGCGGATCTCTGTGTGTGACGAGGACAAATTCCGGCACAATGAGTTCATCGGGGAGACACGTGTGCCCCTGAAGAAGCTGAAACCCAACCACACCAAGACCTTCAGCATCTGCCTGGAGAAGCAGCTGCCGGTGGACAAGACTGAAGACAAGTCCCTGGAGGAGCGGGGCCGCATCCTCATCTCCCTCAAGTACAGCTCACAGAAGCAAGGCCTGCTGGTAGGCATCGTGCGGTGCGCCCACCTGGCCGCCATGGACGCCAACGGCTACTCGGACCCCTACGTGAAAAC
74 changes: 74 additions & 0 deletions tests/test_feature_spliced_seq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os
from pyfaidx import Fasta
from unittest import TestCase

path = os.path.dirname(__file__)
os.chdir(path)

class TestFeatureSplicedSeq(TestCase):
def setUp(self):
pass

def tearDown(self):
fais = [
"data/gene.bed12.fasta.fai",
"data/chr17.hg19.part.fa.fai"
]
for fai in fais:
try:
os.remove(fai)
except EnvironmentError:
pass # some tests may delete this file

def test_split_seq(self):
""" Fetch sequence by blocks """
fa = Fasta('data/chr17.hg19.part.fa')

gene = Fasta("data/gene.bed12.fasta")
expect = gene[list(gene.keys())[0]][:].seq

bed = "data/gene.bed12"
with open(bed) as fi:
record = fi.readline().strip().split("\t")

chrom = record[0]
start = int(record[1])
strand = record[5]

# parse bed12 format
starts = [int(x) for x in record[11].split(",")[:-1]]
sizes = [int(x) for x in record[10].split(",")[:-1]]
starts = [start + x for x in starts]
ends = [start + size for start,size in zip(starts, sizes)]

# bed half-open
if strand == "-":
starts = [start + 1 for start in starts]
else:
ends = [end - 1 for end in ends]

intervals = zip(starts, ends)
result = fa.get_spliced_seq(chrom, intervals, rc=True)
print(result.seq)
print("====")
print(expect)

assert result.seq == expect

def test_get_seq_rc(self):
""" Check get_seq with rc argument """
fa = Fasta('data/chr17.hg19.part.fa')

result = fa.get_seq("chr17", 11, 20, rc=False)
expect = "CCCTGTTCCT"
print("normal")
print(result.seq)
print(expect)
assert result.seq == expect

result = fa.get_seq("chr17", 11, 20, rc=True)
expect = "AGGAACAGGG"
assert result.seq == expect
print("rc")
print(result.seq)
print(expect)