Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow reference sequence to be returned always as upper case. #71

Merged
merged 8 commits into from
Jul 29, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@
venv
__pycache__
*.pyc
.project
.pydevproject

29 changes: 29 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,18 @@ Or just get a Python string:

>>> genes['NM_001282543.1'][200:230]
CTCGTTCCGCGCCCGCCATGGAACCGGATG

You can make sure that you always receive an uppercase sequence, even if your fasta file has lower case

.. code:: python

>>> from pyfaidx import Fasta
>>> reference = Fasta('tests/data/genes.fasta.lower', sequence_always_upper=True)
>>> reference['gi|557361099|gb|KF435150.1|'][1:70]

>gi|557361099|gb|KF435150.1|:2-70
TGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGACAGTGCTTGCAGGATCTCTCCTGGACAAA


You can also perform line-based iteration, receiving the sequence lines as they appear in the FASTA file:

Expand Down Expand Up @@ -415,6 +427,23 @@ Changelog
Please see the `releases <https://github.com/mdshw5/pyfaidx/releases>`_ for a
comprehensive list of version changes.


Contributing
------------

Create a new Pull Request with one feauture. If you add a new feature, please
create also the relevant test.

To get test running on your machine:
- Create a new virtualenv and install the `dev-requirements.txt`.
- Download the test data running:

python tests/data/download_gene_fasta.py

- Run the tests with

nosetests --with-coverage --cover-package=pyfaidx

Acknowledgements
----------------

Expand Down
3 changes: 2 additions & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ Pygments>=1
collective.checkdocs>=0.2
docutils>=0.12
six>=1.7.3
nose
nose==1.3.7
biopython==1.65
17 changes: 13 additions & 4 deletions pyfaidx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ class Faidx(object):
def __init__(self, filename, default_seq=None, key_function=None,
as_raw=False, strict_bounds=False, read_ahead=None,
mutable=False, split_char=None, filt_function=None,
one_based_attributes=True):
one_based_attributes=True,
sequence_always_upper=False):
"""
filename: name of fasta file
key_function: optional callback function which should return a unique
Expand All @@ -222,6 +223,7 @@ def __init__(self, filename, default_seq=None, key_function=None,
self.default_seq = default_seq
self.strict_bounds = strict_bounds
self.one_based_attributes = one_based_attributes
self.sequence_always_upper = sequence_always_upper
self.index = OrderedDict()
self.buffer = dict((('seq', None), ('name', None), ('start', None), ('end', None)))
if not read_ahead or isinstance(read_ahead, int):
Expand Down Expand Up @@ -422,7 +424,10 @@ def format_seq(self, seq, rname, start, end):
seq = ''.join([seq, pad_len * self.default_seq])
else: # Return less than requested range
end = start0 + len(seq)


if self.sequence_always_upper:
seq = seq.upper()

if self.as_raw:
return seq
else:
Expand Down Expand Up @@ -568,7 +573,10 @@ def __setitem__(self, n, value):


class Fasta(object):
def __init__(self, filename, default_seq=None, key_function=None, as_raw=False, strict_bounds=False, read_ahead=None, mutable=False, split_char=None, filt_function=None, one_based_attributes=True):
def __init__(self, filename, default_seq=None, key_function=None, as_raw=False,
strict_bounds=False, read_ahead=None, mutable=False, split_char=None,
filt_function=None, one_based_attributes=True,
sequence_always_upper=False):
"""
An object that provides a pygr compatible interface.
filename: name of fasta file
Expand All @@ -578,7 +586,8 @@ def __init__(self, filename, default_seq=None, key_function=None, as_raw=False,
self.faidx = Faidx(filename, key_function=key_function, as_raw=as_raw,
default_seq=default_seq, strict_bounds=strict_bounds,
read_ahead=read_ahead, mutable=mutable, split_char=split_char,
filt_function=filt_function, one_based_attributes=one_based_attributes)
filt_function=filt_function, one_based_attributes=one_based_attributes,
sequence_always_upper=sequence_always_upper)
self.keys = self.faidx.index.keys
if not self.mutable:
self.records = dict([(rname, FastaRecord(rname, self)) for rname in self.keys()])
Expand Down
2 changes: 1 addition & 1 deletion tests/data/download_gene_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def fetch_chr22_vcf(filename):
if __name__ == "__main__":
path = os.path.dirname(__file__)
os.chdir(path)
if not os.path.isfile("genes.fasta"):
if not os.path.isfile("genes.fasta") or not os.path.isfile("genes.fasta.lower"):
fetch_genes("genes.fasta")
if not os.path.isfile("chr22.vcf.gz"):
fetch_chr22_vcf("chr22.vcf.gz")
Expand Down
10 changes: 10 additions & 0 deletions tests/test_FastaRecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@ def tearDown(self):
except EnvironmentError:
pass # some tests may delete this file

def test_sequence_uppercase(self):
"""Test that the sequence is always returned in
uppercase, even if it is in lowercase in the
reference genome.
"""
filename = "data/genes.fasta.lower"
reference_upper = Fasta(filename, sequence_always_upper=True)
reference_normal = Fasta(filename)
assert reference_upper['gi|557361099|gb|KF435150.1|'][1:100].seq == reference_normal['gi|557361099|gb|KF435150.1|'][1:100].seq.upper()

def test_long_names(self):
""" Test that deflines extracted using FastaRecord.long_name are
identical to deflines in the actual file.
Expand Down