mdshw5 · mdshw5 · Jul 29, 2015 · Jul 28, 2015 · Jul 29, 2015 · Jul 29, 2015
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,6 @@
 venv
 __pycache__
 *.pyc
+.project
+.pydevproject
+
diff --git a/README.rst b/README.rst
@@ -181,6 +181,18 @@ Or just get a Python string:
 
     >>> genes['NM_001282543.1'][200:230]
     CTCGTTCCGCGCCCGCCATGGAACCGGATG
+
+You can make sure that you always receive an uppercase sequence, even if your fasta file has lower case
+
+.. code:: python
+
+    >>> from pyfaidx import Fasta
+    >>> reference = Fasta('tests/data/genes.fasta.lower', sequence_always_upper=True)
+    >>> reference['gi|557361099|gb|KF435150.1|'][1:70]
+
+    >gi|557361099|gb|KF435150.1|:2-70
+    TGACATCATTTTCCACCTCTGCTCAGTGTTCAACATCTGACAGTGCTTGCAGGATCTCTCCTGGACAAA
+
 
 You can also perform line-based iteration, receiving the sequence lines as they appear in the FASTA file:
 
@@ -415,6 +427,23 @@ Changelog
 Please see the `releases <https://github.com/mdshw5/pyfaidx/releases>`_ for a
 comprehensive list of version changes.
 
+
+Contributing
+------------
+
+Create a new Pull Request with one feauture. If you add a new feature, please 
+create also the relevant test.
+
+To get test running on your machine:
+ - Create a new virtualenv and install the `dev-requirements.txt`. 
+ - Download the test data running:
+
+      python tests/data/download_gene_fasta.py
+
+ - Run the tests with 
+
+      nosetests --with-coverage --cover-package=pyfaidx
+
 Acknowledgements
 ----------------
 

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -2,4 +2,5 @@ Pygments>=1
 collective.checkdocs>=0.2
 docutils>=0.12
 six>=1.7.3
-nose
+nose==1.3.7
+biopython==1.65
diff --git a/pyfaidx/__init__.py b/pyfaidx/__init__.py
@@ -201,7 +201,8 @@ class Faidx(object):
     def __init__(self, filename, default_seq=None, key_function=None,
                  as_raw=False, strict_bounds=False, read_ahead=None,
                  mutable=False, split_char=None, filt_function=None,
-                 one_based_attributes=True):
+                 one_based_attributes=True, 
+                 sequence_always_upper=False):
         """
         filename: name of fasta file
         key_function: optional callback function which should return a unique
@@ -222,6 +223,7 @@ def __init__(self, filename, default_seq=None, key_function=None,
         self.default_seq = default_seq
         self.strict_bounds = strict_bounds
         self.one_based_attributes = one_based_attributes
+        self.sequence_always_upper = sequence_always_upper
         self.index = OrderedDict()
         self.buffer = dict((('seq', None), ('name', None), ('start', None), ('end', None)))
         if not read_ahead or isinstance(read_ahead, int):
@@ -422,7 +424,10 @@ def format_seq(self, seq, rname, start, end):
             seq = ''.join([seq, pad_len * self.default_seq])
         else:  # Return less than requested range
             end = start0 + len(seq)
-
+
+        if self.sequence_always_upper:
+            seq = seq.upper()
+
         if self.as_raw:
             return seq
         else:
@@ -568,7 +573,10 @@ def __setitem__(self, n, value):
 
 
 class Fasta(object):
-    def __init__(self, filename, default_seq=None, key_function=None, as_raw=False, strict_bounds=False, read_ahead=None, mutable=False, split_char=None, filt_function=None, one_based_attributes=True):
+    def __init__(self, filename, default_seq=None, key_function=None, as_raw=False, 
+                 strict_bounds=False, read_ahead=None, mutable=False, split_char=None, 
+                 filt_function=None, one_based_attributes=True, 
+                 sequence_always_upper=False):
         """
         An object that provides a pygr compatible interface.
         filename: name of fasta file
@@ -578,7 +586,8 @@ def __init__(self, filename, default_seq=None, key_function=None, as_raw=False,
         self.faidx = Faidx(filename, key_function=key_function, as_raw=as_raw,
                            default_seq=default_seq, strict_bounds=strict_bounds,
                            read_ahead=read_ahead, mutable=mutable, split_char=split_char,
-                           filt_function=filt_function, one_based_attributes=one_based_attributes)
+                           filt_function=filt_function, one_based_attributes=one_based_attributes,
+                           sequence_always_upper=sequence_always_upper)
         self.keys = self.faidx.index.keys
         if not self.mutable:
             self.records = dict([(rname, FastaRecord(rname, self)) for rname in self.keys()])

diff --git a/tests/data/download_gene_fasta.py b/tests/data/download_gene_fasta.py
@@ -66,7 +66,7 @@ def fetch_chr22_vcf(filename):
 if __name__ == "__main__":
     path = os.path.dirname(__file__)
     os.chdir(path)
-    if not os.path.isfile("genes.fasta"):
+    if not os.path.isfile("genes.fasta") or not os.path.isfile("genes.fasta.lower"):
         fetch_genes("genes.fasta")
     if not os.path.isfile("chr22.vcf.gz"):
         fetch_chr22_vcf("chr22.vcf.gz")

diff --git a/tests/test_FastaRecord.py b/tests/test_FastaRecord.py
@@ -17,6 +17,16 @@ def tearDown(self):
         except EnvironmentError:
             pass  # some tests may delete this file
 
+    def test_sequence_uppercase(self):
+        """Test that the sequence is always returned in 
+        uppercase, even if it is in lowercase in the 
+        reference genome. 
+        """
+        filename = "data/genes.fasta.lower"
+        reference_upper = Fasta(filename, sequence_always_upper=True)
+        reference_normal = Fasta(filename)
+        assert reference_upper['gi|557361099|gb|KF435150.1|'][1:100].seq == reference_normal['gi|557361099|gb|KF435150.1|'][1:100].seq.upper()
+
     def test_long_names(self):
         """ Test that deflines extracted using FastaRecord.long_name are
         identical to deflines in the actual file.