From 002b6397a86797b169d29320570d464ee3060a01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20P=C5=99=C3=ADhoda?= Date: Thu, 21 Nov 2019 10:30:52 +0100 Subject: [PATCH 1/5] Support zero-length sequences --- pyfaidx/__init__.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pyfaidx/__init__.py b/pyfaidx/__init__.py index 22e1927..b6a9b41 100644 --- a/pyfaidx/__init__.py +++ b/pyfaidx/__init__.py @@ -123,7 +123,7 @@ def __getitem__(self, n): >chr1 AC """ - if self.start is None or self.end is None: + if self.start is None or self.end is None or len(self.seq) == 0: correction_factor = 0 elif len( self.seq @@ -461,7 +461,7 @@ def read_fai(self): rname, rlen, offset, lenc, lenb = line.split('\t') rlen, offset, lenc, lenb = map(int, (rlen, offset, lenc, lenb)) - newlines = int(ceil(rlen / lenc) * (lenb - lenc)) + newlines = int(ceil(rlen / lenc) * (lenb - lenc)) if lenc else 0 bend = offset + newlines + rlen rec = IndexRecord(rlen, offset, lenc, lenb, bend, prev_bend) @@ -508,8 +508,8 @@ def build_index(self): rname = None # reference sequence name offset = 0 # binary offset of end of current line rlen = 0 # reference character length - blen = None # binary line length (includes newline) - clen = None # character line length + blen = 0 # binary line length (includes newline) + clen = 0 # character line length bad_lines = [] # lines > || < than blen thisoffset = offset valid_entry = False @@ -535,9 +535,9 @@ def build_index(self): "Inconsistent line found in >{0} at " "line {1:n}.".format( rname, bad_lines[0][0] + 1)) - blen = None + blen = 0 rlen = 0 - clen = None + clen = 0 bad_lines = [] try: # must catch empty deflines (actually these might be okay: https://github.com/samtools/htslib/pull/258) rname = line.rstrip('\n\r')[1:].split()[ @@ -648,8 +648,8 @@ def from_file(self, rname, start, end, internals=False): # Calculate offset (https://github.com/samtools/htslib/blob/20238f354894775ed22156cdd077bc0d544fa933/faidx.c#L398) newlines_before = int( - (start0 - 1) / i.lenc * (i.lenb - i.lenc)) if start0 > 0 else 0 - newlines_to_end = int(end / i.lenc * (i.lenb - i.lenc)) + (start0 - 1) / i.lenc * (i.lenb - i.lenc)) if start0 > 0 and i.lenc else 0 + newlines_to_end = int(end / i.lenc * (i.lenb - i.lenc)) if i.lenc else 0 newlines_inside = newlines_to_end - newlines_before seq_blen = newlines_inside + seq_len bstart = i.offset + newlines_before + start0 From c456033a793f6d65dd78a6e5031e03d30f83e349 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 21 Nov 2019 14:28:18 -0500 Subject: [PATCH 2/5] Initial tests for #155 --- tests/test_feature_bounds_check.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/test_feature_bounds_check.py b/tests/test_feature_bounds_check.py index 251bcb5..ab18448 100644 --- a/tests/test_feature_bounds_check.py +++ b/tests/test_feature_bounds_check.py @@ -6,6 +6,31 @@ path = os.path.dirname(__file__) os.chdir(path) +class TestFeatureZeroLength: + """Tests for handling zero-length entries, added in #155""" + def setUp(self): + with open('data/zero_length.fasta', 'w') as fasta: + fasta.write(""">A +ATCG +>B +>C + +>D +GTA +GC""") + + def tearDown(self): + os.remove('data/zero_length.fasta') + os.remove('data/zero_length.fasta.fai') + + def test_index_zero_length(self): + fasta = Fasta('data/zero_length.fasta') + + def test_fetch_zero_length(self): + fasta = Fasta('data/zero_length.fasta') + b = fasta["B"] + assert str(b) == '' + class TestFeatureBoundsCheck: def setUp(self): pass From 9088dd198a0ac0eb81b1dedadcbce3c6149fc3d6 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 21 Nov 2019 15:11:44 -0500 Subject: [PATCH 3/5] Add tests from #93 before merging #155 --- tests/test_feature_bounds_check.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_feature_bounds_check.py b/tests/test_feature_bounds_check.py index ab18448..e9ccdc2 100644 --- a/tests/test_feature_bounds_check.py +++ b/tests/test_feature_bounds_check.py @@ -30,6 +30,28 @@ def test_fetch_zero_length(self): fasta = Fasta('data/zero_length.fasta') b = fasta["B"] assert str(b) == '' + +class TestZeroLengthSequenceSubRange(TestCase): + def setUp(self): + pass + + def tearDown(self): + try: + os.remove('data/genes.fasta.fai') + except EnvironmentError: + pass # some tests may delete this file + + def test_as_raw_zero_length_subsequence(self): + fasta = Fasta('data/genes.fasta', as_raw=True, strict_bounds=True) + expect = '' + result = fasta['gi|557361099|gb|KF435150.1|'][100:100] + assert result == expect + + def test_zero_length_subsequence(self): + fasta = Fasta('data/genes.fasta', strict_bounds=True) + expect = '' + result = fasta['gi|557361099|gb|KF435150.1|'][100:100] + assert result.seq == expect class TestFeatureBoundsCheck: def setUp(self): From 38142493cea5d3a608c31c9e4e0fa3eeec99179e Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Fri, 22 Nov 2019 10:32:07 -0500 Subject: [PATCH 4/5] Return an empty string when strict_bounds=True and query length=0 --- pyfaidx/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pyfaidx/__init__.py b/pyfaidx/__init__.py index b6a9b41..ead050b 100644 --- a/pyfaidx/__init__.py +++ b/pyfaidx/__init__.py @@ -669,12 +669,15 @@ def from_file(self, rname, start, end, internals=False): else: self.file.seek(bstart) + # If the requested sequence exceeds len(FastaRecord), return as much as possible if bstart + seq_blen > i.bend and not self.strict_bounds: seq_blen = i.bend - bstart - + # Otherwise it should be safe to read the sequence if seq_blen > 0: seq = self.file.read(seq_blen).decode() - elif seq_blen <= 0 and not self.strict_bounds: + # If the requested sequence is negative, we will pad the empty string with default_seq. + # This was changed to support #155 with strict_bounds=True. + elif seq_blen <= 0: seq = '' if not internals: From bb9b311c8e866b8901ba0e153418a52899af2fca Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Fri, 22 Nov 2019 10:46:24 -0500 Subject: [PATCH 5/5] Bump version for release to PyPI --- pyfaidx/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyfaidx/__init__.py b/pyfaidx/__init__.py index ead050b..2978c88 100644 --- a/pyfaidx/__init__.py +++ b/pyfaidx/__init__.py @@ -25,7 +25,7 @@ dna_bases = re.compile(r'([ACTGNactgnYRWSKMDVHBXyrwskmdvhbx]+)') -__version__ = '0.5.5.2' +__version__ = '0.5.6' class KeyFunctionError(ValueError):