Merge local changes for #126

There is a lot here that doesn't work, but mainly I was trying to figure out the format of the GZI file and provide methods to unpack and pack the binary on-disk format. There are also methods for loading the GZI into an object for use by Faidx.
mdshw5 · Jun 25, 2020 · e526ce8 · e526ce8
1 parent b5d375a
commit e526ce8
Show file tree

Hide file tree

Showing 5 changed files with 155 additions and 70 deletions.
diff --git a/pyfaidx/__init__.py b/pyfaidx/__init__.py
@@ -4,18 +4,18 @@
 """
 
 from __future__ import division
-
+import bisect
 import os
 import re
 import string
+import struct
 import sys
 import warnings
 from collections import namedtuple
 from itertools import islice
 from math import ceil
 from os.path import getmtime
 from threading import Lock
-
 from six import PY2, PY3, integer_types, string_types
 from six.moves import zip_longest
 
@@ -312,19 +312,32 @@ def __len__(self):
         return self.rlen
 
 
-class BGZFblock(namedtuple('BGZFblock', ['cstart', 'clen', 'ustart', 'ulen'])):
-    __slots__ = ()
-
-    def __getitem__(self, key):
-        if type(key) == str:
-            return getattr(self, key)
-        return tuple.__getitem__(self, key)
-
-    def as_bytes(self):
-        return struct.pack('<QQ', self.cstart, self.ustart)
-
-    def __len__(self):
-        return self.ulen
+class BgzfBlock(namedtuple('BgzfBlock', ['cstart', 'clen', 'ustart', 'ulen'])):
+	    __slots__ = ()
+
+	    def __getitem__(self, key):
+	        if type(key) == str:
+	            return getattr(self, key)
+	        return tuple.__getitem__(self, key)
+
+	    def as_bytes(self):
+	        return struct.pack('<QQ', self.cstart, self.ustart)
+
+	    def __lt__(self, other):
+	        if self.ustart < other:
+	            return True
+
+	    def __len__(self):
+	        return self.ulen
+
+	    @property
+	    def empty(self):
+	    """Check for the EOF marker, which is an empty BGZF block.
+	       https://github.com/biopython/biopython/blob/master/Bio/bgzf.py#L171"""
+	        if self.ulen == '0':
+	            return True
+	        else:
+	            return False
 
 
 class Faidx(object):
@@ -369,7 +382,6 @@ def __init__(self,
                     self._bgzf = True
                     self.gzi_index = OrderedDict()
                     self.gzi_indexname = filename + '.gzi'
-
             else:
                 self._fasta_opener = open
                 self._bgzf = False
@@ -438,6 +450,8 @@ def __init__(self,
                 elif build_index:
                     self.build_index()
                     self.read_fai()
+                else:
+                    self.read_fai()
                 if self._bgzf:
                     if os.path.exists(self.gzi_indexname) and getmtime(self.gzi_indexname) >= getmtime(self.gzi_indexname):
                         self.read_gzi()
@@ -569,8 +583,7 @@ def build_index(self):
                                     "Bad sequence name %s at line %s." %
                                     (line.rstrip('\n\r'), str(i)))
                             offset += line_blen
-                            thisoffset = fastafile.tell(
-                            ) if self._bgzf else offset
+                            thisoffset = offset
                         else:  # check line and advance offset
                             if not blen:
                                 blen = line_blen
@@ -614,7 +627,41 @@ def build_index(self):
                     % self.indexname)
             elif isinstance(e, FastaIndexingError):
                 raise e
-
+
+    def build_gzi(self):
+	        """ Build the htslib .gzi index format """
+	        from Bio import bgzf
+	        with open(self.filename, 'rb') as bgzf_file:
+	            self.gzi_index = []
+	            for i, values in enumerate(bgzf.BgzfBlocks(bgzf_file)):
+	                self.gzi_index.append(BgzfBlock(*values))
+	        eof = self.gzi_index.pop()
+	        if not eof.empty:
+	            raise IOError("BGZF EOF marker not found. File %s is not a valid BGZF file." % self.filename)
+
+
+    def write_gzi(self):
+	        """ Write the on disk format for the htslib .gzi index
+	        https://github.com/samtools/htslib/issues/473"""
+	        with open(self.gzi_indexname, 'wb') as bzi_file:
+	            bzi_file.write(struct.pack('<Q', len(self.gzi_index)))
+	            for block in self.gzi_index:
+	                bzi_file.write(block.as_bytes())
+
+    def read_gzi(self):
+	        """ Read the on disk format for the htslib .gzi index
+	        https://github.com/samtools/htslib/issues/473"""
+	        from ctypes import c_uint64, sizeof
+	        with open(self.gzi_indexname, 'rb') as bzi_file:
+	            number_of_blocks = struct.unpack('<Q', bzi_file.read(sizeof(c_uint64)))[0]
+	            self.gzi_index = []
+	            for i in range(number_of_blocks):
+	                cstart, ustart = struct.unpack('<QQ', bzi_file.read(sizeof(c_uint64) * 2))
+	                if cstart == '' or ustart == '':
+	                    raise IndexError("Unexpected end of .gzi file. ")
+	                else:
+	                    self.gzi_index.append(BgzfBlock(cstart, None, ustart, None))
+
     def write_fai(self):
         with self.lock:
             with open(self.indexname, 'w') as outfile:
@@ -702,7 +749,7 @@ def from_file(self, rname, start, end, internals=False):
         newlines_to_end = int(end / i.lenc * (i.lenb - i.lenc)) if i.lenc else 0
         newlines_inside = newlines_to_end - newlines_before
         seq_blen = newlines_inside + seq_len
-        bstart = i.offset + newlines_before + start0
+
         if seq_blen < 0 and self.strict_bounds:
             raise FetchError("Requested coordinates start={0:n} end={1:n} are "
                              "invalid.\n".format(start, end))
@@ -715,11 +762,18 @@ def from_file(self, rname, start, end, internals=False):
                              "\n".format(end, rname))
 
         with self.lock:
-            if self._bgzf:  # We can't add to virtual offsets, so we need to read from the beginning of the record and trim the beginning if needed
-                self.file.seek(i.offset)
-                chunk = start0 + newlines_before + newlines_inside + seq_len
-                chunk_seq = self.file.read(chunk).decode()
-                seq = chunk_seq[start0 + newlines_before:]
+            bstart = i.offset + newlines_before + start0  # uncompressed offset for the start of requested string
+            if self._bgzf: 
+                from Bio import bgzf
+                insertion_i = bisect.bisect_left(self.gzi_index, bstart)
+                if insertion_i == 0:  # bisect_left already returns index to the left if values are the same
+                    start_block = self.gzi_index[insertion_i]
+                else:
+                    start_block = self.gzi_index[insertion_i - 1]
+                within_block_offset = start_block.ustart + bstart
+                print(self.gzi_index)
+                print(locals())
+                self.file.seek(bgzf.make_virtual_offset(start_block.cstart, within_block_offset))
             else:
                 self.file.seek(bstart)
 
@@ -1337,7 +1391,38 @@ def get_valid_filename(s):
     'chromosome_6.fa'
     """
     s = str(s).strip().replace(' ', '_')
-    return re.sub(r'(?u)[^-\w.]', '', s)
+    return re.sub(r'(?u)[^-\w.]', '', s)              
+
+def unpack_gzi_to_blocks(gzi_bytes):
+    """ Unpacks the bgzip .gzi format to a tuple of 
+    (compressed offset, uncompressed offset) describing 
+    the BGZF compressed FASTA file. 
+    >>> unpack_gzi_to_blocks(b'\x02\x00\x00\x00\x00\x00\x00\x00\x1e(\x00\x00\x00\x00\x00\x00\x00\xff\x00\x00\x00\x00\x00\x00\x8a1\x00\x00\x00\x00\x00\x00\xff\x1c\x01\x00\x00\x00\x00\x00')
+    ((10270, 65280), (12682, 72959))
+    """
+    import struct
+
+    n_bytes = len(gzi_bytes)
+    gzi_ints = struct.Struct('<%sQ' % str(n_bytes // 8)).unpack(gzi_bytes)  # little-endian 64-bit unsigned integers
+    n_blocks = gzi_ints[0]
+
+    block_offsets = tuple(zip(*[iter(gzi_ints[1:])] * 2))
+    return block_offsets
+
+def pack_blocks_to_gzi(block_offsets):
+    """ Packs the bgzip .gzi format from a tuple of 
+    (compressed offset, uncompressed offset) describing 
+    the BGZF compressed FASTA file. 
+    >>> pack_blocks_to_gzi(((10270, 65280), (12682, 72959)))
+    b'\x02\x00\x00\x00\x00\x00\x00\x00\x1e(\x00\x00\x00\x00\x00\x00\x00\xff\x00\x00\x00\x00\x00\x00\x8a1\x00\x00\x00\x00\x00\x00\xff\x1c\x01\x00\x00\x00\x00\x00'
+    """
+    import struct
+    from itertools import chain
+
+    n_blocks = len(block_offsets)
+    gzi_bytes = struct.Struct('<%sQ' % str(n_blocks * 2 + 1)).pack(n_blocks, *chain(*block_offsets))  # little-endian 64-bit unsigned integers
+
+    return gzi_bytes
 
 
 if __name__ == "__main__":

diff --git a/tests/data/chr17.hg19.part.fa b/tests/data/chr17.hg19.part.fa