Skip to content

Commit

Permalink
Merge local changes for #126
Browse files Browse the repository at this point in the history
There is a lot here that doesn't work, but mainly I was trying
to figure out the format of the GZI file and provide methods to
unpack and pack the binary on-disk format. There are also methods
for loading the GZI into an object for use by Faidx.
  • Loading branch information
mdshw5 committed Jun 25, 2020
1 parent b5d375a commit e526ce8
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 70 deletions.
137 changes: 111 additions & 26 deletions pyfaidx/__init__.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@
"""

from __future__ import division

import bisect
import os
import re
import string
import struct
import sys
import warnings
from collections import namedtuple
from itertools import islice
from math import ceil
from os.path import getmtime
from threading import Lock

from six import PY2, PY3, integer_types, string_types
from six.moves import zip_longest

Expand Down Expand Up @@ -312,19 +312,32 @@ def __len__(self):
return self.rlen


class BGZFblock(namedtuple('BGZFblock', ['cstart', 'clen', 'ustart', 'ulen'])):
__slots__ = ()

def __getitem__(self, key):
if type(key) == str:
return getattr(self, key)
return tuple.__getitem__(self, key)

def as_bytes(self):
return struct.pack('<QQ', self.cstart, self.ustart)

def __len__(self):
return self.ulen
class BgzfBlock(namedtuple('BgzfBlock', ['cstart', 'clen', 'ustart', 'ulen'])):
__slots__ = ()

def __getitem__(self, key):
if type(key) == str:
return getattr(self, key)
return tuple.__getitem__(self, key)

def as_bytes(self):
return struct.pack('<QQ', self.cstart, self.ustart)

def __lt__(self, other):
if self.ustart < other:
return True

def __len__(self):
return self.ulen

@property
def empty(self):
"""Check for the EOF marker, which is an empty BGZF block.
https://github.com/biopython/biopython/blob/master/Bio/bgzf.py#L171"""
if self.ulen == '0':
return True
else:
return False


class Faidx(object):
Expand Down Expand Up @@ -369,7 +382,6 @@ def __init__(self,
self._bgzf = True
self.gzi_index = OrderedDict()
self.gzi_indexname = filename + '.gzi'

else:
self._fasta_opener = open
self._bgzf = False
Expand Down Expand Up @@ -438,6 +450,8 @@ def __init__(self,
elif build_index:
self.build_index()
self.read_fai()
else:
self.read_fai()
if self._bgzf:
if os.path.exists(self.gzi_indexname) and getmtime(self.gzi_indexname) >= getmtime(self.gzi_indexname):
self.read_gzi()
Expand Down Expand Up @@ -569,8 +583,7 @@ def build_index(self):
"Bad sequence name %s at line %s." %
(line.rstrip('\n\r'), str(i)))
offset += line_blen
thisoffset = fastafile.tell(
) if self._bgzf else offset
thisoffset = offset
else: # check line and advance offset
if not blen:
blen = line_blen
Expand Down Expand Up @@ -614,7 +627,41 @@ def build_index(self):
% self.indexname)
elif isinstance(e, FastaIndexingError):
raise e


def build_gzi(self):
""" Build the htslib .gzi index format """
from Bio import bgzf
with open(self.filename, 'rb') as bgzf_file:
self.gzi_index = []
for i, values in enumerate(bgzf.BgzfBlocks(bgzf_file)):
self.gzi_index.append(BgzfBlock(*values))
eof = self.gzi_index.pop()
if not eof.empty:
raise IOError("BGZF EOF marker not found. File %s is not a valid BGZF file." % self.filename)


def write_gzi(self):
""" Write the on disk format for the htslib .gzi index
https://github.com/samtools/htslib/issues/473"""
with open(self.gzi_indexname, 'wb') as bzi_file:
bzi_file.write(struct.pack('<Q', len(self.gzi_index)))
for block in self.gzi_index:
bzi_file.write(block.as_bytes())

def read_gzi(self):
""" Read the on disk format for the htslib .gzi index
https://github.com/samtools/htslib/issues/473"""
from ctypes import c_uint64, sizeof
with open(self.gzi_indexname, 'rb') as bzi_file:
number_of_blocks = struct.unpack('<Q', bzi_file.read(sizeof(c_uint64)))[0]
self.gzi_index = []
for i in range(number_of_blocks):
cstart, ustart = struct.unpack('<QQ', bzi_file.read(sizeof(c_uint64) * 2))
if cstart == '' or ustart == '':
raise IndexError("Unexpected end of .gzi file. ")
else:
self.gzi_index.append(BgzfBlock(cstart, None, ustart, None))

def write_fai(self):
with self.lock:
with open(self.indexname, 'w') as outfile:
Expand Down Expand Up @@ -702,7 +749,7 @@ def from_file(self, rname, start, end, internals=False):
newlines_to_end = int(end / i.lenc * (i.lenb - i.lenc)) if i.lenc else 0
newlines_inside = newlines_to_end - newlines_before
seq_blen = newlines_inside + seq_len
bstart = i.offset + newlines_before + start0

if seq_blen < 0 and self.strict_bounds:
raise FetchError("Requested coordinates start={0:n} end={1:n} are "
"invalid.\n".format(start, end))
Expand All @@ -715,11 +762,18 @@ def from_file(self, rname, start, end, internals=False):
"\n".format(end, rname))

with self.lock:
if self._bgzf: # We can't add to virtual offsets, so we need to read from the beginning of the record and trim the beginning if needed
self.file.seek(i.offset)
chunk = start0 + newlines_before + newlines_inside + seq_len
chunk_seq = self.file.read(chunk).decode()
seq = chunk_seq[start0 + newlines_before:]
bstart = i.offset + newlines_before + start0 # uncompressed offset for the start of requested string
if self._bgzf:
from Bio import bgzf
insertion_i = bisect.bisect_left(self.gzi_index, bstart)
if insertion_i == 0: # bisect_left already returns index to the left if values are the same
start_block = self.gzi_index[insertion_i]
else:
start_block = self.gzi_index[insertion_i - 1]
within_block_offset = start_block.ustart + bstart
print(self.gzi_index)
print(locals())
self.file.seek(bgzf.make_virtual_offset(start_block.cstart, within_block_offset))
else:
self.file.seek(bstart)

Expand Down Expand Up @@ -1337,7 +1391,38 @@ def get_valid_filename(s):
'chromosome_6.fa'
"""
s = str(s).strip().replace(' ', '_')
return re.sub(r'(?u)[^-\w.]', '', s)
return re.sub(r'(?u)[^-\w.]', '', s)

def unpack_gzi_to_blocks(gzi_bytes):
""" Unpacks the bgzip .gzi format to a tuple of
(compressed offset, uncompressed offset) describing
the BGZF compressed FASTA file.
>>> unpack_gzi_to_blocks(b'\x02\x00\x00\x00\x00\x00\x00\x00\x1e(\x00\x00\x00\x00\x00\x00\x00\xff\x00\x00\x00\x00\x00\x00\x8a1\x00\x00\x00\x00\x00\x00\xff\x1c\x01\x00\x00\x00\x00\x00')
((10270, 65280), (12682, 72959))
"""
import struct

n_bytes = len(gzi_bytes)
gzi_ints = struct.Struct('<%sQ' % str(n_bytes // 8)).unpack(gzi_bytes) # little-endian 64-bit unsigned integers
n_blocks = gzi_ints[0]

block_offsets = tuple(zip(*[iter(gzi_ints[1:])] * 2))
return block_offsets

def pack_blocks_to_gzi(block_offsets):
""" Packs the bgzip .gzi format from a tuple of
(compressed offset, uncompressed offset) describing
the BGZF compressed FASTA file.
>>> pack_blocks_to_gzi(((10270, 65280), (12682, 72959)))
b'\x02\x00\x00\x00\x00\x00\x00\x00\x1e(\x00\x00\x00\x00\x00\x00\x00\xff\x00\x00\x00\x00\x00\x00\x8a1\x00\x00\x00\x00\x00\x00\xff\x1c\x01\x00\x00\x00\x00\x00'
"""
import struct
from itertools import chain

n_blocks = len(block_offsets)
gzi_bytes = struct.Struct('<%sQ' % str(n_blocks * 2 + 1)).pack(n_blocks, *chain(*block_offsets)) # little-endian 64-bit unsigned integers

return gzi_bytes


if __name__ == "__main__":
Expand Down
2 changes: 0 additions & 2 deletions tests/data/chr17.hg19.part.fa

This file was deleted.

Loading

0 comments on commit e526ce8

Please sign in to comment.