From 501cecdee6c599a1406caa62393107d451a95583 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Mon, 5 Feb 2018 15:32:09 -0500 Subject: [PATCH 1/5] Initial implementation of #134 --- pyfaidx/__init__.py | 394 +++++++++++++++++++++++++++++--------------- 1 file changed, 260 insertions(+), 134 deletions(-) diff --git a/pyfaidx/__init__.py b/pyfaidx/__init__.py index 87f6661..7b911cf 100644 --- a/pyfaidx/__init__.py +++ b/pyfaidx/__init__.py @@ -1,5 +1,4 @@ # pylint: disable=R0913, R0914, C0301 - """ Fasta file -> Faidx -> Fasta -> FastaRecord -> Sequence """ @@ -11,7 +10,7 @@ from six.moves import zip_longest try: from collections import OrderedDict -except ImportError: #python 2.6 +except ImportError: #python 2.6 from ordereddict import OrderedDict from collections import namedtuple import re @@ -24,18 +23,28 @@ __version__ = '0.5.2' -class KeyFunctionError(Exception): + +class KeyFunctionError(ValueError): """Raised if the key_function argument is invalid.""" -class FastaIndexingError(Exception): + +class FastaIndexingError(IOError): """Raised if we encounter malformed FASTA that prevents indexing.""" -class FetchError(Exception): +class IndexNotFoundError(IOError): + """Raised if read_fai cannot open the index file.""" + + +class FastaNotFoundError(IOError): + """Raised if the fasta file cannot be opened.""" + + +class FetchError(IndexError): """Raised if a request to fetch a FASTA sequence cannot be fulfilled.""" -class BedError(Exception): +class BedError(ValueError): """Indicates a malformed BED entry.""" @@ -45,7 +54,7 @@ class RegionError(Exception): """A region error occurred.""" -class UnsupportedCompressionFormat(Exception): +class UnsupportedCompressionFormat(IOError): """ Raised when a FASTA file is given with a recognized but unsupported compression extension. @@ -59,6 +68,7 @@ class Sequence(object): start, end = coordinates of subsequence (optional) comp = boolean switch for complement property """ + def __init__(self, name='', seq='', start=None, end=None, comp=False): self.name = name self.seq = seq @@ -111,22 +121,26 @@ def __getitem__(self, n): """ if self.start is None or self.end is None: correction_factor = 0 - elif len(self.seq) == abs(self.end - self.start) + 1: # determine coordinate system + elif len( + self.seq + ) == abs(self.end - self.start) + 1: # determine coordinate system one_based = True correction_factor = -1 elif len(self.seq) == abs(self.end - self.start): one_based = False correction_factor = 0 elif len(self.seq) != abs(self.end - self.start): - raise ValueError("Coordinates (Sequence.start=%s and Sequence.end=%s) imply a different length than Sequence.seq (len=%s). Did you modify Sequence.seq?" % (self.start, self.end, len(self.seq))) + raise ValueError( + "Coordinates (Sequence.start=%s and Sequence.end=%s) imply a different length than Sequence.seq (len=%s). Did you modify Sequence.seq?" + % (self.start, self.end, len(self.seq))) if isinstance(n, slice): slice_start, slice_stop, slice_step = n.indices(len(self)) if self.start is None or self.end is None: # there should never be self.start != self.end == None start = None end = None - return self.__class__(self.name, self.seq[n], - start, end, self.comp) + return self.__class__(self.name, self.seq[n], start, end, + self.comp) self_end, self_start = (self.end, self.start) if abs(slice_step) > 1: start = None @@ -140,7 +154,8 @@ def __getitem__(self, n): else: start = self_start + slice_start end = self_start + slice_stop + correction_factor - return self.__class__(self.name, self.seq[n], start, end, self.comp) + return self.__class__(self.name, self.seq[n], start, end, + self.comp) elif isinstance(n, integer_types): if n < 0: n = len(self) + n @@ -205,7 +220,6 @@ def long_name(self): warnings.warn(msg, DeprecationWarning, stacklevel=2) return self.fancy_name - @property def complement(self): """ Returns the compliment of self. @@ -214,8 +228,8 @@ def complement(self): >chr1 (complement) TAGCAT """ - comp = self.__class__(self.name, complement(self.seq), - start=self.start, end=self.end) + comp = self.__class__( + self.name, complement(self.seq), start=self.start, end=self.end) comp.comp = False if self.comp else True return comp @@ -250,7 +264,6 @@ def orientation(self): else: return None - @property def gc(self): """ Return the GC content of seq as a float @@ -266,7 +279,9 @@ def gc(self): return (g + c) / len(self.seq) -class IndexRecord(namedtuple('IndexRecord', ['rlen', 'offset', 'lenc', 'lenb', 'bend', 'prev_bend'])): +class IndexRecord( + namedtuple('IndexRecord', + ['rlen', 'offset', 'lenc', 'lenb', 'bend', 'prev_bend'])): __slots__ = () def __getitem__(self, key): @@ -275,7 +290,8 @@ def __getitem__(self, key): return tuple.__getitem__(self, key) def __str__(self): - return "{rlen:n}\t{offset:n}\t{lenc:n}\t{lenb:n}\n".format(**self._asdict()) + return "{rlen:n}\t{offset:n}\t{lenc:n}\t{lenb:n}\n".format( + **self._asdict()) def __len__(self): return self.rlen @@ -283,11 +299,23 @@ def __len__(self): class Faidx(object): """ A python implementation of samtools faidx FASTA indexing """ - def __init__(self, filename, default_seq=None, key_function=lambda x: x, - as_raw=False, strict_bounds=False, read_ahead=None, - mutable=False, split_char=None, duplicate_action="stop", filt_function=lambda x: True, - one_based_attributes=True, read_long_names=False, - sequence_always_upper=False, rebuild=True): + + def __init__(self, + filename, + default_seq=None, + key_function=lambda x: x, + as_raw=False, + strict_bounds=False, + read_ahead=None, + mutable=False, + split_char=None, + duplicate_action="stop", + filt_function=lambda x: True, + one_based_attributes=True, + read_long_names=False, + sequence_always_upper=False, + rebuild=True, + build_index=True): """ filename: name of fasta file key_function: optional callback function which should return a unique @@ -298,7 +326,8 @@ def __init__(self, filename, default_seq=None, key_function=lambda x: x, """ self.filename = filename - if filename.lower().endswith('.bgz') or filename.lower().endswith('.gz'): + if filename.lower().endswith('.bgz') or filename.lower().endswith( + '.gz'): # Only try to import Bio if we actually need the bgzf reader. try: from Bio import bgzf @@ -308,7 +337,8 @@ def __init__(self, filename, default_seq=None, key_function=lambda x: x, else: self._fasta_opener = bgzf.open self._bgzf = True - elif filename.lower().endswith('.bz2') or filename.lower().endswith('.zip'): + elif filename.lower().endswith('.bz2') or filename.lower().endswith( + '.zip'): raise UnsupportedCompressionFormat( "Compressed FASTA is only supported in BGZF format. Use " "bgzip to compresss your FASTA.") @@ -317,32 +347,40 @@ def __init__(self, filename, default_seq=None, key_function=lambda x: x, self._bgzf = False try: - self.file = self._fasta_opener(filename, 'r+b' if mutable else 'rb') - except ValueError as e: + self.file = self._fasta_opener(filename, 'r+b' + if mutable else 'rb') + except (ValueError, IOError) as e: if str(e).find('BGZF') > -1: raise UnsupportedCompressionFormat( "Compressed FASTA is only supported in BGZF format. Use " "the samtools bgzip utility (instead of gzip) to " "compress your FASTA.") else: - raise + raise FastaNotFoundError( + "Cannot read FASTA file %s" % filename) self.indexname = filename + '.fai' self.read_long_names = read_long_names self.key_function = key_function try: - key_fn_test = self.key_function("TestingReturnType of_key_function") + key_fn_test = self.key_function( + "TestingReturnType of_key_function") if not isinstance(key_fn_test, string_types): - raise KeyFunctionError("key_function argument should return a string, not {0}".format(type(key_fn_test))) + raise KeyFunctionError( + "key_function argument should return a string, not {0}". + format(type(key_fn_test))) except Exception as e: pass self.filt_function = filt_function - assert duplicate_action in ("stop", "first", "last", "longest", "shortest", "drop") + assert duplicate_action in ("stop", "first", "last", "longest", + "shortest", "drop") self.duplicate_action = duplicate_action self.as_raw = as_raw self.default_seq = default_seq if self._bgzf and self.default_seq is not None: - raise FetchError("The default_seq argument is not supported with using BGZF compression. Please decompress your FASTA file and try again.") + raise FetchError( + "The default_seq argument is not supported with using BGZF compression. Please decompress your FASTA file and try again." + ) if self._bgzf: self.strict_bounds = True else: @@ -352,23 +390,33 @@ def __init__(self, filename, default_seq=None, key_function=lambda x: x, self.sequence_always_upper = sequence_always_upper self.index = OrderedDict() self.lock = Lock() - self.buffer = dict((('seq', None), ('name', None), ('start', None), ('end', None))) + self.buffer = dict((('seq', None), ('name', None), ('start', None), + ('end', None))) if not read_ahead or isinstance(read_ahead, integer_types): self.read_ahead = read_ahead elif not isinstance(read_ahead, integer_types): - raise ValueError("read_ahead value must be int, not {0}".format(type(read_ahead))) + raise ValueError("read_ahead value must be int, not {0}".format( + type(read_ahead))) self.mutable = mutable with self.lock: # lock around index generation so only one thread calls method try: - if os.path.exists(self.indexname) and getmtime(self.indexname) >= getmtime(self.filename): + if os.path.exists(self.indexname) and getmtime( + self.indexname) >= getmtime(self.filename): self.read_fai() - elif os.path.exists(self.indexname) and getmtime(self.indexname) < getmtime(self.filename) and not rebuild: + elif os.path.exists(self.indexname) and getmtime( + self.indexname) < getmtime( + self.filename) and not rebuild: self.read_fai() - warnings.warn("Index file {0} is older than FASTA file {1}.".format(self.indexname, self.filename), RuntimeWarning) - else: + warnings.warn( + "Index file {0} is older than FASTA file {1}.".format( + self.indexname, self.filename), RuntimeWarning) + elif build_index: self.build_index() self.read_fai() + else: + self.read_fai() + except FastaIndexingError: os.remove(self.indexname) self.file.close() @@ -396,45 +444,54 @@ def _index_as_string(self): yield '\t'.join([k, str(v)]) def read_fai(self): - with open(self.indexname) as index: - prev_bend = 0 - drop_keys = [] - for line in index: - line = line.rstrip() - rname, rlen, offset, lenc, lenb = line.split('\t') - rlen, offset, lenc, lenb = map(int, (rlen, offset, lenc, lenb)) - newlines = int(ceil(rlen / lenc) * (lenb - lenc)) - bend = offset + newlines + rlen - rec = IndexRecord(rlen, offset, lenc, lenb, bend, prev_bend) - if self.read_long_names: - rname = self._long_name_from_index_record(rec) - if self.split_char: - rname = filter(self.filt_function, self.key_function(rname).split(self.split_char)) - else: - # filter must act on an iterable - rname = filter(self.filt_function, [self.key_function(rname)]) - for key in rname: # mdshw5/pyfaidx/issues/64 - if key in self.index: - if self.duplicate_action == "stop": - raise ValueError('Duplicate key "%s"' % key) - elif self.duplicate_action == "first": - continue - elif self.duplicate_action == "last": - self.index[key] = rec - elif self.duplicate_action == "longest": - if len(rec) > len(self.index[key]): - self.index[key] = rec - elif self.duplicate_action == "shortest": - if len(rec) < len(self.index[key]): - self.index[key] = rec - elif self.duplicate_action == "drop": - if key not in drop_keys: - drop_keys.append(key) + try: + with open(self.indexname) as index: + prev_bend = 0 + drop_keys = [] + for line in index: + line = line.rstrip() + rname, rlen, offset, lenc, lenb = line.split('\t') + rlen, offset, lenc, lenb = map(int, + (rlen, offset, lenc, lenb)) + newlines = int(ceil(rlen / lenc) * (lenb - lenc)) + bend = offset + newlines + rlen + rec = IndexRecord(rlen, offset, lenc, lenb, bend, + prev_bend) + if self.read_long_names: + rname = self._long_name_from_index_record(rec) + if self.split_char: + rname = filter(self.filt_function, + self.key_function(rname).split( + self.split_char)) else: - self.index[key] = rec - prev_bend = bend - for dup in drop_keys: - self.index.pop(dup, None) + # filter must act on an iterable + rname = filter(self.filt_function, + [self.key_function(rname)]) + for key in rname: # mdshw5/pyfaidx/issues/64 + if key in self.index: + if self.duplicate_action == "stop": + raise ValueError('Duplicate key "%s"' % key) + elif self.duplicate_action == "first": + continue + elif self.duplicate_action == "last": + self.index[key] = rec + elif self.duplicate_action == "longest": + if len(rec) > len(self.index[key]): + self.index[key] = rec + elif self.duplicate_action == "shortest": + if len(rec) < len(self.index[key]): + self.index[key] = rec + elif self.duplicate_action == "drop": + if key not in drop_keys: + drop_keys.append(key) + else: + self.index[key] = rec + prev_bend = bend + for dup in drop_keys: + self.index.pop(dup, None) + except IOError: + raise IndexNotFoundError( + "Could not read index file %s" % self.indexname) def build_index(self): try: @@ -455,25 +512,34 @@ def build_index(self): lastline = i # write an index line if line[0] == '>': - valid_entry = check_bad_lines(rname, bad_lines, i - 1) + valid_entry = check_bad_lines( + rname, bad_lines, i - 1) if valid_entry and i > 0: - indexfile.write("{0}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(rname, rlen, thisoffset, clen, blen)) + indexfile.write( + "{0}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format( + rname, rlen, thisoffset, clen, blen)) elif not valid_entry: - raise FastaIndexingError("Line length of fasta" - " file is not " - "consistent! " - "Inconsistent line found in >{0} at " - "line {1:n}.".format(rname, bad_lines[0][0] + 1)) + raise FastaIndexingError( + "Line length of fasta" + " file is not " + "consistent! " + "Inconsistent line found in >{0} at " + "line {1:n}.".format( + rname, bad_lines[0][0] + 1)) blen = None rlen = 0 clen = None bad_lines = [] try: # must catch empty deflines (actually these might be okay: https://github.com/samtools/htslib/pull/258) - rname = line.rstrip('\n\r')[1:].split()[0] # duplicates are detected with read_fai + rname = line.rstrip('\n\r')[1:].split()[ + 0] # duplicates are detected with read_fai except IndexError: - raise FastaIndexingError("Bad sequence name %s at line %s." % (line.rstrip('\n\r'), str(i))) + raise FastaIndexingError( + "Bad sequence name %s at line %s." % + (line.rstrip('\n\r'), str(i))) offset += line_blen - thisoffset = fastafile.tell() if self._bgzf else offset + thisoffset = fastafile.tell( + ) if self._bgzf else offset else: # check line and advance offset if not blen: blen = line_blen @@ -489,17 +555,25 @@ def build_index(self): # write the final index line, if there is one. if lastline is not None: - valid_entry = check_bad_lines(rname, bad_lines, lastline) # advance index since we're at the end of the file + valid_entry = check_bad_lines( + rname, bad_lines, lastline + ) # advance index since we're at the end of the file if valid_entry: - indexfile.write("{0:s}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format(rname, rlen, thisoffset, clen, blen)) + indexfile.write( + "{0:s}\t{1:d}\t{2:d}\t{3:d}\t{4:d}\n".format( + rname, rlen, thisoffset, clen, blen)) else: - raise FastaIndexingError("Line length of fasta" - " file is not " - "consistent! " - "Inconsistent line found in >{0} at " - "line {1:n}.".format(rname, bad_lines[0][0] + 1)) + raise FastaIndexingError( + "Line length of fasta" + " file is not " + "consistent! " + "Inconsistent line found in >{0} at " + "line {1:n}.".format(rname, + bad_lines[0][0] + 1)) except IOError: - raise IOError("%s may not be writable. Please use Fasta(rebuild=False), Faidx(rebuild=False) or faidx --no-rebuild." % self.indexname) + raise IOError( + "%s may not be writable. Please use Fasta(rebuild=False), Faidx(rebuild=False) or faidx --no-rebuild." + % self.indexname) def write_fai(self): with self.lock: @@ -550,12 +624,13 @@ def from_file(self, rname, start, end, internals=False): "Please check your FASTA file.".format(rname)) start0 = start - 1 # make coordinates [0,1) if start0 < 0: - raise FetchError("Requested start coordinate must be greater than 1.") + raise FetchError( + "Requested start coordinate must be greater than 1.") seq_len = end - start0 - # Calculate offset (https://github.com/samtools/htslib/blob/20238f354894775ed22156cdd077bc0d544fa933/faidx.c#L398) - newlines_before = int((start0 - 1) / i.lenc * (i.lenb - i.lenc)) if start0 > 0 else 0 + newlines_before = int( + (start0 - 1) / i.lenc * (i.lenb - i.lenc)) if start0 > 0 else 0 newlines_to_end = int(end / i.lenc * (i.lenb - i.lenc)) newlines_inside = newlines_to_end - newlines_before seq_blen = newlines_inside + seq_len @@ -591,7 +666,9 @@ def from_file(self, rname, start, end, internals=False): def format_seq(self, seq, rname, start, end): start0 = start - 1 - if len(seq) < end - start0 and self.default_seq: # Pad missing positions with default_seq + if len( + seq + ) < end - start0 and self.default_seq: # Pad missing positions with default_seq pad_len = end - start0 - len(seq) seq = ''.join([seq, pad_len * self.default_seq]) else: # Return less than requested range @@ -606,19 +683,23 @@ def format_seq(self, seq, rname, start, end): if self.as_raw: return seq else: - return Sequence(name=rname, start=int(start), - end=int(end), seq=seq) + return Sequence( + name=rname, start=int(start), end=int(end), seq=seq) def to_file(self, rname, start, end, seq): """ Write sequence in region from start-end, overwriting current contents of the FASTA file. """ if not self.mutable: - raise IOError("Write attempted for immutable Faidx instance. Set mutable=True to modify original FASTA.") + raise IOError( + "Write attempted for immutable Faidx instance. Set mutable=True to modify original FASTA." + ) file_seq, internals = self.from_file(rname, start, end, internals=True) with self.lock: if len(seq) != len(file_seq) - internals['newlines_inside']: - raise IOError("Specified replacement sequence needs to have the same length as original.") + raise IOError( + "Specified replacement sequence needs to have the same length as original." + ) elif len(seq) == len(file_seq) - internals['newlines_inside']: line_len = internals['i'].lenc self.file.seek(internals['bstart']) @@ -653,8 +734,9 @@ def _long_name_from_bgzf(self, index_record): """ Return the full sequence defline and description. Internal method passing IndexRecord This method is present for compatibility with BGZF files, since we cannot subtract their offsets. It may be possible to implement a more efficient method. """ - raise NotImplementedError("FastaRecord.long_name and Fasta(read_long_names=True) " - "are not supported currently for BGZF compressed files.") + raise NotImplementedError( + "FastaRecord.long_name and Fasta(read_long_names=True) " + "are not supported currently for BGZF compressed files.") prev_bend = index_record.prev_bend self.file.seek(prev_bend) defline = [] @@ -732,7 +814,7 @@ def __reversed__(self): else: yield self[:end][::-1] raise StopIteration - if end == len(self): # first iteration + if end == len(self): # first iteration end -= last_line else: end -= line_len @@ -788,17 +870,20 @@ def variant_sites(self): for site in var: if site.is_snp: sample = site.genotype(self._fa.sample) - if sample.gt_type in self._fa.gt_type and eval(self._fa.filter): + if sample.gt_type in self._fa.gt_type and eval( + self._fa.filter): pos.append(site.POS) return tuple(pos) else: - raise NotImplementedError("variant_sites() only valid for FastaVariant.") + raise NotImplementedError( + "variant_sites() only valid for FastaVariant.") @property def long_name(self): """ Read the actual defline from self._fa.faidx mdshw5/pyfaidx#54 """ return self._fa.faidx.get_long_name(self.name) + class MutableFastaRecord(FastaRecord): def __init__(self, name, fa): super(MutableFastaRecord, self).__init__(name, fa) @@ -835,27 +920,50 @@ def __setitem__(self, n, value): class Fasta(object): - def __init__(self, filename, default_seq=None, key_function=lambda x: x, as_raw=False, - strict_bounds=False, read_ahead=None, mutable=False, split_char=None, - filt_function=lambda x: True, one_based_attributes=True, read_long_names=False, - duplicate_action="stop", sequence_always_upper=False, rebuild=True): + def __init__(self, + filename, + default_seq=None, + key_function=lambda x: x, + as_raw=False, + strict_bounds=False, + read_ahead=None, + mutable=False, + split_char=None, + filt_function=lambda x: True, + one_based_attributes=True, + read_long_names=False, + duplicate_action="stop", + sequence_always_upper=False, + rebuild=True, + build_index=True): """ An object that provides a pygr compatible interface. filename: name of fasta file """ self.filename = filename self.mutable = mutable - self.faidx = Faidx(filename, key_function=key_function, as_raw=as_raw, - default_seq=default_seq, strict_bounds=strict_bounds, - read_ahead=read_ahead, mutable=mutable, split_char=split_char, - filt_function=filt_function, one_based_attributes=one_based_attributes, - read_long_names=read_long_names, duplicate_action=duplicate_action, - sequence_always_upper=sequence_always_upper, rebuild=rebuild) + self.faidx = Faidx( + filename, + key_function=key_function, + as_raw=as_raw, + default_seq=default_seq, + strict_bounds=strict_bounds, + read_ahead=read_ahead, + mutable=mutable, + split_char=split_char, + filt_function=filt_function, + one_based_attributes=one_based_attributes, + read_long_names=read_long_names, + duplicate_action=duplicate_action, + sequence_always_upper=sequence_always_upper, + rebuild=rebuild) self.keys = self.faidx.index.keys if not self.mutable: - self.records = dict([(rname, FastaRecord(rname, self)) for rname in self.keys()]) + self.records = dict( + [(rname, FastaRecord(rname, self)) for rname in self.keys()]) elif self.mutable: - self.records = dict([(rname, MutableFastaRecord(rname, self)) for rname in self.keys()]) + self.records = dict([(rname, MutableFastaRecord(rname, self)) + for rname in self.keys()]) def __contains__(self, rname): """Return True if genome contains record.""" @@ -898,7 +1006,7 @@ def get_spliced_seq(self, name, intervals, rc=False): If rc is set, reverse complement will be returned. """ # Get sequence for all intervals - chunks = [self.faidx.fetch(name, s, e) for s,e in intervals] + chunks = [self.faidx.fetch(name, s, e) for s, e in intervals] start = chunks[0].start end = chunks[-1].end @@ -926,7 +1034,15 @@ class FastaVariant(Fasta): """ Return consensus sequence from FASTA and VCF inputs """ expr = set(('>', '<', '=', '!')) - def __init__(self, filename, vcf_file, sample=None, het=True, hom=True, call_filter=None, **kwargs): + + def __init__(self, + filename, + vcf_file, + sample=None, + het=True, + hom=True, + call_filter=None, + **kwargs): super(FastaVariant, self).__init__(filename, **kwargs) try: import pysam @@ -938,9 +1054,10 @@ def __init__(self, filename, vcf_file, sample=None, het=True, hom=True, call_fil raise ImportError("PyVCF must be installed for FastaVariant.") if call_filter is not None: try: - key, expr, value = call_filter.split() # 'GQ > 30' + key, expr, value = call_filter.split() # 'GQ > 30' except IndexError: - raise ValueError("call_filter must be a string in the format 'XX <>!= NN'") + raise ValueError( + "call_filter must be a string in the format 'XX <>!= NN'") assert all([x in self.expr for x in list(expr)]) assert all([x in string.ascii_uppercase for x in list(key)]) assert all([x in string.printable for x in list(value)]) @@ -956,18 +1073,21 @@ def __init__(self, filename, vcf_file, sample=None, het=True, hom=True, call_fil else: self.sample = self.vcf.samples[0] if len(self.vcf.samples) > 1 and sample is None: - warnings.warn("Using sample {0} genotypes.".format(self.sample), RuntimeWarning) + warnings.warn("Using sample {0} genotypes.".format( + self.sample), RuntimeWarning) if het and hom: self.gt_type = set((1, 2)) elif het: - self.gt_type = set((1,)) + self.gt_type = set((1, )) elif hom: - self.gt_type = set((2,)) + self.gt_type = set((2, )) else: self.gt_type = set() def __repr__(self): - return 'FastaVariant("%s", "%s", gt="%s")' % (self.filename, self.vcf.filename, str(self.gt_type)) + return 'FastaVariant("%s", "%s", gt="%s")' % (self.filename, + self.vcf.filename, + str(self.gt_type)) def get_seq(self, name, start, end): """Return a sequence by record name and interval [start, end). @@ -1000,23 +1120,26 @@ def get_seq(self, name, start, end): def wrap_sequence(n, sequence, fillvalue=''): args = [iter(sequence)] * n for line in zip_longest(fillvalue=fillvalue, *args): - yield ''.join(line + ("\n",)) + yield ''.join(line + ("\n", )) + # To take a complement, we map each character in the first string in this pair # to the corresponding character in the second string. -complement_map = ('ACTGNactgnYRWSKMDVHBXyrwskmdvhbx', 'TGACNtgacnRYWSMKHBDVXrywsmkhbdvx') +complement_map = ('ACTGNactgnYRWSKMDVHBXyrwskmdvhbx', + 'TGACNtgacnRYWSMKHBDVXrywsmkhbdvx') invalid_characters_set = set( chr(x) for x in range(256) if chr(x) not in complement_map[0]) invalid_characters_string = ''.join(invalid_characters_set) if PY3: - complement_table = str.maketrans( - complement_map[0], complement_map[1], invalid_characters_string) - translate_arguments = (complement_table,) + complement_table = str.maketrans(complement_map[0], complement_map[1], + invalid_characters_string) + translate_arguments = (complement_table, ) elif PY2: complement_table = string.maketrans(complement_map[0], complement_map[1]) translate_arguments = (complement_table, invalid_characters_string) + def complement(seq): """ Returns the complement of seq. >>> seq = 'ATCGTA' @@ -1028,10 +1151,12 @@ def complement(seq): if len(result) != len(seq): first_invalid_position = next( i for i in range(len(seq)) if seq[i] in invalid_characters_set) - raise ValueError("Sequence contains non-DNA character '{0}' at position {1:n}\n".format( - seq[first_invalid_position], first_invalid_position + 1)) + raise ValueError( + "Sequence contains non-DNA character '{0}' at position {1:n}\n". + format(seq[first_invalid_position], first_invalid_position + 1)) return result + def translate_chr_name(from_name, to_name): chr_name_map = dict(zip(from_name, to_name)) @@ -1063,6 +1188,7 @@ def ucsc_split(region): start, end = (None, None) return (rname, start, end) + def check_bad_lines(rname, bad_lines, i): """ Find inconsistent line lengths in the middle of an entry. Allow blank lines between entries, and short lines From c7feade6f07cc245216959e5c75596637f5c8f40 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Mon, 5 Feb 2018 15:40:32 -0500 Subject: [PATCH 2/5] Tests for #134 --- tests/test_feature_indexing.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) mode change 100644 => 100755 tests/test_feature_indexing.py diff --git a/tests/test_feature_indexing.py b/tests/test_feature_indexing.py old mode 100644 new mode 100755 index c486e0a..7c28606 --- a/tests/test_feature_indexing.py +++ b/tests/test_feature_indexing.py @@ -1,6 +1,6 @@ import os from os.path import getmtime -from pyfaidx import Faidx, FastaIndexingError +from pyfaidx import Faidx, FastaIndexingError, IndexNotFoundError, FastaNotFoundError from nose.tools import raises from nose.plugins.skip import Skip, SkipTest from unittest import TestCase @@ -76,13 +76,15 @@ def test_build_issue_111(self): "gi|530364726|ref|XR_241081 1009 63849 70 71\n" "gi|530364725|ref|XR_241080 4884 65009 70 71\n" "gi|530364724|ref|XR_241079 2819 70099 70 71\n") - index = Faidx('data/genes.fasta', read_long_names=True, key_function=lambda x: x.split('.')[0]) + index = Faidx( + 'data/genes.fasta', + read_long_names=True, + key_function=lambda x: x.split('.')[0]) result_index = ''.join(index._index_as_string()) assert result_index == expect_index def test_order(self): - order = ("gi|563317589|dbj|AB821309.1|", - "gi|557361099|gb|KF435150.1|", + order = ("gi|563317589|dbj|AB821309.1|", "gi|557361099|gb|KF435150.1|", "gi|557361097|gb|KF435149.1|", "gi|543583796|ref|NR_104216.1|", "gi|543583795|ref|NR_104215.1|", @@ -227,11 +229,14 @@ def test_build_issue_96_fail_build_faidx(self): # Write simple fasta file with inconsistent sequence line lengths, # so building an index raises a 'FastaIndexingError' with open(fasta_path, 'w') as fasta_out: - fasta_out.write(">seq1\nCTCCGGGCCCAT\nAACACTTGGGGGTAGCTAAAGTGAA\nATAAAGCCTAAA\n") + fasta_out.write( + ">seq1\nCTCCGGGCCCAT\nAACACTTGGGGGTAGCTAAAGTGAA\nATAAAGCCTAAA\n" + ) builtins_open = builtins.open - opened_files=[] + opened_files = [] + def test_open(*args, **kwargs): f = builtins_open(*args, **kwargs) opened_files.append(f) @@ -240,7 +245,9 @@ def test_open(*args, **kwargs): with mock.patch('six.moves.builtins.open', side_effect=test_open): try: Faidx(fasta_path) - self.assertFail("Faidx construction should fail with 'FastaIndexingError'.") + self.assertFail( + "Faidx construction should fail with 'FastaIndexingError'." + ) except FastaIndexingError: pass self.assertTrue(all(f.closed for f in opened_files)) @@ -265,7 +272,8 @@ def test_build_issue_96_fail_read_malformed_index_duplicate_key(self): builtins_open = builtins.open - opened_files=[] + opened_files = [] + def test_open(*args, **kwargs): f = builtins_open(*args, **kwargs) opened_files.append(f) @@ -274,9 +282,16 @@ def test_open(*args, **kwargs): with mock.patch('six.moves.builtins.open', side_effect=test_open): try: Faidx(fasta_path) - self.assertFail("Faidx construction should fail with 'ValueError'.") + self.assertFail( + "Faidx construction should fail with 'ValueError'.") except ValueError: pass self.assertTrue(all(f.closed for f in opened_files)) finally: shutil.rmtree(tmp_dir) + + @raises(IndexNotFoundError) + def test_issue_134_no_build_index(self): + """ Ensure that index file is not built when build_index=False. See mdshw5/pyfaidx#134. + """ + faidx = Faidx('data/genes.fasta', build_index=False) From aaffc88393b6637ccd02342b909857adc34de7d8 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Mon, 5 Feb 2018 15:59:26 -0500 Subject: [PATCH 3/5] unset executable bit --- tests/test_feature_indexing.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 tests/test_feature_indexing.py diff --git a/tests/test_feature_indexing.py b/tests/test_feature_indexing.py old mode 100755 new mode 100644 From b83a48e65c388c9126ccced78c1b87179b8c7d1a Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Mon, 5 Feb 2018 16:09:02 -0500 Subject: [PATCH 4/5] Raise proper type of exception in build_index --- pyfaidx/__init__.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pyfaidx/__init__.py b/pyfaidx/__init__.py index 7b911cf..c6f9ec8 100644 --- a/pyfaidx/__init__.py +++ b/pyfaidx/__init__.py @@ -570,10 +570,13 @@ def build_index(self): "Inconsistent line found in >{0} at " "line {1:n}.".format(rname, bad_lines[0][0] + 1)) - except IOError: - raise IOError( - "%s may not be writable. Please use Fasta(rebuild=False), Faidx(rebuild=False) or faidx --no-rebuild." - % self.indexname) + except (IOError, FastaIndexingError) as e: + if isinstance(e, IOError): + raise IndexNotFoundError( + "%s may not be writable. Please use Fasta(rebuild=False), Faidx(rebuild=False) or faidx --no-rebuild." + % self.indexname) + elif isinstance(e, FastaIndexingError): + raise e def write_fai(self): with self.lock: From 400c8bca7019f5ca4e3ac05a030d5d27ef1957e1 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Mon, 5 Feb 2018 16:22:27 -0500 Subject: [PATCH 5/5] Don't inherit from IOError --- pyfaidx/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyfaidx/__init__.py b/pyfaidx/__init__.py index c6f9ec8..d5086c2 100644 --- a/pyfaidx/__init__.py +++ b/pyfaidx/__init__.py @@ -28,7 +28,7 @@ class KeyFunctionError(ValueError): """Raised if the key_function argument is invalid.""" -class FastaIndexingError(IOError): +class FastaIndexingError(Exception): """Raised if we encounter malformed FASTA that prevents indexing.""" @@ -572,7 +572,7 @@ def build_index(self): bad_lines[0][0] + 1)) except (IOError, FastaIndexingError) as e: if isinstance(e, IOError): - raise IndexNotFoundError( + raise IOError( "%s may not be writable. Please use Fasta(rebuild=False), Faidx(rebuild=False) or faidx --no-rebuild." % self.indexname) elif isinstance(e, FastaIndexingError):