From e9e9ac46749fa1c4df90c627274d9567c87edd79 Mon Sep 17 00:00:00 2001 From: xklinka Date: Wed, 2 Feb 2022 20:19:30 +0100 Subject: [PATCH 01/24] added new field to TRRecord which holds position of full alleles --- trtools/utils/tr_harmonizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/trtools/utils/tr_harmonizer.py b/trtools/utils/tr_harmonizer.py index c996802a..1c550ec5 100644 --- a/trtools/utils/tr_harmonizer.py +++ b/trtools/utils/tr_harmonizer.py @@ -633,6 +633,7 @@ def __init__(self, record_id: str, quality_field: str, *, + harmonized_pos: Optional[int] = None, full_alleles: Optional[Tuple[str, List[str]]] = None, ref_allele_length: Optional[float] = None, alt_allele_lengths: Optional[List[float]] = None, @@ -643,10 +644,11 @@ def __init__(self, self.motif = motif self.record_id = record_id self.chrom = vcfrecord.CHROM - self.pos = vcfrecord.POS + self.pos = harmonized_pos if harmonized_pos is not None else vcfrecord.POS self.info = dict(vcfrecord.INFO) self.format = _Cyvcf2FormatDict(vcfrecord) self.full_alleles = full_alleles + self.full_alleles_pos = self.vcfrecord.POS self.ref_allele_length = ref_allele_length self.alt_allele_lengths = alt_allele_lengths self.quality_field = quality_field From 0c7b6e6d29fe231e02a67cd7b2fe8fd5b28ffb7b Mon Sep 17 00:00:00 2001 From: xklinka Date: Wed, 2 Feb 2022 20:21:41 +0100 Subject: [PATCH 02/24] HipSTR harmonization sets TRRecord position to position of record without flanking bp at the start --- trtools/utils/tr_harmonizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/trtools/utils/tr_harmonizer.py b/trtools/utils/tr_harmonizer.py index 1c550ec5..9f1d3bb3 100644 --- a/trtools/utils/tr_harmonizer.py +++ b/trtools/utils/tr_harmonizer.py @@ -360,6 +360,7 @@ def _HarmonizeHipSTRRecord(vcfrecord: cyvcf2.Variant): motif, record_id, 'Q', + harmonized_pos=int(vcfrecord.INFO['START']), full_alleles=full_alleles) From 927f1f34c1235910fb9442b2a567f18a33c34d14 Mon Sep 17 00:00:00 2001 From: xklinka Date: Thu, 3 Feb 2022 12:00:27 +0100 Subject: [PATCH 03/24] Added correct types to mergeutils.py functions --- trtools/utils/mergeutils.py | 94 +++++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 34 deletions(-) diff --git a/trtools/utils/mergeutils.py b/trtools/utils/mergeutils.py index 5b2eb6cb..9c79689e 100644 --- a/trtools/utils/mergeutils.py +++ b/trtools/utils/mergeutils.py @@ -7,11 +7,18 @@ import os import sys +from cyvcf2 import cyvcf2 + import trtools.utils.common as common import trtools.utils.tr_harmonizer as trh +from typing import List, Union, Tuple, Optional + +CYVCF_RECORD = cyvcf2.Variant +CYVCF_READER = cyvcf2.VCF + -def LoadReaders(vcffiles, region=None): +def LoadReaders(vcffiles: List[str], region: Optional[str] = None) -> List[CYVCF_READER]: r"""Return list of VCF readers Parameters @@ -28,17 +35,19 @@ def LoadReaders(vcffiles, region=None): """ for f in vcffiles: if not f.endswith(".vcf.gz") and not f.endswith(".vcf.bgz"): - raise ValueError("Make sure %s is bgzipped and indexed"%f) + raise ValueError("Make sure %s is bgzipped and indexed" % f) if not os.path.isfile(f): - raise ValueError("Could not find VCF file %s"%f) - if not os.path.isfile(f+".tbi"): - raise ValueError("Could not find VCF index %s.tbi"%f) + raise ValueError("Could not find VCF file %s" % f) + if not os.path.isfile(f + ".tbi"): + raise ValueError("Could not find VCF index %s.tbi" % f) readers = [vcf.Reader(open(f, "rb")) for f in vcffiles] if region is None: return readers - else: return [r.fetch(region) for r in readers] + else: + return [r.fetch(region) for r in readers] + -def GetSharedSamples(readers): +def GetSharedSamples(readers: List[CYVCF_READER]) -> List[str]: r"""Get list of samples used in all files being merged Parameters @@ -57,7 +66,8 @@ def GetSharedSamples(readers): samples = samples.intersection(set(r.samples)) return list(samples) -def GetSamples(readers, filenames=None): + +def GetSamples(readers: List[CYVCF_READER], filenames: Optional[str] = None) -> List[str]: r"""Get list of samples used in all files being merged Parameters @@ -87,7 +97,8 @@ def GetSamples(readers, filenames=None): samples += r.samples return samples -def GetAndCheckVCFType(vcfs, vcftype): + +def GetAndCheckVCFType(vcfs: List[CYVCF_READER], vcftype: str) -> str: """Infer type of multiple VCFs If they are all the same, return that type @@ -123,9 +134,11 @@ def GetAndCheckVCFType(vcfs, vcftype): types.append(vcf_type) if len(set(types)) == 1: return types[0] - else: raise ValueError("VCF files are of mixed types.") + else: + raise ValueError("VCF files are of mixed types.") -def GetChromOrder(r, chroms): + +def GetChromOrder(r: CYVCF_RECORD, chroms: List[str]) -> Union[int, float]: r"""Get the chromosome order of a record Parameters @@ -136,14 +149,17 @@ def GetChromOrder(r, chroms): Returns ------- - order : int - Index of r.CHROM in chroms - Return np.inf if can't find r.CHROM + order : int or float + Index of r.CHROM in chroms, int + Return np.inf if can't find r.CHROM, float """ - if r is None: return np.inf - else: return chroms.index(r.CHROM) + if r is None: + return np.inf + else: + return chroms.index(r.CHROM) -def GetChromOrderEqual(chrom_order, min_chrom): + +def GetChromOrderEqual(chrom_order: int, min_chrom: int) -> bool: r"""Check chrom order Parameters @@ -161,7 +177,8 @@ def GetChromOrderEqual(chrom_order, min_chrom): if chrom_order == np.inf: return False return chrom_order == min_chrom -def GetPos(r): + +def GetPos(r: CYVCF_RECORD) -> Union[int, float]: r"""Get the position of a record Parameters @@ -171,12 +188,15 @@ def GetPos(r): Returns ------- pos : int - If r is None, returns np.inf + If r is None, returns np.inf, which is a float """ - if r is None: return np.inf - else: return r.POS + if r is None: + return np.inf + else: + return r.POS -def CheckPos(record, chrom, pos): + +def CheckPos(record: CYVCF_RECORD, chrom: str, pos: int) -> bool: r"""Check a record is at the specified position Parameters @@ -194,9 +214,10 @@ def CheckPos(record, chrom, pos): Return True if the current record is at this position """ if record is None: return False - return record.CHROM==chrom and record.POS==pos + return record.CHROM == chrom and record.POS == pos + -def GetMinRecords(record_list, chroms): +def GetMinRecords(record_list: List[CYVCF_RECORD], chroms: List[str]) -> List[bool]: r"""Check if each record is next up in sort order Return a vector of boolean set to true if @@ -222,10 +243,11 @@ def GetMinRecords(record_list, chroms): if len(allpos) > 0: min_pos = min(allpos) else: - return [False]*len(record_list) + return [False] * len(record_list) return [CheckPos(r, chroms[min_chrom], min_pos) for r in record_list] -def DoneReading(records): + +def DoneReading(records: List[CYVCF_RECORD]) -> bool: r"""Check if all records are at the end of the file Parameters @@ -241,7 +263,8 @@ def DoneReading(records): """ return all([item is None for item in records]) -def DebugPrintRecordLocations(current_records, is_min): + +def DebugPrintRecordLocations(current_records: List[CYVCF_RECORD], is_min: List[bool]) -> None: r"""Debug function to print current records for each file Parameters @@ -255,10 +278,11 @@ def DebugPrintRecordLocations(current_records, is_min): for i in range(len(is_min)): chrom = current_records[i].CHROM pos = current_records[i].POS - info.append("%s:%s:%s"%(chrom, pos, is_min[i])) - common.MSG("\t".join(info)+"\n", debug=True) + info.append("%s:%s:%s" % (chrom, pos, is_min[i])) + common.MSG("\t".join(info) + "\n", debug=True) + -def CheckMin(is_min): +def CheckMin(is_min: List[bool]) -> bool: r"""Check if we're progressing through VCFs Parameters @@ -271,11 +295,13 @@ def CheckMin(is_min): check : bool Set to True if something went wrong """ - if sum(is_min)==0: + if sum(is_min) == 0: raise ValueError("Unexpected error. Stuck in infinite loop and exiting.") return False -def GetNextRecords(readers, current_records, increment): + +def GetNextRecords(readers: List[CYVCF_READER], current_records: List[CYVCF_RECORD], increment: List[bool])\ + -> List[CYVCF_RECORD]: r"""Increment readers of each file Increment readers[i] if increment[i] set to true @@ -302,6 +328,6 @@ def GetNextRecords(readers, current_records, increment): new_records.append(next(readers[i])) except StopIteration: new_records.append(None) - else: new_records.append(current_records[i]) + else: + new_records.append(current_records[i]) return new_records - From 9ad411a433482f43583fa9a06f140b29949f13da Mon Sep 17 00:00:00 2001 From: xklinka Date: Thu, 3 Feb 2022 12:17:24 +0100 Subject: [PATCH 04/24] Added multipledispatch lib to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 3c78cebb..0317b14f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ matplotlib==3.2.2 pysam==0.16.0.1 cython==0.29.20 cyvcf2==0.30.1 +multipledispatch==0.6.0 \ No newline at end of file From 786bfee30a867058202cab47b1f0143738fe2c92 Mon Sep 17 00:00:00 2001 From: xklinka Date: Thu, 3 Feb 2022 12:18:01 +0100 Subject: [PATCH 05/24] Fixed problems revealed by static type analysis and added functions that use multiple dispatch --- trtools/utils/mergeutils.py | 51 ++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/trtools/utils/mergeutils.py b/trtools/utils/mergeutils.py index 9c79689e..0d85d8be 100644 --- a/trtools/utils/mergeutils.py +++ b/trtools/utils/mergeutils.py @@ -13,6 +13,7 @@ import trtools.utils.tr_harmonizer as trh from typing import List, Union, Tuple, Optional +from multipledispatch import dispatch CYVCF_RECORD = cyvcf2.Variant CYVCF_READER = cyvcf2.VCF @@ -59,9 +60,9 @@ def GetSharedSamples(readers: List[CYVCF_READER]) -> List[str]: samples : list of str Samples present in all readers """ - if len(readers) == 0: return set() + if len(readers) == 0: return list() samples = set(readers[0].samples) - if len(readers) == 1: return samples + if len(readers) == 1: return list(samples) for r in readers[1:]: samples = samples.intersection(set(r.samples)) return list(samples) @@ -138,6 +139,7 @@ def GetAndCheckVCFType(vcfs: List[CYVCF_READER], vcftype: str) -> str: raise ValueError("VCF files are of mixed types.") +@dispatch(CYVCF_RECORD, List[str]) def GetChromOrder(r: CYVCF_RECORD, chroms: List[str]) -> Union[int, float]: r"""Get the chromosome order of a record @@ -158,8 +160,29 @@ def GetChromOrder(r: CYVCF_RECORD, chroms: List[str]) -> Union[int, float]: else: return chroms.index(r.CHROM) +@dispatch(trh.TRRecord, List[str]) +def GetChromOrder(r: trh.TRRecord, chroms: List[str]) -> Union[int, float]: + r"""Get the chromosome order of a harmonized record -def GetChromOrderEqual(chrom_order: int, min_chrom: int) -> bool: + Parameters + ---------- + r : trh.TRRecord + chroms : list of str + Ordered list of chromosomes + + Returns + ------- + order : int or float + Index of r.CHROM in chroms, int + Return np.inf if can't find r.CHROM, float + """ + if r is None: + return np.inf + else: + return chroms.index(r.chrom) + + +def GetChromOrderEqual(chrom_order: Union[int, float], min_chrom: int) -> bool: r"""Check chrom order Parameters @@ -178,6 +201,7 @@ def GetChromOrderEqual(chrom_order: int, min_chrom: int) -> bool: return chrom_order == min_chrom +@dispatch(CYVCF_RECORD) def GetPos(r: CYVCF_RECORD) -> Union[int, float]: r"""Get the position of a record @@ -196,6 +220,25 @@ def GetPos(r: CYVCF_RECORD) -> Union[int, float]: return r.POS +@dispatch(trh.TRRecord) +def GetPos(r: trh.TRRecord) -> Union[int, float]: + r"""Get the position of a harmonized record + + Parameters + ---------- + r : trh.TRRecord + + Returns + ------- + pos : int + If r is None, returns np.inf, which is a float + """ + if r is None: + return np.inf + else: + return r.pos + + def CheckPos(record: CYVCF_RECORD, chrom: str, pos: int) -> bool: r"""Check a record is at the specified position @@ -300,7 +343,7 @@ def CheckMin(is_min: List[bool]) -> bool: return False -def GetNextRecords(readers: List[CYVCF_READER], current_records: List[CYVCF_RECORD], increment: List[bool])\ +def GetNextRecords(readers: List[CYVCF_READER], current_records: List[CYVCF_RECORD], increment: List[bool]) \ -> List[CYVCF_RECORD]: r"""Increment readers of each file From 0a52fd9d5b7d6f4b1f45faec2f65430836a935b8 Mon Sep 17 00:00:00 2001 From: xklinka Date: Thu, 3 Feb 2022 12:21:52 +0100 Subject: [PATCH 06/24] Fixed bug in compareSTR.py --- trtools/compareSTR/compareSTR.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/trtools/compareSTR/compareSTR.py b/trtools/compareSTR/compareSTR.py index 408a7265..5109911e 100644 --- a/trtools/compareSTR/compareSTR.py +++ b/trtools/compareSTR/compareSTR.py @@ -818,17 +818,15 @@ def main(args): if args.numrecords is not None and num_records >= args.numrecords: break if args.verbose: mergeutils.DebugPrintRecordLocations(current_records, is_min) if mergeutils.CheckMin(is_min): return 1 - if all([is_min]): - if (current_records[0].CHROM == current_records[1].CHROM and \ - current_records[0].POS == current_records[1].POS): - UpdateComparisonResults(trh.HarmonizeRecord(vcftype1, current_records[0]), \ - trh.HarmonizeRecord(vcftype2, current_records[1]), \ - sample_idxs, - args.ignore_phasing, args.period, - format_fields, format_bins, - args.stratify_file, - overall_results, locus_results, - sample_results, bubble_results) + if all(is_min): + UpdateComparisonResults(trh.HarmonizeRecord(vcftype1, current_records[0]), \ + trh.HarmonizeRecord(vcftype2, current_records[1]), \ + sample_idxs, + args.ignore_phasing, args.period, + format_fields, format_bins, + args.stratify_file, + overall_results, locus_results, + sample_results, bubble_results) current_records = mergeutils.GetNextRecords(vcfregions, current_records, is_min) is_min = mergeutils.GetMinRecords(current_records, chroms) done = mergeutils.DoneReading(current_records) From a8cf75674195176984cf0f231dcddba73fd8e3e0 Mon Sep 17 00:00:00 2001 From: xklinka Date: Thu, 3 Feb 2022 15:04:06 +0100 Subject: [PATCH 07/24] Added utility function to tr_harmonizer.py --- trtools/utils/tr_harmonizer.py | 66 ++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 19 deletions(-) diff --git a/trtools/utils/tr_harmonizer.py b/trtools/utils/tr_harmonizer.py index 9f1d3bb3..da8256c8 100644 --- a/trtools/utils/tr_harmonizer.py +++ b/trtools/utils/tr_harmonizer.py @@ -13,6 +13,7 @@ import trtools.utils.utils as utils + # List of supported VCF types # TODO: add Beagle # TODO: add support for tool version numbers @@ -45,13 +46,13 @@ def _ToVCFType(vcftype: Union[str, VcfTypes]): if vcftype not in VcfTypes.__members__: raise ValueError(("{} is not an excepted TR vcf type. " "Expected one of {}").format( - vcftype, list(VcfTypes.__members__))) + vcftype, list(VcfTypes.__members__))) return VcfTypes[vcftype] elif isinstance(vcftype, VcfTypes): return vcftype else: raise TypeError(("{} (of type {}) is not a vcftype" - .format(vcftype, type(vcftype)))) + .format(vcftype, type(vcftype)))) def MayHaveImpureRepeats(vcftype: Union[str, VcfTypes]): @@ -224,6 +225,34 @@ def InferVCFType(vcffile: cyvcf2.VCF, vcftype: Union[str, VcfTypes] = "auto") -> .format(possible_vcf_types, vcftype))) +def HarmonizeRecords(vcf_records: List[cyvcf2.Variant], vcf_types: List[Union[str, VcfTypes]]): + """ + Create a list of standardized TRRecord objects out of a cyvcf2.Variant + objects of possibly unknown type. + + Parameters + ---------- + vcf_records : + A list of cyvcf2.Variant Object + + vcf_types : + A list of strings or VCFTypes objects + + Returns + ------- + List of TRRecord objects + + Raises + ------- + ValueError when lengths of input lists are not equal + """ + + + if len(vcf_types) != len(vcf_records): + raise ValueError("Length of type and record lists is not equal") + + return [HarmonizeRecord(vcf_types[i], vcf_records[i]) for i in range(len(vcf_records))] + def HarmonizeRecord(vcftype: Union[str, VcfTypes], vcfrecord: cyvcf2.Variant): """ Create a standardized TRRecord object out of a cyvcf2.Variant @@ -273,7 +302,8 @@ def _HarmonizeGangSTRRecord(vcfrecord: cyvcf2.Variant): if vcfrecord.INFO.get('RU') is None: raise TypeError("This is not a GangSTR record {}:{}".format(vcfrecord.CHROM, vcfrecord.POS)) if vcfrecord.INFO.get('VID') is not None: - raise TypeError("Trying to read an AdVNTR record as a GangSTR record {}:{}".format(vcfrecord.CHROM, vcfrecord.POS)) + raise TypeError( + "Trying to read an AdVNTR record as a GangSTR record {}:{}".format(vcfrecord.CHROM, vcfrecord.POS)) if vcfrecord.INFO.get('VARID') is not None: raise TypeError("Trying to read an EH record as a GangSTR record {}:{}".format(vcfrecord.CHROM, vcfrecord.POS)) ref_allele = vcfrecord.REF.upper() @@ -390,7 +420,7 @@ def _HarmonizeAdVNTRRecord(vcfrecord: cyvcf2.Variant): return TRRecord(vcfrecord, ref_allele, alt_alleles, motif, record_id, 'ML') -#def _PHREDtoProb(phred: int) -> float: +# def _PHREDtoProb(phred: int) -> float: # """Convert PHRED score to probability # # Notes @@ -400,7 +430,7 @@ def _HarmonizeAdVNTRRecord(vcfrecord: cyvcf2.Variant): # return 10**(-phred/10) -#def _ConvertPLtoQualityProb(PL: List[int]) -> float: +# def _ConvertPLtoQualityProb(PL: List[int]) -> float: # """ # Convert a list of PHRED-scaled genotype probabilities to the # unscaled probability of the single most likely genotype.# @@ -499,8 +529,8 @@ def _HarmonizeEHRecord(vcfrecord: cyvcf2.Variant): alt_allele_lengths=alt_allele_lengths) -def _UpperCaseAlleles(alleles : List[str]): - #Convert the list of allele strings to upper case +def _UpperCaseAlleles(alleles: List[str]): + # Convert the list of allele strings to upper case upper_alleles = [] for allele in alleles: upper_alleles.append(allele.upper()) @@ -673,7 +703,7 @@ def __init__(self, self.ref_allele = utils.FabricateAllele(motif, ref_allele_length) else: self.has_fabricated_ref_allele = False - self.ref_allele_length = len(ref_allele)/len(motif) + self.ref_allele_length = len(ref_allele) / len(motif) if alt_allele_lengths is not None: self.has_fabricated_alt_alleles = True @@ -684,7 +714,7 @@ def __init__(self, else: self.has_fabricated_alt_alleles = False self.alt_allele_lengths = [ - len(allele)/len(motif) for allele in self.alt_alleles + len(allele) / len(motif) for allele in self.alt_alleles ] try: @@ -708,7 +738,7 @@ def _CheckRecord(self): "number of alt alleles as given to the TRRecord " "constructor. Underlying alt alleles: {}, " " constructor alt alleles: {}".format( - self.vcfrecord.ALT, self.alt_alleles)) + self.vcfrecord.ALT, self.alt_alleles)) if self.full_alleles: if len(self.full_alleles) != 2: @@ -836,7 +866,7 @@ def GetSamplePloidies(self) -> Optional[np.ndarray]: return None return ( - gt_idxs.shape[1] - 1 - np.sum(gt_idxs[:, :-1] == -2, axis = 1) + gt_idxs.shape[1] - 1 - np.sum(gt_idxs[:, :-1] == -2, axis=1) ) def GetCallRate(self, strict: bool = True) -> float: @@ -864,7 +894,7 @@ def GetCallRate(self, strict: bool = True) -> float: if called_samples is None: return None else: - return np.sum(called_samples)/called_samples.shape[0] + return np.sum(called_samples) / called_samples.shape[0] def _GetStringGenotypeArray( self, @@ -881,7 +911,6 @@ def _GetStringGenotypeArray( seq_array[:, :-1][idx_gts[:, :-1] == -2] = ',' return seq_array - def GetStringGenotypes(self) -> Optional[np.ndarray]: """ Get an array of string genotypes for each sample. @@ -1209,7 +1238,7 @@ def GetGenotypeCounts( if gts is None: return {} - gts = gts[:, :-1] #remove phasing + gts = gts[:, :-1] # remove phasing gts = np.sort(gts, axis=1) if sample_index is not None: @@ -1298,11 +1327,11 @@ def GetAlleleCounts(self, if gts is None: return {} - gts = gts[:, :-1] #remove phasing + gts = gts[:, :-1] # remove phasing if sample_index is not None: gts = gts[sample_index, :] - + # remove no calls and missing haplotypes gts = gts[gts != nocall_entry] gts = gts[gts != lowploidy_entry] @@ -1352,7 +1381,7 @@ def GetAlleleFreqs(self, fullgenotypes=fullgenotypes, sample_index=sample_index) total = float(sum(allele_counts.values())) - return {key: value/total for key, value in allele_counts.items()} + return {key: value / total for key, value in allele_counts.items()} def GetMaxAllele(self, sample_index: Optional[Any] = None) -> float: @@ -1571,5 +1600,4 @@ def __next__(self) -> TRRecord: """Iterate over TRRecord produced from the underlying vcf.""" return HarmonizeRecord(self.vcftype, next(self.vcffile)) - -#TODO check all users of this class for new options +# TODO check all users of this class for new options From e2db6cfeea2dd6b68da716c4680a1d6c9e6408af Mon Sep 17 00:00:00 2001 From: xklinka Date: Thu, 3 Feb 2022 15:08:28 +0100 Subject: [PATCH 08/24] Implemented improved record comparison, records are now harmonized before the ability to compare them is calculated, fixes HipSTR issue --- trtools/compareSTR/compareSTR.py | 13 ++++++++----- trtools/utils/mergeutils.py | 33 ++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/trtools/compareSTR/compareSTR.py b/trtools/compareSTR/compareSTR.py index 5109911e..e184aa86 100644 --- a/trtools/compareSTR/compareSTR.py +++ b/trtools/compareSTR/compareSTR.py @@ -809,26 +809,29 @@ def main(args): ### Walk through sorted readers, merging records as we go ### current_records = [next(reader) for reader in vcfreaders] - is_min = mergeutils.GetMinRecords(current_records, chroms) - done = mergeutils.DoneReading(current_records) + num_records = 0 while not done: + + harmonized_records = trh.HarmonizeRecords(current_records, [vcftype1, vcftype2]) + # contains information about which record should be skipped in next iteration and whether it is currently comparable + is_min = mergeutils.GetMinRecords(harmonized_records, chroms) + if any([item is None for item in current_records]): break if args.numrecords is not None and num_records >= args.numrecords: break if args.verbose: mergeutils.DebugPrintRecordLocations(current_records, is_min) if mergeutils.CheckMin(is_min): return 1 if all(is_min): - UpdateComparisonResults(trh.HarmonizeRecord(vcftype1, current_records[0]), \ - trh.HarmonizeRecord(vcftype2, current_records[1]), \ + UpdateComparisonResults(*harmonized_records, sample_idxs, args.ignore_phasing, args.period, format_fields, format_bins, args.stratify_file, overall_results, locus_results, sample_results, bubble_results) + current_records = mergeutils.GetNextRecords(vcfregions, current_records, is_min) - is_min = mergeutils.GetMinRecords(current_records, chroms) done = mergeutils.DoneReading(current_records) num_records += 1 diff --git a/trtools/utils/mergeutils.py b/trtools/utils/mergeutils.py index 0d85d8be..9c62b1e3 100644 --- a/trtools/utils/mergeutils.py +++ b/trtools/utils/mergeutils.py @@ -139,7 +139,7 @@ def GetAndCheckVCFType(vcfs: List[CYVCF_READER], vcftype: str) -> str: raise ValueError("VCF files are of mixed types.") -@dispatch(CYVCF_RECORD, List[str]) +@dispatch(CYVCF_RECORD, list) def GetChromOrder(r: CYVCF_RECORD, chroms: List[str]) -> Union[int, float]: r"""Get the chromosome order of a record @@ -160,7 +160,8 @@ def GetChromOrder(r: CYVCF_RECORD, chroms: List[str]) -> Union[int, float]: else: return chroms.index(r.CHROM) -@dispatch(trh.TRRecord, List[str]) + +@dispatch(trh.TRRecord, list) def GetChromOrder(r: trh.TRRecord, chroms: List[str]) -> Union[int, float]: r"""Get the chromosome order of a harmonized record @@ -239,6 +240,7 @@ def GetPos(r: trh.TRRecord) -> Union[int, float]: return r.pos +@dispatch(CYVCF_RECORD, str, int) def CheckPos(record: CYVCF_RECORD, chrom: str, pos: int) -> bool: r"""Check a record is at the specified position @@ -260,7 +262,29 @@ def CheckPos(record: CYVCF_RECORD, chrom: str, pos: int) -> bool: return record.CHROM == chrom and record.POS == pos -def GetMinRecords(record_list: List[CYVCF_RECORD], chroms: List[str]) -> List[bool]: +@dispatch(trh.TRRecord, str, int) +def CheckPos(record: trh.TRRecord, chrom: str, pos: int) -> bool: + r"""Check a record is at the specified position + + Parameters + ---------- + r : vcf.Record + VCF Record being checked + chrom : str + Chromosome name + pos : int + Chromosome position + + Returns + ------- + check : bool + Return True if the current record is at this position + """ + if record is None: return False + return record.chrom == chrom and record.pos == pos + + +def GetMinRecords(record_list: List[Union[CYVCF_RECORD, trh.TRRecord]], chroms: List[str]) -> List[bool]: r"""Check if each record is next up in sort order Return a vector of boolean set to true if @@ -269,8 +293,9 @@ def GetMinRecords(record_list: List[CYVCF_RECORD], chroms: List[str]) -> List[bo Parameters ---------- - record_list : list of vcf.Record + record_list : list of vcf.Record or trh.TRRecord list of current records from each file being merged + they can be already harmonized chroms : list of str Ordered list of all chromosomes From d09808990a4e79d3babc8abc44b591257ee4f401 Mon Sep 17 00:00:00 2001 From: xklinka Date: Thu, 3 Feb 2022 19:51:30 +0100 Subject: [PATCH 09/24] updated RELEASE_NOTES.rst --- RELEASE_NOTES.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 9e5a660e..78dad8f3 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -1,3 +1,12 @@ +Unreleased changes +----- + +Bug fixes: + +* https://github.com/gymreklab/TRTools/issues/146 fixed record positions being compared twice +* HipSTR records are now compared by starting position of the repeat, not the whole called allele + (position of the start of allele with flanking bps is no longer used) + 4.0.1 ----- From 9cec45affe12f5b9b1befee8f43f4efb0e4b5afc Mon Sep 17 00:00:00 2001 From: xklinka Date: Fri, 4 Feb 2022 13:34:19 +0100 Subject: [PATCH 10/24] added utility function to mergeutils.py --- trtools/compareSTR/compareSTR.py | 2 +- trtools/utils/mergeutils.py | 40 +++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/trtools/compareSTR/compareSTR.py b/trtools/compareSTR/compareSTR.py index e184aa86..4b44dc12 100644 --- a/trtools/compareSTR/compareSTR.py +++ b/trtools/compareSTR/compareSTR.py @@ -808,7 +808,7 @@ def main(args): vcfregions = [vcfreaders[0](args.region), vcfreaders[1](args.region)] ### Walk through sorted readers, merging records as we go ### - current_records = [next(reader) for reader in vcfreaders] + current_records = mergeutils.InitReaders(vcfreaders) done = mergeutils.DoneReading(current_records) num_records = 0 diff --git a/trtools/utils/mergeutils.py b/trtools/utils/mergeutils.py index 9c62b1e3..14f48514 100644 --- a/trtools/utils/mergeutils.py +++ b/trtools/utils/mergeutils.py @@ -315,7 +315,7 @@ def GetMinRecords(record_list: List[Union[CYVCF_RECORD, trh.TRRecord]], chroms: return [CheckPos(r, chroms[min_chrom], min_pos) for r in record_list] -def DoneReading(records: List[CYVCF_RECORD]) -> bool: +def DoneReading(records: List[Union[CYVCF_RECORD, trh.TRRecord]]) -> bool: r"""Check if all records are at the end of the file Parameters @@ -399,3 +399,41 @@ def GetNextRecords(readers: List[CYVCF_READER], current_records: List[CYVCF_RECO else: new_records.append(current_records[i]) return new_records + + +def GetNextRecords(readers: List[CYVCF_READER], current_records: List[CYVCF_RECORD], increment: List[bool]) \ + -> List[CYVCF_RECORD]: + r"""Increment readers of each file + + Increment readers[i] if increment[i] set to true + Else keep current_records[i] + + Parameters + ---------- + readers : list of vcf.Reader + List of readers for all files being merged + current_records : list of vcf.Record + List of current records for all readers + increment : list of bool + List indicating if each file should be incremented + + Returns + ------- + new_records : list of vcf.Record + List of next records for each file + """ + new_records = [] + for i in range(len(readers)): + if increment[i]: + try: + new_records.append(next(readers[i])) + except StopIteration: + new_records.append(None) + else: + new_records.append(current_records[i]) + return new_records + + +def InitReaders(readers: List[CYVCF_READER]) -> List[CYVCF_RECORD]: + return [next(reader) for reader in readers] + From 1db076d185fdd26116650e7284043a2273b60623 Mon Sep 17 00:00:00 2001 From: xklinka Date: Sat, 5 Feb 2022 15:36:32 +0100 Subject: [PATCH 11/24] Removed multiple dispatch and created new function for choosing which harmonized record should be skipped --- requirements.txt | 3 +- trtools/compareSTR/compareSTR.py | 2 +- trtools/utils/mergeutils.py | 97 ++++++++++---------------------- 3 files changed, 33 insertions(+), 69 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0317b14f..6371f059 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,4 @@ scikit-learn==0.23.1 matplotlib==3.2.2 pysam==0.16.0.1 cython==0.29.20 -cyvcf2==0.30.1 -multipledispatch==0.6.0 \ No newline at end of file +cyvcf2==0.30.1 \ No newline at end of file diff --git a/trtools/compareSTR/compareSTR.py b/trtools/compareSTR/compareSTR.py index 4b44dc12..d5851915 100644 --- a/trtools/compareSTR/compareSTR.py +++ b/trtools/compareSTR/compareSTR.py @@ -816,7 +816,7 @@ def main(args): harmonized_records = trh.HarmonizeRecords(current_records, [vcftype1, vcftype2]) # contains information about which record should be skipped in next iteration and whether it is currently comparable - is_min = mergeutils.GetMinRecords(harmonized_records, chroms) + is_min = mergeutils.GetMinHarmonizedRecords(harmonized_records, chroms) if any([item is None for item in current_records]): break if args.numrecords is not None and num_records >= args.numrecords: break diff --git a/trtools/utils/mergeutils.py b/trtools/utils/mergeutils.py index 14f48514..5fed398a 100644 --- a/trtools/utils/mergeutils.py +++ b/trtools/utils/mergeutils.py @@ -12,8 +12,7 @@ import trtools.utils.common as common import trtools.utils.tr_harmonizer as trh -from typing import List, Union, Tuple, Optional -from multipledispatch import dispatch +from typing import List, Union, Any, Optional, Callable CYVCF_RECORD = cyvcf2.Variant CYVCF_READER = cyvcf2.VCF @@ -139,7 +138,6 @@ def GetAndCheckVCFType(vcfs: List[CYVCF_READER], vcftype: str) -> str: raise ValueError("VCF files are of mixed types.") -@dispatch(CYVCF_RECORD, list) def GetChromOrder(r: CYVCF_RECORD, chroms: List[str]) -> Union[int, float]: r"""Get the chromosome order of a record @@ -161,28 +159,6 @@ def GetChromOrder(r: CYVCF_RECORD, chroms: List[str]) -> Union[int, float]: return chroms.index(r.CHROM) -@dispatch(trh.TRRecord, list) -def GetChromOrder(r: trh.TRRecord, chroms: List[str]) -> Union[int, float]: - r"""Get the chromosome order of a harmonized record - - Parameters - ---------- - r : trh.TRRecord - chroms : list of str - Ordered list of chromosomes - - Returns - ------- - order : int or float - Index of r.CHROM in chroms, int - Return np.inf if can't find r.CHROM, float - """ - if r is None: - return np.inf - else: - return chroms.index(r.chrom) - - def GetChromOrderEqual(chrom_order: Union[int, float], min_chrom: int) -> bool: r"""Check chrom order @@ -202,7 +178,6 @@ def GetChromOrderEqual(chrom_order: Union[int, float], min_chrom: int) -> bool: return chrom_order == min_chrom -@dispatch(CYVCF_RECORD) def GetPos(r: CYVCF_RECORD) -> Union[int, float]: r"""Get the position of a record @@ -221,26 +196,6 @@ def GetPos(r: CYVCF_RECORD) -> Union[int, float]: return r.POS -@dispatch(trh.TRRecord) -def GetPos(r: trh.TRRecord) -> Union[int, float]: - r"""Get the position of a harmonized record - - Parameters - ---------- - r : trh.TRRecord - - Returns - ------- - pos : int - If r is None, returns np.inf, which is a float - """ - if r is None: - return np.inf - else: - return r.pos - - -@dispatch(CYVCF_RECORD, str, int) def CheckPos(record: CYVCF_RECORD, chrom: str, pos: int) -> bool: r"""Check a record is at the specified position @@ -262,29 +217,39 @@ def CheckPos(record: CYVCF_RECORD, chrom: str, pos: int) -> bool: return record.CHROM == chrom and record.POS == pos -@dispatch(trh.TRRecord, str, int) -def CheckPos(record: trh.TRRecord, chrom: str, pos: int) -> bool: - r"""Check a record is at the specified position +def GetMinRecords(record_list: List[Optional[trh.TRRecord]], chroms: List[str]) -> List[bool]: + r"""Check if each record is next up in sort order + + Return a vector of boolean set to true if + the record is in lowest sort order of all the records + Use order in chroms to determine sort order of chromosomes Parameters ---------- - r : vcf.Record - VCF Record being checked - chrom : str - Chromosome name - pos : int - Chromosome position + record_list : list of CYVCF_RECORD + list of current records from each file being merged + + chroms : list of str + Ordered list of all chromosomes Returns ------- - check : bool - Return True if the current record is at this position + checks : list of bool + Set to True for records that are first in sort order """ - if record is None: return False - return record.chrom == chrom and record.pos == pos + chrom_order = [GetChromOrder(r, chroms) for r in record_list] + pos = [GetPos(r) for r in record_list] + min_chrom = min(chrom_order) + allpos = [pos[i] for i in range(len(pos)) if GetChromOrderEqual(chrom_order[i], min_chrom)] + if len(allpos) > 0: + min_pos = min(allpos) + else: + return [False] * len(record_list) + return [CheckPos(r, chroms[min_chrom], min_pos) for r in record_list] + -def GetMinRecords(record_list: List[Union[CYVCF_RECORD, trh.TRRecord]], chroms: List[str]) -> List[bool]: +def GetMinHarmonizedRecords(record_list: List[Optional[trh.TRRecord]], chroms: List[str]) -> List[bool]: r"""Check if each record is next up in sort order Return a vector of boolean set to true if @@ -293,9 +258,9 @@ def GetMinRecords(record_list: List[Union[CYVCF_RECORD, trh.TRRecord]], chroms: Parameters ---------- - record_list : list of vcf.Record or trh.TRRecord + record_list : trh.TRRecord list of current records from each file being merged - they can be already harmonized + chroms : list of str Ordered list of all chromosomes @@ -304,15 +269,16 @@ def GetMinRecords(record_list: List[Union[CYVCF_RECORD, trh.TRRecord]], chroms: checks : list of bool Set to True for records that are first in sort order """ - chrom_order = [GetChromOrder(r, chroms) for r in record_list] - pos = [GetPos(r) for r in record_list] + chrom_order = [np.inf if r is None else chroms.index(r.chrom) for r in record_list] + pos = [np.inf if r is None else r.pos for r in record_list] + min_chrom = min(chrom_order) allpos = [pos[i] for i in range(len(pos)) if GetChromOrderEqual(chrom_order[i], min_chrom)] if len(allpos) > 0: min_pos = min(allpos) else: return [False] * len(record_list) - return [CheckPos(r, chroms[min_chrom], min_pos) for r in record_list] + return [False if r is None else r.chrom == chroms[min_chrom] and r.pos == min_pos for r in record_list] def DoneReading(records: List[Union[CYVCF_RECORD, trh.TRRecord]]) -> bool: @@ -436,4 +402,3 @@ def GetNextRecords(readers: List[CYVCF_READER], current_records: List[CYVCF_RECO def InitReaders(readers: List[CYVCF_READER]) -> List[CYVCF_RECORD]: return [next(reader) for reader in readers] - From 51401ea27e1786c11c01c2e3eb2bd4ad815b7245 Mon Sep 17 00:00:00 2001 From: xklinka Date: Sun, 6 Feb 2022 16:57:45 +0100 Subject: [PATCH 12/24] added tests for additions to TRHarmonizer --- trtools/utils/tests/test_trharmonizer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/trtools/utils/tests/test_trharmonizer.py b/trtools/utils/tests/test_trharmonizer.py index 7bd6213a..98327eb3 100644 --- a/trtools/utils/tests/test_trharmonizer.py +++ b/trtools/utils/tests/test_trharmonizer.py @@ -797,6 +797,7 @@ def test_HarmonizeRecord(vcfdir): assert not tr_rec1.HasFullStringGenotypes() assert not tr_rec1.HasFabricatedRefAllele() assert not tr_rec1.HasFabricatedAltAlleles() + assert tr_rec1.pos == tr_rec1.full_alleles_pos tr_rec2 = next(str_iter) tr_rec3 = next(str_iter) assert tr_rec3.ref_allele == 'TTTTTTTTTTTTTTT' @@ -807,6 +808,8 @@ def test_HarmonizeRecord(vcfdir): while record.record_id != "STR_125": record = next(str_iter) assert record.HasFullStringGenotypes() + # record in the test file has flanking bp at the start of the ref. allele + assert record.pos > record.full_alleles_pos # TODO this isn't really the correct behavior - # we're trimming off an extra repeat from the alt allele assert record.full_alleles == ( @@ -932,3 +935,6 @@ def test_TRRecord_Quality(vcfdir): with pytest.raises(TypeError): var.GetQualityScores() + +def test_hipstr_flanking_pb_position_harmonization(): + pass \ No newline at end of file From 924092ab3254f544087f4bb38bc8c84a15665b1c Mon Sep 17 00:00:00 2001 From: xklinka Date: Sun, 6 Feb 2022 17:02:24 +0100 Subject: [PATCH 13/24] removed duplicated function in mergeutils.py --- trtools/utils/mergeutils.py | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/trtools/utils/mergeutils.py b/trtools/utils/mergeutils.py index 5fed398a..408a74b6 100644 --- a/trtools/utils/mergeutils.py +++ b/trtools/utils/mergeutils.py @@ -367,38 +367,5 @@ def GetNextRecords(readers: List[CYVCF_READER], current_records: List[CYVCF_RECO return new_records -def GetNextRecords(readers: List[CYVCF_READER], current_records: List[CYVCF_RECORD], increment: List[bool]) \ - -> List[CYVCF_RECORD]: - r"""Increment readers of each file - - Increment readers[i] if increment[i] set to true - Else keep current_records[i] - - Parameters - ---------- - readers : list of vcf.Reader - List of readers for all files being merged - current_records : list of vcf.Record - List of current records for all readers - increment : list of bool - List indicating if each file should be incremented - - Returns - ------- - new_records : list of vcf.Record - List of next records for each file - """ - new_records = [] - for i in range(len(readers)): - if increment[i]: - try: - new_records.append(next(readers[i])) - except StopIteration: - new_records.append(None) - else: - new_records.append(current_records[i]) - return new_records - - def InitReaders(readers: List[CYVCF_READER]) -> List[CYVCF_RECORD]: return [next(reader) for reader in readers] From 9be775ce276781c9486785d865ebd1195801f94b Mon Sep 17 00:00:00 2001 From: xklinka Date: Sun, 6 Feb 2022 17:15:56 +0100 Subject: [PATCH 14/24] added test for getMinHarmonizedRecords from mergeutils --- trtools/utils/tests/test_mergeutils.py | 40 ++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/trtools/utils/tests/test_mergeutils.py b/trtools/utils/tests/test_mergeutils.py index b4fafbbd..311216c1 100644 --- a/trtools/utils/tests/test_mergeutils.py +++ b/trtools/utils/tests/test_mergeutils.py @@ -9,33 +9,43 @@ @pytest.fixture def mrgvcfdir(vcfdir): - return os.path.join(vcfdir, "mergeSTR_vcfs") + return os.path.join(vcfdir, "mergeSTR_vcfs") + # Set up dummy class class DummyRecord: - def __init__(self, chrom, pos, ref, alts=[], info = {}): + def __init__(self, chrom, pos, ref, alts=[], info={}): self.CHROM = chrom self.POS = pos self.REF = ref self.ALTS = alts self.INFO = info + +class DummyHarmonizedRecord: + def __init__(self, chrom, pos): + self.chrom = chrom + self.pos = pos + + def test_DebugPrintRecordLocations(capsys): - dummy_records = [] + dummy_records = [] dummy_records.append(DummyRecord('chr1', 100, 'CAGCAG', info={'END': 120})) dummy_records.append(DummyRecord('chr1', 150, 'CTTCTT', info={'END': 170})) - + mergeutils.DebugPrintRecordLocations(dummy_records, [True, False]) captured = capsys.readouterr() assert "chr1:100:True" in captured.err assert "chr1:150:False" in captured.err + def test_CheckMin(): assert mergeutils.CheckMin([True, False]) == False with pytest.raises(ValueError) as info: mergeutils.CheckMin([False, False]) assert "Unexpected error. Stuck in infinite loop and exiting." in str(info.value) + def test_CheckVCFType(vcfdir): snps_path = os.path.join(vcfdir, "snps.vcf") gangstr_path = os.path.join(vcfdir, "test_gangstr.vcf") @@ -49,11 +59,12 @@ def test_CheckVCFType(vcfdir): with pytest.raises(ValueError) as info: print(mergeutils.GetAndCheckVCFType([gangstr_vcf, hipstr_vcf], "auto")) assert "VCF files are of mixed types." in str(info.value) - + with pytest.raises(TypeError) as info: print(mergeutils.GetAndCheckVCFType([gangstr_vcf, snps_vcf], "auto")) assert "Could not identify the type of this vcf" in str(info.value) + # Test no such file or directory def test_WrongFile(mrgvcfdir): # Try a dummy file name. Make sure it doesn't exist before we try @@ -63,11 +74,12 @@ def test_WrongFile(mrgvcfdir): os.remove(fname1) if os.path.exists(fname2): os.remove(fname2) - print (os.path.isfile(fname1)) + print(os.path.isfile(fname1)) with pytest.raises(ValueError) as info: mergeutils.LoadReaders([fname1, fname2]) assert "Could not find VCF file" in str(info.value) + # test unzipped, unindexed VCFs return 1 def test_UnzippedUnindexedFile(mrgvcfdir): fname1 = os.path.join(mrgvcfdir, "test_file_gangstr_unzipped1.vcf") @@ -81,3 +93,19 @@ def test_UnzippedUnindexedFile(mrgvcfdir): with pytest.raises(ValueError) as info: mergeutils.LoadReaders([fname1, fname2]) assert "Could not find VCF index" in str(info.value) + + +def test_GetMinHarmonizedRecords(): + chromosomes = ["chr1", "chr2", "chr3"] + + pair = [DummyHarmonizedRecord("chr1", 20), DummyHarmonizedRecord("chr1", 20)] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [True, True] + + pair = [DummyHarmonizedRecord("chr1", 21), DummyHarmonizedRecord("chr1", 20)] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [False, True] + + pair = [DummyHarmonizedRecord("chr2", 20), DummyHarmonizedRecord("chr1", 20)] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [False, True] + + pair = [DummyHarmonizedRecord("chr1", 20), DummyHarmonizedRecord("chr1", 21)] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [True, False] From c481ac784ae899ba54ad455d80bb4d1b9a3b9cfe Mon Sep 17 00:00:00 2001 From: xklinka Date: Sun, 6 Feb 2022 18:49:19 +0100 Subject: [PATCH 15/24] updated compareSTR.py to use harmonized values UpdateComparisonResults --- trtools/compareSTR/compareSTR.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/trtools/compareSTR/compareSTR.py b/trtools/compareSTR/compareSTR.py index d5851915..2ca8b990 100644 --- a/trtools/compareSTR/compareSTR.py +++ b/trtools/compareSTR/compareSTR.py @@ -521,8 +521,8 @@ def UpdateComparisonResults(record1, record2, sample_idxs, dictionary of counts to update """ # Extract shared info - chrom = record1.vcfrecord.CHROM - pos = record1.vcfrecord.POS + chrom = record1.chrom + pos = record1.pos period = len(record1.motif) reflen = len(record1.ref_allele)/period @@ -813,7 +813,6 @@ def main(args): num_records = 0 while not done: - harmonized_records = trh.HarmonizeRecords(current_records, [vcftype1, vcftype2]) # contains information about which record should be skipped in next iteration and whether it is currently comparable is_min = mergeutils.GetMinHarmonizedRecords(harmonized_records, chroms) From 482407bdf90d814bf7c4c5ccf9f2f807e244cf71 Mon Sep 17 00:00:00 2001 From: xklinka Date: Sun, 6 Feb 2022 18:50:05 +0100 Subject: [PATCH 16/24] added tests for comparison of harmonized HipSTR records --- trtools/compareSTR/tests/test_compareSTR.py | 19 ++++++++ .../test_hipstr_flanking_bp_flanking.vcf | 41 ++++++++++++++++++ .../test_hipstr_flanking_bp_flanking.vcf.gz | Bin 0 -> 1539 bytes ...est_hipstr_flanking_bp_flanking.vcf.gz.tbi | Bin 0 -> 104 bytes .../test_hipstr_flanking_bp_non_flanking.vcf | 41 ++++++++++++++++++ ...est_hipstr_flanking_bp_non_flanking.vcf.gz | Bin 0 -> 1535 bytes ...hipstr_flanking_bp_non_flanking.vcf.gz.tbi | Bin 0 -> 104 bytes 7 files changed, 101 insertions(+) create mode 100644 trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf create mode 100644 trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz create mode 100644 trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz.tbi create mode 100644 trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf create mode 100644 trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz create mode 100644 trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz.tbi diff --git a/trtools/compareSTR/tests/test_compareSTR.py b/trtools/compareSTR/tests/test_compareSTR.py index fcaa4735..4d4296a3 100644 --- a/trtools/compareSTR/tests/test_compareSTR.py +++ b/trtools/compareSTR/tests/test_compareSTR.py @@ -173,6 +173,25 @@ def test_main(tmpdir, vcfdir): retcode = main(args) assert retcode == 1 +def test_hipstr_position_harmonisation(tmpdir, vcfdir): + vcfcomp = os.path.join(vcfdir, "compareSTR_vcfs") + + hipstr_vcf_1 = os.path.join(vcfcomp, "test_hipstr_flanking_bp_flanking.vcf.gz") + hipstr_vcf_2 = os.path.join(vcfcomp, "test_hipstr_flanking_bp_non_flanking.vcf.gz") + + args = base_argparse(tmpdir) + args.vcf1 = hipstr_vcf_1 + args.region= "" + args.vcftype1 = 'hipstr' + args.vcf2 = hipstr_vcf_2 + args.vcftype2 = 'hipstr' + retcode = main(args) + assert retcode == 0 + + with open(tmpdir + "/test_compare-overall.tab", "r") as out_overall: + _, line = out_overall.readlines() + assert line == "ALL 1.0 1.0 nan 1\n" + def test_wrong_vcftype(tmpdir, vcfdir, capsys): args = base_argparse(tmpdir) vcfcomp = os.path.join(vcfdir, "compareSTR_vcfs") diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf new file mode 100644 index 00000000..898b4262 --- /dev/null +++ b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf @@ -0,0 +1,41 @@ +##fileformat=VCFv4.1 +##command=HipSTR-v0.5 --bams /storage/gtex-data/wgs/SRR2163846.bam --fasta /storage/resources/Homo_sapiens_assembly19.fasta --regions /storage/resources/GRCh37.hipstr_reference.bed --min-reads 5 --str-vcf /storage/vcfs/SRR2163846.vcf.gz --def-stutter-model --log /storage/vcfs/SRR2163846.log.txt +##reference=/storage/resources/Homo_sapiens_assembly19.fasta +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GTEX-R55C-0003 +1 101673 STR_38 TCAAAAAAAAAAAAAAAA TAAAAAAAAAAAAAAAAA . . INFRAME_PGEOM=0.95;INFRAME_UP=0.05;INFRAME_DOWN=0.05;OUTFRAME_PGEOM=0.95;OUTFRAME_UP=0.01;OUTFRAME_DOWN=0.01;BPDIFFS=0;START=101675;END=101690;PERIOD=1;AN=2;REFAC=1;AC=1;NSKIP=0;NFILT=0;DP=53;DSNP=0;DSTUTTER=2;DFLANKINDEL=1 GT:GB:Q:PQ:DP:DSNP:DSTUTTER:DFLANKINDEL:PDP:PSNP:GLDIFF:AB:DAB:FS:ALLREADS:MALLREADS 0|1:0|0:1.0:0.5:53:0:2:1:41.95|11.05:0|0:5.08:-5.03:49:-0.39:-19|1;-15|1;-2|1;0|40:-2|1;0|39;1|1 diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..1392673943a3356b7ee9ae6e78fff1dc3212b780 GIT binary patch literal 1539 zcmV+e2K@OSiwFb&00000{{{d;LjnNi1;tofZ=*OAe$M;~Yx=M+g+kMGres-(mZTF+ zN>W01_N{WjA=Us!*z_{$kKbdH8%aAQZKY`?g5lWT=W~2+930GOLT0Q;v9SNW=--{_ z=BI;$kfkZkBKsT7CxNfs8F~k4+7zc89dW@59FwD1kOwWo0w3MS{Al9)$L4AK>&dAO zVjw)jT;R2IK{zXmknp2#EM>PG=agjp7IRM0Y4T{E>D2^HD@aUP#-E#d>0iv-U-UW6 zxhQT6G9v}aLZVMe1QS!5!DJk9B*}t=b{Edp%fO&U6UKV{3nU{l1L;x-QfMiQNCM9i z7QZ|bUh3jOfSy(q?0q*1lLuaZWdA*IZMgBBtKRMSvNyUq^vZNf3fnvk9&=*%6Nbg1 zOL$n&T!3Qi&tqKRlmJ@AG@;@Vi8&T%p)C~Rk`s<_wq`-I5NyEZ4PsNSBRD^b%k%K==!f^Q1R~Oi!00A; zhtgRoG&Xya;#a^Qr1=_^P3=op_=~@`{SPxhjmM@De#_&0><;?<$&!InF&Jcm#H6Tm zIG5|?m=>Vi*~0A*n<0Y3Il>9B$*>@Bmym>@S;11EO3=zh7jNqLlfdzVt^GfXKFGqP z!XNki!Ke|1A<1Gf zm#a!+rcBUTeJRIlvP{4PPEe)<}>&yGSzH`w8!Qa|2sH@Dx z7T1d=htAnPS8Am#Uf$&UVBCb4x7stuX)gB+`&{idLQzSOVCW7Pl$D?m&dY5bc>TdJ zcpKLJoN=<&ExHFCAxQtQ!)Dm-_$}tW!wHR0m6#z|L=t65LTYu%ZwqVUy$P+txua8EQ^&zo}E(6Q4X$Al0(Q`GF2InlptYNucWe522V2*V^#DhS$I*b%3z?CJSuS# z=`daTl}tEuXv}^+#xl*lXr*;c+K@NY`;yZJ?<&6aDF^o8$m9V|EfR1x{>X;h9ZNJa zz6wYSn>*)$)d2^U~shug)96((=x3&cdqEv@X=I-e5k2aY+2MU3w*8&QR%Ddj&i(;4FtxO zv!LlCf$@o3T-xGkuJ zTh1_OUEesvmaGVSeoGb?tx5~6%b@q4=65<5nqe62PiD(B%+oLJ7C7i_`)ez>aJK$h zfwS{ShyU8A3ER-mI^Fe`0~j0YvFs$PC-vUCnzhmPN+Gjb>p-hVi)PR^x~eU+73oeF z8YFdlW^}7w#WuT+XCHS}KOhJ4qKa^sstRQIb;ow6-IZBV4i+&NCT#>m+XO{fm*>{C zHNLjou_dLgrL46fX+dJLWHz*_)L}X27C4mEpIFPYX01+L%Xl&^4@;cA{enO)87$m=m&jHeUBTHLkIy5^Jl p>3>~X=b5$>001A02m}BC000301^_}s0stET0{{R300000000^*@zDSP literal 0 HcmV?d00001 diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..bd849c237d67972f4060a788a825b2b461238436 GIT binary patch literal 104 zcmb2|=3rp}f&Xj_PR>jW-VEG@pHfm%5)u-ak|cPUP6bFEF=v?^H(6m%dQ*UA+w(O% n0kJ0z#B(UMt?QHZ>pgglyA(hSUCqd^1!>;D<( literal 0 HcmV?d00001 diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf new file mode 100644 index 00000000..c6374172 --- /dev/null +++ b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf @@ -0,0 +1,41 @@ +##fileformat=VCFv4.1 +##command=HipSTR-v0.5 --bams /storage/gtex-data/wgs/SRR2163846.bam --fasta /storage/resources/Homo_sapiens_assembly19.fasta --regions /storage/resources/GRCh37.hipstr_reference.bed --min-reads 5 --str-vcf /storage/vcfs/SRR2163846.vcf.gz --def-stutter-model --log /storage/vcfs/SRR2163846.log.txt +##reference=/storage/resources/Homo_sapiens_assembly19.fasta +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GTEX-R55C-0003 +1 101675 STR_38 AAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAA . . INFRAME_PGEOM=0.95;INFRAME_UP=0.05;INFRAME_DOWN=0.05;OUTFRAME_PGEOM=0.95;OUTFRAME_UP=0.01;OUTFRAME_DOWN=0.01;BPDIFFS=0;START=101675;END=101690;PERIOD=1;AN=2;REFAC=1;AC=1;NSKIP=0;NFILT=0;DP=53;DSNP=0;DSTUTTER=2;DFLANKINDEL=1 GT:GB:Q:PQ:DP:DSNP:DSTUTTER:DFLANKINDEL:PDP:PSNP:GLDIFF:AB:DAB:FS:ALLREADS:MALLREADS 0|1:0|0:1.0:0.5:53:0:2:1:41.95|11.05:0|0:5.08:-5.03:49:-0.39:-19|1;-15|1;-2|1;0|40:-2|1;0|39;1|1 diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..23703aadb8852718f865dbda2e2db6c03a8c2dd0 GIT binary patch literal 1535 zcmVW*{@nc)nf1f{A_&&HE4|zql(yU1 zN-M2*?>8rflx7wPLqgRZfBZcOeWTTtsxzxIl)y=zC+Fn6adb4JF`2PE!NUIcs=GMT z%}+;10ZS5`hW0m_jVHdgF!UDEv?)$FI^lxlI3g#JAP-uI1wOft_{rG!PtEh@*RyjS z#6Wn4xxgFgoN!j;0pTa#Sig6U-UW6 zxXABvG9x)j1ENn!2on>U!ektBB*}t=wg_gMWnfUD31dC_1(G3|fpj4R$+d)qB!*`( zi(Z}yFLm)CKu>E5_Msbv$$hUowEynAHr)8mb?0u>>kO}ty&{>C+%}IVj~TJMF~j26 zB|OM!CO|Rv=Mm0vLVzs-yCIrtGdV^PVemAk0pfyWh|^yLg%s34r-hvO`Jd-7Z$~u% zb2i0O8dLFz#2gE>)D{YG!3jq=-LRl(05;(A2C*sE5u6{z<#~8__`~~H0wJkIV0b%u zhtgRgG&Xya;@7|*r1=V!ZS6}}_=~@`{SPxhg~zrLe#_%>{GU%s*L?o|r zxRmQOxmv5^0cJ}`)dOr=~ zg3|?ot>avvfU%G&KzSZYpo|DAq)5hcIZLv8okGWR4QatZXPF5TI7X=wl#7+5xI8O?ybr;nQYK(NuP*O*y3SP{1b=74pspen zJ6x~o96D$BT&b0IczNUR{ZSoW-g?g*Cz;$c>~p=_5QPOnf}sV@DJwuBoEN(~@VfoM zR=x9A>pgdqLH4(nmNqqmrMfnyq?GBHE22qnsjgw*Pa-wxK;dlOoTb5G%% z!U`yxV@}fu&Q@9D?1f`@Jh`1rI{rJc@H!&WR8|mhmZj%OKPl^(9n|i?@xJ#xw=;Mz za9TxNRZp`Rr$6P+moyZ9cj#X`6UCsndx*x6uhOWB+gqAxZ}Mf0$@Gff&+4zN?H(8l_8de;^wKFHZ@!39(?PsL zU1M8I`xwtz`XvaCc#)}G1TZR1&v6zr`Wv&jb?~}54&)v9LgcA5Dq4CuHSu5*y404HHVqgMhbO?BuYD;p#^m@9nW>8}YDc<9%eDaV}q1 zGxSrYu_{)Ud3H)MM;W+AL5?AF$y8-PQi6n8zLLsL7(7i$gjLa_WZ^}=E`xzq{HVl9 zsKa#WS2E$up)vdQ7|AsEqLnr^X-(cx?@LZwyvz93r5xCUBa;U>wMf9(_#+!~_bgG% z_$nYRZSGtumff+e(}E&aF=G$N84Uc6@ubhY_Xg8@qgjTC?LI98RmeR_6r+A_{hO%S;H<`1tMqv5#GcN@WZT literal 0 HcmV?d00001 diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..9c338503e3b8be27b04bcc351406d06d3ee2e7ae GIT binary patch literal 104 zcmb2|=3rp}f&Xj_PR>jW-VEG@pHfm%5)u-ak|cPUP6bFEF=v?^H(6m%dQ*UA+wnC# n0kJ0z#B(UMt?QHZ> Date: Mon, 7 Feb 2022 12:37:42 +0100 Subject: [PATCH 17/24] removed incomplete test --- trtools/utils/tests/test_trharmonizer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/trtools/utils/tests/test_trharmonizer.py b/trtools/utils/tests/test_trharmonizer.py index 98327eb3..6fc64171 100644 --- a/trtools/utils/tests/test_trharmonizer.py +++ b/trtools/utils/tests/test_trharmonizer.py @@ -934,7 +934,3 @@ def test_TRRecord_Quality(vcfdir): assert not var.HasQualityScores() with pytest.raises(TypeError): var.GetQualityScores() - - -def test_hipstr_flanking_pb_position_harmonization(): - pass \ No newline at end of file From e230a5648c8103e54809c5ff3f18c329db1cf4d9 Mon Sep 17 00:00:00 2001 From: xklinka Date: Mon, 7 Feb 2022 12:54:18 +0100 Subject: [PATCH 18/24] added docstring for new function in mergeutils.py --- trtools/utils/mergeutils.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/trtools/utils/mergeutils.py b/trtools/utils/mergeutils.py index 408a74b6..9cd0b202 100644 --- a/trtools/utils/mergeutils.py +++ b/trtools/utils/mergeutils.py @@ -368,4 +368,18 @@ def GetNextRecords(readers: List[CYVCF_READER], current_records: List[CYVCF_RECO def InitReaders(readers: List[CYVCF_READER]) -> List[CYVCF_RECORD]: + r"""Increment readers of each file + + Returns list of first records from list of readers. + + Parameters + ---------- + readers : list of cyvcf2.VCF + List of readers for all files being merged + + Returns + ------- + list of vcf.Record + List of next records for each file + """ return [next(reader) for reader in readers] From a4cf7ce4d9b510a7ad03e679c6fad4b690a47627 Mon Sep 17 00:00:00 2001 From: Michal Klinka Date: Mon, 7 Feb 2022 17:36:13 +0100 Subject: [PATCH 19/24] Updated release notes to better describe changes --- RELEASE_NOTES.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 78dad8f3..eded94aa 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -4,8 +4,9 @@ Unreleased changes Bug fixes: * https://github.com/gymreklab/TRTools/issues/146 fixed record positions being compared twice -* HipSTR records are now compared by starting position of the repeat, not the whole called allele - (position of the start of allele with flanking bps is no longer used) +* CompareSTR: Decision on which records are comparable is now based on data from harmonized TRRecords, + and not from the records directly from VCF readers. Thanks to this, HipSTR records which have different starting positions, + but position of their repeat is at the same position are compared correctly (harmonization step removes this difference). 4.0.1 ----- From 43b5aff4b20148a5cd9ebc079acbdc8852a477c5 Mon Sep 17 00:00:00 2001 From: Jonathan Margoliash Date: Mon, 7 Feb 2022 20:49:23 -0800 Subject: [PATCH 20/24] Update docs, remove extra copies of VCFs --- README.rst | 1 - .../test_hipstr_flanking_bp_flanking.vcf | 41 ------------------- .../test_hipstr_flanking_bp_non_flanking.vcf | 41 ------------------- trtools/utils/tr_harmonizer.py | 2 +- 4 files changed, 1 insertion(+), 84 deletions(-) delete mode 100644 trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf delete mode 100644 trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf diff --git a/README.rst b/README.rst index b7f85252..3fd57306 100644 --- a/README.rst +++ b/README.rst @@ -116,7 +116,6 @@ Development Notes ----------------- * TRTools only currently supports diploid genotypes. Haploid calls, such as those on male chrX or chrY, are not yet supported but should be coming soon. -* CompareSTR currently only compares STR loci between two callsets if they have the same start and end coordinates. In the future we will add the capacity to take in a user specificied mapping of loci between the two callsets and use that to compare loci even if they don't completely overlap one another. Contact Us ---------- diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf deleted file mode 100644 index 898b4262..00000000 --- a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf +++ /dev/null @@ -1,41 +0,0 @@ -##fileformat=VCFv4.1 -##command=HipSTR-v0.5 --bams /storage/gtex-data/wgs/SRR2163846.bam --fasta /storage/resources/Homo_sapiens_assembly19.fasta --regions /storage/resources/GRCh37.hipstr_reference.bed --min-reads 5 --str-vcf /storage/vcfs/SRR2163846.vcf.gz --def-stutter-model --log /storage/vcfs/SRR2163846.log.txt -##reference=/storage/resources/Homo_sapiens_assembly19.fasta -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##contig= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GTEX-R55C-0003 -1 101673 STR_38 TCAAAAAAAAAAAAAAAA TAAAAAAAAAAAAAAAAA . . INFRAME_PGEOM=0.95;INFRAME_UP=0.05;INFRAME_DOWN=0.05;OUTFRAME_PGEOM=0.95;OUTFRAME_UP=0.01;OUTFRAME_DOWN=0.01;BPDIFFS=0;START=101675;END=101690;PERIOD=1;AN=2;REFAC=1;AC=1;NSKIP=0;NFILT=0;DP=53;DSNP=0;DSTUTTER=2;DFLANKINDEL=1 GT:GB:Q:PQ:DP:DSNP:DSTUTTER:DFLANKINDEL:PDP:PSNP:GLDIFF:AB:DAB:FS:ALLREADS:MALLREADS 0|1:0|0:1.0:0.5:53:0:2:1:41.95|11.05:0|0:5.08:-5.03:49:-0.39:-19|1;-15|1;-2|1;0|40:-2|1;0|39;1|1 diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf deleted file mode 100644 index c6374172..00000000 --- a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf +++ /dev/null @@ -1,41 +0,0 @@ -##fileformat=VCFv4.1 -##command=HipSTR-v0.5 --bams /storage/gtex-data/wgs/SRR2163846.bam --fasta /storage/resources/Homo_sapiens_assembly19.fasta --regions /storage/resources/GRCh37.hipstr_reference.bed --min-reads 5 --str-vcf /storage/vcfs/SRR2163846.vcf.gz --def-stutter-model --log /storage/vcfs/SRR2163846.log.txt -##reference=/storage/resources/Homo_sapiens_assembly19.fasta -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##contig= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GTEX-R55C-0003 -1 101675 STR_38 AAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAA . . INFRAME_PGEOM=0.95;INFRAME_UP=0.05;INFRAME_DOWN=0.05;OUTFRAME_PGEOM=0.95;OUTFRAME_UP=0.01;OUTFRAME_DOWN=0.01;BPDIFFS=0;START=101675;END=101690;PERIOD=1;AN=2;REFAC=1;AC=1;NSKIP=0;NFILT=0;DP=53;DSNP=0;DSTUTTER=2;DFLANKINDEL=1 GT:GB:Q:PQ:DP:DSNP:DSTUTTER:DFLANKINDEL:PDP:PSNP:GLDIFF:AB:DAB:FS:ALLREADS:MALLREADS 0|1:0|0:1.0:0.5:53:0:2:1:41.95|11.05:0|0:5.08:-5.03:49:-0.39:-19|1;-15|1;-2|1;0|40:-2|1;0|39;1|1 diff --git a/trtools/utils/tr_harmonizer.py b/trtools/utils/tr_harmonizer.py index da8256c8..53d911b3 100644 --- a/trtools/utils/tr_harmonizer.py +++ b/trtools/utils/tr_harmonizer.py @@ -606,7 +606,7 @@ class TRRecord: chrom : str The chromosome this locus is in pos : int - The bp along the chromosome that this locus is at + The bp along the chromosome that this locus is at (ignoring flanking base pairs/full alleles) info : Dict[str, Any] The dictionary of INFO fields at this locus format : Dict[str, np.ndarray] From a77ed1732da8b600ed859323429469a04f1fdec0 Mon Sep 17 00:00:00 2001 From: Jonathan Margoliash Date: Tue, 8 Feb 2022 08:18:55 -0800 Subject: [PATCH 21/24] More doc cleanup --- trtools/compareSTR/README.rst | 4 ++-- trtools/utils/tr_harmonizer.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/trtools/compareSTR/README.rst b/trtools/compareSTR/README.rst index 57b75c52..b37c0d01 100644 --- a/trtools/compareSTR/README.rst +++ b/trtools/compareSTR/README.rst @@ -11,11 +11,11 @@ Example use cases include: * Comparing calls to a "ground truth" set, e.g. from capillary electrophoresis data, to find call errors * Comparing calls for the same tool using different parameter settings to identify differences due to bioinformatic processing -* Comparing calls for different tools. This only works if they used the same set of reference TRs. Please note that compareSTR uses matching chromosomes and positions to compare TRs. Therefore, our method is not able to compare TRs if the starting coordinates is changed by TR calling method (e.g., due to phasing with a nearby variant) +* Comparing calls for different tools. This only works if they used the same set of reference TRs. Please note that compareSTR only compares TRs with matching chromosomes and allele-positions (ignoring flanking base pairs) CompareSTR optionally will stratify results based on a user-specified FORMAT field (e.g. depth, or quality score) and/or by repeat motif length. -Note: CompareSTR is designed to be used as a QC tool. While it may be able to pick up certain biological differences in some applications (e.g. identifying de novo mutations by comparing parent and child callsets or somatic mutations by comparing callsets from different tissues), most such applications would be better performed by specialized tools. +Note: CompareSTR is designed to be used as a QC tool. While it may be able to pick up certain biological differences in some applications (e.g. identifying de novo mutations by comparing parent and child callsets or somatic mutations by comparing callsets from different tissues), use-case specific analyses may be better performed by more specialized tools. Note: CompareSTR has the ability to stratify comparisons based on quality scores. However, beware that quality scores output by different genotypers may not be directly comparable. You can use `qcSTR `_ to visualize the distribution of quality scores in each VCF file seprately. diff --git a/trtools/utils/tr_harmonizer.py b/trtools/utils/tr_harmonizer.py index 53d911b3..0eb770e1 100644 --- a/trtools/utils/tr_harmonizer.py +++ b/trtools/utils/tr_harmonizer.py @@ -619,6 +619,9 @@ class TRRecord: Other Parameters ---------------- + harmonized_pos : + If this record has flanking base pairs before the repeat, set this + to note at which bp the repeat begins full_alleles : A tuple of string genotypes (ref_allele, [alt_alleles]) where each allele may contain any number of flanking From 9956a7dbe9ad08d5679b1df607ef9a1500d6b0d4 Mon Sep 17 00:00:00 2001 From: xklinka Date: Wed, 9 Feb 2022 10:14:02 +0100 Subject: [PATCH 22/24] added new test cases to test_mergeutils.py --- trtools/utils/tests/test_mergeutils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/trtools/utils/tests/test_mergeutils.py b/trtools/utils/tests/test_mergeutils.py index 311216c1..0a82e140 100644 --- a/trtools/utils/tests/test_mergeutils.py +++ b/trtools/utils/tests/test_mergeutils.py @@ -109,3 +109,12 @@ def test_GetMinHarmonizedRecords(): pair = [DummyHarmonizedRecord("chr1", 20), DummyHarmonizedRecord("chr1", 21)] assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [True, False] + + pair = [None, None] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [False, False] + + pair = [DummyHarmonizedRecord("chr1", 20), None] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [True, False] + + pair = [None, DummyHarmonizedRecord("chr1", 20)] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [False, True] From 834e6893d4da99b15e4587b7af3c03f801c8e7b7 Mon Sep 17 00:00:00 2001 From: xklinka Date: Wed, 9 Feb 2022 11:56:42 +0100 Subject: [PATCH 23/24] Added new test cases to test_compareSTR.py --- trtools/compareSTR/tests/test_compareSTR.py | 15 +++++++++++---- .../test_hipstr_flanking_bp_flanking.vcf.gz | Bin 1539 -> 1646 bytes ...est_hipstr_flanking_bp_flanking.vcf.gz.tbi | Bin 104 -> 119 bytes ...est_hipstr_flanking_bp_non_flanking.vcf.gz | Bin 1535 -> 1631 bytes ...hipstr_flanking_bp_non_flanking.vcf.gz.tbi | Bin 104 -> 119 bytes 5 files changed, 11 insertions(+), 4 deletions(-) diff --git a/trtools/compareSTR/tests/test_compareSTR.py b/trtools/compareSTR/tests/test_compareSTR.py index 4d4296a3..66a201d1 100644 --- a/trtools/compareSTR/tests/test_compareSTR.py +++ b/trtools/compareSTR/tests/test_compareSTR.py @@ -181,16 +181,23 @@ def test_hipstr_position_harmonisation(tmpdir, vcfdir): args = base_argparse(tmpdir) args.vcf1 = hipstr_vcf_1 - args.region= "" + args.region = "" args.vcftype1 = 'hipstr' args.vcf2 = hipstr_vcf_2 args.vcftype2 = 'hipstr' retcode = main(args) assert retcode == 0 - with open(tmpdir + "/test_compare-overall.tab", "r") as out_overall: - _, line = out_overall.readlines() - assert line == "ALL 1.0 1.0 nan 1\n" + with open(tmpdir + "/test_compare-locuscompare.tab", "r") as out_overall: + lines = out_overall.readlines() + ## vcf1 : flank bp at start of record, vcf2: no flanking bp + assert lines[1] == "1 101675 1.0 1.0 1\n" + ## vcf1 : no flanking bp, vcf2: no flanking bp + assert lines[2] == "1 111675 1.0 1.0 1\n" + ## vcf1 : flanking bp at the end, vcf2: no flanking bp + assert lines[3] == "1 112655 1.0 1.0 1\n" + ## vcf1 : flanking bp at both sides, vcf2: no flanking bp + assert lines[4] == "1 125557 1.0 1.0 1\n" def test_wrong_vcftype(tmpdir, vcfdir, capsys): args = base_argparse(tmpdir) diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz index 1392673943a3356b7ee9ae6e78fff1dc3212b780..2f1136c1eb9ea69b896fd0aadb0101db4954ef16 100644 GIT binary patch literal 1646 zcmV-!29fz6iwFb&00000{{{d;LjnL%2JKi|bDKC6ex|=d(?0A=aKH|3jF+7u#&$h* zu!G5V-!d}DSW|9;L@u5D_&oxA!ATl&nb~$`>kbhh>HBm(os0PNX+=Y_O0o!Z9(L7bY5Y`@ve79SdwQRVaH#RD7j)dr6guom@yJ9!#lOtDhJ4NMgp3|?6IL^XRz*k zZmns`cy^VM70F2Kk=ByRvvIz4snsP9V@o|nb91&oPKsQ8Vc_r>qKoWSGQ4cXrQpD&Ff_w^Qpvzng{P@>n zn3wYgfLXG{OBzyshxi(ER9OpoIA??*9Pe1r*aHoiXrN@tS_t+}!R2vyYxd24Ujja9 zL|}I5zQO28&gEqFD#LGq-^ubdD*M(KuJ8wYZu##fK#j+~2tVa2?y7P-l&lcJ-WuT$*u3CAIdi;iNlk`xnr#S?8GJSt8&jA!-fo#DtFG(m6< z77XUfV}8JO&}7q@hx@9S>42BL_&S+4;bm{v%y5*7HN!f$tM!qe6T}mAjWe3$U=YUg zLmSwm$<%!v*3CL$WM^A+12#gC{$Yj9u&w!Ph|<0`{`-JmDv0J#&CMnws|4 ziERz1Zv~F)h@0wZ72^22Sb0UGz#q+=bJHytj1M=_0`gTH)N#j^Ob-U1H5mL@yk>&^ zR{SDHZ-dhz$VG#+lt9LX{hFb4FF^rB9t+|3BFW+8^S8|Y%-q%sW5ELl5fQy~3doyJ zqBL|6`=ze2Z>5JAk172W1Vb!Oi(CXSMVemYG)(AEWU-pyjWXPpQhLSgG>tl46EZaE+WCLFN*vN`a&V3A219j2$KLG$sKqik_k`yvVj? zFwhF`itfa3!EoVMBH^szVD{rK5NYm7m3B30L*7vD3r>5y%lI~;4A_GslUvxeh{N9a zPZnf7P@<9eML?=FH_r-|qlKu`ygV#o#seU8I(3GowRmNU5QjFBU6-hZH&+J;n-D)F z@L59smdC89`DqYGq$=$87@WU=!a2UBQ6ANtJ1ywc@X@je`A}0Av8AeADtuN5QQE6K z9i@018wlK&&VrVA2yQ|UlEWv8&QIJ0wU zK3hl=OM*&RGN-O|acNE^Q4tOuNfZ}SX$5KQ4*!uIO&iFHqI5o}lB%fP&m9RI^r~|z zxdZdypX44q7?@HEezi{%hSKV3{oR)XxL0=f;v`uM&?1 z{o+_=6r{C29FU4;PwAIu6+`Wtw$bhv=K*meE=3UzLyH0#{#tWG>-0sI6gO4Og+Y74 z&``lp`uI%0(B~JrHP=ONebZOp>#0Lx(nU7Z%haKpXF52PK3eFTy{2z>ouu5Wx^l1R zYD>}K%bBiqbVYCLs(zw^DehHxt`!}$mU61gaP89k{B!d$ljCnz**%qTHV- zde!Rm`s%$Z^i+juNF9Cr(5OP|0}1qyIQH-i6V_n@pA`N1Dfuy zpq`*usn_my-(5ZB_f@a0X&d!a$#lo&p!^TInd9niJaEUqfQo|Z=Tb#zcD0R)op)04 sclE)40i6|z9rPFg03VA81ONa4009360763o02=@U00000000000Qk@xF8}}l literal 1539 zcmV+e2K@OSiwFb&00000{{{d;LjnNi1;tofZ=*OAe$M;~Yx=M+g+kMGres-(mZTF+ zN>W01_N{WjA=Us!*z_{$kKbdH8%aAQZKY`?g5lWT=W~2+930GOLT0Q;v9SNW=--{_ z=BI;$kfkZkBKsT7CxNfs8F~k4+7zc89dW@59FwD1kOwWo0w3MS{Al9)$L4AK>&dAO zVjw)jT;R2IK{zXmknp2#EM>PG=agjp7IRM0Y4T{E>D2^HD@aUP#-E#d>0iv-U-UW6 zxhQT6G9v}aLZVMe1QS!5!DJk9B*}t=b{Edp%fO&U6UKV{3nU{l1L;x-QfMiQNCM9i z7QZ|bUh3jOfSy(q?0q*1lLuaZWdA*IZMgBBtKRMSvNyUq^vZNf3fnvk9&=*%6Nbg1 zOL$n&T!3Qi&tqKRlmJ@AG@;@Vi8&T%p)C~Rk`s<_wq`-I5NyEZ4PsNSBRD^b%k%K==!f^Q1R~Oi!00A; zhtgRoG&Xya;#a^Qr1=_^P3=op_=~@`{SPxhjmM@De#_&0><;?<$&!InF&Jcm#H6Tm zIG5|?m=>Vi*~0A*n<0Y3Il>9B$*>@Bmym>@S;11EO3=zh7jNqLlfdzVt^GfXKFGqP z!XNki!Ke|1A<1Gf zm#a!+rcBUTeJRIlvP{4PPEe)<}>&yGSzH`w8!Qa|2sH@Dx z7T1d=htAnPS8Am#Uf$&UVBCb4x7stuX)gB+`&{idLQzSOVCW7Pl$D?m&dY5bc>TdJ zcpKLJoN=<&ExHFCAxQtQ!)Dm-_$}tW!wHR0m6#z|L=t65LTYu%ZwqVUy$P+txua8EQ^&zo}E(6Q4X$Al0(Q`GF2InlptYNucWe522V2*V^#DhS$I*b%3z?CJSuS# z=`daTl}tEuXv}^+#xl*lXr*;c+K@NY`;yZJ?<&6aDF^o8$m9V|EfR1x{>X;h9ZNJa zz6wYSn>*)$)d2^U~shug)96((=x3&cdqEv@X=I-e5k2aY+2MU3w*8&QR%Ddj&i(;4FtxO zv!LlCf$@o3T-xGkuJ zTh1_OUEesvmaGVSeoGb?tx5~6%b@q4=65<5nqe62PiD(B%+oLJ7C7i_`)ez>aJK$h zfwS{ShyU8A3ER-mI^Fe`0~j0YvFs$PC-vUCnzhmPN+Gjb>p-hVi)PR^x~eU+73oeF z8YFdlW^}7w#WuT+XCHS}KOhJ4qKa^sstRQIb;ow6-IZBV4i+&NCT#>m+XO{fm*>{C zHNLjou_dLgrL46fX+dJLWHz*_)L}X27C4mEpIFPYX01+L%Xl&^4@;cA{enO)87$m=m&jHeUBTHLkIy5^Jl p>3>~X=b5$>001A02m}BC000301^_}s0stET0{{R300000000^*@zDSP diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz.tbi index bd849c237d67972f4060a788a825b2b461238436..c4da03cc1d1090c7f156a1777ffdf6e1756210cf 100644 GIT binary patch delta 78 zcmc~O7nJYjU||4(|7;9S&P)tZ69q+#Qxe!FF**r|Ry_*PY*R23cxlSH=+#VxRTkXL fS|8e9wXbZRdAL!o44$Wz;M6FubtMza|5 delta 63 zcmXTV5R~ucU||4(|7;9S&P)v669q+7SSH6!R@jr?6rkDmd<{=P?1=;M97=8L`eZ$O RLZsLj_O|<#uA3Mk4*(%>6XpN_ diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz index 23703aadb8852718f865dbda2e2db6c03a8c2dd0..2fb440cb33983bec1b7874df8637d60398c17041 100644 GIT binary patch delta 1622 zcmV-c2C4b~3*QWXABzYC000000RIL6LPG)oLI&+vTW_N{6n1@cKm2K79J*B~gT9-~2+;`NEa2RYga#yu=Yh$BZX`84k#Cz{#!bV~&q+0(Lxi z-ImsApPqCS=mwovm~p)8oe`GgnMc_1mn2HA7)~jP*%fAtM9c6_>nh~{SoPmhF$3cuxGr!58Zr3;0@V`n%u{~X&UG+gU^a5WnZrsqda9xX{` zYDbH^l$gUX!TiW3%*$xX!7%2>8P0G-fGq;u5RK)PxJCg<;BH1e#5hS2qdy4pDVTvS zb20GaUyos4&Kdw_$r3MVNckP&Ys^t)E#%>x5r%MoykkLQ4>Vw+fs!Rt5bU3V%j59& z^qcd(1botn!1QwQ2BRlAmy^}248H|_C(GBU>|0;B!XNCp<-eN%H6HsS{FKMp%pMPi z^9=*5WH6372}oAwa3)hw-dF zy*n6M{U!+R!GghDdCU*E_M2=v>u_HcGac}Ma^_#hvnITp?V1^mQn6-O=XSL|@^gZC zg068!lN=1fSbk^&XE>fLUWawFP8iwQ7Tti25Tt)tVKZ!d_8NJwaY%hsCT0j0zChWK z5LVsrJHVPduR<$vZYb@rxL}4Niw37Y))<0vQ+fYlhOj1O*UzEQH^SB!`pF-ZJ|$bK5VB z1rHoVMD)@rAa6d2($GPim%7Hjl^$Y$Jf`$h5Dc+AEpidS6lr>m(=eewk;QI;H_Wgn zTHq5U;7egf6H#?MzXR-Dp#km{((fdsYe@WPN%##R8;=v%AR8Q*P)u$)JZ1bKht-5@ z9&w!4$C`b_mqi=rJ>!gZ_Pm+lJf*3dVx`KnONtpv!8LMn1er^uDix9vB+T-EkuY|Y zz}=VxxF~vxzVIO1mcc+PyeqmBUxDGmuSCLG!@=ywT_DojlPc|M(uTaD-WQzqc$e{Q zNExsPM<%zhYY~UN@t-WnexO7n@r!^|X>OerEQfPZr+Il;#Eb_()@0%iEPMXS6d?|6 zB)cwA3vaFt5Dp=JNZ_@E{4I}vSyA)TAdW~?*zGYme*uMad`qJ|syTOB(5d00WfAhB zrY>SjRl8L9tPrBKS9v;0@isOPxGtRqE${3dS6ym zwf#YpG*#<-)+KPztM;j69sC{K=v$HkpW15)Q&qZpZ}-*!uGQVOI7OCs>gVcm)ZUR- zbkcgY(`R|BItZF-uQ-yKCE*?%j*4bi?Um;gQ|nod+3FQ%0dXO|iUJ&l7R53A+A~vc z_e6#i7gfB4L3@GF)WAr8#^}trFlHBqJu^gaW7F5z>uEqzGDIdc%CuovX9hTvF`OHl zeP(QToTT1shI+3WnxY!;*38h`hHA77%{bA(6!#k3*NYCiqMjNue7B90t|6;R8@{#f zz1EX8U3|CTOT9l)jjGk|_OyFVXsPYA>`Hx0e5#*cLOE@x95hc&x5~BtS59kn^isJK zRT^31Q~iv73FQR6O1W02^X|&2zprvFUDrRC%EA7#`sM$oHR}HY>J=1A^`P0&H|n+D UNxk3I@BRfLzw_MW7?a@z9`|J#iU0rr delta 1525 zcmVIR^$QUC*N4Y?l{gUN%PO*HpNpKQ}Kwz91FD677B2|2}d~Hu%Kyw05;(A2C*sE5u6{z<#~8__`~~H z0wJkIV0b%uhtgRgG&Xya;@7|*r1=V!ZS6}}_=~@`{SPxhg~zrLe#_%>{ zGU%s*L?o|rxRmQOxmv5^0 zcJ}{&EP6i;;)2r!fvw|Qpn$QEDnNN2N}!AgDx^rpayd(~d!0hZa}Ogx(on+ZAA~>Z z`2Arm3ImcxVlG#e#!Q%?v+7cgS7(_C6F5ex5|oRTq_{jQfxHjFqf#bdKCdqCce>71 z9Rz=8!=SDr6+2w7>Kr;}_gtx!c6fQ?@BLAK9bVpg&m1S2+%xQRz1t9l1wn$L14>WbeE z*4TR!T8VQ{;he$>D4SzW(+JL1S>x=5V|P5colH9ZJF)OOBGOb=5O9{I=Se>)>zN&Y z)b7CXzV|)1Gk7m>T18w{PqP@OKjqGsG!%Y!=wCY%#h|x)h{lkw(x{5tTbgNa@@0+5 z&&6vl*{_YSV)QyV?SkCYNJ|VdE}YjKWm^dfAc|BPek+pfEaVQr9vBPu97IO+ z(kUQszKBxOLA*mgcA5Dq4CuHSu5*y404HHVqgMhbO?BuYD;p#^m@9nW> z8}YDc<9%eDaV}q1GxSrYu_{)Ud3H)MM;W+AL5?AF$y8-PQi6n8zLLsL7(7jXNrY9= zqh#SlzAl4-R{W^MNvOkg=~pu0%%L&+^%%)C_o9_HHEB)WQ144lTfEEo)}o_<<%xQgQA~(W&60sS5c}Q>)mrtX-D) zTp6O&SJ@q9cpV!Ej7w)h(?>kJyQBJB7LSzI(qEi`(Q&VQQUYF%l6hG8^6nGMr0 z&%d-9;GlQSuMKDCZ~uV~|CP@YwxM6N+MBNiFg7-0*+rI5s(p1iYpdmzLT0*hpw5@spIgl4sfWuT(9K)|WvRloz%#d=hjJGgp zD-hZyD9P$wS~u3{#&Sn&mXx+uveuTQ1xd-0nb0cJhUHvZ;7nF`Y^~0jwLWnTo5FzTN%jCVI*P06nf5IsgCw delta 63 zcmXTV5R~ucU||4(|7;9S&P)v669q+7SSH6!R@jr?6rkC5d<{=P?1=;M97=8L`eZ$O RLZsLjHZA>EyKZ8HJOCoH6p;V` From 6730e6a7d9b7bd375509bee00fd594b1a1560b73 Mon Sep 17 00:00:00 2001 From: xklinka Date: Fri, 11 Feb 2022 15:39:21 +0100 Subject: [PATCH 24/24] removed HarmonizeRecords function, reordered conditions in main loop of compareSTR.py to correctly handle None Records from getNextRecords --- trtools/compareSTR/compareSTR.py | 13 ++++++++----- trtools/utils/tr_harmonizer.py | 28 ---------------------------- 2 files changed, 8 insertions(+), 33 deletions(-) diff --git a/trtools/compareSTR/compareSTR.py b/trtools/compareSTR/compareSTR.py index 2ca8b990..8a908fed 100644 --- a/trtools/compareSTR/compareSTR.py +++ b/trtools/compareSTR/compareSTR.py @@ -810,15 +810,18 @@ def main(args): ### Walk through sorted readers, merging records as we go ### current_records = mergeutils.InitReaders(vcfreaders) done = mergeutils.DoneReading(current_records) - + vcf_types = [vcftype1, vcftype2] num_records = 0 while not done: - harmonized_records = trh.HarmonizeRecords(current_records, [vcftype1, vcftype2]) - # contains information about which record should be skipped in next iteration and whether it is currently comparable - is_min = mergeutils.GetMinHarmonizedRecords(harmonized_records, chroms) - if any([item is None for item in current_records]): break if args.numrecords is not None and num_records >= args.numrecords: break + + harmonized_records = [trh.HarmonizeRecord(vcf_types[i], current_records[i]) for i in range(len(current_records))] + # contains information about which record should be + # skipped in next iteration and whether it is currently comparable + is_min = mergeutils.GetMinHarmonizedRecords(harmonized_records, chroms) + + if args.verbose: mergeutils.DebugPrintRecordLocations(current_records, is_min) if mergeutils.CheckMin(is_min): return 1 if all(is_min): diff --git a/trtools/utils/tr_harmonizer.py b/trtools/utils/tr_harmonizer.py index 0eb770e1..f24ce3bd 100644 --- a/trtools/utils/tr_harmonizer.py +++ b/trtools/utils/tr_harmonizer.py @@ -225,34 +225,6 @@ def InferVCFType(vcffile: cyvcf2.VCF, vcftype: Union[str, VcfTypes] = "auto") -> .format(possible_vcf_types, vcftype))) -def HarmonizeRecords(vcf_records: List[cyvcf2.Variant], vcf_types: List[Union[str, VcfTypes]]): - """ - Create a list of standardized TRRecord objects out of a cyvcf2.Variant - objects of possibly unknown type. - - Parameters - ---------- - vcf_records : - A list of cyvcf2.Variant Object - - vcf_types : - A list of strings or VCFTypes objects - - Returns - ------- - List of TRRecord objects - - Raises - ------- - ValueError when lengths of input lists are not equal - """ - - - if len(vcf_types) != len(vcf_records): - raise ValueError("Length of type and record lists is not equal") - - return [HarmonizeRecord(vcf_types[i], vcf_records[i]) for i in range(len(vcf_records))] - def HarmonizeRecord(vcftype: Union[str, VcfTypes], vcfrecord: cyvcf2.Variant): """ Create a standardized TRRecord object out of a cyvcf2.Variant