diff --git a/README.rst b/README.rst index b7f85252..3fd57306 100644 --- a/README.rst +++ b/README.rst @@ -116,7 +116,6 @@ Development Notes ----------------- * TRTools only currently supports diploid genotypes. Haploid calls, such as those on male chrX or chrY, are not yet supported but should be coming soon. -* CompareSTR currently only compares STR loci between two callsets if they have the same start and end coordinates. In the future we will add the capacity to take in a user specificied mapping of loci between the two callsets and use that to compare loci even if they don't completely overlap one another. Contact Us ---------- diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 9e5a660e..eded94aa 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -1,3 +1,13 @@ +Unreleased changes +----- + +Bug fixes: + +* https://github.com/gymreklab/TRTools/issues/146 fixed record positions being compared twice +* CompareSTR: Decision on which records are comparable is now based on data from harmonized TRRecords, + and not from the records directly from VCF readers. Thanks to this, HipSTR records which have different starting positions, + but position of their repeat is at the same position are compared correctly (harmonization step removes this difference). + 4.0.1 ----- diff --git a/requirements.txt b/requirements.txt index 3c78cebb..6371f059 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ scikit-learn==0.23.1 matplotlib==3.2.2 pysam==0.16.0.1 cython==0.29.20 -cyvcf2==0.30.1 +cyvcf2==0.30.1 \ No newline at end of file diff --git a/trtools/compareSTR/README.rst b/trtools/compareSTR/README.rst index 57b75c52..b37c0d01 100644 --- a/trtools/compareSTR/README.rst +++ b/trtools/compareSTR/README.rst @@ -11,11 +11,11 @@ Example use cases include: * Comparing calls to a "ground truth" set, e.g. from capillary electrophoresis data, to find call errors * Comparing calls for the same tool using different parameter settings to identify differences due to bioinformatic processing -* Comparing calls for different tools. This only works if they used the same set of reference TRs. Please note that compareSTR uses matching chromosomes and positions to compare TRs. Therefore, our method is not able to compare TRs if the starting coordinates is changed by TR calling method (e.g., due to phasing with a nearby variant) +* Comparing calls for different tools. This only works if they used the same set of reference TRs. Please note that compareSTR only compares TRs with matching chromosomes and allele-positions (ignoring flanking base pairs) CompareSTR optionally will stratify results based on a user-specified FORMAT field (e.g. depth, or quality score) and/or by repeat motif length. -Note: CompareSTR is designed to be used as a QC tool. While it may be able to pick up certain biological differences in some applications (e.g. identifying de novo mutations by comparing parent and child callsets or somatic mutations by comparing callsets from different tissues), most such applications would be better performed by specialized tools. +Note: CompareSTR is designed to be used as a QC tool. While it may be able to pick up certain biological differences in some applications (e.g. identifying de novo mutations by comparing parent and child callsets or somatic mutations by comparing callsets from different tissues), use-case specific analyses may be better performed by more specialized tools. Note: CompareSTR has the ability to stratify comparisons based on quality scores. However, beware that quality scores output by different genotypers may not be directly comparable. You can use `qcSTR `_ to visualize the distribution of quality scores in each VCF file seprately. diff --git a/trtools/compareSTR/compareSTR.py b/trtools/compareSTR/compareSTR.py index 408a7265..8a908fed 100644 --- a/trtools/compareSTR/compareSTR.py +++ b/trtools/compareSTR/compareSTR.py @@ -521,8 +521,8 @@ def UpdateComparisonResults(record1, record2, sample_idxs, dictionary of counts to update """ # Extract shared info - chrom = record1.vcfrecord.CHROM - pos = record1.vcfrecord.POS + chrom = record1.chrom + pos = record1.pos period = len(record1.motif) reflen = len(record1.ref_allele)/period @@ -808,29 +808,32 @@ def main(args): vcfregions = [vcfreaders[0](args.region), vcfreaders[1](args.region)] ### Walk through sorted readers, merging records as we go ### - current_records = [next(reader) for reader in vcfreaders] - is_min = mergeutils.GetMinRecords(current_records, chroms) - + current_records = mergeutils.InitReaders(vcfreaders) done = mergeutils.DoneReading(current_records) + vcf_types = [vcftype1, vcftype2] num_records = 0 while not done: if any([item is None for item in current_records]): break if args.numrecords is not None and num_records >= args.numrecords: break + + harmonized_records = [trh.HarmonizeRecord(vcf_types[i], current_records[i]) for i in range(len(current_records))] + # contains information about which record should be + # skipped in next iteration and whether it is currently comparable + is_min = mergeutils.GetMinHarmonizedRecords(harmonized_records, chroms) + + if args.verbose: mergeutils.DebugPrintRecordLocations(current_records, is_min) if mergeutils.CheckMin(is_min): return 1 - if all([is_min]): - if (current_records[0].CHROM == current_records[1].CHROM and \ - current_records[0].POS == current_records[1].POS): - UpdateComparisonResults(trh.HarmonizeRecord(vcftype1, current_records[0]), \ - trh.HarmonizeRecord(vcftype2, current_records[1]), \ - sample_idxs, - args.ignore_phasing, args.period, - format_fields, format_bins, - args.stratify_file, - overall_results, locus_results, - sample_results, bubble_results) + if all(is_min): + UpdateComparisonResults(*harmonized_records, + sample_idxs, + args.ignore_phasing, args.period, + format_fields, format_bins, + args.stratify_file, + overall_results, locus_results, + sample_results, bubble_results) + current_records = mergeutils.GetNextRecords(vcfregions, current_records, is_min) - is_min = mergeutils.GetMinRecords(current_records, chroms) done = mergeutils.DoneReading(current_records) num_records += 1 diff --git a/trtools/compareSTR/tests/test_compareSTR.py b/trtools/compareSTR/tests/test_compareSTR.py index fcaa4735..66a201d1 100644 --- a/trtools/compareSTR/tests/test_compareSTR.py +++ b/trtools/compareSTR/tests/test_compareSTR.py @@ -173,6 +173,32 @@ def test_main(tmpdir, vcfdir): retcode = main(args) assert retcode == 1 +def test_hipstr_position_harmonisation(tmpdir, vcfdir): + vcfcomp = os.path.join(vcfdir, "compareSTR_vcfs") + + hipstr_vcf_1 = os.path.join(vcfcomp, "test_hipstr_flanking_bp_flanking.vcf.gz") + hipstr_vcf_2 = os.path.join(vcfcomp, "test_hipstr_flanking_bp_non_flanking.vcf.gz") + + args = base_argparse(tmpdir) + args.vcf1 = hipstr_vcf_1 + args.region = "" + args.vcftype1 = 'hipstr' + args.vcf2 = hipstr_vcf_2 + args.vcftype2 = 'hipstr' + retcode = main(args) + assert retcode == 0 + + with open(tmpdir + "/test_compare-locuscompare.tab", "r") as out_overall: + lines = out_overall.readlines() + ## vcf1 : flank bp at start of record, vcf2: no flanking bp + assert lines[1] == "1 101675 1.0 1.0 1\n" + ## vcf1 : no flanking bp, vcf2: no flanking bp + assert lines[2] == "1 111675 1.0 1.0 1\n" + ## vcf1 : flanking bp at the end, vcf2: no flanking bp + assert lines[3] == "1 112655 1.0 1.0 1\n" + ## vcf1 : flanking bp at both sides, vcf2: no flanking bp + assert lines[4] == "1 125557 1.0 1.0 1\n" + def test_wrong_vcftype(tmpdir, vcfdir, capsys): args = base_argparse(tmpdir) vcfcomp = os.path.join(vcfdir, "compareSTR_vcfs") diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz new file mode 100644 index 00000000..2f1136c1 Binary files /dev/null and b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz differ diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz.tbi new file mode 100644 index 00000000..c4da03cc Binary files /dev/null and b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_flanking.vcf.gz.tbi differ diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz new file mode 100644 index 00000000..2fb440cb Binary files /dev/null and b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz differ diff --git a/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz.tbi new file mode 100644 index 00000000..79b89819 Binary files /dev/null and b/trtools/testsupport/sample_vcfs/compareSTR_vcfs/test_hipstr_flanking_bp_non_flanking.vcf.gz.tbi differ diff --git a/trtools/utils/mergeutils.py b/trtools/utils/mergeutils.py index 5b2eb6cb..9cd0b202 100644 --- a/trtools/utils/mergeutils.py +++ b/trtools/utils/mergeutils.py @@ -7,11 +7,18 @@ import os import sys +from cyvcf2 import cyvcf2 + import trtools.utils.common as common import trtools.utils.tr_harmonizer as trh +from typing import List, Union, Any, Optional, Callable + +CYVCF_RECORD = cyvcf2.Variant +CYVCF_READER = cyvcf2.VCF -def LoadReaders(vcffiles, region=None): + +def LoadReaders(vcffiles: List[str], region: Optional[str] = None) -> List[CYVCF_READER]: r"""Return list of VCF readers Parameters @@ -28,17 +35,19 @@ def LoadReaders(vcffiles, region=None): """ for f in vcffiles: if not f.endswith(".vcf.gz") and not f.endswith(".vcf.bgz"): - raise ValueError("Make sure %s is bgzipped and indexed"%f) + raise ValueError("Make sure %s is bgzipped and indexed" % f) if not os.path.isfile(f): - raise ValueError("Could not find VCF file %s"%f) - if not os.path.isfile(f+".tbi"): - raise ValueError("Could not find VCF index %s.tbi"%f) + raise ValueError("Could not find VCF file %s" % f) + if not os.path.isfile(f + ".tbi"): + raise ValueError("Could not find VCF index %s.tbi" % f) readers = [vcf.Reader(open(f, "rb")) for f in vcffiles] if region is None: return readers - else: return [r.fetch(region) for r in readers] + else: + return [r.fetch(region) for r in readers] -def GetSharedSamples(readers): + +def GetSharedSamples(readers: List[CYVCF_READER]) -> List[str]: r"""Get list of samples used in all files being merged Parameters @@ -50,14 +59,15 @@ def GetSharedSamples(readers): samples : list of str Samples present in all readers """ - if len(readers) == 0: return set() + if len(readers) == 0: return list() samples = set(readers[0].samples) - if len(readers) == 1: return samples + if len(readers) == 1: return list(samples) for r in readers[1:]: samples = samples.intersection(set(r.samples)) return list(samples) -def GetSamples(readers, filenames=None): + +def GetSamples(readers: List[CYVCF_READER], filenames: Optional[str] = None) -> List[str]: r"""Get list of samples used in all files being merged Parameters @@ -87,7 +97,8 @@ def GetSamples(readers, filenames=None): samples += r.samples return samples -def GetAndCheckVCFType(vcfs, vcftype): + +def GetAndCheckVCFType(vcfs: List[CYVCF_READER], vcftype: str) -> str: """Infer type of multiple VCFs If they are all the same, return that type @@ -123,9 +134,11 @@ def GetAndCheckVCFType(vcfs, vcftype): types.append(vcf_type) if len(set(types)) == 1: return types[0] - else: raise ValueError("VCF files are of mixed types.") + else: + raise ValueError("VCF files are of mixed types.") + -def GetChromOrder(r, chroms): +def GetChromOrder(r: CYVCF_RECORD, chroms: List[str]) -> Union[int, float]: r"""Get the chromosome order of a record Parameters @@ -136,14 +149,17 @@ def GetChromOrder(r, chroms): Returns ------- - order : int - Index of r.CHROM in chroms - Return np.inf if can't find r.CHROM + order : int or float + Index of r.CHROM in chroms, int + Return np.inf if can't find r.CHROM, float """ - if r is None: return np.inf - else: return chroms.index(r.CHROM) + if r is None: + return np.inf + else: + return chroms.index(r.CHROM) -def GetChromOrderEqual(chrom_order, min_chrom): + +def GetChromOrderEqual(chrom_order: Union[int, float], min_chrom: int) -> bool: r"""Check chrom order Parameters @@ -161,7 +177,8 @@ def GetChromOrderEqual(chrom_order, min_chrom): if chrom_order == np.inf: return False return chrom_order == min_chrom -def GetPos(r): + +def GetPos(r: CYVCF_RECORD) -> Union[int, float]: r"""Get the position of a record Parameters @@ -171,12 +188,15 @@ def GetPos(r): Returns ------- pos : int - If r is None, returns np.inf + If r is None, returns np.inf, which is a float """ - if r is None: return np.inf - else: return r.POS + if r is None: + return np.inf + else: + return r.POS -def CheckPos(record, chrom, pos): + +def CheckPos(record: CYVCF_RECORD, chrom: str, pos: int) -> bool: r"""Check a record is at the specified position Parameters @@ -194,9 +214,10 @@ def CheckPos(record, chrom, pos): Return True if the current record is at this position """ if record is None: return False - return record.CHROM==chrom and record.POS==pos + return record.CHROM == chrom and record.POS == pos + -def GetMinRecords(record_list, chroms): +def GetMinRecords(record_list: List[Optional[trh.TRRecord]], chroms: List[str]) -> List[bool]: r"""Check if each record is next up in sort order Return a vector of boolean set to true if @@ -205,8 +226,9 @@ def GetMinRecords(record_list, chroms): Parameters ---------- - record_list : list of vcf.Record + record_list : list of CYVCF_RECORD list of current records from each file being merged + chroms : list of str Ordered list of all chromosomes @@ -222,10 +244,44 @@ def GetMinRecords(record_list, chroms): if len(allpos) > 0: min_pos = min(allpos) else: - return [False]*len(record_list) + return [False] * len(record_list) return [CheckPos(r, chroms[min_chrom], min_pos) for r in record_list] -def DoneReading(records): + + +def GetMinHarmonizedRecords(record_list: List[Optional[trh.TRRecord]], chroms: List[str]) -> List[bool]: + r"""Check if each record is next up in sort order + + Return a vector of boolean set to true if + the record is in lowest sort order of all the records + Use order in chroms to determine sort order of chromosomes + + Parameters + ---------- + record_list : trh.TRRecord + list of current records from each file being merged + + chroms : list of str + Ordered list of all chromosomes + + Returns + ------- + checks : list of bool + Set to True for records that are first in sort order + """ + chrom_order = [np.inf if r is None else chroms.index(r.chrom) for r in record_list] + pos = [np.inf if r is None else r.pos for r in record_list] + + min_chrom = min(chrom_order) + allpos = [pos[i] for i in range(len(pos)) if GetChromOrderEqual(chrom_order[i], min_chrom)] + if len(allpos) > 0: + min_pos = min(allpos) + else: + return [False] * len(record_list) + return [False if r is None else r.chrom == chroms[min_chrom] and r.pos == min_pos for r in record_list] + + +def DoneReading(records: List[Union[CYVCF_RECORD, trh.TRRecord]]) -> bool: r"""Check if all records are at the end of the file Parameters @@ -241,7 +297,8 @@ def DoneReading(records): """ return all([item is None for item in records]) -def DebugPrintRecordLocations(current_records, is_min): + +def DebugPrintRecordLocations(current_records: List[CYVCF_RECORD], is_min: List[bool]) -> None: r"""Debug function to print current records for each file Parameters @@ -255,10 +312,11 @@ def DebugPrintRecordLocations(current_records, is_min): for i in range(len(is_min)): chrom = current_records[i].CHROM pos = current_records[i].POS - info.append("%s:%s:%s"%(chrom, pos, is_min[i])) - common.MSG("\t".join(info)+"\n", debug=True) + info.append("%s:%s:%s" % (chrom, pos, is_min[i])) + common.MSG("\t".join(info) + "\n", debug=True) + -def CheckMin(is_min): +def CheckMin(is_min: List[bool]) -> bool: r"""Check if we're progressing through VCFs Parameters @@ -271,11 +329,13 @@ def CheckMin(is_min): check : bool Set to True if something went wrong """ - if sum(is_min)==0: + if sum(is_min) == 0: raise ValueError("Unexpected error. Stuck in infinite loop and exiting.") return False -def GetNextRecords(readers, current_records, increment): + +def GetNextRecords(readers: List[CYVCF_READER], current_records: List[CYVCF_RECORD], increment: List[bool]) \ + -> List[CYVCF_RECORD]: r"""Increment readers of each file Increment readers[i] if increment[i] set to true @@ -302,6 +362,24 @@ def GetNextRecords(readers, current_records, increment): new_records.append(next(readers[i])) except StopIteration: new_records.append(None) - else: new_records.append(current_records[i]) + else: + new_records.append(current_records[i]) return new_records + +def InitReaders(readers: List[CYVCF_READER]) -> List[CYVCF_RECORD]: + r"""Increment readers of each file + + Returns list of first records from list of readers. + + Parameters + ---------- + readers : list of cyvcf2.VCF + List of readers for all files being merged + + Returns + ------- + list of vcf.Record + List of next records for each file + """ + return [next(reader) for reader in readers] diff --git a/trtools/utils/tests/test_mergeutils.py b/trtools/utils/tests/test_mergeutils.py index b4fafbbd..0a82e140 100644 --- a/trtools/utils/tests/test_mergeutils.py +++ b/trtools/utils/tests/test_mergeutils.py @@ -9,33 +9,43 @@ @pytest.fixture def mrgvcfdir(vcfdir): - return os.path.join(vcfdir, "mergeSTR_vcfs") + return os.path.join(vcfdir, "mergeSTR_vcfs") + # Set up dummy class class DummyRecord: - def __init__(self, chrom, pos, ref, alts=[], info = {}): + def __init__(self, chrom, pos, ref, alts=[], info={}): self.CHROM = chrom self.POS = pos self.REF = ref self.ALTS = alts self.INFO = info + +class DummyHarmonizedRecord: + def __init__(self, chrom, pos): + self.chrom = chrom + self.pos = pos + + def test_DebugPrintRecordLocations(capsys): - dummy_records = [] + dummy_records = [] dummy_records.append(DummyRecord('chr1', 100, 'CAGCAG', info={'END': 120})) dummy_records.append(DummyRecord('chr1', 150, 'CTTCTT', info={'END': 170})) - + mergeutils.DebugPrintRecordLocations(dummy_records, [True, False]) captured = capsys.readouterr() assert "chr1:100:True" in captured.err assert "chr1:150:False" in captured.err + def test_CheckMin(): assert mergeutils.CheckMin([True, False]) == False with pytest.raises(ValueError) as info: mergeutils.CheckMin([False, False]) assert "Unexpected error. Stuck in infinite loop and exiting." in str(info.value) + def test_CheckVCFType(vcfdir): snps_path = os.path.join(vcfdir, "snps.vcf") gangstr_path = os.path.join(vcfdir, "test_gangstr.vcf") @@ -49,11 +59,12 @@ def test_CheckVCFType(vcfdir): with pytest.raises(ValueError) as info: print(mergeutils.GetAndCheckVCFType([gangstr_vcf, hipstr_vcf], "auto")) assert "VCF files are of mixed types." in str(info.value) - + with pytest.raises(TypeError) as info: print(mergeutils.GetAndCheckVCFType([gangstr_vcf, snps_vcf], "auto")) assert "Could not identify the type of this vcf" in str(info.value) + # Test no such file or directory def test_WrongFile(mrgvcfdir): # Try a dummy file name. Make sure it doesn't exist before we try @@ -63,11 +74,12 @@ def test_WrongFile(mrgvcfdir): os.remove(fname1) if os.path.exists(fname2): os.remove(fname2) - print (os.path.isfile(fname1)) + print(os.path.isfile(fname1)) with pytest.raises(ValueError) as info: mergeutils.LoadReaders([fname1, fname2]) assert "Could not find VCF file" in str(info.value) + # test unzipped, unindexed VCFs return 1 def test_UnzippedUnindexedFile(mrgvcfdir): fname1 = os.path.join(mrgvcfdir, "test_file_gangstr_unzipped1.vcf") @@ -81,3 +93,28 @@ def test_UnzippedUnindexedFile(mrgvcfdir): with pytest.raises(ValueError) as info: mergeutils.LoadReaders([fname1, fname2]) assert "Could not find VCF index" in str(info.value) + + +def test_GetMinHarmonizedRecords(): + chromosomes = ["chr1", "chr2", "chr3"] + + pair = [DummyHarmonizedRecord("chr1", 20), DummyHarmonizedRecord("chr1", 20)] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [True, True] + + pair = [DummyHarmonizedRecord("chr1", 21), DummyHarmonizedRecord("chr1", 20)] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [False, True] + + pair = [DummyHarmonizedRecord("chr2", 20), DummyHarmonizedRecord("chr1", 20)] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [False, True] + + pair = [DummyHarmonizedRecord("chr1", 20), DummyHarmonizedRecord("chr1", 21)] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [True, False] + + pair = [None, None] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [False, False] + + pair = [DummyHarmonizedRecord("chr1", 20), None] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [True, False] + + pair = [None, DummyHarmonizedRecord("chr1", 20)] + assert mergeutils.GetMinHarmonizedRecords(pair, chromosomes) == [False, True] diff --git a/trtools/utils/tests/test_trharmonizer.py b/trtools/utils/tests/test_trharmonizer.py index 7bd6213a..6fc64171 100644 --- a/trtools/utils/tests/test_trharmonizer.py +++ b/trtools/utils/tests/test_trharmonizer.py @@ -797,6 +797,7 @@ def test_HarmonizeRecord(vcfdir): assert not tr_rec1.HasFullStringGenotypes() assert not tr_rec1.HasFabricatedRefAllele() assert not tr_rec1.HasFabricatedAltAlleles() + assert tr_rec1.pos == tr_rec1.full_alleles_pos tr_rec2 = next(str_iter) tr_rec3 = next(str_iter) assert tr_rec3.ref_allele == 'TTTTTTTTTTTTTTT' @@ -807,6 +808,8 @@ def test_HarmonizeRecord(vcfdir): while record.record_id != "STR_125": record = next(str_iter) assert record.HasFullStringGenotypes() + # record in the test file has flanking bp at the start of the ref. allele + assert record.pos > record.full_alleles_pos # TODO this isn't really the correct behavior - # we're trimming off an extra repeat from the alt allele assert record.full_alleles == ( @@ -931,4 +934,3 @@ def test_TRRecord_Quality(vcfdir): assert not var.HasQualityScores() with pytest.raises(TypeError): var.GetQualityScores() - diff --git a/trtools/utils/tr_harmonizer.py b/trtools/utils/tr_harmonizer.py index c996802a..f24ce3bd 100644 --- a/trtools/utils/tr_harmonizer.py +++ b/trtools/utils/tr_harmonizer.py @@ -13,6 +13,7 @@ import trtools.utils.utils as utils + # List of supported VCF types # TODO: add Beagle # TODO: add support for tool version numbers @@ -45,13 +46,13 @@ def _ToVCFType(vcftype: Union[str, VcfTypes]): if vcftype not in VcfTypes.__members__: raise ValueError(("{} is not an excepted TR vcf type. " "Expected one of {}").format( - vcftype, list(VcfTypes.__members__))) + vcftype, list(VcfTypes.__members__))) return VcfTypes[vcftype] elif isinstance(vcftype, VcfTypes): return vcftype else: raise TypeError(("{} (of type {}) is not a vcftype" - .format(vcftype, type(vcftype)))) + .format(vcftype, type(vcftype)))) def MayHaveImpureRepeats(vcftype: Union[str, VcfTypes]): @@ -273,7 +274,8 @@ def _HarmonizeGangSTRRecord(vcfrecord: cyvcf2.Variant): if vcfrecord.INFO.get('RU') is None: raise TypeError("This is not a GangSTR record {}:{}".format(vcfrecord.CHROM, vcfrecord.POS)) if vcfrecord.INFO.get('VID') is not None: - raise TypeError("Trying to read an AdVNTR record as a GangSTR record {}:{}".format(vcfrecord.CHROM, vcfrecord.POS)) + raise TypeError( + "Trying to read an AdVNTR record as a GangSTR record {}:{}".format(vcfrecord.CHROM, vcfrecord.POS)) if vcfrecord.INFO.get('VARID') is not None: raise TypeError("Trying to read an EH record as a GangSTR record {}:{}".format(vcfrecord.CHROM, vcfrecord.POS)) ref_allele = vcfrecord.REF.upper() @@ -360,6 +362,7 @@ def _HarmonizeHipSTRRecord(vcfrecord: cyvcf2.Variant): motif, record_id, 'Q', + harmonized_pos=int(vcfrecord.INFO['START']), full_alleles=full_alleles) @@ -389,7 +392,7 @@ def _HarmonizeAdVNTRRecord(vcfrecord: cyvcf2.Variant): return TRRecord(vcfrecord, ref_allele, alt_alleles, motif, record_id, 'ML') -#def _PHREDtoProb(phred: int) -> float: +# def _PHREDtoProb(phred: int) -> float: # """Convert PHRED score to probability # # Notes @@ -399,7 +402,7 @@ def _HarmonizeAdVNTRRecord(vcfrecord: cyvcf2.Variant): # return 10**(-phred/10) -#def _ConvertPLtoQualityProb(PL: List[int]) -> float: +# def _ConvertPLtoQualityProb(PL: List[int]) -> float: # """ # Convert a list of PHRED-scaled genotype probabilities to the # unscaled probability of the single most likely genotype.# @@ -498,8 +501,8 @@ def _HarmonizeEHRecord(vcfrecord: cyvcf2.Variant): alt_allele_lengths=alt_allele_lengths) -def _UpperCaseAlleles(alleles : List[str]): - #Convert the list of allele strings to upper case +def _UpperCaseAlleles(alleles: List[str]): + # Convert the list of allele strings to upper case upper_alleles = [] for allele in alleles: upper_alleles.append(allele.upper()) @@ -575,7 +578,7 @@ class TRRecord: chrom : str The chromosome this locus is in pos : int - The bp along the chromosome that this locus is at + The bp along the chromosome that this locus is at (ignoring flanking base pairs/full alleles) info : Dict[str, Any] The dictionary of INFO fields at this locus format : Dict[str, np.ndarray] @@ -588,6 +591,9 @@ class TRRecord: Other Parameters ---------------- + harmonized_pos : + If this record has flanking base pairs before the repeat, set this + to note at which bp the repeat begins full_alleles : A tuple of string genotypes (ref_allele, [alt_alleles]) where each allele may contain any number of flanking @@ -633,6 +639,7 @@ def __init__(self, record_id: str, quality_field: str, *, + harmonized_pos: Optional[int] = None, full_alleles: Optional[Tuple[str, List[str]]] = None, ref_allele_length: Optional[float] = None, alt_allele_lengths: Optional[List[float]] = None, @@ -643,10 +650,11 @@ def __init__(self, self.motif = motif self.record_id = record_id self.chrom = vcfrecord.CHROM - self.pos = vcfrecord.POS + self.pos = harmonized_pos if harmonized_pos is not None else vcfrecord.POS self.info = dict(vcfrecord.INFO) self.format = _Cyvcf2FormatDict(vcfrecord) self.full_alleles = full_alleles + self.full_alleles_pos = self.vcfrecord.POS self.ref_allele_length = ref_allele_length self.alt_allele_lengths = alt_allele_lengths self.quality_field = quality_field @@ -670,7 +678,7 @@ def __init__(self, self.ref_allele = utils.FabricateAllele(motif, ref_allele_length) else: self.has_fabricated_ref_allele = False - self.ref_allele_length = len(ref_allele)/len(motif) + self.ref_allele_length = len(ref_allele) / len(motif) if alt_allele_lengths is not None: self.has_fabricated_alt_alleles = True @@ -681,7 +689,7 @@ def __init__(self, else: self.has_fabricated_alt_alleles = False self.alt_allele_lengths = [ - len(allele)/len(motif) for allele in self.alt_alleles + len(allele) / len(motif) for allele in self.alt_alleles ] try: @@ -705,7 +713,7 @@ def _CheckRecord(self): "number of alt alleles as given to the TRRecord " "constructor. Underlying alt alleles: {}, " " constructor alt alleles: {}".format( - self.vcfrecord.ALT, self.alt_alleles)) + self.vcfrecord.ALT, self.alt_alleles)) if self.full_alleles: if len(self.full_alleles) != 2: @@ -833,7 +841,7 @@ def GetSamplePloidies(self) -> Optional[np.ndarray]: return None return ( - gt_idxs.shape[1] - 1 - np.sum(gt_idxs[:, :-1] == -2, axis = 1) + gt_idxs.shape[1] - 1 - np.sum(gt_idxs[:, :-1] == -2, axis=1) ) def GetCallRate(self, strict: bool = True) -> float: @@ -861,7 +869,7 @@ def GetCallRate(self, strict: bool = True) -> float: if called_samples is None: return None else: - return np.sum(called_samples)/called_samples.shape[0] + return np.sum(called_samples) / called_samples.shape[0] def _GetStringGenotypeArray( self, @@ -878,7 +886,6 @@ def _GetStringGenotypeArray( seq_array[:, :-1][idx_gts[:, :-1] == -2] = ',' return seq_array - def GetStringGenotypes(self) -> Optional[np.ndarray]: """ Get an array of string genotypes for each sample. @@ -1206,7 +1213,7 @@ def GetGenotypeCounts( if gts is None: return {} - gts = gts[:, :-1] #remove phasing + gts = gts[:, :-1] # remove phasing gts = np.sort(gts, axis=1) if sample_index is not None: @@ -1295,11 +1302,11 @@ def GetAlleleCounts(self, if gts is None: return {} - gts = gts[:, :-1] #remove phasing + gts = gts[:, :-1] # remove phasing if sample_index is not None: gts = gts[sample_index, :] - + # remove no calls and missing haplotypes gts = gts[gts != nocall_entry] gts = gts[gts != lowploidy_entry] @@ -1349,7 +1356,7 @@ def GetAlleleFreqs(self, fullgenotypes=fullgenotypes, sample_index=sample_index) total = float(sum(allele_counts.values())) - return {key: value/total for key, value in allele_counts.items()} + return {key: value / total for key, value in allele_counts.items()} def GetMaxAllele(self, sample_index: Optional[Any] = None) -> float: @@ -1568,5 +1575,4 @@ def __next__(self) -> TRRecord: """Iterate over TRRecord produced from the underlying vcf.""" return HarmonizeRecord(self.vcftype, next(self.vcffile)) - -#TODO check all users of this class for new options +# TODO check all users of this class for new options