diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 96cc3438..8ef3b011 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -1,11 +1,18 @@ Unreleased changes ----- -CompareSTR and mergeutils changes: +Functionality Changes: + +* MergeSTR: Decision on which records are able to be merged is now based on data from harmonized TRRecords, + and not from the records directly from VCF readers. Thanks to this, HipSTR records which have different starting positions, + but position of their repeat is at the same position are merged correctly. This difference is caused by flanking BP that are + included in the record. These flanking BPs are also removed during the merge, and merged HipSTR records no longer contain them * CompareSTR: the tool now only compares records that start and end at the same position. If overlap in records is detected, the program will output a warning to the user. This warning contains IDs of the records and their positions. +Misc: + * mergeutils: function GetMinHarmonizedRecords was transformed into GetRecordComparabilityAndIncrement, which allows the caller to define custom predicate that decides whether records are comparable. diff --git a/trtools/mergeSTR/mergeSTR.py b/trtools/mergeSTR/mergeSTR.py index d5813797..7e7bf7d7 100644 --- a/trtools/mergeSTR/mergeSTR.py +++ b/trtools/mergeSTR/mergeSTR.py @@ -17,6 +17,7 @@ import trtools.utils.utils as utils from trtools import __version__ +from typing import Tuple, Union, Any, Optional, List, TextIO NOCALLSTRING = "." @@ -24,17 +25,17 @@ # fail when all records don't have identical values for that field INFOFIELDS = { trh.VcfTypes.gangstr: [("END", True), ("RU", True), ("PERIOD", True), ("REF", True), \ - ("EXPTHRESH", True), ("STUTTERUP", False), \ - ("STUTTERDOWN", False), ("STUTTERP", False)], + ("EXPTHRESH", True), ("STUTTERUP", False), \ + ("STUTTERDOWN", False), ("STUTTERP", False)], trh.VcfTypes.hipstr: [("INFRAME_PGEOM", False), ("INFRAME_UP", False), ("INFRAME_DOWN", False), \ - ("OUTFRAME_PGEOM", False), ("OUTFRAME_UP", False), ("OUTFRAME_DOWN", False), \ - ("BPDIFFS", False), ("START", True), ("END", True), ("PERIOD", True), \ - ("AN", False), ("REFAC", False), ("AC", False), ("NSKIP", False), \ - ("NFILT", False), ("DP", False), ("DSNP", False), ("DSTUTTER", False), \ - ("DFLANKINDEL", False)], + ("OUTFRAME_PGEOM", False), ("OUTFRAME_UP", False), ("OUTFRAME_DOWN", False), \ + ("BPDIFFS", False), ("START", True), ("END", True), ("PERIOD", True), \ + ("AN", False), ("REFAC", False), ("AC", False), ("NSKIP", False), \ + ("NFILT", False), ("DP", False), ("DSNP", False), ("DSTUTTER", False), \ + ("DFLANKINDEL", False)], trh.VcfTypes.eh: [("END", True), ("REF", True), ("REPID", True), ("RL", True), \ - ("RU", True), ("SVTYPE", False), ("VARID", True)], - trh.VcfTypes.popstr: [("Motif", True)], # TODO ("RefLen", True) omitted. since it is marked as "A" incorrectly + ("RU", True), ("SVTYPE", False), ("VARID", True)], + trh.VcfTypes.popstr: [("Motif", True)], # TODO ("RefLen", True) omitted. since it is marked as "A" incorrectly trh.VcfTypes.advntr: [("END", True), ("VID", True), ("RU", True), ("RC", True)] } @@ -42,14 +43,17 @@ # Not all fields currently handled # If not listed here, it is ignored FORMATFIELDS = { - trh.VcfTypes.gangstr: ["DP","Q","REPCN","REPCI","RC","ENCLREADS","FLNKREADS","ML","INS","STDERR","QEXP"], - trh.VcfTypes.hipstr: ["GB","Q","PQ","DP","DSNP","PSNP","PDP","GLDIFF","DSTUTTER","DFLANKINDEL","AB","FS","DAB","ALLREADS","MALLREADS"], - trh.VcfTypes.eh: ["ADFL","ADIR","ADSP","LC","REPCI","REPCN","SO"], - trh.VcfTypes.popstr: ["AD","DP","PL"], - trh.VcfTypes.advntr: ["DP","SR","FR","ML"] + trh.VcfTypes.gangstr: ["DP", "Q", "REPCN", "REPCI", "RC", "ENCLREADS", "FLNKREADS", "ML", "INS", "STDERR", "QEXP"], + trh.VcfTypes.hipstr: ["GB", "Q", "PQ", "DP", "DSNP", "PSNP", "PDP", "GLDIFF", "DSTUTTER", "DFLANKINDEL", "AB", "FS", + "DAB", "ALLREADS", "MALLREADS"], + trh.VcfTypes.eh: ["ADFL", "ADIR", "ADSP", "LC", "REPCI", "REPCN", "SO"], + trh.VcfTypes.popstr: ["AD", "DP", "PL"], + trh.VcfTypes.advntr: ["DP", "SR", "FR", "ML"] } -def WriteMergedHeader(vcfw, args, readers, cmd, vcftype): + +def WriteMergedHeader(vcfw: TextIO, args: Any, readers: List[cyvcf2.VCF], cmd: str, vcftype: Union[str, trh.VcfTypes]) \ + -> Union[Tuple[List[Tuple[str, bool]], List[str]], Tuple[None, None]]: r"""Write merged header for VCFs in args.vcfs Also do some checks on the VCFs to make sure merging @@ -76,7 +80,8 @@ def WriteMergedHeader(vcfw, args, readers, cmd, vcftype): useformat: list of str List of format field strings to use downstream """ - def get_header_lines(field, reader): + + def get_header_lines(field: str, reader: cyvcf2.VCF) -> List[str]: compare_len = 3 + len(field) compare_start = '##' + field.lower() + "=" return [line for line in reader.raw_header.split('\n') if \ @@ -98,12 +103,12 @@ def get_header_lines(field, reader): for r in readers: for line in get_header_lines('command', r): vcfw.write(line + '\n') - vcfw.write("##command="+cmd+"\n") + vcfw.write("##command=" + cmd + "\n") # Update sources sources = set.union(*[set(get_header_lines('source', reader)) for reader in readers]) for src in sources: - vcfw.write(src+"\n") + vcfw.write(src + "\n") for contig in contigs: vcfw.write(contig + "\n") @@ -111,17 +116,17 @@ def get_header_lines(field, reader): # Write ALT fields if present alts = set.union(*[set(get_header_lines('alt', reader)) for reader in readers]) for alt in alts: - vcfw.write(alt+'\n') + vcfw.write(alt + '\n') # Write INFO fields, different for each tool - useinfo = [] + useinfo: List[Tuple[str, bool]] = [] infos = get_header_lines('info', readers[0]) for (field, reqd) in INFOFIELDS[vcftype]: this_info = [line for line in infos if 'ID=' + field + ',' in line] if len(this_info) == 0: - common.WARNING("Expected info field %s not found. Skipping"%field) + common.WARNING("Expected info field %s not found. Skipping" % field) elif len(this_info) >= 2: - common.WARNING("Found two header lines matching the info field %s. Skipping"%field) + common.WARNING("Found two header lines matching the info field %s. Skipping" % field) else: vcfw.write(this_info[0] + '\n') useinfo.append((field, reqd)) @@ -129,14 +134,14 @@ def get_header_lines(field, reader): # Write GT header vcfw.write("##FORMAT=\n") # Write FORMAT fields, different for each tool - useformat = [] + useformat: List[str] = [] formats = get_header_lines('format', readers[0]) for field in FORMATFIELDS[vcftype]: this_format = [line for line in formats if 'ID=' + field + ',' in line] if len(this_format) == 0: - common.WARNING("Expected format field %s not found. Skipping"%field) + common.WARNING("Expected format field %s not found. Skipping" % field) elif len(this_format) >= 2: - common.WARNING("Found two header lines matching the format field %s. Skipping"%field) + common.WARNING("Found two header lines matching the format field %s. Skipping" % field) else: vcfw.write(this_format[0] + '\n') useformat.append(field) @@ -154,10 +159,12 @@ def get_header_lines(field, reader): if len(samples) == 0: return None, None header_fields = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] - vcfw.write("#"+"\t".join(header_fields+samples)+"\n") + vcfw.write("#" + "\t".join(header_fields + samples) + "\n") return useinfo, useformat -def GetRefAllele(current_records, mergelist): + + +def GetRefAllele(current_records: List[trh.TRRecord], mergelist: List[bool], vcfType: trh.VcfTypes) -> Optional[str]: r"""Get reference allele for a set of records Parameters @@ -172,19 +179,44 @@ def GetRefAllele(current_records, mergelist): ref : str Reference allele string. Set to None if conflicting references are found. """ - refs = [] + + def DefaultKey(record: trh.TRRecord): + return record.vcfrecord.REF.upper() + + def HipstrKey(record: trh.TRRecord): + return record.ref_allele + + refs: List[str] = [] + ref_picker = DefaultKey + if vcfType == trh.VcfTypes.hipstr: + ref_picker = HipstrKey + chrom = "" pos = -1 for i in range(len(mergelist)): if mergelist[i]: - chrom = current_records[i].CHROM - pos = current_records[i].POS - refs.append(current_records[i].REF.upper()) + chrom = current_records[i].chrom + pos = current_records[i].pos + refs.append(ref_picker(current_records[i]).upper()) if len(set(refs)) != 1: return None return refs[0] -def GetAltAlleles(current_records, mergelist, vcftype): + + + +def GetAltsByKey(current_records: List[trh.TRRecord], mergelist: List[bool], key: Any): + result = set() + for i in range(len(mergelist)): + if mergelist[i]: + ralts = key(current_records[i]) + for item in ralts: + result.add(item.upper()) + return result + + +def GetAltAlleles(current_records: List[trh.TRRecord], mergelist: List[bool], vcftype: Union[str, trh.VcfTypes]) \ + -> Tuple[List[str], List[np.ndarray]]: r"""Get list of alt alleles Parameters @@ -216,12 +248,19 @@ def GetAltAlleles(current_records, mergelist, vcftype): rec_n.alleles[1] == out_rec.alleles[3] == 'AAAA' rec_n.alleles[2] == out_rec.alleles[2] == 'AAA' """ - alts = set() - for i in range(len(mergelist)): - if mergelist[i]: - ralts = current_records[i].ALT - for item in ralts: - alts.add(item.upper()) + + def DefaultKey(record: trh.TRRecord): + return record.vcfrecord.ALT + + def HipstrKey(record: trh.TRRecord): + return record.alt_alleles + + alt_picker = DefaultKey + if vcftype == trh.VcfTypes.hipstr: + alt_picker = HipstrKey + + alts = GetAltsByKey(current_records, mergelist, alt_picker) + if vcftype == trh.VcfTypes.eh: # EH alleles look like where 42 is the # number of repeat units so sort accordingly @@ -236,13 +275,14 @@ def GetAltAlleles(current_records, mergelist, vcftype): mappings = [] for i in range(len(mergelist)): if mergelist[i]: - ralts = current_records[i].ALT + ralts = alt_picker(current_records[i]) mappings.append( np.array([0] + [out_alts.index(ralt.upper()) + 1 for ralt in ralts]).astype(str) ) return out_alts, mappings -def GetID(idval): + +def GetID(idval: str) -> str: r"""Get the ID for a a record If not set, output "." @@ -257,10 +297,14 @@ def GetID(idval): idval : str Return ID. if None, return "." """ - if idval is None: return "." - else: return idval + if idval is None: + return "." + else: + return idval + -def GetInfoItem(current_records, mergelist, info_field, fail=True): +def GetInfoItem(current_records: List[trh.TRRecord], mergelist: List[bool], info_field: str, fail: bool = True) \ + -> Optional[str]: """Get INFO item for a group of records Make sure it's the same across merged records @@ -283,25 +327,27 @@ def GetInfoItem(current_records, mergelist, info_field, fail=True): infostring : str INFO string to add (key=value) """ - if not fail: return None # TODO in future implement smart merging of select fields + if not fail: return None # TODO in future implement smart merging of select fields vals = set() a_merged_rec = None for i in range(len(mergelist)): if mergelist[i]: a_merged_rec = current_records[i] - if info_field in dict(current_records[i].INFO): - vals.add(current_records[i].INFO[info_field]) + if info_field in dict(current_records[i].info): + vals.add(current_records[i].info[info_field]) else: - raise ValueError("Missing info field %s"%info_field) - if len(vals)==1: - return "%s=%s"%(info_field, vals.pop()) + raise ValueError("Missing info field %s" % info_field) + if len(vals) == 1: + return "%s=%s" % (info_field, vals.pop()) else: common.WARNING("Incompatible values %s for info field %s at position " - "%s:%i"%(vals, info_field, a_merged_rec.CHROM, - a_merged_rec.POS)) + "%s:%i" % (vals, info_field, a_merged_rec.chrom, + a_merged_rec.pos)) return None -def WriteSampleData(vcfw, record, alleles, formats, format_type, mapping): + +def WriteSampleData(vcfw: TextIO, record: cyvcf2.Variant, alleles: List[str], formats: List[str], + format_type: List[str], mapping: np.ndarray) -> None: r"""Output sample FORMAT data Writes a string representation of the GT and other format @@ -323,7 +369,7 @@ def WriteSampleData(vcfw, record, alleles, formats, format_type, mapping): mapping: np.ndarray See GetAltAlleles """ - assert "GT" not in formats # since we will add that + assert "GT" not in formats # since we will add that genotypes = record.genotype.array() not_called_samples = np.all( @@ -365,7 +411,7 @@ def WriteSampleData(vcfw, record, alleles, formats, format_type, mapping): )) # Add rest of formats - for fmt_idx, fmt in enumerate(formats): + for fmt_idx, fmt in enumerate(formats): vcfw.write(':') if format_type[fmt_idx] == 'String': vcfw.write(format_arrays[fmt][sample_idx]) @@ -375,13 +421,18 @@ def WriteSampleData(vcfw, record, alleles, formats, format_type, mapping): format_arrays[fmt][sample_idx, :] )) -def MergeRecords(readers, vcftype, num_samples, current_records, - mergelist, vcfw, useinfo, - useformat, format_type): + + +def MergeRecords(readers: cyvcf2.VCF, vcftype: Union[str, trh.VcfTypes], num_samples: List[int], + current_records: List[trh.TRRecord], + mergelist: List[bool], vcfw: TextIO, useinfo: List[Tuple[str, bool]], + useformat: List[str], format_type: List[str]) -> None: r"""Merge records from different files Merge all records with indicator set to True in mergelist Output merged record to vcfw + If the merged records were created by HipSTR and contain flanking BPs, these + BPs are removed Parameters ---------- @@ -405,12 +456,12 @@ def MergeRecords(readers, vcftype, num_samples, current_records, The type of each format field """ use_ind = [i for i in range(len(mergelist)) if mergelist[i]] - if len(use_ind)==0: return + if len(use_ind) == 0: return - chrom = current_records[use_ind[0]].CHROM - pos = str(current_records[use_ind[0]].POS) + chrom = current_records[use_ind[0]].chrom + pos = str(current_records[use_ind[0]].pos) - ref_allele = GetRefAllele(current_records, mergelist) + ref_allele = GetRefAllele(current_records, mergelist, vcftype) if ref_allele is None: common.WARNING("Conflicting refs found at {}:{}. Skipping.".format(chrom, pos)) return @@ -418,14 +469,14 @@ def MergeRecords(readers, vcftype, num_samples, current_records, alt_alleles, mappings = GetAltAlleles(current_records, mergelist, vcftype) # Set common fields - vcfw.write(chrom) #CHROM + vcfw.write(chrom) # CHROM vcfw.write('\t') - vcfw.write(pos) #POS + vcfw.write(pos) # POS vcfw.write('\t') # TODO complain if records have different IDs - vcfw.write(GetID(current_records[use_ind[0]].ID)) # ID + vcfw.write(GetID(current_records[use_ind[0]].vcfrecord.ID)) # ID vcfw.write('\t') - vcfw.write(ref_allele) # REF + vcfw.write(ref_allele) # REF vcfw.write('\t') # ALT if len(alt_alleles) > 0: @@ -434,8 +485,8 @@ def MergeRecords(readers, vcftype, num_samples, current_records, else: vcfw.write('.\t') # fields which are always set to empty - vcfw.write(".\t") # QUAL - vcfw.write(".\t") # FITLER + vcfw.write(".\t") # QUAL + vcfw.write(".\t") # FITLER # INFO first = True @@ -449,47 +500,64 @@ def MergeRecords(readers, vcftype, num_samples, current_records, vcfw.write('\t') # FORMAT - add GT to front - vcfw.write(":".join(["GT"]+useformat)) + vcfw.write(":".join(["GT"] + useformat)) # Samples - alleles = [ref_allele]+alt_alleles + alleles = [ref_allele] + alt_alleles map_iter = iter(mappings) for i in range(len(mergelist)): if mergelist[i]: - WriteSampleData(vcfw, current_records[i], alleles, useformat, + WriteSampleData(vcfw, current_records[i].vcfrecord, alleles, useformat, format_type, next(map_iter)) - else: # NOCALL + else: # NOCALL if num_samples[i] > 0: vcfw.write('\t') - vcfw.write('\t'.join([NOCALLSTRING]*num_samples[i])) + vcfw.write('\t'.join([NOCALLSTRING] * num_samples[i])) vcfw.write('\n') -def getargs(): # pragma: no cover + +def getargs() -> Any: # pragma: no cover parser = argparse.ArgumentParser( __doc__, formatter_class=utils.ArgumentDefaultsHelpFormatter ) ### Required arguments ### req_group = parser.add_argument_group("Required arguments") - req_group.add_argument("--vcfs", help="Comma-separated list of VCF files to merge (must be sorted, bgzipped and indexed)", type=str, required=True) + req_group.add_argument("--vcfs", + help="Comma-separated list of VCF files to merge (must be sorted, bgzipped and indexed)", + type=str, required=True) req_group.add_argument("--out", help="Prefix to name output files", type=str, required=True) - req_group.add_argument("--vcftype", help="Options=%s"%[str(item) for item in trh.VcfTypes.__members__], type=str, default="auto") + req_group.add_argument("--vcftype", help="Options=%s" % [str(item) for item in trh.VcfTypes.__members__], type=str, + default="auto") ### Special merge options ### spec_group = parser.add_argument_group("Special merge options") - spec_group.add_argument("--update-sample-from-file", help="Use file names, rather than sample header names, when merging", action="store_true") + spec_group.add_argument("--update-sample-from-file", + help="Use file names, rather than sample header names, when merging", action="store_true") ### Optional arguments ### opt_group = parser.add_argument_group("Optional arguments") opt_group.add_argument("--verbose", help="Print out extra info", action="store_true") opt_group.add_argument("--quiet", help="Don't print out anything", action="store_true") ## Version argument ## ver_group = parser.add_argument_group("Version") - ver_group.add_argument("--version", action="version", version = '{version}'.format(version=__version__)) + ver_group.add_argument("--version", action="version", version='{version}'.format(version=__version__)) ### Parse args ### args = parser.parse_args() return args -def main(args): + +def HarmonizeIfNotNone(records: List[Optional[trh.TRRecord]], vcf_type: trh.VcfTypes): + result = [] + for record in records: + if record is not None: + result.append(trh.HarmonizeRecord(vcf_type, record)) + else: + result.append(None) + + return result + + +def main(args: Any) -> int: if not os.path.exists(os.path.dirname(os.path.abspath(args.out))): common.WARNING("Error: The directory which contains the output location {} does" " not exist".format(args.out)) @@ -502,7 +570,7 @@ def main(args): filenames = args.vcfs.split(",") ### Check and Load VCF files ### - vcfreaders = utils.LoadReaders(filenames, checkgz = True) + vcfreaders = utils.LoadReaders(filenames, checkgz=True) if vcfreaders is None: return 1 if len(vcfreaders) == 0: return 1 @@ -529,23 +597,24 @@ def main(args): common.WARNING("Error writing merged header. Quitting") return 1 - #need to know format types to know how to convert them to strings + # need to know format types to know how to convert them to strings format_type = [] for fmt in useformat: format_type.append(vcfreaders[0].get_header_type(fmt)['Type']) ### Walk through sorted readers, merging records as we go ### - current_records = [next(reader) for reader in vcfreaders] + current_records = mergeutils.InitReaders(vcfreaders) # Check if contig ID is set in VCF header for all records done = mergeutils.DoneReading(current_records) + while not done: for vcf_num, (r, reader) in enumerate(zip(current_records, vcfreaders)): if r is None: continue if not r.CHROM in chroms: common.WARNING(( - "Error: found a record in file {} with " - "chromosome '{}' which was not found in the contig list " - "({})").format(filenames[vcf_num], r.CHROM, ", ".join(chroms))) + "Error: found a record in file {} with " + "chromosome '{}' which was not found in the contig list " + "({})").format(filenames[vcf_num], r.CHROM, ", ".join(chroms))) common.WARNING("VCF files must contain a ##contig header line for each chromosome.") common.WARNING( "If this is only a technical issue and all the vcf " @@ -554,19 +623,22 @@ def main(args): "(https://github.com/samtools/bcftools) to fix the contigs" ", e.g.: bcftools reheader -f hg19.fa.fai -o myvcf-readher.vcf.gz myvcf.vcf.gz") return 1 - is_min = mergeutils.GetMinRecords(current_records, chroms) + harmonized_records = HarmonizeIfNotNone(current_records, vcftype) + is_min = mergeutils.GetMinHarmonizedRecords(harmonized_records, chroms) if args.verbose: mergeutils.DebugPrintRecordLocations(current_records, is_min) if mergeutils.CheckMin(is_min): return 1 - MergeRecords(vcfreaders, vcftype, num_samples, current_records, is_min, vcfw, useinfo, + MergeRecords(vcfreaders, vcftype, num_samples, harmonized_records, is_min, vcfw, useinfo, useformat, format_type) current_records = mergeutils.GetNextRecords(vcfreaders, current_records, is_min) done = mergeutils.DoneReading(current_records) - return 0 + return 0 -def run(): # pragma: no cover + +def run() -> None: # pragma: no cover args = getargs() retcode = main(args) sys.exit(retcode) -if __name__ == "__main__": # pragma: no cover + +if __name__ == "__main__": # pragma: no cover run() diff --git a/trtools/mergeSTR/tests/test_mergeSTR.py b/trtools/mergeSTR/tests/test_mergeSTR.py index 9330bf90..618e4442 100644 --- a/trtools/mergeSTR/tests/test_mergeSTR.py +++ b/trtools/mergeSTR/tests/test_mergeSTR.py @@ -22,7 +22,7 @@ def args(tmpdir): @pytest.fixture def mrgvcfdir(vcfdir): - return os.path.join(vcfdir, "mergeSTR_vcfs") + return os.path.join(vcfdir, "mergeSTR_vcfs") # Set up dummy class @@ -34,6 +34,16 @@ def __init__(self, chrom, pos, ref, alts=[], info = {}): self.ALTS = alts self.INFO = info + +class DummyHarmonizedRecord: + def __init__(self, chrom, pos, ref, alts=None, info=None): + self.chrom = chrom + self.pos = pos + self.ref_allele = ref + self.alt_alleles = alts if alts is not None else [] + self.info = info if info is not None else {} + self.vcfrecord = DummyRecord(chrom, pos, ref, self.alt_alleles, self.info) + # Test right files or directory - GangSTR def test_GangSTRRightFile(args, mrgvcfdir): fname1 = os.path.join(mrgvcfdir, "test_file_gangstr1.vcf.gz") @@ -192,23 +202,23 @@ def test_MissingFieldWarnings(capsys, args, mrgvcfdir): def test_ConflictingRefs(): # Set up dummy records dummy_records = [] - dummy_records.append(DummyRecord('chr1', 100, 'CAGCAG')) - dummy_records.append(DummyRecord('chr1', 100, 'CAGCAG')) - dummy_records.append(DummyRecord('chr1', 100, 'CAG')) + dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAGCAG')) + dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAGCAG')) + dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAG')) - retval = GetRefAllele(dummy_records, [True, True, True]) + retval = GetRefAllele(dummy_records, [True, True, True], None) assert retval is None - retval = GetRefAllele(dummy_records, [True, True, False]) + retval = GetRefAllele(dummy_records, [True, True, False], None) assert retval == "CAGCAG" def test_GetInfoItem(capsys): # Set up dummy records dummy_records = [] - dummy_records.append(DummyRecord('chr1', 100, 'CAGCAG', info={'END': 120})) - dummy_records.append(DummyRecord('chr1', 100, 'CAGCAG', info={'END': 120})) - dummy_records.append(DummyRecord('chr1', 100, 'CAGCAG', info={'END': 110})) - dummy_records.append(DummyRecord('chr1', 100, 'CAGCAG', info={})) + dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAGCAG', info={'END': 120})) + dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAGCAG', info={'END': 120})) + dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAGCAG', info={'END': 110})) + dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAGCAG', info={})) GetInfoItem(dummy_records, [True, True, True, False], 'END') captured = capsys.readouterr() @@ -249,6 +259,7 @@ def test_advntr_output(args, mrgvcfdir): assert_same_vcf(args.out + '.vcf', mrgvcfdir + "/advntr_merged.vcf") + def test_eh_output(args, mrgvcfdir): fname1 = os.path.join(mrgvcfdir, "test_file_eh1.vcf.gz") fname2 = os.path.join(mrgvcfdir, "test_file_eh2.vcf.gz") @@ -300,6 +311,13 @@ def test_hipstr_output(args, mrgvcfdir): assert main(args) == 0 assert_same_vcf(args.out + '.vcf', mrgvcfdir + "/hipstr_merged.vcf") +def test_hipstr_output_flanking_pb_harmonization(args, mrgvcfdir): + fname1 = os.path.join(mrgvcfdir, "hipstr-harmonized-merge-contains-flanking.vcf.gz") + fname2 = os.path.join(mrgvcfdir, "hipstr-harmonized-merge-no-flanking.vcf.gz") + args.vcftype = "hipstr" + args.vcfs = fname1 + "," + fname2 + assert main(args) == 0 + assert_same_vcf(args.out + '.vcf', mrgvcfdir + "/hipstr_flanking_harmonization_test_output.vcf") def test_popstr_output(args, mrgvcfdir): fname1 = os.path.join(mrgvcfdir, "test_file_popstr1.vcf.gz") diff --git a/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-contains-flanking.vcf.gz b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-contains-flanking.vcf.gz new file mode 100644 index 00000000..f6b5ed2d Binary files /dev/null and b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-contains-flanking.vcf.gz differ diff --git a/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-contains-flanking.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-contains-flanking.vcf.gz.tbi new file mode 100644 index 00000000..847cb5bc Binary files /dev/null and b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-contains-flanking.vcf.gz.tbi differ diff --git a/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-no-flanking.vcf.gz b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-no-flanking.vcf.gz new file mode 100644 index 00000000..2046bd81 Binary files /dev/null and b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-no-flanking.vcf.gz differ diff --git a/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-no-flanking.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-no-flanking.vcf.gz.tbi new file mode 100644 index 00000000..9396af0c Binary files /dev/null and b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-no-flanking.vcf.gz.tbi differ diff --git a/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr_flanking_harmonization_test_output.vcf b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr_flanking_harmonization_test_output.vcf new file mode 100644 index 00000000..f00eaeca --- /dev/null +++ b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr_flanking_harmonization_test_output.vcf @@ -0,0 +1,128 @@ +##fileformat=VCFv4.1 +##command=HipSTR-v0.5 --bams /storage/gtex-data/wgs/SRR2163846.bam --fasta /storage/resources/Homo_sapiens_assembly19.fasta --regions /storage/resources/GRCh37.hipstr_reference.bed --min-reads 5 --str-vcf /storage/vcfs/SRR2163846.vcf.gz --def-stutter-model --log /storage/vcfs/SRR2163846.log.txt +##command=HipSTR-v0.5 --bams /storage/gtex-data/wgs/SRR2163846.bam --fasta /storage/resources/Homo_sapiens_assembly19.fasta --regions /storage/resources/GRCh37.hipstr_reference.bed --min-reads 5 --str-vcf /storage/vcfs/SRR2163846.vcf.gz --def-stutter-model --log /storage/vcfs/SRR2163846.log.txt +##command=/home/xklinka/PycharmProjects/TRTools/trtools/mergeSTR/mergeSTR.py --vcfs /home/xklinka/PycharmProjects/TRTools/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-contains-flanking.vcf.gz,/home/xklinka/PycharmProjects/TRTools/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/hipstr-harmonized-merge-no-flanking.vcf.gz --out test --vcftype hipstr +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GTEX-R55C-0004 GTEX-R55C-0003 +1 16717 STR_2 GGTGGTGGTGGGGGCGGTGGGGGTGGTG GGTGGTGGTGGGGGCGGTGGTGGTGCTG . . START=16717;END=16744;PERIOD=3 GT:GB:Q:PQ:DP:DSNP:PSNP:PDP:GLDIFF:DSTUTTER:DFLANKINDEL:AB:FS:DAB:ALLREADS:MALLREADS 0|1:0|0:1.0:0.5:121:0:0|0:96.95|24.05:29.31:3:15:-15.66:0.0:93:-23|1;0|64;1|2;9|5:0|52;9|2 0|1:0|0:1.0:0.5:121:0:0|0:96.95|24.05:29.31:3:15:-15.66:0.0:93:-23|1;0|64;1|2;9|5:0|52;9|2 +1 16914 STR_3 CTTCTT CTTCTTCTT . . START=16914;END=16919;PERIOD=3 GT:GB:Q:PQ:DP:DSNP:PSNP:PDP:GLDIFF:DSTUTTER:DFLANKINDEL:AB:FS:DAB:ALLREADS:MALLREADS 0|1:0|0:1.0:0.5:121:0:0|0:96.95|24.05:29.31:3:15:-15.66:0.0:93:-23|1;0|64;1|2;9|5:0|52;9|2 0|1:0|0:1.0:0.5:121:0:0|0:96.95|24.05:29.31:3:15:-15.66:0.0:93:-23|1;0|64;1|2;9|5:0|52;9|2 +1 19924 STR_4 ATATAT ATATATAT,ATATATGTAT . . START=19924;END=19929;PERIOD=2 GT:GB:Q:PQ:DP:DSNP:PSNP:PDP:GLDIFF:DSTUTTER:DFLANKINDEL:AB:FS:DAB:ALLREADS:MALLREADS 0|2:0|0:1.0:0.5:121:0:0|0:96.95|24.05:29.31:3:15:-15.66:0.0:93:-23|1;0|64;1|2;9|5:0|52;9|2 0|1:0|0:1.0:0.5:121:0:0|0:96.95|24.05:29.31:3:15:-15.66:0.0:93:-23|1;0|64;1|2;9|5:0|52;9|2