From fcabd0fef89d9f5467bf618420b35f98639ed8eb Mon Sep 17 00:00:00 2001 From: tjiang Date: Tue, 10 Dec 2019 06:05:28 +0000 Subject: [PATCH] Update cuteSV to 1.0.4 --- README.md | 14 ++++- setup.py | 4 +- src/cuteSV/cuteSV | 24 ++++---- src/cuteSV/cuteSV_Description.py | 33 ++++++++--- src/cuteSV/cuteSV_resolveDUP.py | 36 ++++++++---- src/cuteSV/cuteSV_resolveINDEL.py | 96 ++++++++++++++++++++++++------- src/cuteSV/cuteSV_resolveINV.py | 43 ++++++++++---- src/cuteSV/cuteSV_resolveTRA.py | 55 +++++++++++++----- 8 files changed, 222 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index 3189b5b..0fa93c4 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ --- ### Introduction -Long-read sequencing enables the comprehensive discovery of structural variations (SVs). However, it is still non-trivial to achieve high sensitivity and performance simultaneously due to the complex SV signatures implied by the noisy long reads. Therefore, we propose cuteSV, a sensitive, fast and scalable long read-based SV detection approach. cuteSV uses tailored methods to collect the signatures of various types of SVs and it employs a clustering-and-refinement method to analyze the signatures to implement sensitive SV detection. Benchmark on real PacBio and ONT datasets demonstrate that cuteSV has better yields and scalability than state-of-the-art tools. +Long-read sequencing enables the comprehensive discovery of structural variations (SVs). However, it is still non-trivial to achieve high sensitivity and performance simultaneously due to the complex SV characteristics implied by noisy long reads. Therefore, we propose cuteSV, a sensitive, fast and scalable long-read-based SV detection approach. cuteSV uses tailored methods to collect the signatures of various types of SVs and employs a clustering-and-refinement method to analyze the signatures to implement sensitive SV detection. Benchmarks on real Pacific Biosciences (PacBio) and Oxford Nanopore Technology (ONT) datasets demonstrate that cuteSV has better yields and scalability than state-of-the-art tools. The benchmark results of cuteSV on the HG002 human sample are below: @@ -72,9 +72,13 @@ For more detailed implementation of SV benchmarks, we show an example [here](htt |--sample| Sample name/id |NULL| |--max_split_parts|Maximum number of split segments a read may be aligned before it is ignored.|7| |--min_mapq|Minimum mapping quality value of alignment to be taken into account.|20| -|--min_read_len|Ignores reads that only report alignments with not longer then bp.|500| +|--min_read_len|Ignores reads that only report alignments with not longer than bp.|500| +|--merge_threshold|Maximum distance of SV signals to be merged.|500| |--min_support|Minimum number of reads that support a SV to be reported.|10| |--min_length|Minimum length of SV to be reported.|30| +|--genotype|Enable to generate genotypes.|False| +|--hom|Threshold on allele frequency for homozygous.|0.8| +|--het|Threshold on allele frequency for heterozygous.|0.2| |--max_cluster_bias_INS|Maximum distance to cluster read together for insertion.|100| |--diff_ratio_merging_INS|Do not merge breakpoints with basepair identity more than the ratio of *default* for insertion.|0.2| |--diff_ratio_filtering_INS|Filter breakpoints with basepair identity less than the ratio of *default* for insertion.|0.6| @@ -98,6 +102,12 @@ Please cite the manuscript of cuteSV before using these callsets. --- ### Changelog + cuteSV (v1.0.4): + 1.Add a new option for specificly setting the threshold of SV signals merging in the same read. The default parameter is 500 bp. You can reduce it for high-quality sequencing datasets like PacBio HiFi (CCS). + 2.Make the genotyping function optional. + 3.Enable users to set the threshold of SV allele frequency of homozygous/heterozygous. + 4.Update the description of recommendation parameters in processing ONT data. + cuteSV (v1.0.3): 1.Refine the genotyping model. 2.Adjust the threshold value of heterozygosis alleles. diff --git a/setup.py b/setup.py index 897bf98..76addb3 100644 --- a/setup.py +++ b/setup.py @@ -7,8 +7,8 @@ setup( name = "cuteSV", - version = "1.0.3", - description = "Long read based human genomic structural variation detection with cuteSV", + version = "1.0.4", + description = "Long-Read-based Human Genomic Structural Variation Detection with cuteSV", author = "Jiang Tao", author_email = "tjiang@hit.edu.cn", url = "https://github.com/tjiangHIT/cuteSV", diff --git a/src/cuteSV/cuteSV b/src/cuteSV/cuteSV index 1b893ef..d1f7a51 100644 --- a/src/cuteSV/cuteSV +++ b/src/cuteSV/cuteSV @@ -282,7 +282,7 @@ def generate_combine_sigs(sigs, Chr_name, read_name, svtype, candidate, merge_di read_name]) -def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len, candidate, min_siglength): +def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len, candidate, min_siglength, merge_threshold): if read.query_length < min_read_len: return 0 @@ -327,8 +327,8 @@ def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len, softclip_right = read.cigar[-1][1] # ************Combine signals in same read******************** - generate_combine_sigs(Combine_sig_in_same_read_ins, Chr_name, read.query_name, "INS", candidate, 500) - generate_combine_sigs(Combine_sig_in_same_read_del, Chr_name, read.query_name, "DEL", candidate, 500) + generate_combine_sigs(Combine_sig_in_same_read_ins, Chr_name, read.query_name, "INS", candidate, merge_threshold) + generate_combine_sigs(Combine_sig_in_same_read_del, Chr_name, read.query_name, "DEL", candidate, merge_threshold) if process_signal == 1 or process_signal == 2: Tags = read.get_tags() @@ -348,7 +348,7 @@ def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len, organize_split_signal(Chr_name, primary_info, Supplementary_info, read.query_length, SV_size, min_mapq, max_split_parts, read.query_name, candidate) -def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, temp_dir, task, min_siglength): +def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, temp_dir, task, min_siglength, merge_threshold): # args.input, args.min_length, args.min_mapq, args.max_split_parts, args.min_read_len, temporary_dir, i candidate = dict() candidate["DEL"] = dict() @@ -361,7 +361,7 @@ def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, t Chr_length = samfile.get_reference_length(Chr_name) for read in samfile.fetch(Chr_name, task[1], task[2]): - parse_read(read, Chr_name, min_length, min_mapq, max_split_parts, min_read_len, candidate, min_siglength) + parse_read(read, Chr_name, min_length, min_mapq, max_split_parts, min_read_len, candidate, min_siglength, merge_threshold) logging.info("Finished %s:%d-%d."%(Chr_name, task[1], task[2])) samfile.close() @@ -420,7 +420,7 @@ def main_ctrl(args): analysis_pools = Pool(processes=int(args.threads)) os.mkdir("%ssignatures"%temporary_dir) for i in Task_list: - para = [(args.input, args.min_size, args.min_mapq, args.max_split_parts, args.min_read_len, temporary_dir, i, args.min_siglength)] + para = [(args.input, args.min_size, args.min_mapq, args.max_split_parts, args.min_read_len, temporary_dir, i, args.min_siglength, args.merge_threshold)] analysis_pools.map_async(multi_run_wrapper, para) analysis_pools.close() analysis_pools.join() @@ -452,28 +452,30 @@ def main_ctrl(args): for svtype in ["DEL", "INS", "INV"]: if svtype == "INV": para = [("%s%s.sigs"%(temporary_dir, svtype), chr, svtype, args.min_support, - args.max_cluster_bias_INV, args.min_size, args.input)] + args.max_cluster_bias_INV, args.min_size, args.input, args.genotype, + args.hom, args.het)] result.append(analysis_pools.map_async(run_inv, para)) # pass if svtype == 'DEL': para = [("%s%s.sigs"%(temporary_dir, svtype), chr, svtype, args.min_support, args.diff_ratio_merging_DEL, args.max_cluster_bias_DEL, args.diff_ratio_filtering_DEL, - min(args.min_support, 5), args.input)] + min(args.min_support, 5), args.input, args.genotype, args.hom, args.het)] result.append(analysis_pools.map_async(run_del, para)) if svtype == 'INS': para = [("%s%s.sigs"%(temporary_dir, svtype), chr, svtype, args.min_support, args.diff_ratio_merging_INS, args.max_cluster_bias_INS, args.diff_ratio_filtering_INS, - min(args.min_support, 5), args.input)] + min(args.min_support, 5), args.input, args.genotype, args.hom, args.het)] result.append(analysis_pools.map_async(run_ins, para)) # DUP para = [("%s%s.sigs"%(temporary_dir, "DUP"), chr, args.min_support, args.max_cluster_bias_DUP, - args.min_size, args.input)] + args.min_size, args.input, args.genotype, args.hom, args.het)] result.append(analysis_pools.map_async(run_dup, para)) for i in process_tra: # TRA para = [("%s%s.sigs"%(temporary_dir, "TRA"), i[0], i[1], args.min_support, - args.diff_ratio_filtering_TRA, args.max_cluster_bias_TRA, args.input)] + args.diff_ratio_filtering_TRA, args.max_cluster_bias_TRA, args.input, args.genotype + , args.hom, args.het)] result.append(analysis_pools.map_async(run_tra, para)) analysis_pools.close() diff --git a/src/cuteSV/cuteSV_Description.py b/src/cuteSV/cuteSV_Description.py index cfe46d5..6cc69e1 100644 --- a/src/cuteSV/cuteSV_Description.py +++ b/src/cuteSV/cuteSV_Description.py @@ -7,7 +7,7 @@ ''' import argparse -VERSION = '1.0.3' +VERSION = '1.0.4' class cuteSVdp(object): ''' @@ -42,9 +42,14 @@ class cuteSVdp(object): # MinSizeDel = 'For current version of cuteSV, it can detect deletions larger than this size.' def parseArgs(argv): - parser = argparse.ArgumentParser(prog="cuteSV", description=cuteSVdp.USAGE, + parser = argparse.ArgumentParser(prog="cuteSV", + description=cuteSVdp.USAGE, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('--version', '-v', + action = 'version', + version = '%(prog)s {version}'.format(version=VERSION)) + # **************Parameters of input****************** parser.add_argument("input", metavar="[BAM]", @@ -71,10 +76,6 @@ def parseArgs(argv): help = "Sample name/id", default = "NULL", type = str) - # parser.add_argument('-g', '--genotype', - # help = "Enable generate genotype (True/False).[%(default)s]", - # default = "False", - # type = str) # **************Parameters in signatures collection****************** GroupSignaturesCollect = parser.add_argument_group('Collection of SV signatures') @@ -87,7 +88,11 @@ def parseArgs(argv): default = 20, type = int) GroupSignaturesCollect.add_argument('-r', '--min_read_len', - help = "Ignores reads that only report alignments with not longer then bp.[%(default)s]", + help = "Ignores reads that only report alignments with not longer than bp.[%(default)s]", + default = 500, + type = int) + GroupSignaturesCollect.add_argument('-m', '--merge_threshold', + help = "Maximum distance of SV signals to be merged.[%(default)s]", default = 500, type = int) # The min_read_len in last version is 2000. @@ -108,6 +113,20 @@ def parseArgs(argv): default = 10, type = int) + # **************Parameters in genotyping****************** + GroupGenotype = parser.add_argument_group('Computing genotypes') + GroupGenotype.add_argument('--genotype', + help = "Enable to generate genotypes.", + action="store_true") + GroupGenotype.add_argument('--hom', + help = "Threshold on allele frequency for homozygous.[%(default)s]", + default = 0.8, + type = float) + GroupGenotype.add_argument('--het', + help = "Threshold on allele frequency for heterozygous.[%(default)s].", + default = 0.2, + type = float) + # Just a parameter for debug. # Will be removed in future. # GroupSVCluster.add_argument('--preset', diff --git a/src/cuteSV/cuteSV_resolveDUP.py b/src/cuteSV/cuteSV_resolveDUP.py index 22794ba..4a26bd7 100644 --- a/src/cuteSV/cuteSV_resolveDUP.py +++ b/src/cuteSV/cuteSV_resolveDUP.py @@ -13,7 +13,8 @@ ******************************************* ''' -def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size, bam_path): +def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size, + bam_path, action, hom, het): semi_dup_cluster = list() semi_dup_cluster.append([0,0,'']) candidate_single_SV = list() @@ -36,7 +37,10 @@ def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size, bam_path): max_cluster_bias, sv_size, candidate_single_SV, - bam_path) + bam_path, + action, + hom, + het) semi_dup_cluster = [] semi_dup_cluster.append([pos_1, pos_2, read_id]) else: @@ -48,12 +52,15 @@ def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size, bam_path): max_cluster_bias, sv_size, candidate_single_SV, - bam_path) + bam_path, + action, + hom, + het) file.close() return candidate_single_SV -def generate_dup_cluster(semi_dup_cluster, chr, read_count, max_cluster_bias, sv_size, - candidate_single_SV, bam_path): +def generate_dup_cluster(semi_dup_cluster, chr, read_count, max_cluster_bias, + sv_size, candidate_single_SV, bam_path, action, hom, het): # calculate support reads support_read = list(set([i[2] for i in semi_dup_cluster])) # print(support_read) @@ -122,7 +129,12 @@ def generate_dup_cluster(semi_dup_cluster, chr, read_count, max_cluster_bias, sv # candidate_single_SV.append('%s\t%s\t%d\t%d\t%d\t%s\n' % (chr, 'DUP', i[0][0], i[1][0] - i[0][0], i[2], reliability)) '''genotyping''' if i[1][0] - i[0][0] <= 100000: - DV, DR, GT = call_gt(bam_path, int(i[0][0]), int(i[1][0]), chr, support_read, max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, int(i[0][0]), int(i[1][0]), + chr, support_read, max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' candidate_single_SV.append([chr, 'DUP', str(i[0][0]), @@ -144,19 +156,19 @@ def count_coverage(chr, s, e, f): read_count.add(i.query_name) return read_count -def assign_gt(a, b): +def assign_gt(a, b, hom, het): if b == 0: return "1/1" - if a*1.0/b < 0.2: + if a*1.0/b < het: return "0/0" - elif a*1.0/b >= 0.2 and a*1.0/b < 0.8: + elif a*1.0/b >= het and a*1.0/b < hom: return "0/1" - elif a*1.0/b >= 0.8 and a*1.0/b < 1.0: + elif a*1.0/b >= hom and a*1.0/b < 1.0: return "1/1" else: return "1/1" -def call_gt(bam_path, pos_1, pos_2, chr, read_id_list, max_cluster_bias): +def call_gt(bam_path, pos_1, pos_2, chr, read_id_list, max_cluster_bias, hom, het): import pysam bamfile = pysam.AlignmentFile(bam_path) search_start = max(int(pos_1) - max_cluster_bias, 0) @@ -167,4 +179,4 @@ def call_gt(bam_path, pos_1, pos_2, chr, read_id_list, max_cluster_bias): for query in querydata: if query not in read_id_list: DR += 1 - return len(read_id_list), DR, assign_gt(len(read_id_list), DR+len(read_id_list)) + return len(read_id_list), DR, assign_gt(len(read_id_list), DR+len(read_id_list), hom, het) diff --git a/src/cuteSV/cuteSV_resolveINDEL.py b/src/cuteSV/cuteSV_resolveINDEL.py index 6db1976..d5f102f 100644 --- a/src/cuteSV/cuteSV_resolveINDEL.py +++ b/src/cuteSV/cuteSV_resolveINDEL.py @@ -14,7 +14,7 @@ ''' def resolution_DEL(path, chr, svtype, read_count, threshold_gloab, max_cluster_bias, - threshold_local, minimum_support_reads, bam_path): + threshold_local, minimum_support_reads, bam_path, action, hom, het): ''' cluster DEL @@ -66,7 +66,10 @@ def resolution_DEL(path, chr, svtype, read_count, threshold_gloab, max_cluster_b minimum_support_reads, candidate_single_SV, bam_path, - max_cluster_bias) + max_cluster_bias, + action, + hom, + het) semi_del_cluster = [] semi_del_cluster.append([pos, indel_len, read_id]) else: @@ -82,13 +85,16 @@ def resolution_DEL(path, chr, svtype, read_count, threshold_gloab, max_cluster_b minimum_support_reads, candidate_single_SV, bam_path, - max_cluster_bias) + max_cluster_bias, + action, + hom, + het) file.close() return candidate_single_SV def generate_del_cluster(semi_del_cluster, chr, svtype, read_count, threshold_gloab, threshold_local, minimum_support_reads, candidate_single_SV, - bam_path, max_cluster_bias): + bam_path, max_cluster_bias, action, hom, het): ''' generate deletion @@ -150,7 +156,12 @@ def generate_del_cluster(semi_del_cluster, chr, svtype, read_count, # print(chr, svtype, int(breakpointStart), int(signalLen), alelle_sort[-1][2][0], breakpointStart_STD, signalLen_STD) '''genotyping''' - DV, DR, GT = call_gt(bam_path, search_threshold, chr, alelle_sort[-1][3], max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, search_threshold, chr, alelle_sort[-1][3], + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' # print(DV, DR, GT) candidate_single_SV.append([chr, svtype, @@ -176,7 +187,12 @@ def generate_del_cluster(semi_del_cluster, chr, svtype, read_count, # # pass # print(chr, svtype, int(breakpointStart), int(signalLen), alelle_sort[-2][2][0], breakpointStart_STD, signalLen_STD) '''genotyping''' - DV, DR, GT = call_gt(bam_path, search_threshold, chr, alelle_sort[-2][3], max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, search_threshold, chr, alelle_sort[-2][3], + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' candidate_single_SV.append([chr, svtype, str(int(breakpointStart)), @@ -196,7 +212,12 @@ def generate_del_cluster(semi_del_cluster, chr, svtype, read_count, signalLen_STD = np.std(alelle_sort[-1][1]) # print(chr, svtype, int(breakpointStart), int(signalLen), alelle_sort[-1][2][0], breakpointStart_STD, signalLen_STD) '''genotyping''' - DV, DR, GT = call_gt(bam_path, search_threshold, chr, alelle_sort[-1][3], max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, search_threshold, chr, alelle_sort[-1][3], + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' candidate_single_SV.append([chr, svtype, str(int(breakpointStart)), @@ -214,7 +235,12 @@ def generate_del_cluster(semi_del_cluster, chr, svtype, read_count, signalLen_STD = np.std(alelle_sort[-2][1]) # print(chr, svtype, int(breakpointStart), int(signalLen), alelle_sort[-2][2][0], breakpointStart_STD, signalLen_STD) '''genotyping''' - DV, DR, GT = call_gt(bam_path, search_threshold, chr, alelle_sort[-2][3], max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, search_threshold, chr, alelle_sort[-2][3], + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' candidate_single_SV.append([chr, svtype, str(int(breakpointStart)), @@ -229,7 +255,7 @@ def generate_del_cluster(semi_del_cluster, chr, svtype, read_count, def resolution_INS(path, chr, svtype, read_count, threshold_gloab, - max_cluster_bias, threshold_local, minimum_support_reads, bam_path): + max_cluster_bias, threshold_local, minimum_support_reads, bam_path, action, hom, het): ''' cluster INS @@ -281,7 +307,10 @@ def resolution_INS(path, chr, svtype, read_count, threshold_gloab, minimum_support_reads, candidate_single_SV, bam_path, - max_cluster_bias) + max_cluster_bias, + action, + hom, + het) semi_ins_cluster = [] semi_ins_cluster.append([pos, indel_len, read_id]) else: @@ -297,13 +326,16 @@ def resolution_INS(path, chr, svtype, read_count, threshold_gloab, minimum_support_reads, candidate_single_SV, bam_path, - max_cluster_bias) + max_cluster_bias, + action, + hom, + het) file.close() return candidate_single_SV def generate_ins_cluster(semi_ins_cluster, chr, svtype, read_count, threshold_gloab, threshold_local, minimum_support_reads, candidate_single_SV, - bam_path, max_cluster_bias): + bam_path, max_cluster_bias, action, hom, het): ''' generate deletion @@ -366,7 +398,12 @@ def generate_ins_cluster(semi_ins_cluster, chr, svtype, read_count, # search_threshold = np.min(alelle_sort[-1][0]) # print(chr, svtype, int(breakpointStart), int(signalLen), alelle_sort[-1][2][0], breakpointStart_STD, signalLen_STD) '''genotyping''' - DV, DR, GT = call_gt(bam_path, int(breakpointStart), chr, alelle_sort[-1][3], max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, int(breakpointStart), chr, alelle_sort[-1][3], + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' candidate_single_SV.append([chr, svtype, str(int(breakpointStart)), @@ -391,7 +428,12 @@ def generate_ins_cluster(semi_ins_cluster, chr, svtype, read_count, # pass # print(chr, svtype, int(breakpointStart), int(signalLen), alelle_sort[-2][2][0], breakpointStart_STD, signalLen_STD) '''genotyping''' - DV, DR, GT = call_gt(bam_path, int(breakpointStart), chr, alelle_sort[-2][3], max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, int(breakpointStart), chr, alelle_sort[-2][3], + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' candidate_single_SV.append([chr, svtype, str(int(breakpointStart)), @@ -411,7 +453,12 @@ def generate_ins_cluster(semi_ins_cluster, chr, svtype, read_count, # search_threshold = np.min(alelle_sort[-1][0]) # print(chr, svtype, int(breakpointStart), int(signalLen), alelle_sort[-1][2][0], breakpointStart_STD, signalLen_STD) '''genotyping''' - DV, DR, GT = call_gt(bam_path, int(breakpointStart), chr, alelle_sort[-1][3], max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, int(breakpointStart), chr, alelle_sort[-1][3], + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' candidate_single_SV.append([chr, svtype, str(int(breakpointStart)), @@ -429,7 +476,12 @@ def generate_ins_cluster(semi_ins_cluster, chr, svtype, read_count, # search_threshold = np.min(alelle_sort[-2][0]) # print(chr, svtype, int(breakpointStart), int(signalLen), alelle_sort[-2][2][0], breakpointStart_STD, signalLen_STD) '''genotyping''' - DV, DR, GT = call_gt(bam_path, int(breakpointStart), chr, alelle_sort[-2][3], max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, int(breakpointStart), chr, alelle_sort[-2][3], + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' candidate_single_SV.append([chr, svtype, str(int(breakpointStart)), @@ -455,19 +507,19 @@ def count_coverage(chr, s, e, f): read_count.add(i.query_name) return read_count -def assign_gt(a, b): +def assign_gt(a, b, hom, het): if b == 0: return "1/1" - if a*1.0/b < 0.2: + if a*1.0/b < het: return "0/0" - elif a*1.0/b >= 0.2 and a*1.0/b < 0.8: + elif a*1.0/b >= het and a*1.0/b < hom: return "0/1" - elif a*1.0/b >= 0.8 and a*1.0/b < 1.0: + elif a*1.0/b >= hom and a*1.0/b < 1.0: return "1/1" else: return "1/1" -def call_gt(bam_path, search_threshold, chr, read_id_list, max_cluster_bias): +def call_gt(bam_path, search_threshold, chr, read_id_list, max_cluster_bias, hom, het): import pysam bamfile = pysam.AlignmentFile(bam_path) search_start = max(int(search_threshold) - max_cluster_bias, 0) @@ -478,4 +530,4 @@ def call_gt(bam_path, search_threshold, chr, read_id_list, max_cluster_bias): for query in querydata: if query not in read_id_list: DR += 1 - return len(read_id_list), DR, assign_gt(len(read_id_list), DR+len(read_id_list)) + return len(read_id_list), DR, assign_gt(len(read_id_list), DR+len(read_id_list), hom, het) diff --git a/src/cuteSV/cuteSV_resolveINV.py b/src/cuteSV/cuteSV_resolveINV.py index 7d9018a..ab65dfd 100644 --- a/src/cuteSV/cuteSV_resolveINV.py +++ b/src/cuteSV/cuteSV_resolveINV.py @@ -1,7 +1,8 @@ import sys import numpy as np -def resolution_INV(path, chr, svtype, read_count, max_cluster_bias, sv_size, bam_path): +def resolution_INV(path, chr, svtype, read_count, max_cluster_bias, sv_size, + bam_path, action, hom, het): ''' cluster INV ************************************************************************ @@ -52,7 +53,10 @@ def resolution_INV(path, chr, svtype, read_count, max_cluster_bias, sv_size, bam sv_size, candidate_single_SV, max_cluster_bias, - bam_path) + bam_path, + action, + hom, + het) semi_inv_cluster = [] semi_inv_cluster.append([breakpoint_1_in_read, breakpoint_2_in_read, read_id]) else: @@ -65,12 +69,15 @@ def resolution_INV(path, chr, svtype, read_count, max_cluster_bias, sv_size, bam sv_size, candidate_single_SV, max_cluster_bias, - bam_path) + bam_path, + action, + hom, + het) file.close() return candidate_single_SV def generate_semi_inv_cluster(semi_inv_cluster, chr, svtype, read_count, sv_size, - candidate_single_SV, max_cluster_bias, bam_path): + candidate_single_SV, max_cluster_bias, bam_path, action, hom, het): read_id = [i[2] for i in semi_inv_cluster] support_read = len(list(set(read_id))) @@ -101,7 +108,13 @@ def generate_semi_inv_cluster(semi_inv_cluster, chr, svtype, read_count, sv_size if inv_len >= sv_size and max_count_id >= read_count: # candidate_single_SV.append('%s\t%s\t%d\t%d\t%d\n'%(chr, svtype, breakpoint_1, breakpoint_2, max_count_id)) if inv_len <= 100000: - DV, DR, GT = call_gt(bam_path, int(breakpoint_1), int(breakpoint_2), chr, list(temp_id.keys()), max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, int(breakpoint_1), + int(breakpoint_2), chr, list(temp_id.keys()), + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' candidate_single_SV.append([chr, svtype, str(int(breakpoint_1)), @@ -132,7 +145,13 @@ def generate_semi_inv_cluster(semi_inv_cluster, chr, svtype, read_count, sv_size if inv_len >= sv_size and max_count_id >= read_count: # candidate_single_SV.append('%s\t%s\t%d\t%d\t%d\n'%(chr, svtype, breakpoint_1, breakpoint_2, max_count_id)) if inv_len <= 100000: - DV, DR, GT = call_gt(bam_path, int(breakpoint_1), int(breakpoint_2), chr, list(temp_id.keys()), max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, int(breakpoint_1), + int(breakpoint_2), chr, list(temp_id.keys()), + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' candidate_single_SV.append([chr, svtype, str(int(breakpoint_1)), @@ -153,19 +172,19 @@ def count_coverage(chr, s, e, f): read_count.add(i.query_name) return read_count -def assign_gt(a, b): +def assign_gt(a, b, hom, het): if b == 0: return "1/1" - if a*1.0/b < 0.2: + if a*1.0/b < het: return "0/0" - elif a*1.0/b >= 0.2 and a*1.0/b < 0.8: + elif a*1.0/b >= het and a*1.0/b < hom: return "0/1" - elif a*1.0/b >= 0.8 and a*1.0/b < 1.0: + elif a*1.0/b >= hom and a*1.0/b < 1.0: return "1/1" else: return "1/1" -def call_gt(bam_path, pos_1, pos_2, chr, read_id_list, max_cluster_bias): +def call_gt(bam_path, pos_1, pos_2, chr, read_id_list, max_cluster_bias, hom, het): import pysam bamfile = pysam.AlignmentFile(bam_path) search_start = max(int(pos_1) - max_cluster_bias, 0) @@ -176,5 +195,5 @@ def call_gt(bam_path, pos_1, pos_2, chr, read_id_list, max_cluster_bias): for query in querydata: if query not in read_id_list: DR += 1 - return len(read_id_list), DR, assign_gt(len(read_id_list), DR+len(read_id_list)) + return len(read_id_list), DR, assign_gt(len(read_id_list), DR+len(read_id_list), hom, het) \ No newline at end of file diff --git a/src/cuteSV/cuteSV_resolveTRA.py b/src/cuteSV/cuteSV_resolveTRA.py index ef345a8..95831b5 100644 --- a/src/cuteSV/cuteSV_resolveTRA.py +++ b/src/cuteSV/cuteSV_resolveTRA.py @@ -25,7 +25,7 @@ ***************************** ''' -def resolution_TRA(path, chr_1, chr_2, read_count, overlap_size, max_cluster_bias, bam_path): +def resolution_TRA(path, chr_1, chr_2, read_count, overlap_size, max_cluster_bias, bam_path, action, hom, het): semi_tra_cluster = list() semi_tra_cluster.append([0,0,'','N']) candidate_single_SV = list() @@ -52,7 +52,10 @@ def resolution_TRA(path, chr_1, chr_2, read_count, overlap_size, max_cluster_bia overlap_size, max_cluster_bias, candidate_single_SV, - bam_path) + bam_path, + action, + hom, + het) semi_tra_cluster = [] semi_tra_cluster.append([pos_1, pos_2, read_id, BND_type]) else: @@ -66,12 +69,15 @@ def resolution_TRA(path, chr_1, chr_2, read_count, overlap_size, max_cluster_bia overlap_size, max_cluster_bias, candidate_single_SV, - bam_path) + bam_path, + action, + hom, + het) file.close() return candidate_single_SV def generate_semi_tra_cluster(semi_tra_cluster, chr_1, chr_2, read_count, overlap_size, - max_cluster_bias, candidate_single_SV, bam_path): + max_cluster_bias, candidate_single_SV, bam_path, action, hom, het): BND_type = semi_tra_cluster[0][3] semi_tra_cluster = sorted(semi_tra_cluster, key = lambda x:x[1]) read_tag = dict() @@ -117,7 +123,14 @@ def generate_semi_tra_cluster(semi_tra_cluster, chr_1, chr_2, read_count, overla else: return - DV, DR, GT = call_gt(bam_path, int(temp[0][0]/len(temp[0][2])), int(temp[0][1]/len(temp[0][2])), chr_1, chr_2, temp[0][2], max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, int(temp[0][0]/len(temp[0][2])), + int(temp[0][1]/len(temp[0][2])), chr_1, chr_2, temp[0][2], + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' + candidate_single_SV.append([chr_1, TRA_1, str(int(temp[0][0]/len(temp[0][2]))), @@ -127,7 +140,14 @@ def generate_semi_tra_cluster(semi_tra_cluster, chr_1, chr_2, read_count, overla str(DR), str(GT)]) - DV, DR, GT = call_gt(bam_path, int(temp[1][0]/len(temp[1][2])), int(temp[1][1]/len(temp[1][2])), chr_1, chr_2, temp[1][2], max_cluster_bias) + if action: + DV, DR, GT = call_gt(bam_path, int(temp[1][0]/len(temp[1][2])), + int(temp[1][1]/len(temp[1][2])), chr_1, chr_2, temp[1][2], + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' + candidate_single_SV.append([chr_1, TRA_2, str(int(temp[1][0]/len(temp[1][2]))), @@ -152,9 +172,14 @@ def generate_semi_tra_cluster(semi_tra_cluster, chr_1, chr_2, read_count, overla else: return - # print(chr_1, int(temp[0][0]/len(temp[0][2])), chr_2, int(temp[0][1]/len(temp[0][2])), len(temp[0][2])) - DV, DR, GT = call_gt(bam_path, int(temp[0][0]/len(temp[0][2])), int(temp[0][1]/len(temp[0][2])), chr_1, chr_2, temp[0][2], max_cluster_bias) - # print(DV,DR,GT) + if action: + DV, DR, GT = call_gt(bam_path, int(temp[0][0]/len(temp[0][2])), + int(temp[0][1]/len(temp[0][2])), chr_1, chr_2, temp[0][2], + max_cluster_bias, hom, het) + else: + DR = '.' + GT = './.' + candidate_single_SV.append([chr_1, TRA, str(int(temp[0][0]/len(temp[0][2]))), @@ -176,19 +201,19 @@ def count_coverage(chr, s, e, f, read_count): if i.reference_start < s and i.reference_end > e: read_count.add(i.query_name) -def assign_gt(a, b): +def assign_gt(a, b, hom, het): if b == 0: return "1/1" - if a*1.0/b < 0.2: + if a*1.0/b < het: return "0/0" - elif a*1.0/b >= 0.2 and a*1.0/b < 0.8: + elif a*1.0/b >= het and a*1.0/b < hom: return "0/1" - elif a*1.0/b >= 0.8 and a*1.0/b < 1.0: + elif a*1.0/b >= hom and a*1.0/b < 1.0: return "1/1" else: return "1/1" -def call_gt(bam_path, pos_1, pos_2, chr_1, chr_2, read_id_list, max_cluster_bias): +def call_gt(bam_path, pos_1, pos_2, chr_1, chr_2, read_id_list, max_cluster_bias, hom, het): import pysam bamfile = pysam.AlignmentFile(bam_path) querydata = set() @@ -204,4 +229,4 @@ def call_gt(bam_path, pos_1, pos_2, chr_1, chr_2, read_id_list, max_cluster_bias for query in querydata: if query not in read_id_list: DR += 1 - return len(read_id_list), DR, assign_gt(len(read_id_list), DR+len(read_id_list)) + return len(read_id_list), DR, assign_gt(len(read_id_list), DR+len(read_id_list), hom, het)