Update cuteSV to 1.0.4

tjiangHIT · Dec 10, 2019 · fcabd0f · fcabd0f
1 parent ac5577d
commit fcabd0f
Show file tree

Hide file tree

Showing 8 changed files with 222 additions and 83 deletions.
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@
 
 ---	
 ### Introduction
-Long-read sequencing enables the comprehensive discovery of structural variations (SVs). However, it is still non-trivial to achieve high sensitivity and performance simultaneously due to the complex SV signatures implied by the noisy long reads. Therefore, we propose cuteSV, a sensitive, fast and scalable long read-based SV detection approach. cuteSV uses tailored methods to collect the signatures of various types of SVs and it employs a clustering-and-refinement method to analyze the signatures to implement sensitive SV detection. Benchmark on real PacBio and ONT datasets demonstrate that cuteSV has better yields and scalability than state-of-the-art tools.
+Long-read sequencing enables the comprehensive discovery of structural variations (SVs). However, it is still non-trivial to achieve high sensitivity and performance simultaneously due to the complex SV characteristics implied by noisy long reads. Therefore, we propose cuteSV, a sensitive, fast and scalable long-read-based SV detection approach. cuteSV uses tailored methods to collect the signatures of various types of SVs and employs a clustering-and-refinement method to analyze the signatures to implement sensitive SV detection. Benchmarks on real Pacific Biosciences (PacBio) and Oxford Nanopore Technology (ONT) datasets demonstrate that cuteSV has better yields and scalability than state-of-the-art tools.
 
 The benchmark results of cuteSV on the HG002 human sample are below:
 
@@ -72,9 +72,13 @@ For more detailed implementation of SV benchmarks, we show an example [here](htt
 |--sample| Sample name/id |NULL|
 |--max_split_parts|Maximum number of split segments a read may be aligned before it is ignored.|7|
 |--min_mapq|Minimum mapping quality value of alignment to be taken into account.|20|
-|--min_read_len|Ignores reads that only report alignments with not longer then bp.|500|
+|--min_read_len|Ignores reads that only report alignments with not longer than bp.|500|
+|--merge_threshold|Maximum distance of SV signals to be merged.|500|
 |--min_support|Minimum number of reads that support a SV to be reported.|10|
 |--min_length|Minimum length of SV to be reported.|30|
+|--genotype|Enable to generate genotypes.|False|
+|--hom|Threshold on allele frequency for homozygous.|0.8|
+|--het|Threshold on allele frequency for heterozygous.|0.2|
 |--max_cluster_bias_INS|Maximum distance to cluster read together for insertion.|100|
 |--diff_ratio_merging_INS|Do not merge breakpoints with basepair identity more than the ratio of *default* for insertion.|0.2|
 |--diff_ratio_filtering_INS|Filter breakpoints with basepair identity less than the ratio of *default* for insertion.|0.6|
@@ -98,6 +102,12 @@ Please cite the manuscript of cuteSV before using these callsets.
 ---
 ### Changelog
 
+	cuteSV (v1.0.4):
+	1.Add a new option for specificly setting the threshold of SV signals merging in the same read. The default parameter is 500 bp. You can reduce it for high-quality sequencing datasets like PacBio HiFi (CCS).
+	2.Make the genotyping function optional.
+	3.Enable users to set the threshold of SV allele frequency of homozygous/heterozygous.
+	4.Update the description of recommendation parameters in processing ONT data.
+
 	cuteSV (v1.0.3):
 	1.Refine the genotyping model.
 	2.Adjust the threshold value of heterozygosis alleles.

diff --git a/setup.py b/setup.py
@@ -7,8 +7,8 @@
 
 setup(
     name = "cuteSV",
-    version = "1.0.3",
-    description = "Long read based human genomic structural variation detection with cuteSV",
+    version = "1.0.4",
+    description = "Long-Read-based Human Genomic Structural Variation Detection with cuteSV",
     author = "Jiang Tao",
     author_email = "[email protected]",
     url = "https://github.com/tjiangHIT/cuteSV",

diff --git a/src/cuteSV/cuteSV b/src/cuteSV/cuteSV
@@ -282,7 +282,7 @@ def generate_combine_sigs(sigs, Chr_name, read_name, svtype, candidate, merge_di
 											read_name])
 
 
-def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len, candidate, min_siglength):
+def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len, candidate, min_siglength, merge_threshold):
 	if read.query_length < min_read_len:
 		return 0
 
@@ -327,8 +327,8 @@ def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len,
 			softclip_right = read.cigar[-1][1]
 
 	# ************Combine signals in same read********************
-	generate_combine_sigs(Combine_sig_in_same_read_ins, Chr_name, read.query_name, "INS", candidate, 500)
-	generate_combine_sigs(Combine_sig_in_same_read_del, Chr_name, read.query_name, "DEL", candidate, 500)
+	generate_combine_sigs(Combine_sig_in_same_read_ins, Chr_name, read.query_name, "INS", candidate, merge_threshold)
+	generate_combine_sigs(Combine_sig_in_same_read_del, Chr_name, read.query_name, "DEL", candidate, merge_threshold)
 
 	if process_signal == 1 or process_signal == 2:
 		Tags = read.get_tags()
@@ -348,7 +348,7 @@ def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len,
 				organize_split_signal(Chr_name, primary_info, Supplementary_info, read.query_length, 
 					SV_size, min_mapq, max_split_parts, read.query_name, candidate)
 
-def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, temp_dir, task, min_siglength):
+def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, temp_dir, task, min_siglength, merge_threshold):
 	# args.input, args.min_length, args.min_mapq, args.max_split_parts, args.min_read_len, temporary_dir, i
 	candidate = dict()
 	candidate["DEL"] = dict()
@@ -361,7 +361,7 @@ def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, t
 	Chr_length = samfile.get_reference_length(Chr_name)
 
 	for read in samfile.fetch(Chr_name, task[1], task[2]):
-		parse_read(read, Chr_name, min_length, min_mapq, max_split_parts, min_read_len, candidate, min_siglength)
+		parse_read(read, Chr_name, min_length, min_mapq, max_split_parts, min_read_len, candidate, min_siglength, merge_threshold)
 
 	logging.info("Finished %s:%d-%d."%(Chr_name, task[1], task[2]))
 	samfile.close()
@@ -420,7 +420,7 @@ def main_ctrl(args):
 	analysis_pools = Pool(processes=int(args.threads))
 	os.mkdir("%ssignatures"%temporary_dir)
 	for i in Task_list:
-		para = [(args.input, args.min_size, args.min_mapq, args.max_split_parts, args.min_read_len, temporary_dir, i, args.min_siglength)]
+		para = [(args.input, args.min_size, args.min_mapq, args.max_split_parts, args.min_read_len, temporary_dir, i, args.min_siglength, args.merge_threshold)]
 		analysis_pools.map_async(multi_run_wrapper, para)
 	analysis_pools.close()
 	analysis_pools.join()
@@ -452,28 +452,30 @@ def main_ctrl(args):
 		for svtype in ["DEL", "INS", "INV"]:
 			if svtype == "INV":
 				para = [("%s%s.sigs"%(temporary_dir, svtype), chr, svtype, args.min_support, 
-					args.max_cluster_bias_INV, args.min_size, args.input)]
+					args.max_cluster_bias_INV, args.min_size, args.input, args.genotype,
+					args.hom, args.het)]
 				result.append(analysis_pools.map_async(run_inv, para))
 				# pass
 			if svtype == 'DEL':
 				para = [("%s%s.sigs"%(temporary_dir, svtype), chr, svtype, args.min_support, 
 					args.diff_ratio_merging_DEL, args.max_cluster_bias_DEL, args.diff_ratio_filtering_DEL, 
-					min(args.min_support, 5), args.input)]
+					min(args.min_support, 5), args.input, args.genotype, args.hom, args.het)]
 				result.append(analysis_pools.map_async(run_del, para))
 			if svtype == 'INS':
 				para = [("%s%s.sigs"%(temporary_dir, svtype), chr, svtype, args.min_support, 
 					args.diff_ratio_merging_INS, args.max_cluster_bias_INS, args.diff_ratio_filtering_INS, 
-					min(args.min_support, 5), args.input)]
+					min(args.min_support, 5), args.input, args.genotype, args.hom, args.het)]
 				result.append(analysis_pools.map_async(run_ins, para))
 		# DUP
 		para = [("%s%s.sigs"%(temporary_dir, "DUP"), chr, args.min_support, args.max_cluster_bias_DUP,
-		 args.min_size, args.input)]
+		 args.min_size, args.input, args.genotype, args.hom, args.het)]
 		result.append(analysis_pools.map_async(run_dup, para))
 
 	for i in process_tra:
 		# TRA
 		para = [("%s%s.sigs"%(temporary_dir, "TRA"), i[0], i[1], args.min_support, 
-			args.diff_ratio_filtering_TRA, args.max_cluster_bias_TRA, args.input)]
+			args.diff_ratio_filtering_TRA, args.max_cluster_bias_TRA, args.input, args.genotype
+			, args.hom, args.het)]
 		result.append(analysis_pools.map_async(run_tra, para))
 
 	analysis_pools.close()

diff --git a/src/cuteSV/cuteSV_Description.py b/src/cuteSV/cuteSV_Description.py
@@ -7,7 +7,7 @@
 '''
 import argparse
 
-VERSION = '1.0.3'
+VERSION = '1.0.4'
 
 class cuteSVdp(object):
 	'''
@@ -42,9 +42,14 @@ class cuteSVdp(object):
 	# MinSizeDel = 'For current version of cuteSV, it can detect deletions larger than this size.'
 
 def parseArgs(argv):
-	parser = argparse.ArgumentParser(prog="cuteSV", description=cuteSVdp.USAGE, 
+	parser = argparse.ArgumentParser(prog="cuteSV", 
+		description=cuteSVdp.USAGE, 
 		formatter_class=argparse.RawDescriptionHelpFormatter)
 
+	parser.add_argument('--version', '-v', 
+		action = 'version', 
+		version = '%(prog)s {version}'.format(version=VERSION))
+
 	# **************Parameters of input******************
 	parser.add_argument("input", 
 		metavar="[BAM]", 
@@ -71,10 +76,6 @@ def parseArgs(argv):
 		help = "Sample name/id",
 		default = "NULL",
 		type = str)
-	# parser.add_argument('-g', '--genotype',
-	# 	help = "Enable generate genotype (True/False).[%(default)s]",
-	# 	default = "False",
-	# 	type = str)
 
 	# **************Parameters in signatures collection******************
 	GroupSignaturesCollect = parser.add_argument_group('Collection of SV signatures')
@@ -87,7 +88,11 @@ def parseArgs(argv):
 		default = 20, 
 		type = int)
 	GroupSignaturesCollect.add_argument('-r', '--min_read_len', 
-		help = "Ignores reads that only report alignments with not longer then bp.[%(default)s]", 
+		help = "Ignores reads that only report alignments with not longer than bp.[%(default)s]", 
+		default = 500, 
+		type = int)
+	GroupSignaturesCollect.add_argument('-m', '--merge_threshold', 
+		help = "Maximum distance of SV signals to be merged.[%(default)s]", 
 		default = 500, 
 		type = int)
 	# The min_read_len in last version is 2000.
@@ -108,6 +113,20 @@ def parseArgs(argv):
 		default = 10, 
 		type = int)
 
+	# **************Parameters in genotyping******************
+	GroupGenotype = parser.add_argument_group('Computing genotypes')
+	GroupGenotype.add_argument('--genotype',
+		help = "Enable to generate genotypes.",
+		action="store_true")
+	GroupGenotype.add_argument('--hom', 
+		help = "Threshold on allele frequency for homozygous.[%(default)s]", 
+		default = 0.8, 
+		type = float)
+	GroupGenotype.add_argument('--het', 
+		help = "Threshold on allele frequency for heterozygous.[%(default)s].", 
+		default = 0.2, 
+		type = float)
+
 	# Just a parameter for debug.
 	# Will be removed in future.
 	# GroupSVCluster.add_argument('--preset',

diff --git a/src/cuteSV/cuteSV_resolveDUP.py b/src/cuteSV/cuteSV_resolveDUP.py
@@ -13,7 +13,8 @@
 *******************************************
 '''
 
-def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size, bam_path):
+def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size, 
+	bam_path, action, hom, het):
 	semi_dup_cluster = list()
 	semi_dup_cluster.append([0,0,''])
 	candidate_single_SV = list()
@@ -36,7 +37,10 @@ def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size, bam_path):
 										max_cluster_bias, 
 										sv_size, 
 										candidate_single_SV,
-										bam_path)
+										bam_path,
+										action,
+										hom,
+										het)
 			semi_dup_cluster = []
 			semi_dup_cluster.append([pos_1, pos_2, read_id])
 		else:
@@ -48,12 +52,15 @@ def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size, bam_path):
 								max_cluster_bias, 
 								sv_size, 
 								candidate_single_SV,
-								bam_path)
+								bam_path,
+								action,
+								hom,
+								het)
 	file.close()
 	return candidate_single_SV
 
-def generate_dup_cluster(semi_dup_cluster, chr, read_count, max_cluster_bias, sv_size, 
-	candidate_single_SV, bam_path):
+def generate_dup_cluster(semi_dup_cluster, chr, read_count, max_cluster_bias, 
+	sv_size, candidate_single_SV, bam_path, action, hom, het):
 	# calculate support reads
 	support_read = list(set([i[2] for i in semi_dup_cluster]))
 	# print(support_read)
@@ -122,7 +129,12 @@ def generate_dup_cluster(semi_dup_cluster, chr, read_count, max_cluster_bias, sv
 			# candidate_single_SV.append('%s\t%s\t%d\t%d\t%d\t%s\n' % (chr, 'DUP', i[0][0], i[1][0] - i[0][0], i[2], reliability))
 			'''genotyping'''
 			if i[1][0] - i[0][0] <= 100000:
-				DV, DR, GT = call_gt(bam_path, int(i[0][0]), int(i[1][0]), chr, support_read, max_cluster_bias)
+				if action:
+					DV, DR, GT = call_gt(bam_path, int(i[0][0]), int(i[1][0]), 
+						chr, support_read, max_cluster_bias, hom, het)
+				else:
+					DR = '.'
+					GT = './.'
 				candidate_single_SV.append([chr,
 											'DUP', 
 											str(i[0][0]), 
@@ -144,19 +156,19 @@ def count_coverage(chr, s, e, f):
 			read_count.add(i.query_name)
 	return read_count
 
-def assign_gt(a, b):
+def assign_gt(a, b, hom, het):
 	if b == 0:
 		return "1/1"
-	if a*1.0/b < 0.2:
+	if a*1.0/b < het:
 		return "0/0"
-	elif a*1.0/b >= 0.2 and a*1.0/b < 0.8:
+	elif a*1.0/b >= het and a*1.0/b < hom:
 		return "0/1"
-	elif a*1.0/b >= 0.8 and a*1.0/b < 1.0:
+	elif a*1.0/b >= hom and a*1.0/b < 1.0:
 		return "1/1"
 	else:
 		return "1/1"
 
-def call_gt(bam_path, pos_1, pos_2, chr, read_id_list, max_cluster_bias):
+def call_gt(bam_path, pos_1, pos_2, chr, read_id_list, max_cluster_bias, hom, het):
 	import pysam
 	bamfile = pysam.AlignmentFile(bam_path)
 	search_start = max(int(pos_1) - max_cluster_bias, 0)
@@ -167,4 +179,4 @@ def call_gt(bam_path, pos_1, pos_2, chr, read_id_list, max_cluster_bias):
 	for query in querydata:
 		if query not in read_id_list:
 			DR += 1
-	return len(read_id_list), DR, assign_gt(len(read_id_list), DR+len(read_id_list))
+	return len(read_id_list), DR, assign_gt(len(read_id_list), DR+len(read_id_list), hom, het)