update cuteSV to 1.0.3

tjiangHIT · Nov 28, 2019 · d442666 · d442666
1 parent 57632be
commit d442666
Show file tree

Hide file tree

Showing 6 changed files with 64 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -73,7 +73,7 @@ For more detailed implementation of SV benchmarks, we show an example [here](htt
 |--max_split_parts|Maximum number of split segments a read may be aligned before it is ignored.|7|
 |--min_mapq|Minimum mapping quality value of alignment to be taken into account.|20|
 |--min_read_len|Ignores reads that only report alignments with not longer then bp.|500|
-|--min_support|Minimum number of reads that support a SV to be reported.|3|
+|--min_support|Minimum number of reads that support a SV to be reported.|10|
 |--min_length|Minimum length of SV to be reported.|30|
 |--max_cluster_bias_INS|Maximum distance to cluster read together for insertion.|100|
 |--diff_ratio_merging_INS|Do not merge breakpoints with basepair identity more than the ratio of *default* for insertion.|0.2|
@@ -98,6 +98,10 @@ Please cite the manuscript of cuteSV before using these callsets.
 ---
 ### Changelog
 
+	cuteSV (v1.0.3):
+	1.Refine the genotyping model.
+	2.Adjust the threshold value of heterozygosis alleles.
+
 	cuteSV (v1.0.2):
 	1.Improve the genotyping performance and enable it to be default option.
 	2.Make the description of parameters better.

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name = "cuteSV",
-    version = "1.0.2",
+    version = "1.0.3",
     description = "Long read based human genomic structural variation detection with cuteSV",
     author = "Jiang Tao",
     author_email = "[email protected]",

diff --git a/src/benchmarks/cmp_NA19240.py b/src/benchmarks/cmp_NA19240.py
@@ -30,7 +30,7 @@ def pase_base_info(seq):
 			except:
 				pass
 		if i.split('=')[0] == "SVTYPE":
-			info[i.split('=')[0]] = i.split('=')[1]
+			info[i.split('=')[0]] = i.split('=')[1][0:3]
 	return info
 
 
@@ -45,6 +45,8 @@ def load_base(base_path):
 		chr = seq[0]
 		pos = int(seq[1])
 		ALT = seq[4][1:4]
+		if ALT not in ["INS", "INV", "DEL", "DUP"]:
+			continue
 		if ALT == "DUP":
 			ALT = "INS"
 		info = pase_base_info(seq[7])
@@ -184,6 +186,8 @@ def cmp_callsets(base, call, flag, Bias, Offect):
 				total_base += 1
 				if i[3] == flag:
 					tp_base += 1
+				# else:
+				# 	print(flag, svtype, chr, i[0], i[1], i[2])
 	# logging.info("Base count: %d"%(total_base))
 	# logging.info("TP-base count: %d"%(tp_base))
 	logging.info("====%s===="%(callset[flag]))

diff --git a/src/benchmarks/eva_trio.py b/src/benchmarks/eva_trio.py
@@ -43,6 +43,23 @@ def pase_info(seq):
 			info[i.split('=')[0]] = i.split('=')[1]
 	return info
 
+def pase_info_2(seq, seq2):
+	info = {'SVLEN': 0, 'END': 0, "SVTYPE": '', "SUPPORT": 0, "CHR2": ''}
+	for i in seq.split(';'):
+		if i.split('=')[0] in ["SVLEN", "END", "SUPPORT"]:
+			try:
+				info[i.split('=')[0]] = abs(int(i.split('=')[1]))
+			except:
+				pass
+		if i.split('=')[0] in ["SVTYPE"]:
+			info[i.split('=')[0]] = i.split('=')[1]
+			if i.split('=')[1] == 'BND':
+				if seq2[0] == 'N':
+					info['CHR2'] = seq[2].split(':')[0][2:]
+				else:
+					info['CHR2'] = seq[2].split(':')[0][1:]
+	return info
+
 def load_callset_cuteSV(path, filter, confbed):
 	callset = dict()
 	file = open(path, 'r')
@@ -148,29 +165,47 @@ def load_callset_svim(path, filter, confbed):
 
 		chr = seq[0]
 		pos = int(seq[1])
-		ALT = seq[4][1:4]
-
-		if ALT == "DUP":
-			ALT = "INS"
-		info = pase_info(seq[7])
+		info = pase_info_2(seq[7], seq[4])
 
 		if len(confbed) > 0:
 			if chr not in confbed:
 				continue
 			if judge_bed(pos, info["END"], confbed[chr]) == 0:
 				continue
 
-		if ALT not in base_call:
-			base_call[ALT] = dict()
-
-		if chr not in base_call[ALT]:
-			base_call[ALT][chr] = list()
+		svtype = info["SVTYPE"]
+		if svtype == "BND":
+			chr_2, pos_2 = parse_BND(seq[4])
+			if len(confbed) > 0:
+				if chr not in confbed:
+					continue
+				if judge_bed(pos, pos_2, confbed[chr]) == 0:
+					continue
 
-		if ALT == "INV":
-			base_call[ALT][chr].append([pos, info["END"] - pos + 1, info["END"], 0])
+			if svtype not in base_call:
+				base_call[svtype] = dict()
+			if chr not in base_call[svtype]:
+				base_call[svtype][chr] = dict()
+			if chr_2 not in base_call[svtype][chr]:
+				base_call[svtype][chr][chr_2] = list()
+
+			if info["SUPPORT"] >= filter:
+				base_call[svtype][chr][chr_2].append([pos, pos_2, 0])
+
+		elif info["SVTYPE"] == "INV":
+			if info["SVTYPE"] not in base_call:
+				base_call[info["SVTYPE"]] = dict()
+			if chr not in base_call[info["SVTYPE"]]:
+				base_call[info["SVTYPE"]][chr] = list()
+			if info["END"] - pos + 1 >= 50:
+				base_call[info["SVTYPE"]][chr].append([pos, info["END"] - pos + 1, info["END"], 0])
 		else:
+			if info["SVTYPE"] not in base_call:
+				base_call[info["SVTYPE"]] = dict()
+			if chr not in base_call[info["SVTYPE"]]:
+				base_call[info["SVTYPE"]][chr] = list()
 			if info["SVLEN"] >= 50:
-				base_call[ALT][chr].append([pos, info["SVLEN"], info["END"], 0])
+				base_call[info["SVTYPE"]][chr].append([pos, info["SVLEN"], info["END"], 0])
 	file.close()
 	return base_call
 
@@ -265,6 +300,7 @@ def eva_record(call_A, call_B, bias, offect):
 					else:
 						for i in call_A[svtype][chr]:
 							for j in call_B[svtype][chr]:
+								# if min(i[2], j[2]) >= max(i[0], j[0]):
 								if i[0] - offect <= j[0] <= i[2] + offect or i[0] - offect <= j[2] <= i[2] + offect or j[0] - offect <= i[0] <= j[2] + offect:
 									if min(i[1], j[1])*1.0/max(i[1], j[1]) >= bias:
 										i[3] = 1
@@ -377,7 +413,7 @@ def main_ctrl(args):
 		logging.info("Evaluate accuracy and sensitivity.")
 		eva_record(call_child, call_father, args.bias, args.offect)
 		eva_record(call_child, call_mother, args.bias, args.offect)
-		svtype = ["DEL", "INS", "INV"]
+		svtype = ["DEL", "INS", "INV", "BND"]
 		for i in svtype:
 			child_r, child_tr = statistics_true_possitive(call_child, i)
 			father_r, father_tr = statistics_true_possitive(call_father, i)

diff --git a/src/cuteSV/cuteSV b/src/cuteSV/cuteSV
@@ -454,7 +454,7 @@ def main_ctrl(args):
 				para = [("%s%s.sigs"%(temporary_dir, svtype), chr, svtype, args.min_support, 
 					args.max_cluster_bias_INV, args.min_size, args.input)]
 				result.append(analysis_pools.map_async(run_inv, para))
-				pass
+				# pass
 			if svtype == 'DEL':
 				para = [("%s%s.sigs"%(temporary_dir, svtype), chr, svtype, args.min_support, 
 					args.diff_ratio_merging_DEL, args.max_cluster_bias_DEL, args.diff_ratio_filtering_DEL, 

diff --git a/src/cuteSV/cuteSV_Description.py b/src/cuteSV/cuteSV_Description.py
@@ -7,7 +7,7 @@
 '''
 import argparse
 
-VERSION = '1.0.2'
+VERSION = '1.0.3'
 
 class cuteSVdp(object):
 	'''
@@ -97,7 +97,7 @@ def parseArgs(argv):
 	GroupSVCluster = parser.add_argument_group('Generation of SV clusters')
 	GroupSVCluster.add_argument('-s', '--min_support', 
 		help = "Minimum number of reads that support a SV to be reported.[%(default)s]", 
-		default = 3, 
+		default = 10, 
 		type = int)
 	GroupSVCluster.add_argument('-l', '--min_size', 
 		help = "Minimum size of SV to be reported.[%(default)s]",