Skip to content

Commit

Permalink
Update cuteSV to 1.0.4
Browse files Browse the repository at this point in the history
  • Loading branch information
tjiangHIT committed Dec 10, 2019
1 parent ac5577d commit fcabd0f
Show file tree
Hide file tree
Showing 8 changed files with 222 additions and 83 deletions.
14 changes: 12 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

---
### Introduction
Long-read sequencing enables the comprehensive discovery of structural variations (SVs). However, it is still non-trivial to achieve high sensitivity and performance simultaneously due to the complex SV signatures implied by the noisy long reads. Therefore, we propose cuteSV, a sensitive, fast and scalable long read-based SV detection approach. cuteSV uses tailored methods to collect the signatures of various types of SVs and it employs a clustering-and-refinement method to analyze the signatures to implement sensitive SV detection. Benchmark on real PacBio and ONT datasets demonstrate that cuteSV has better yields and scalability than state-of-the-art tools.
Long-read sequencing enables the comprehensive discovery of structural variations (SVs). However, it is still non-trivial to achieve high sensitivity and performance simultaneously due to the complex SV characteristics implied by noisy long reads. Therefore, we propose cuteSV, a sensitive, fast and scalable long-read-based SV detection approach. cuteSV uses tailored methods to collect the signatures of various types of SVs and employs a clustering-and-refinement method to analyze the signatures to implement sensitive SV detection. Benchmarks on real Pacific Biosciences (PacBio) and Oxford Nanopore Technology (ONT) datasets demonstrate that cuteSV has better yields and scalability than state-of-the-art tools.

The benchmark results of cuteSV on the HG002 human sample are below:

Expand Down Expand Up @@ -72,9 +72,13 @@ For more detailed implementation of SV benchmarks, we show an example [here](htt
|--sample| Sample name/id |NULL|
|--max_split_parts|Maximum number of split segments a read may be aligned before it is ignored.|7|
|--min_mapq|Minimum mapping quality value of alignment to be taken into account.|20|
|--min_read_len|Ignores reads that only report alignments with not longer then bp.|500|
|--min_read_len|Ignores reads that only report alignments with not longer than bp.|500|
|--merge_threshold|Maximum distance of SV signals to be merged.|500|
|--min_support|Minimum number of reads that support a SV to be reported.|10|
|--min_length|Minimum length of SV to be reported.|30|
|--genotype|Enable to generate genotypes.|False|
|--hom|Threshold on allele frequency for homozygous.|0.8|
|--het|Threshold on allele frequency for heterozygous.|0.2|
|--max_cluster_bias_INS|Maximum distance to cluster read together for insertion.|100|
|--diff_ratio_merging_INS|Do not merge breakpoints with basepair identity more than the ratio of *default* for insertion.|0.2|
|--diff_ratio_filtering_INS|Filter breakpoints with basepair identity less than the ratio of *default* for insertion.|0.6|
Expand All @@ -98,6 +102,12 @@ Please cite the manuscript of cuteSV before using these callsets.
---
### Changelog

cuteSV (v1.0.4):
1.Add a new option for specificly setting the threshold of SV signals merging in the same read. The default parameter is 500 bp. You can reduce it for high-quality sequencing datasets like PacBio HiFi (CCS).
2.Make the genotyping function optional.
3.Enable users to set the threshold of SV allele frequency of homozygous/heterozygous.
4.Update the description of recommendation parameters in processing ONT data.

cuteSV (v1.0.3):
1.Refine the genotyping model.
2.Adjust the threshold value of heterozygosis alleles.
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

setup(
name = "cuteSV",
version = "1.0.3",
description = "Long read based human genomic structural variation detection with cuteSV",
version = "1.0.4",
description = "Long-Read-based Human Genomic Structural Variation Detection with cuteSV",
author = "Jiang Tao",
author_email = "[email protected]",
url = "https://github.com/tjiangHIT/cuteSV",
Expand Down
24 changes: 13 additions & 11 deletions src/cuteSV/cuteSV
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def generate_combine_sigs(sigs, Chr_name, read_name, svtype, candidate, merge_di
read_name])


def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len, candidate, min_siglength):
def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len, candidate, min_siglength, merge_threshold):
if read.query_length < min_read_len:
return 0

Expand Down Expand Up @@ -327,8 +327,8 @@ def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len,
softclip_right = read.cigar[-1][1]

# ************Combine signals in same read********************
generate_combine_sigs(Combine_sig_in_same_read_ins, Chr_name, read.query_name, "INS", candidate, 500)
generate_combine_sigs(Combine_sig_in_same_read_del, Chr_name, read.query_name, "DEL", candidate, 500)
generate_combine_sigs(Combine_sig_in_same_read_ins, Chr_name, read.query_name, "INS", candidate, merge_threshold)
generate_combine_sigs(Combine_sig_in_same_read_del, Chr_name, read.query_name, "DEL", candidate, merge_threshold)

if process_signal == 1 or process_signal == 2:
Tags = read.get_tags()
Expand All @@ -348,7 +348,7 @@ def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len,
organize_split_signal(Chr_name, primary_info, Supplementary_info, read.query_length,
SV_size, min_mapq, max_split_parts, read.query_name, candidate)

def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, temp_dir, task, min_siglength):
def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, temp_dir, task, min_siglength, merge_threshold):
# args.input, args.min_length, args.min_mapq, args.max_split_parts, args.min_read_len, temporary_dir, i
candidate = dict()
candidate["DEL"] = dict()
Expand All @@ -361,7 +361,7 @@ def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, t
Chr_length = samfile.get_reference_length(Chr_name)

for read in samfile.fetch(Chr_name, task[1], task[2]):
parse_read(read, Chr_name, min_length, min_mapq, max_split_parts, min_read_len, candidate, min_siglength)
parse_read(read, Chr_name, min_length, min_mapq, max_split_parts, min_read_len, candidate, min_siglength, merge_threshold)

logging.info("Finished %s:%d-%d."%(Chr_name, task[1], task[2]))
samfile.close()
Expand Down Expand Up @@ -420,7 +420,7 @@ def main_ctrl(args):
analysis_pools = Pool(processes=int(args.threads))
os.mkdir("%ssignatures"%temporary_dir)
for i in Task_list:
para = [(args.input, args.min_size, args.min_mapq, args.max_split_parts, args.min_read_len, temporary_dir, i, args.min_siglength)]
para = [(args.input, args.min_size, args.min_mapq, args.max_split_parts, args.min_read_len, temporary_dir, i, args.min_siglength, args.merge_threshold)]
analysis_pools.map_async(multi_run_wrapper, para)
analysis_pools.close()
analysis_pools.join()
Expand Down Expand Up @@ -452,28 +452,30 @@ def main_ctrl(args):
for svtype in ["DEL", "INS", "INV"]:
if svtype == "INV":
para = [("%s%s.sigs"%(temporary_dir, svtype), chr, svtype, args.min_support,
args.max_cluster_bias_INV, args.min_size, args.input)]
args.max_cluster_bias_INV, args.min_size, args.input, args.genotype,
args.hom, args.het)]
result.append(analysis_pools.map_async(run_inv, para))
# pass
if svtype == 'DEL':
para = [("%s%s.sigs"%(temporary_dir, svtype), chr, svtype, args.min_support,
args.diff_ratio_merging_DEL, args.max_cluster_bias_DEL, args.diff_ratio_filtering_DEL,
min(args.min_support, 5), args.input)]
min(args.min_support, 5), args.input, args.genotype, args.hom, args.het)]
result.append(analysis_pools.map_async(run_del, para))
if svtype == 'INS':
para = [("%s%s.sigs"%(temporary_dir, svtype), chr, svtype, args.min_support,
args.diff_ratio_merging_INS, args.max_cluster_bias_INS, args.diff_ratio_filtering_INS,
min(args.min_support, 5), args.input)]
min(args.min_support, 5), args.input, args.genotype, args.hom, args.het)]
result.append(analysis_pools.map_async(run_ins, para))
# DUP
para = [("%s%s.sigs"%(temporary_dir, "DUP"), chr, args.min_support, args.max_cluster_bias_DUP,
args.min_size, args.input)]
args.min_size, args.input, args.genotype, args.hom, args.het)]
result.append(analysis_pools.map_async(run_dup, para))

for i in process_tra:
# TRA
para = [("%s%s.sigs"%(temporary_dir, "TRA"), i[0], i[1], args.min_support,
args.diff_ratio_filtering_TRA, args.max_cluster_bias_TRA, args.input)]
args.diff_ratio_filtering_TRA, args.max_cluster_bias_TRA, args.input, args.genotype
, args.hom, args.het)]
result.append(analysis_pools.map_async(run_tra, para))

analysis_pools.close()
Expand Down
33 changes: 26 additions & 7 deletions src/cuteSV/cuteSV_Description.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
'''
import argparse

VERSION = '1.0.3'
VERSION = '1.0.4'

class cuteSVdp(object):
'''
Expand Down Expand Up @@ -42,9 +42,14 @@ class cuteSVdp(object):
# MinSizeDel = 'For current version of cuteSV, it can detect deletions larger than this size.'

def parseArgs(argv):
parser = argparse.ArgumentParser(prog="cuteSV", description=cuteSVdp.USAGE,
parser = argparse.ArgumentParser(prog="cuteSV",
description=cuteSVdp.USAGE,
formatter_class=argparse.RawDescriptionHelpFormatter)

parser.add_argument('--version', '-v',
action = 'version',
version = '%(prog)s {version}'.format(version=VERSION))

# **************Parameters of input******************
parser.add_argument("input",
metavar="[BAM]",
Expand All @@ -71,10 +76,6 @@ def parseArgs(argv):
help = "Sample name/id",
default = "NULL",
type = str)
# parser.add_argument('-g', '--genotype',
# help = "Enable generate genotype (True/False).[%(default)s]",
# default = "False",
# type = str)

# **************Parameters in signatures collection******************
GroupSignaturesCollect = parser.add_argument_group('Collection of SV signatures')
Expand All @@ -87,7 +88,11 @@ def parseArgs(argv):
default = 20,
type = int)
GroupSignaturesCollect.add_argument('-r', '--min_read_len',
help = "Ignores reads that only report alignments with not longer then bp.[%(default)s]",
help = "Ignores reads that only report alignments with not longer than bp.[%(default)s]",
default = 500,
type = int)
GroupSignaturesCollect.add_argument('-m', '--merge_threshold',
help = "Maximum distance of SV signals to be merged.[%(default)s]",
default = 500,
type = int)
# The min_read_len in last version is 2000.
Expand All @@ -108,6 +113,20 @@ def parseArgs(argv):
default = 10,
type = int)

# **************Parameters in genotyping******************
GroupGenotype = parser.add_argument_group('Computing genotypes')
GroupGenotype.add_argument('--genotype',
help = "Enable to generate genotypes.",
action="store_true")
GroupGenotype.add_argument('--hom',
help = "Threshold on allele frequency for homozygous.[%(default)s]",
default = 0.8,
type = float)
GroupGenotype.add_argument('--het',
help = "Threshold on allele frequency for heterozygous.[%(default)s].",
default = 0.2,
type = float)

# Just a parameter for debug.
# Will be removed in future.
# GroupSVCluster.add_argument('--preset',
Expand Down
36 changes: 24 additions & 12 deletions src/cuteSV/cuteSV_resolveDUP.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
*******************************************
'''

def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size, bam_path):
def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size,
bam_path, action, hom, het):
semi_dup_cluster = list()
semi_dup_cluster.append([0,0,''])
candidate_single_SV = list()
Expand All @@ -36,7 +37,10 @@ def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size, bam_path):
max_cluster_bias,
sv_size,
candidate_single_SV,
bam_path)
bam_path,
action,
hom,
het)
semi_dup_cluster = []
semi_dup_cluster.append([pos_1, pos_2, read_id])
else:
Expand All @@ -48,12 +52,15 @@ def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size, bam_path):
max_cluster_bias,
sv_size,
candidate_single_SV,
bam_path)
bam_path,
action,
hom,
het)
file.close()
return candidate_single_SV

def generate_dup_cluster(semi_dup_cluster, chr, read_count, max_cluster_bias, sv_size,
candidate_single_SV, bam_path):
def generate_dup_cluster(semi_dup_cluster, chr, read_count, max_cluster_bias,
sv_size, candidate_single_SV, bam_path, action, hom, het):
# calculate support reads
support_read = list(set([i[2] for i in semi_dup_cluster]))
# print(support_read)
Expand Down Expand Up @@ -122,7 +129,12 @@ def generate_dup_cluster(semi_dup_cluster, chr, read_count, max_cluster_bias, sv
# candidate_single_SV.append('%s\t%s\t%d\t%d\t%d\t%s\n' % (chr, 'DUP', i[0][0], i[1][0] - i[0][0], i[2], reliability))
'''genotyping'''
if i[1][0] - i[0][0] <= 100000:
DV, DR, GT = call_gt(bam_path, int(i[0][0]), int(i[1][0]), chr, support_read, max_cluster_bias)
if action:
DV, DR, GT = call_gt(bam_path, int(i[0][0]), int(i[1][0]),
chr, support_read, max_cluster_bias, hom, het)
else:
DR = '.'
GT = './.'
candidate_single_SV.append([chr,
'DUP',
str(i[0][0]),
Expand All @@ -144,19 +156,19 @@ def count_coverage(chr, s, e, f):
read_count.add(i.query_name)
return read_count

def assign_gt(a, b):
def assign_gt(a, b, hom, het):
if b == 0:
return "1/1"
if a*1.0/b < 0.2:
if a*1.0/b < het:
return "0/0"
elif a*1.0/b >= 0.2 and a*1.0/b < 0.8:
elif a*1.0/b >= het and a*1.0/b < hom:
return "0/1"
elif a*1.0/b >= 0.8 and a*1.0/b < 1.0:
elif a*1.0/b >= hom and a*1.0/b < 1.0:
return "1/1"
else:
return "1/1"

def call_gt(bam_path, pos_1, pos_2, chr, read_id_list, max_cluster_bias):
def call_gt(bam_path, pos_1, pos_2, chr, read_id_list, max_cluster_bias, hom, het):
import pysam
bamfile = pysam.AlignmentFile(bam_path)
search_start = max(int(pos_1) - max_cluster_bias, 0)
Expand All @@ -167,4 +179,4 @@ def call_gt(bam_path, pos_1, pos_2, chr, read_id_list, max_cluster_bias):
for query in querydata:
if query not in read_id_list:
DR += 1
return len(read_id_list), DR, assign_gt(len(read_id_list), DR+len(read_id_list))
return len(read_id_list), DR, assign_gt(len(read_id_list), DR+len(read_id_list), hom, het)
Loading

0 comments on commit fcabd0f

Please sign in to comment.