diff --git a/docs/caller_usage.md b/docs/caller_usage.md index 2883708..ec0d9aa 100644 --- a/docs/caller_usage.md +++ b/docs/caller_usage.md @@ -12,6 +12,8 @@ * `--min-avg-phred ##`: Minimum average PHRED score for relevant bases (flanking region + tandem repeat). Read segments with average PHRED scores below this (common with a threshold of ~13 and ONT Ultra Long reads, for example) will be skipped. **Default:** 13 +* `--min-read-align-score #.#`: Minimum normalized read alignment score (fractional; `0.0` to `1.0`) needed to include a + read in a call. A good value for pure tandem repeats is 0.9. A good value for much more lenient genotyping is ~0.4. * `--flank-size ##`: Size of the flanking region to use on either side of a region to properly anchor reads. **Default:** 70 * `--realign` or `-a`: Whether to perform local re-alignment to attempt recovery of soft-clipped reads. Some aligners diff --git a/strkit/call/call_locus.py b/strkit/call/call_locus.py index 9284d65..3d1b0b1 100644 --- a/strkit/call/call_locus.py +++ b/strkit/call/call_locus.py @@ -56,7 +56,6 @@ ref_max_iters_to_be_slow: int = 100 max_rc_iters = 50 -min_read_score = 0.9 # TODO: parametrize # TODO: Parametrize - if very low alignment or very slow, then "bad read" extremely_low_read_adj_score: float = 0.1 # fraction bad_read_alignment_time: int = 15 # seconds @@ -796,6 +795,7 @@ def call_locus( flank_size = params.flank_size realign = params.realign respect_ref = params.respect_ref + min_read_align_score = params.min_read_align_score snv_min_base_qual = params.snv_min_base_qual # ---------------------------------- @@ -1141,7 +1141,7 @@ def call_locus( if rc_time >= really_bad_read_alignment_time: logger_.debug( f"{locus_log_str} - not calling locus due to a pathologically-poorly-aligning read ({rn}; " - f"repeat count alignment scored {read_adj_score:.2f} < {min_read_score}; get_repeat_count time: " + f"repeat count alignment scored {read_adj_score:.2f} < {min_read_align_score}; get_repeat_count time: " f"{rc_time:.3f}s; # get_repeat_count iters: {n_read_cn_iters}; {motif=}; {ref_cn=})") logger_.debug(f"{locus_log_str} - ref left flank: {ref_left_flank_seq}") logger_.debug(f"{locus_log_str} - read left flank: {flank_left_seq}") @@ -1156,10 +1156,10 @@ def call_locus( "time": (datetime.now() - call_timer).total_seconds(), } - if read_adj_score < min_read_score: + if read_adj_score < min_read_align_score: logger_.debug( f"{locus_log_str} - skipping read {rn} (repeat count alignment scored {read_adj_score:.2f} < " - f"{min_read_score}; get_repeat_count time: {rc_time:.3f}s; # get_repeat_count iters: " + f"{min_read_align_score}; get_repeat_count time: {rc_time:.3f}s; # get_repeat_count iters: " f"{n_read_cn_iters})") if read_adj_score < extremely_low_read_adj_score or rc_time >= bad_read_alignment_time: diff --git a/strkit/call/params.py b/strkit/call/params.py index 2647eb5..b876076 100644 --- a/strkit/call/params.py +++ b/strkit/call/params.py @@ -23,6 +23,7 @@ def __init__( min_allele_reads: int = 2, max_reads: int = 250, min_avg_phred: int = 13, + min_read_align_score: float = 0.9, num_bootstrap: int = 100, flank_size: int = 70, sex_chroms: Optional[str] = None, @@ -47,6 +48,7 @@ def __init__( self.min_allele_reads: int = min_allele_reads self.max_reads: int = max_reads self.min_avg_phred: int = min_avg_phred + self.min_read_align_score: float = min_read_align_score self.num_bootstrap: int = num_bootstrap self.flank_size: int = flank_size self.sex_chroms: Optional[str] = sex_chroms @@ -97,6 +99,7 @@ def from_args(cls, logger: logging.Logger, p_args): min_allele_reads=p_args.min_allele_reads, max_reads=p_args.max_reads, min_avg_phred=p_args.min_avg_phred, + min_read_align_score=p_args.min_read_align_score, num_bootstrap=p_args.num_bootstrap, flank_size=p_args.flank_size, sex_chroms=p_args.sex_chr, @@ -123,6 +126,7 @@ def to_dict(self, as_inputted: bool = False): "min_allele_reads": self.min_allele_reads, "max_reads": self.max_reads, "min_avg_phred": self.min_avg_phred, + "min_read_align_score": self.min_read_align_score, "num_bootstrap": self.num_bootstrap, "flank_size": self.flank_size, "sample_id": self._sample_id_orig if as_inputted else self.sample_id, diff --git a/strkit/entry.py b/strkit/entry.py index abb1b86..7a4ce98 100644 --- a/strkit/entry.py +++ b/strkit/entry.py @@ -120,6 +120,16 @@ def add_call_parser_args(call_parser): default=13, help="Minimum average PHRED score for relevant (flanking + TR) bases required for a read to be used.") + # Minimum read alignment score (fractional) to include a read in calling. + call_parser.add_argument( + "--min-read-align-score", "--mras", + type=float, + default=0.9, + help=( + "Minimum read alignment score (fractional) to include a read in calling. A good value for pure tandem " + "repeats is 0.9. A good value for more lenient genotyping is ~0.4." + )) + call_parser.add_argument( "--flank-size", type=int,