Skip to content

Commit

Permalink
feat(call): parametrize minimum read alignment score
Browse files Browse the repository at this point in the history
  • Loading branch information
davidlougheed committed Jun 17, 2024
1 parent 17d8de0 commit 80a0227
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 4 deletions.
2 changes: 2 additions & 0 deletions docs/caller_usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
* `--min-avg-phred ##`: Minimum average PHRED score for relevant bases (flanking region + tandem repeat).
Read segments with average PHRED scores below this (common with a threshold of ~13 and ONT Ultra Long reads,
for example) will be skipped. **Default:** 13
* `--min-read-align-score #.#`: Minimum normalized read alignment score (fractional; `0.0` to `1.0`) needed to include a
read in a call. A good value for pure tandem repeats is 0.9. A good value for much more lenient genotyping is ~0.4.
* `--flank-size ##`: Size of the flanking region to use on either side of a region to properly anchor reads.
**Default:** 70
* `--realign` or `-a`: Whether to perform local re-alignment to attempt recovery of soft-clipped reads. Some aligners
Expand Down
8 changes: 4 additions & 4 deletions strkit/call/call_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@
ref_max_iters_to_be_slow: int = 100

max_rc_iters = 50
min_read_score = 0.9 # TODO: parametrize
# TODO: Parametrize - if very low alignment or very slow, then "bad read"
extremely_low_read_adj_score: float = 0.1 # fraction
bad_read_alignment_time: int = 15 # seconds
Expand Down Expand Up @@ -796,6 +795,7 @@ def call_locus(
flank_size = params.flank_size
realign = params.realign
respect_ref = params.respect_ref
min_read_align_score = params.min_read_align_score
snv_min_base_qual = params.snv_min_base_qual
# ----------------------------------

Expand Down Expand Up @@ -1141,7 +1141,7 @@ def call_locus(
if rc_time >= really_bad_read_alignment_time:
logger_.debug(
f"{locus_log_str} - not calling locus due to a pathologically-poorly-aligning read ({rn}; "
f"repeat count alignment scored {read_adj_score:.2f} < {min_read_score}; get_repeat_count time: "
f"repeat count alignment scored {read_adj_score:.2f} < {min_read_align_score}; get_repeat_count time: "
f"{rc_time:.3f}s; # get_repeat_count iters: {n_read_cn_iters}; {motif=}; {ref_cn=})")
logger_.debug(f"{locus_log_str} - ref left flank: {ref_left_flank_seq}")
logger_.debug(f"{locus_log_str} - read left flank: {flank_left_seq}")
Expand All @@ -1156,10 +1156,10 @@ def call_locus(
"time": (datetime.now() - call_timer).total_seconds(),
}

if read_adj_score < min_read_score:
if read_adj_score < min_read_align_score:
logger_.debug(
f"{locus_log_str} - skipping read {rn} (repeat count alignment scored {read_adj_score:.2f} < "
f"{min_read_score}; get_repeat_count time: {rc_time:.3f}s; # get_repeat_count iters: "
f"{min_read_align_score}; get_repeat_count time: {rc_time:.3f}s; # get_repeat_count iters: "
f"{n_read_cn_iters})")

if read_adj_score < extremely_low_read_adj_score or rc_time >= bad_read_alignment_time:
Expand Down
4 changes: 4 additions & 0 deletions strkit/call/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(
min_allele_reads: int = 2,
max_reads: int = 250,
min_avg_phred: int = 13,
min_read_align_score: float = 0.9,
num_bootstrap: int = 100,
flank_size: int = 70,
sex_chroms: Optional[str] = None,
Expand All @@ -47,6 +48,7 @@ def __init__(
self.min_allele_reads: int = min_allele_reads
self.max_reads: int = max_reads
self.min_avg_phred: int = min_avg_phred
self.min_read_align_score: float = min_read_align_score
self.num_bootstrap: int = num_bootstrap
self.flank_size: int = flank_size
self.sex_chroms: Optional[str] = sex_chroms
Expand Down Expand Up @@ -97,6 +99,7 @@ def from_args(cls, logger: logging.Logger, p_args):
min_allele_reads=p_args.min_allele_reads,
max_reads=p_args.max_reads,
min_avg_phred=p_args.min_avg_phred,
min_read_align_score=p_args.min_read_align_score,
num_bootstrap=p_args.num_bootstrap,
flank_size=p_args.flank_size,
sex_chroms=p_args.sex_chr,
Expand All @@ -123,6 +126,7 @@ def to_dict(self, as_inputted: bool = False):
"min_allele_reads": self.min_allele_reads,
"max_reads": self.max_reads,
"min_avg_phred": self.min_avg_phred,
"min_read_align_score": self.min_read_align_score,
"num_bootstrap": self.num_bootstrap,
"flank_size": self.flank_size,
"sample_id": self._sample_id_orig if as_inputted else self.sample_id,
Expand Down
10 changes: 10 additions & 0 deletions strkit/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,16 @@ def add_call_parser_args(call_parser):
default=13,
help="Minimum average PHRED score for relevant (flanking + TR) bases required for a read to be used.")

# Minimum read alignment score (fractional) to include a read in calling.
call_parser.add_argument(
"--min-read-align-score", "--mras",
type=float,
default=0.9,
help=(
"Minimum read alignment score (fractional) to include a read in calling. A good value for pure tandem "
"repeats is 0.9. A good value for more lenient genotyping is ~0.4."
))

call_parser.add_argument(
"--flank-size",
type=int,
Expand Down

0 comments on commit 80a0227

Please sign in to comment.