Skip to content

Commit

Permalink
feat(call)!: consensus method in JSON output + use best_rep for long TRs
Browse files Browse the repository at this point in the history
  • Loading branch information
davidlougheed committed Jun 12, 2024
1 parent 21dee94 commit ca3f7d8
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 12 deletions.
17 changes: 13 additions & 4 deletions strkit/call/call_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

from .align_matrix import match_score
from .cigar import decode_cigar_np
from .consensus import consensus_seq
from .consensus import ConsensusMethod, consensus_seq
from .params import CallParams
from .realign import realign_read
from .repeats import get_repeat_count, get_ref_repeat_count
Expand Down Expand Up @@ -73,6 +73,8 @@
significant_clip_threshold = 100
significant_clip_snv_take_in = 250

max_poa_length = 5000 # maximum number of bases before we can't use POA for consensus anymore due to performance


# property getters & other partials
cn_getter = operator.itemgetter("cn")
Expand Down Expand Up @@ -1431,7 +1433,7 @@ def call_locus(
# don't know how re-sampling has occurred.
call_peak_n_reads: list[int] = []
peak_kmers: list[Counter] = [Counter() for _ in range(call_modal_n or 0)]
call_seqs: list[str] = []
call_seqs: list[tuple[str, ConsensusMethod]] = []
if read_peaks_called := call_modal_n and call_modal_n <= 2:
peaks: NDArray[np.float_] = call_peaks[:call_modal_n]
stdevs: NDArray[np.float_] = call_stdevs[:call_modal_n]
Expand Down Expand Up @@ -1495,8 +1497,15 @@ def call_locus(
call_99_cis = None

if call_data and consensus:
call_seqs = list(
map(lambda a: consensus_seq(map(lambda rr: read_dict_extra[rr]["_tr_seq"], a), logger_), allele_reads)
call_seqs.extend(
map(
lambda a: consensus_seq(
map(lambda rr: read_dict_extra[rr]["_tr_seq"], a),
logger_,
max_poa_length=max_poa_length,
),
allele_reads,
)
)

peak_data = {
Expand Down
38 changes: 30 additions & 8 deletions strkit/call/consensus.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import logging
from random import choice
from strkit_rust_ext import best_representatives as _best_representatives, consensus_seq as _consensus_seq
from typing import Iterable, Optional, Sequence
from typing import Iterable, Literal, Optional, Sequence

__all__ = [
"ConsensusMethod",
"best_representative",
"consensus_seq",
]


ConsensusMethod = Literal["single", "poa", "best_rep"]


def best_representative(seqs: Sequence[str]) -> Optional[str]:
"""
Slightly different from a true consensus - returns the string with the minimum Levenshtein distance to all other
Expand All @@ -27,19 +31,37 @@ def best_representative(seqs: Sequence[str]) -> Optional[str]:
return choice(tuple(res))


def consensus_seq(seqs: Iterable[str], logger: logging.Logger) -> Optional[str]:
def _run_best_representative(seqs: list[str], logger: logging.Logger) -> Optional[tuple[str, ConsensusMethod]]:
res = best_representative(seqs)
method: ConsensusMethod = "best_rep"
if res is None:
logger.debug(f"Got no best representative from sequences {seqs}")
return None
return res, method


def consensus_seq(
seqs: Iterable[str], logger: logging.Logger, max_poa_length: int
) -> Optional[tuple[str, ConsensusMethod]]:
# Return a stringified, gapless version of the column-wise mode for the MSA
# If the consensus fails, try a best-representative strategy instead. If that fails, something's gone wrong...

seqs_l = list(seqs)
if len(set(seqs_l)) == 1:
return seqs_l[0]

if len(seqs_l) == 0:
return None
elif len(set(seqs_l)) == 1:
return seqs_l[0], "single"

seqs_l.sort()
if any(s > max_poa_length for s in map(len, seqs_l)):
return _run_best_representative(seqs_l, logger)

res: Optional[str] = _consensus_seq(seqs_l)
method: ConsensusMethod = "poa"

if res is None:
logger.error(f"Got no consensus sequence from sequences {seqs_l}; trying best representative strategy")
res = best_representative(seqs_l)
if res is None:
logger.debug(f"Got no best representative from sequences {seqs_l}")
return res
return _run_best_representative(seqs_l, logger)

return res, method

0 comments on commit ca3f7d8

Please sign in to comment.