feat(call)!: consensus method in JSON output + use best_rep for long TRs

davidlougheed · Jun 12, 2024 · ca3f7d8 · ca3f7d8
1 parent 21dee94
commit ca3f7d8
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 12 deletions.
diff --git a/strkit/call/call_locus.py b/strkit/call/call_locus.py
@@ -28,7 +28,7 @@
 
 from .align_matrix import match_score
 from .cigar import decode_cigar_np
-from .consensus import consensus_seq
+from .consensus import ConsensusMethod, consensus_seq
 from .params import CallParams
 from .realign import realign_read
 from .repeats import get_repeat_count, get_ref_repeat_count
@@ -73,6 +73,8 @@
 significant_clip_threshold = 100
 significant_clip_snv_take_in = 250
 
+max_poa_length = 5000  # maximum number of bases before we can't use POA for consensus anymore due to performance
+
 
 # property getters & other partials
 cn_getter = operator.itemgetter("cn")
@@ -1431,7 +1433,7 @@ def call_locus(
     # don't know how re-sampling has occurred.
     call_peak_n_reads: list[int] = []
     peak_kmers: list[Counter] = [Counter() for _ in range(call_modal_n or 0)]
-    call_seqs: list[str] = []
+    call_seqs: list[tuple[str, ConsensusMethod]] = []
     if read_peaks_called := call_modal_n and call_modal_n <= 2:
         peaks: NDArray[np.float_] = call_peaks[:call_modal_n]
         stdevs: NDArray[np.float_] = call_stdevs[:call_modal_n]
@@ -1495,8 +1497,15 @@ def call_locus(
             call_99_cis = None
 
         if call_data and consensus:
-            call_seqs = list(
-                map(lambda a: consensus_seq(map(lambda rr: read_dict_extra[rr]["_tr_seq"], a), logger_), allele_reads)
+            call_seqs.extend(
+                map(
+                    lambda a: consensus_seq(
+                        map(lambda rr: read_dict_extra[rr]["_tr_seq"], a),
+                        logger_,
+                        max_poa_length=max_poa_length,
+                    ),
+                    allele_reads,
+                )
             )
 
     peak_data = {

diff --git a/strkit/call/consensus.py b/strkit/call/consensus.py
@@ -1,14 +1,18 @@
 import logging
 from random import choice
 from strkit_rust_ext import best_representatives as _best_representatives, consensus_seq as _consensus_seq
-from typing import Iterable, Optional, Sequence
+from typing import Iterable, Literal, Optional, Sequence
 
 __all__ = [
+    "ConsensusMethod",
     "best_representative",
     "consensus_seq",
 ]
 
 
+ConsensusMethod = Literal["single", "poa", "best_rep"]
+
+
 def best_representative(seqs: Sequence[str]) -> Optional[str]:
     """
     Slightly different from a true consensus - returns the string with the minimum Levenshtein distance to all other
@@ -27,19 +31,37 @@ def best_representative(seqs: Sequence[str]) -> Optional[str]:
         return choice(tuple(res))
 
 
-def consensus_seq(seqs: Iterable[str], logger: logging.Logger) -> Optional[str]:
+def _run_best_representative(seqs: list[str], logger: logging.Logger) -> Optional[tuple[str, ConsensusMethod]]:
+    res = best_representative(seqs)
+    method: ConsensusMethod = "best_rep"
+    if res is None:
+        logger.debug(f"Got no best representative from sequences {seqs}")
+        return None
+    return res, method
+
+
+def consensus_seq(
+    seqs: Iterable[str], logger: logging.Logger, max_poa_length: int
+) -> Optional[tuple[str, ConsensusMethod]]:
     # Return a stringified, gapless version of the column-wise mode for the MSA
     # If the consensus fails, try a best-representative strategy instead. If that fails, something's gone wrong...
 
     seqs_l = list(seqs)
-    if len(set(seqs_l)) == 1:
-        return seqs_l[0]
+
+    if len(seqs_l) == 0:
+        return None
+    elif len(set(seqs_l)) == 1:
+        return seqs_l[0], "single"
 
     seqs_l.sort()
+    if any(s > max_poa_length for s in map(len, seqs_l)):
+        return _run_best_representative(seqs_l, logger)
+
     res: Optional[str] = _consensus_seq(seqs_l)
+    method: ConsensusMethod = "poa"
+
     if res is None:
         logger.error(f"Got no consensus sequence from sequences {seqs_l}; trying best representative strategy")
-        res = best_representative(seqs_l)
-        if res is None:
-            logger.debug(f"Got no best representative from sequences {seqs_l}")
-    return res
+        return _run_best_representative(seqs_l, logger)
+
+    return res, method