From 44f8731e31236a945bcfea75f2f979526fe4a2a0 Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Wed, 3 Apr 2024 15:02:53 +0200 Subject: [PATCH 01/27] tcrdist draft version implemented --- src/scirpy/ir_dist/__init__.py | 5 +- src/scirpy/ir_dist/metrics.py | 164 +++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+), 1 deletion(-) diff --git a/src/scirpy/ir_dist/__init__.py b/src/scirpy/ir_dist/__init__.py index 6b13a760b..0aad959c5 100644 --- a/src/scirpy/ir_dist/__init__.py +++ b/src/scirpy/ir_dist/__init__.py @@ -94,6 +94,8 @@ def _get_distance_calculator(metric: MetricType, cutoff: Union[int, None], *, n_ dist_calc = metrics.LevenshteinDistanceCalculator(cutoff=cutoff, n_jobs=n_jobs, **kwargs) elif metric == "hamming": dist_calc = metrics.HammingDistanceCalculator(cutoff=cutoff, n_jobs=n_jobs, **kwargs) + elif metric == "tcrdist": + dist_calc = metrics.TCRdistDistanceCalculator(cutoff=cutoff, n_jobs=n_jobs, **kwargs) else: raise ValueError("Invalid distance metric.") @@ -117,6 +119,7 @@ def _ir_dist( airr_mod_ref: str = "airr", airr_key_ref: str = "airr", chain_idx_key_ref: str = "chain_indices", + **kwargs, ) -> Union[dict, None]: """\ Computes a sequence-distance metric between all unique :term:`VJ ` @@ -220,7 +223,7 @@ def _get_unique_seqs(tmp_adata, chain_type): result[chain_type][tmp_key] = unique_seqs # compute distance matrices - dist_calc = _get_distance_calculator(metric, cutoff, n_jobs=n_jobs) + dist_calc = _get_distance_calculator(metric, cutoff, n_jobs=n_jobs, **kwargs) for chain_type in ["VJ", "VDJ"]: logging.info(f"Computing sequence x sequence distance matrix for {chain_type} sequences.") # type: ignore result[chain_type]["distances"] = dist_calc.calc_dist_mat( diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index 6f1a46d19..4adbb67e6 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -14,6 +14,9 @@ from scirpy.util import _doc_params, deprecated, tqdm +import numba as nb +import multiprocessing + _doc_params_parallel_distance_calculator = """\ n_jobs Number of jobs to use for the pairwise distance calculation. @@ -369,6 +372,167 @@ def _compute_block(self, seqs1, seqs2, origin): return result +class TCRdistDistanceCalculator: + + parasail_aa_alphabet = 'ARNDCQEGHILKMFPSTWYVBZX' + parasail_aa_alphabet_with_unknown = 'ARNDCQEGHILKMFPSTWYVBZX*' + tcr_dict_distance_matrix = {('A', 'A'): 0, ('A', 'C'): 4, ('A', 'D'): 4, ('A', 'E'): 4, ('A', 'F'): 4, ('A', 'G'): 4, ('A', 'H'): 4, ('A', 'I'): 4, ('A', 'K'): 4, ('A', 'L'): 4, ('A', 'M'): 4, ('A', 'N'): 4, ('A', 'P'): 4, ('A', 'Q'): 4, ('A', 'R'): 4, ('A', 'S'): 3, ('A', 'T'): 4, ('A', 'V'): 4, ('A', 'W'): 4, ('A', 'Y'): 4, ('C', 'A'): 4, ('C', 'C'): 0, ('C', 'D'): 4, ('C', 'E'): 4, ('C', 'F'): 4, ('C', 'G'): 4, ('C', 'H'): 4, ('C', 'I'): 4, ('C', 'K'): 4, ('C', 'L'): 4, ('C', 'M'): 4, ('C', 'N'): 4, ('C', 'P'): 4, ('C', 'Q'): 4, ('C', 'R'): 4, ('C', 'S'): 4, ('C', 'T'): 4, ('C', 'V'): 4, ('C', 'W'): 4, ('C', 'Y'): 4, ('D', 'A'): 4, ('D', 'C'): 4, ('D', 'D'): 0, ('D', 'E'): 2, ('D', 'F'): 4, ('D', 'G'): 4, ('D', 'H'): 4, ('D', 'I'): 4, ('D', 'K'): 4, ('D', 'L'): 4, ('D', 'M'): 4, ('D', 'N'): 3, ('D', 'P'): 4, ('D', 'Q'): 4, ('D', 'R'): 4, ('D', 'S'): 4, ('D', 'T'): 4, ('D', 'V'): 4, ('D', 'W'): 4, ('D', 'Y'): 4, ('E', 'A'): 4, ('E', 'C'): 4, ('E', 'D'): 2, ('E', 'E'): 0, ('E', 'F'): 4, ('E', 'G'): 4, ('E', 'H'): 4, ('E', 'I'): 4, ('E', 'K'): 3, ('E', 'L'): 4, ('E', 'M'): 4, ('E', 'N'): 4, ('E', 'P'): 4, ('E', 'Q'): 2, ('E', 'R'): 4, ('E', 'S'): 4, ('E', 'T'): 4, ('E', 'V'): 4, ('E', 'W'): 4, ('E', 'Y'): 4, ('F', 'A'): 4, ('F', 'C'): 4, ('F', 'D'): 4, ('F', 'E'): 4, ('F', 'F'): 0, ('F', 'G'): 4, ('F', 'H'): 4, ('F', 'I'): 4, ('F', 'K'): 4, ('F', 'L'): 4, ('F', 'M'): 4, ('F', 'N'): 4, ('F', 'P'): 4, ('F', 'Q'): 4, ('F', 'R'): 4, ('F', 'S'): 4, ('F', 'T'): 4, ('F', 'V'): 4, ('F', 'W'): 3, ('F', 'Y'): 1, ('G', 'A'): 4, ('G', 'C'): 4, ('G', 'D'): 4, ('G', 'E'): 4, ('G', 'F'): 4, ('G', 'G'): 0, ('G', 'H'): 4, ('G', 'I'): 4, ('G', 'K'): 4, ('G', 'L'): 4, ('G', 'M'): 4, ('G', 'N'): 4, ('G', 'P'): 4, ('G', 'Q'): 4, ('G', 'R'): 4, ('G', 'S'): 4, ('G', 'T'): 4, ('G', 'V'): 4, ('G', 'W'): 4, ('G', 'Y'): 4, ('H', 'A'): 4, ('H', 'C'): 4, ('H', 'D'): 4, ('H', 'E'): 4, ('H', 'F'): 4, ('H', 'G'): 4, ('H', 'H'): 0, ('H', 'I'): 4, ('H', 'K'): 4, ('H', 'L'): 4, ('H', 'M'): 4, ('H', 'N'): 3, ('H', 'P'): 4, ('H', 'Q'): 4, ('H', 'R'): 4, ('H', 'S'): 4, ('H', 'T'): 4, ('H', 'V'): 4, ('H', 'W'): 4, ('H', 'Y'): 2, ('I', 'A'): 4, ('I', 'C'): 4, ('I', 'D'): 4, ('I', 'E'): 4, ('I', 'F'): 4, ('I', 'G'): 4, ('I', 'H'): 4, ('I', 'I'): 0, ('I', 'K'): 4, ('I', 'L'): 2, ('I', 'M'): 3, ('I', 'N'): 4, ('I', 'P'): 4, ('I', 'Q'): 4, ('I', 'R'): 4, ('I', 'S'): 4, ('I', 'T'): 4, ('I', 'V'): 1, ('I', 'W'): 4, ('I', 'Y'): 4, ('K', 'A'): 4, ('K', 'C'): 4, ('K', 'D'): 4, ('K', 'E'): 3, ('K', 'F'): 4, ('K', 'G'): 4, ('K', 'H'): 4, ('K', 'I'): 4, ('K', 'K'): 0, ('K', 'L'): 4, ('K', 'M'): 4, ('K', 'N'): 4, ('K', 'P'): 4, ('K', 'Q'): 3, ('K', 'R'): 2, ('K', 'S'): 4, ('K', 'T'): 4, ('K', 'V'): 4, ('K', 'W'): 4, ('K', 'Y'): 4, ('L', 'A'): 4, ('L', 'C'): 4, ('L', 'D'): 4, ('L', 'E'): 4, ('L', 'F'): 4, ('L', 'G'): 4, ('L', 'H'): 4, ('L', 'I'): 2, ('L', 'K'): 4, ('L', 'L'): 0, ('L', 'M'): 2, ('L', 'N'): 4, ('L', 'P'): 4, ('L', 'Q'): 4, ('L', 'R'): 4, ('L', 'S'): 4, ('L', 'T'): 4, ('L', 'V'): 3, ('L', 'W'): 4, ('L', 'Y'): 4, ('M', 'A'): 4, ('M', 'C'): 4, ('M', 'D'): 4, ('M', 'E'): 4, ('M', 'F'): 4, ('M', 'G'): 4, ('M', 'H'): 4, ('M', 'I'): 3, ('M', 'K'): 4, ('M', 'L'): 2, ('M', 'M'): 0, ('M', 'N'): 4, ('M', 'P'): 4, ('M', 'Q'): 4, ('M', 'R'): 4, ('M', 'S'): 4, ('M', 'T'): 4, ('M', 'V'): 3, ('M', 'W'): 4, ('M', 'Y'): 4, ('N', 'A'): 4, ('N', 'C'): 4, ('N', 'D'): 3, ('N', 'E'): 4, ('N', 'F'): 4, ('N', 'G'): 4, ('N', 'H'): 3, ('N', 'I'): 4, ('N', 'K'): 4, ('N', 'L'): 4, ('N', 'M'): 4, ('N', 'N'): 0, ('N', 'P'): 4, ('N', 'Q'): 4, ('N', 'R'): 4, ('N', 'S'): 3, ('N', 'T'): 4, ('N', 'V'): 4, ('N', 'W'): 4, ('N', 'Y'): 4, ('P', 'A'): 4, ('P', 'C'): 4, ('P', 'D'): 4, ('P', 'E'): 4, ('P', 'F'): 4, ('P', 'G'): 4, ('P', 'H'): 4, ('P', 'I'): 4, ('P', 'K'): 4, ('P', 'L'): 4, ('P', 'M'): 4, ('P', 'N'): 4, ('P', 'P'): 0, ('P', 'Q'): 4, ('P', 'R'): 4, ('P', 'S'): 4, ('P', 'T'): 4, ('P', 'V'): 4, ('P', 'W'): 4, ('P', 'Y'): 4, ('Q', 'A'): 4, ('Q', 'C'): 4, ('Q', 'D'): 4, ('Q', 'E'): 2, ('Q', 'F'): 4, ('Q', 'G'): 4, ('Q', 'H'): 4, ('Q', 'I'): 4, ('Q', 'K'): 3, ('Q', 'L'): 4, ('Q', 'M'): 4, ('Q', 'N'): 4, ('Q', 'P'): 4, ('Q', 'Q'): 0, ('Q', 'R'): 3, ('Q', 'S'): 4, ('Q', 'T'): 4, ('Q', 'V'): 4, ('Q', 'W'): 4, ('Q', 'Y'): 4, ('R', 'A'): 4, ('R', 'C'): 4, ('R', 'D'): 4, ('R', 'E'): 4, ('R', 'F'): 4, ('R', 'G'): 4, ('R', 'H'): 4, ('R', 'I'): 4, ('R', 'K'): 2, ('R', 'L'): 4, ('R', 'M'): 4, ('R', 'N'): 4, ('R', 'P'): 4, ('R', 'Q'): 3, ('R', 'R'): 0, ('R', 'S'): 4, ('R', 'T'): 4, ('R', 'V'): 4, ('R', 'W'): 4, ('R', 'Y'): 4, ('S', 'A'): 3, ('S', 'C'): 4, ('S', 'D'): 4, ('S', 'E'): 4, ('S', 'F'): 4, ('S', 'G'): 4, ('S', 'H'): 4, ('S', 'I'): 4, ('S', 'K'): 4, ('S', 'L'): 4, ('S', 'M'): 4, ('S', 'N'): 3, ('S', 'P'): 4, ('S', 'Q'): 4, ('S', 'R'): 4, ('S', 'S'): 0, ('S', 'T'): 3, ('S', 'V'): 4, ('S', 'W'): 4, ('S', 'Y'): 4, ('T', 'A'): 4, ('T', 'C'): 4, ('T', 'D'): 4, ('T', 'E'): 4, ('T', 'F'): 4, ('T', 'G'): 4, ('T', 'H'): 4, ('T', 'I'): 4, ('T', 'K'): 4, ('T', 'L'): 4, ('T', 'M'): 4, ('T', 'N'): 4, ('T', 'P'): 4, ('T', 'Q'): 4, ('T', 'R'): 4, ('T', 'S'): 3, ('T', 'T'): 0, ('T', 'V'): 4, ('T', 'W'): 4, ('T', 'Y'): 4, ('V', 'A'): 4, ('V', 'C'): 4, ('V', 'D'): 4, ('V', 'E'): 4, ('V', 'F'): 4, ('V', 'G'): 4, ('V', 'H'): 4, ('V', 'I'): 1, ('V', 'K'): 4, ('V', 'L'): 3, ('V', 'M'): 3, ('V', 'N'): 4, ('V', 'P'): 4, ('V', 'Q'): 4, ('V', 'R'): 4, ('V', 'S'): 4, ('V', 'T'): 4, ('V', 'V'): 0, ('V', 'W'): 4, ('V', 'Y'): 4, ('W', 'A'): 4, ('W', 'C'): 4, ('W', 'D'): 4, ('W', 'E'): 4, ('W', 'F'): 3, ('W', 'G'): 4, ('W', 'H'): 4, ('W', 'I'): 4, ('W', 'K'): 4, ('W', 'L'): 4, ('W', 'M'): 4, ('W', 'N'): 4, ('W', 'P'): 4, ('W', 'Q'): 4, ('W', 'R'): 4, ('W', 'S'): 4, ('W', 'T'): 4, ('W', 'V'): 4, ('W', 'W'): 0, ('W', 'Y'): 2, ('Y', 'A'): 4, ('Y', 'C'): 4, ('Y', 'D'): 4, ('Y', 'E'): 4, ('Y', 'F'): 1, ('Y', 'G'): 4, ('Y', 'H'): 2, ('Y', 'I'): 4, ('Y', 'K'): 4, ('Y', 'L'): 4, ('Y', 'M'): 4, ('Y', 'N'): 4, ('Y', 'P'): 4, ('Y', 'Q'): 4, ('Y', 'R'): 4, ('Y', 'S'): 4, ('Y', 'T'): 4, ('Y', 'V'): 4, ('Y', 'W'): 2, ('Y', 'Y'): 0} + + def __init__(self, dist_weight=3, gap_penalty=4, ntrim=3, ctrim=2, fixed_gappos=True, cutoff: Union[None, int] = None, n_jobs: Union[None, int] = None): + if cutoff is None: + cutoff = 20 + if n_jobs is None: + n_jobs = 1 + + self.dist_weight = dist_weight + self.gap_penalty = gap_penalty + self.ntrim = ntrim + self.ctrim = ctrim + self.fixed_gappos = fixed_gappos + self.cutoff = cutoff + self.n_jobs = n_jobs + + @staticmethod + def _make_numba_matrix(distance_matrix, alphabet=parasail_aa_alphabet_with_unknown): + dm = np.zeros((len(alphabet), len(alphabet)), dtype=np.int32) + for (aa1, aa2), d in distance_matrix.items(): + dm[alphabet.index(aa1), alphabet.index(aa2)] = d + dm[alphabet.index(aa2), alphabet.index(aa1)] = d + return dm + + tcr_nb_distance_matrix = _make_numba_matrix(tcr_dict_distance_matrix) + + @staticmethod + def _seqs2mat(seqs, alphabet=parasail_aa_alphabet, max_len=None): + if max_len is None: + max_len = np.max([len(s) for s in seqs]) + mat = -1 * np.ones((len(seqs), max_len), dtype=np.int8) + L = np.zeros(len(seqs), dtype=np.int8) + for si, s in enumerate(seqs): + L[si] = len(s) + for aai in range(max_len): + if aai >= len(s): + break + try: + mat[si, aai] = alphabet.index(s[aai]) + except ValueError: + #Unknown symbols given value for last column/row of matrix + mat[si, aai] = len(alphabet) + return mat, L + + @staticmethod + @nb.jit(nopython=True, parallel=False, nogil=True) + def _nb_tcrdist_mat(seqs_mat1, seqs_mat2, seqs_L1, seqs_L2, distance_matrix=tcr_nb_distance_matrix, dist_weight=3, gap_penalty=4, ntrim=3, ctrim=2, fixed_gappos=True, cutoff=20): + + assert seqs_mat1.shape[0] == seqs_L1.shape[0] + assert seqs_mat2.shape[0] == seqs_L2.shape[0] + + data_rows = nb.typed.List() + indices_rows = nb.typed.List() + row_element_counts = np.zeros(seqs_mat1.shape[0]) + + empty_row = np.zeros(0) + for i in range(0,seqs_mat1.shape[0]): + data_rows.append(empty_row) + indices_rows.append(empty_row) + + for row_index in range(seqs_mat1.shape[0]): + data_row = np.zeros(seqs_mat2.shape[0]) + indices_row = np.zeros(seqs_mat2.shape[0]) + row_end_index = 0 + + for col_index in range(seqs_mat2.shape[0]): + q_L = seqs_L1[row_index] + s_L = seqs_L2[col_index] + if q_L == s_L: + distance = 1 + for i in range(ntrim, q_L - ctrim): + distance += distance_matrix[seqs_mat1[row_index, i], seqs_mat2[col_index, i]] * dist_weight + + if(distance <= cutoff + 1): + data_row[row_end_index] = distance + indices_row[row_end_index] = col_index + row_end_index += 1 + continue + + short_len = min(q_L, s_L) + len_diff = abs(q_L - s_L) + if fixed_gappos: + min_gappos = min(6, 3 + (short_len - 5) // 2) + max_gappos = min_gappos + else: + min_gappos = 5 + max_gappos = short_len - 1 - 4 + while min_gappos > max_gappos: + min_gappos -= 1 + max_gappos += 1 + min_dist = -1 + + for gappos in range(min_gappos, max_gappos + 1): + tmp_dist = 0 + + remainder = short_len - gappos + for n_i in range(ntrim, gappos): + tmp_dist += distance_matrix[seqs_mat1[row_index, n_i], seqs_mat2[col_index, n_i]] + + for c_i in range(ctrim, remainder): + tmp_dist += distance_matrix[seqs_mat1[row_index, q_L - 1 - c_i], seqs_mat2[col_index, s_L - 1 - c_i]] + + if tmp_dist < min_dist or min_dist == -1: + min_dist = tmp_dist + + if min_dist == 0: + break + + distance = min_dist * dist_weight + len_diff * gap_penalty + 1 + if(distance <= cutoff + 1): + data_row[row_end_index] = distance + indices_row[row_end_index] = col_index + row_end_index += 1 + + data_rows[row_index] = data_row[0:row_end_index] + indices_rows[row_index] = indices_row[0:row_end_index] + row_element_counts[row_index] = row_end_index + + return data_rows, indices_rows, row_element_counts + + def _calc_dist_mat_block(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: + + seqs_mat1, seqs_L1 = self._seqs2mat(seqs) + seqs_mat2, seqs_L2 = self._seqs2mat(seqs2) + + kwargs = { + 'dist_weight': self.dist_weight, + 'gap_penalty': self.gap_penalty, + 'ntrim': self.ntrim, + 'ctrim': self.ctrim, + 'fixed_gappos': self.fixed_gappos, + 'cutoff': self.cutoff + } + + data_rows, indices_rows, row_element_counts = self._nb_tcrdist_mat(seqs_mat1, seqs_mat2, seqs_L1, seqs_L2, **kwargs) + + indptr = np.zeros(row_element_counts.shape[0] + 1) + indptr[1:] = np.cumsum(row_element_counts) + data, indices = np.concatenate(data_rows), np.concatenate(indices_rows) + sparse_distance_matrix = csr_matrix((data, indices, indptr), shape=(len(seqs), len(seqs2))) + return sparse_distance_matrix + + def calc_dist_mat(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: + if(seqs2 is None): + seqs2 = seqs + + if(self.n_jobs>1): + split_seqs = np.array_split(seqs, self.n_jobs) + arguments = [(split_seqs[x],seqs2) for x in range(self.n_jobs)] + with multiprocessing.Pool(self.n_jobs) as p: + results = p.starmap(self._calc_dist_mat_block, arguments) + distance_matrix_csr = scipy.sparse.vstack(results) + else: + distance_matrix_csr = self._calc_dist_mat_block(seqs, seqs2) + + return distance_matrix_csr + @_doc_params(params=_doc_params_parallel_distance_calculator) class AlignmentDistanceCalculator(ParallelDistanceCalculator): From 3081139c8492fec57cf69f03051bee35752258a2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Apr 2024 19:46:13 +0000 Subject: [PATCH 02/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/scirpy/ir_dist/metrics.py | 483 +++++++++++++++++++++++++++++++--- 1 file changed, 453 insertions(+), 30 deletions(-) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index 8645ab557..d8e05fb44 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -1,10 +1,12 @@ import abc import itertools +import multiprocessing import warnings from collections.abc import Sequence from typing import Optional, Union import joblib +import numba as nb import numpy as np import scipy.sparse import scipy.spatial @@ -15,9 +17,6 @@ from scirpy.util import _doc_params, _parallelize_with_joblib, deprecated -import numba as nb -import multiprocessing - _doc_params_parallel_distance_calculator = """\ n_jobs Number of jobs to use for the pairwise distance calculation, passed to @@ -394,13 +393,423 @@ def _compute_block(self, seqs1, seqs2, origin): return result -class TCRdistDistanceCalculator: - parasail_aa_alphabet = 'ARNDCQEGHILKMFPSTWYVBZX' - parasail_aa_alphabet_with_unknown = 'ARNDCQEGHILKMFPSTWYVBZX*' - tcr_dict_distance_matrix = {('A', 'A'): 0, ('A', 'C'): 4, ('A', 'D'): 4, ('A', 'E'): 4, ('A', 'F'): 4, ('A', 'G'): 4, ('A', 'H'): 4, ('A', 'I'): 4, ('A', 'K'): 4, ('A', 'L'): 4, ('A', 'M'): 4, ('A', 'N'): 4, ('A', 'P'): 4, ('A', 'Q'): 4, ('A', 'R'): 4, ('A', 'S'): 3, ('A', 'T'): 4, ('A', 'V'): 4, ('A', 'W'): 4, ('A', 'Y'): 4, ('C', 'A'): 4, ('C', 'C'): 0, ('C', 'D'): 4, ('C', 'E'): 4, ('C', 'F'): 4, ('C', 'G'): 4, ('C', 'H'): 4, ('C', 'I'): 4, ('C', 'K'): 4, ('C', 'L'): 4, ('C', 'M'): 4, ('C', 'N'): 4, ('C', 'P'): 4, ('C', 'Q'): 4, ('C', 'R'): 4, ('C', 'S'): 4, ('C', 'T'): 4, ('C', 'V'): 4, ('C', 'W'): 4, ('C', 'Y'): 4, ('D', 'A'): 4, ('D', 'C'): 4, ('D', 'D'): 0, ('D', 'E'): 2, ('D', 'F'): 4, ('D', 'G'): 4, ('D', 'H'): 4, ('D', 'I'): 4, ('D', 'K'): 4, ('D', 'L'): 4, ('D', 'M'): 4, ('D', 'N'): 3, ('D', 'P'): 4, ('D', 'Q'): 4, ('D', 'R'): 4, ('D', 'S'): 4, ('D', 'T'): 4, ('D', 'V'): 4, ('D', 'W'): 4, ('D', 'Y'): 4, ('E', 'A'): 4, ('E', 'C'): 4, ('E', 'D'): 2, ('E', 'E'): 0, ('E', 'F'): 4, ('E', 'G'): 4, ('E', 'H'): 4, ('E', 'I'): 4, ('E', 'K'): 3, ('E', 'L'): 4, ('E', 'M'): 4, ('E', 'N'): 4, ('E', 'P'): 4, ('E', 'Q'): 2, ('E', 'R'): 4, ('E', 'S'): 4, ('E', 'T'): 4, ('E', 'V'): 4, ('E', 'W'): 4, ('E', 'Y'): 4, ('F', 'A'): 4, ('F', 'C'): 4, ('F', 'D'): 4, ('F', 'E'): 4, ('F', 'F'): 0, ('F', 'G'): 4, ('F', 'H'): 4, ('F', 'I'): 4, ('F', 'K'): 4, ('F', 'L'): 4, ('F', 'M'): 4, ('F', 'N'): 4, ('F', 'P'): 4, ('F', 'Q'): 4, ('F', 'R'): 4, ('F', 'S'): 4, ('F', 'T'): 4, ('F', 'V'): 4, ('F', 'W'): 3, ('F', 'Y'): 1, ('G', 'A'): 4, ('G', 'C'): 4, ('G', 'D'): 4, ('G', 'E'): 4, ('G', 'F'): 4, ('G', 'G'): 0, ('G', 'H'): 4, ('G', 'I'): 4, ('G', 'K'): 4, ('G', 'L'): 4, ('G', 'M'): 4, ('G', 'N'): 4, ('G', 'P'): 4, ('G', 'Q'): 4, ('G', 'R'): 4, ('G', 'S'): 4, ('G', 'T'): 4, ('G', 'V'): 4, ('G', 'W'): 4, ('G', 'Y'): 4, ('H', 'A'): 4, ('H', 'C'): 4, ('H', 'D'): 4, ('H', 'E'): 4, ('H', 'F'): 4, ('H', 'G'): 4, ('H', 'H'): 0, ('H', 'I'): 4, ('H', 'K'): 4, ('H', 'L'): 4, ('H', 'M'): 4, ('H', 'N'): 3, ('H', 'P'): 4, ('H', 'Q'): 4, ('H', 'R'): 4, ('H', 'S'): 4, ('H', 'T'): 4, ('H', 'V'): 4, ('H', 'W'): 4, ('H', 'Y'): 2, ('I', 'A'): 4, ('I', 'C'): 4, ('I', 'D'): 4, ('I', 'E'): 4, ('I', 'F'): 4, ('I', 'G'): 4, ('I', 'H'): 4, ('I', 'I'): 0, ('I', 'K'): 4, ('I', 'L'): 2, ('I', 'M'): 3, ('I', 'N'): 4, ('I', 'P'): 4, ('I', 'Q'): 4, ('I', 'R'): 4, ('I', 'S'): 4, ('I', 'T'): 4, ('I', 'V'): 1, ('I', 'W'): 4, ('I', 'Y'): 4, ('K', 'A'): 4, ('K', 'C'): 4, ('K', 'D'): 4, ('K', 'E'): 3, ('K', 'F'): 4, ('K', 'G'): 4, ('K', 'H'): 4, ('K', 'I'): 4, ('K', 'K'): 0, ('K', 'L'): 4, ('K', 'M'): 4, ('K', 'N'): 4, ('K', 'P'): 4, ('K', 'Q'): 3, ('K', 'R'): 2, ('K', 'S'): 4, ('K', 'T'): 4, ('K', 'V'): 4, ('K', 'W'): 4, ('K', 'Y'): 4, ('L', 'A'): 4, ('L', 'C'): 4, ('L', 'D'): 4, ('L', 'E'): 4, ('L', 'F'): 4, ('L', 'G'): 4, ('L', 'H'): 4, ('L', 'I'): 2, ('L', 'K'): 4, ('L', 'L'): 0, ('L', 'M'): 2, ('L', 'N'): 4, ('L', 'P'): 4, ('L', 'Q'): 4, ('L', 'R'): 4, ('L', 'S'): 4, ('L', 'T'): 4, ('L', 'V'): 3, ('L', 'W'): 4, ('L', 'Y'): 4, ('M', 'A'): 4, ('M', 'C'): 4, ('M', 'D'): 4, ('M', 'E'): 4, ('M', 'F'): 4, ('M', 'G'): 4, ('M', 'H'): 4, ('M', 'I'): 3, ('M', 'K'): 4, ('M', 'L'): 2, ('M', 'M'): 0, ('M', 'N'): 4, ('M', 'P'): 4, ('M', 'Q'): 4, ('M', 'R'): 4, ('M', 'S'): 4, ('M', 'T'): 4, ('M', 'V'): 3, ('M', 'W'): 4, ('M', 'Y'): 4, ('N', 'A'): 4, ('N', 'C'): 4, ('N', 'D'): 3, ('N', 'E'): 4, ('N', 'F'): 4, ('N', 'G'): 4, ('N', 'H'): 3, ('N', 'I'): 4, ('N', 'K'): 4, ('N', 'L'): 4, ('N', 'M'): 4, ('N', 'N'): 0, ('N', 'P'): 4, ('N', 'Q'): 4, ('N', 'R'): 4, ('N', 'S'): 3, ('N', 'T'): 4, ('N', 'V'): 4, ('N', 'W'): 4, ('N', 'Y'): 4, ('P', 'A'): 4, ('P', 'C'): 4, ('P', 'D'): 4, ('P', 'E'): 4, ('P', 'F'): 4, ('P', 'G'): 4, ('P', 'H'): 4, ('P', 'I'): 4, ('P', 'K'): 4, ('P', 'L'): 4, ('P', 'M'): 4, ('P', 'N'): 4, ('P', 'P'): 0, ('P', 'Q'): 4, ('P', 'R'): 4, ('P', 'S'): 4, ('P', 'T'): 4, ('P', 'V'): 4, ('P', 'W'): 4, ('P', 'Y'): 4, ('Q', 'A'): 4, ('Q', 'C'): 4, ('Q', 'D'): 4, ('Q', 'E'): 2, ('Q', 'F'): 4, ('Q', 'G'): 4, ('Q', 'H'): 4, ('Q', 'I'): 4, ('Q', 'K'): 3, ('Q', 'L'): 4, ('Q', 'M'): 4, ('Q', 'N'): 4, ('Q', 'P'): 4, ('Q', 'Q'): 0, ('Q', 'R'): 3, ('Q', 'S'): 4, ('Q', 'T'): 4, ('Q', 'V'): 4, ('Q', 'W'): 4, ('Q', 'Y'): 4, ('R', 'A'): 4, ('R', 'C'): 4, ('R', 'D'): 4, ('R', 'E'): 4, ('R', 'F'): 4, ('R', 'G'): 4, ('R', 'H'): 4, ('R', 'I'): 4, ('R', 'K'): 2, ('R', 'L'): 4, ('R', 'M'): 4, ('R', 'N'): 4, ('R', 'P'): 4, ('R', 'Q'): 3, ('R', 'R'): 0, ('R', 'S'): 4, ('R', 'T'): 4, ('R', 'V'): 4, ('R', 'W'): 4, ('R', 'Y'): 4, ('S', 'A'): 3, ('S', 'C'): 4, ('S', 'D'): 4, ('S', 'E'): 4, ('S', 'F'): 4, ('S', 'G'): 4, ('S', 'H'): 4, ('S', 'I'): 4, ('S', 'K'): 4, ('S', 'L'): 4, ('S', 'M'): 4, ('S', 'N'): 3, ('S', 'P'): 4, ('S', 'Q'): 4, ('S', 'R'): 4, ('S', 'S'): 0, ('S', 'T'): 3, ('S', 'V'): 4, ('S', 'W'): 4, ('S', 'Y'): 4, ('T', 'A'): 4, ('T', 'C'): 4, ('T', 'D'): 4, ('T', 'E'): 4, ('T', 'F'): 4, ('T', 'G'): 4, ('T', 'H'): 4, ('T', 'I'): 4, ('T', 'K'): 4, ('T', 'L'): 4, ('T', 'M'): 4, ('T', 'N'): 4, ('T', 'P'): 4, ('T', 'Q'): 4, ('T', 'R'): 4, ('T', 'S'): 3, ('T', 'T'): 0, ('T', 'V'): 4, ('T', 'W'): 4, ('T', 'Y'): 4, ('V', 'A'): 4, ('V', 'C'): 4, ('V', 'D'): 4, ('V', 'E'): 4, ('V', 'F'): 4, ('V', 'G'): 4, ('V', 'H'): 4, ('V', 'I'): 1, ('V', 'K'): 4, ('V', 'L'): 3, ('V', 'M'): 3, ('V', 'N'): 4, ('V', 'P'): 4, ('V', 'Q'): 4, ('V', 'R'): 4, ('V', 'S'): 4, ('V', 'T'): 4, ('V', 'V'): 0, ('V', 'W'): 4, ('V', 'Y'): 4, ('W', 'A'): 4, ('W', 'C'): 4, ('W', 'D'): 4, ('W', 'E'): 4, ('W', 'F'): 3, ('W', 'G'): 4, ('W', 'H'): 4, ('W', 'I'): 4, ('W', 'K'): 4, ('W', 'L'): 4, ('W', 'M'): 4, ('W', 'N'): 4, ('W', 'P'): 4, ('W', 'Q'): 4, ('W', 'R'): 4, ('W', 'S'): 4, ('W', 'T'): 4, ('W', 'V'): 4, ('W', 'W'): 0, ('W', 'Y'): 2, ('Y', 'A'): 4, ('Y', 'C'): 4, ('Y', 'D'): 4, ('Y', 'E'): 4, ('Y', 'F'): 1, ('Y', 'G'): 4, ('Y', 'H'): 2, ('Y', 'I'): 4, ('Y', 'K'): 4, ('Y', 'L'): 4, ('Y', 'M'): 4, ('Y', 'N'): 4, ('Y', 'P'): 4, ('Y', 'Q'): 4, ('Y', 'R'): 4, ('Y', 'S'): 4, ('Y', 'T'): 4, ('Y', 'V'): 4, ('Y', 'W'): 2, ('Y', 'Y'): 0} +class TCRdistDistanceCalculator: + parasail_aa_alphabet = "ARNDCQEGHILKMFPSTWYVBZX" + parasail_aa_alphabet_with_unknown = "ARNDCQEGHILKMFPSTWYVBZX*" + tcr_dict_distance_matrix = { + ("A", "A"): 0, + ("A", "C"): 4, + ("A", "D"): 4, + ("A", "E"): 4, + ("A", "F"): 4, + ("A", "G"): 4, + ("A", "H"): 4, + ("A", "I"): 4, + ("A", "K"): 4, + ("A", "L"): 4, + ("A", "M"): 4, + ("A", "N"): 4, + ("A", "P"): 4, + ("A", "Q"): 4, + ("A", "R"): 4, + ("A", "S"): 3, + ("A", "T"): 4, + ("A", "V"): 4, + ("A", "W"): 4, + ("A", "Y"): 4, + ("C", "A"): 4, + ("C", "C"): 0, + ("C", "D"): 4, + ("C", "E"): 4, + ("C", "F"): 4, + ("C", "G"): 4, + ("C", "H"): 4, + ("C", "I"): 4, + ("C", "K"): 4, + ("C", "L"): 4, + ("C", "M"): 4, + ("C", "N"): 4, + ("C", "P"): 4, + ("C", "Q"): 4, + ("C", "R"): 4, + ("C", "S"): 4, + ("C", "T"): 4, + ("C", "V"): 4, + ("C", "W"): 4, + ("C", "Y"): 4, + ("D", "A"): 4, + ("D", "C"): 4, + ("D", "D"): 0, + ("D", "E"): 2, + ("D", "F"): 4, + ("D", "G"): 4, + ("D", "H"): 4, + ("D", "I"): 4, + ("D", "K"): 4, + ("D", "L"): 4, + ("D", "M"): 4, + ("D", "N"): 3, + ("D", "P"): 4, + ("D", "Q"): 4, + ("D", "R"): 4, + ("D", "S"): 4, + ("D", "T"): 4, + ("D", "V"): 4, + ("D", "W"): 4, + ("D", "Y"): 4, + ("E", "A"): 4, + ("E", "C"): 4, + ("E", "D"): 2, + ("E", "E"): 0, + ("E", "F"): 4, + ("E", "G"): 4, + ("E", "H"): 4, + ("E", "I"): 4, + ("E", "K"): 3, + ("E", "L"): 4, + ("E", "M"): 4, + ("E", "N"): 4, + ("E", "P"): 4, + ("E", "Q"): 2, + ("E", "R"): 4, + ("E", "S"): 4, + ("E", "T"): 4, + ("E", "V"): 4, + ("E", "W"): 4, + ("E", "Y"): 4, + ("F", "A"): 4, + ("F", "C"): 4, + ("F", "D"): 4, + ("F", "E"): 4, + ("F", "F"): 0, + ("F", "G"): 4, + ("F", "H"): 4, + ("F", "I"): 4, + ("F", "K"): 4, + ("F", "L"): 4, + ("F", "M"): 4, + ("F", "N"): 4, + ("F", "P"): 4, + ("F", "Q"): 4, + ("F", "R"): 4, + ("F", "S"): 4, + ("F", "T"): 4, + ("F", "V"): 4, + ("F", "W"): 3, + ("F", "Y"): 1, + ("G", "A"): 4, + ("G", "C"): 4, + ("G", "D"): 4, + ("G", "E"): 4, + ("G", "F"): 4, + ("G", "G"): 0, + ("G", "H"): 4, + ("G", "I"): 4, + ("G", "K"): 4, + ("G", "L"): 4, + ("G", "M"): 4, + ("G", "N"): 4, + ("G", "P"): 4, + ("G", "Q"): 4, + ("G", "R"): 4, + ("G", "S"): 4, + ("G", "T"): 4, + ("G", "V"): 4, + ("G", "W"): 4, + ("G", "Y"): 4, + ("H", "A"): 4, + ("H", "C"): 4, + ("H", "D"): 4, + ("H", "E"): 4, + ("H", "F"): 4, + ("H", "G"): 4, + ("H", "H"): 0, + ("H", "I"): 4, + ("H", "K"): 4, + ("H", "L"): 4, + ("H", "M"): 4, + ("H", "N"): 3, + ("H", "P"): 4, + ("H", "Q"): 4, + ("H", "R"): 4, + ("H", "S"): 4, + ("H", "T"): 4, + ("H", "V"): 4, + ("H", "W"): 4, + ("H", "Y"): 2, + ("I", "A"): 4, + ("I", "C"): 4, + ("I", "D"): 4, + ("I", "E"): 4, + ("I", "F"): 4, + ("I", "G"): 4, + ("I", "H"): 4, + ("I", "I"): 0, + ("I", "K"): 4, + ("I", "L"): 2, + ("I", "M"): 3, + ("I", "N"): 4, + ("I", "P"): 4, + ("I", "Q"): 4, + ("I", "R"): 4, + ("I", "S"): 4, + ("I", "T"): 4, + ("I", "V"): 1, + ("I", "W"): 4, + ("I", "Y"): 4, + ("K", "A"): 4, + ("K", "C"): 4, + ("K", "D"): 4, + ("K", "E"): 3, + ("K", "F"): 4, + ("K", "G"): 4, + ("K", "H"): 4, + ("K", "I"): 4, + ("K", "K"): 0, + ("K", "L"): 4, + ("K", "M"): 4, + ("K", "N"): 4, + ("K", "P"): 4, + ("K", "Q"): 3, + ("K", "R"): 2, + ("K", "S"): 4, + ("K", "T"): 4, + ("K", "V"): 4, + ("K", "W"): 4, + ("K", "Y"): 4, + ("L", "A"): 4, + ("L", "C"): 4, + ("L", "D"): 4, + ("L", "E"): 4, + ("L", "F"): 4, + ("L", "G"): 4, + ("L", "H"): 4, + ("L", "I"): 2, + ("L", "K"): 4, + ("L", "L"): 0, + ("L", "M"): 2, + ("L", "N"): 4, + ("L", "P"): 4, + ("L", "Q"): 4, + ("L", "R"): 4, + ("L", "S"): 4, + ("L", "T"): 4, + ("L", "V"): 3, + ("L", "W"): 4, + ("L", "Y"): 4, + ("M", "A"): 4, + ("M", "C"): 4, + ("M", "D"): 4, + ("M", "E"): 4, + ("M", "F"): 4, + ("M", "G"): 4, + ("M", "H"): 4, + ("M", "I"): 3, + ("M", "K"): 4, + ("M", "L"): 2, + ("M", "M"): 0, + ("M", "N"): 4, + ("M", "P"): 4, + ("M", "Q"): 4, + ("M", "R"): 4, + ("M", "S"): 4, + ("M", "T"): 4, + ("M", "V"): 3, + ("M", "W"): 4, + ("M", "Y"): 4, + ("N", "A"): 4, + ("N", "C"): 4, + ("N", "D"): 3, + ("N", "E"): 4, + ("N", "F"): 4, + ("N", "G"): 4, + ("N", "H"): 3, + ("N", "I"): 4, + ("N", "K"): 4, + ("N", "L"): 4, + ("N", "M"): 4, + ("N", "N"): 0, + ("N", "P"): 4, + ("N", "Q"): 4, + ("N", "R"): 4, + ("N", "S"): 3, + ("N", "T"): 4, + ("N", "V"): 4, + ("N", "W"): 4, + ("N", "Y"): 4, + ("P", "A"): 4, + ("P", "C"): 4, + ("P", "D"): 4, + ("P", "E"): 4, + ("P", "F"): 4, + ("P", "G"): 4, + ("P", "H"): 4, + ("P", "I"): 4, + ("P", "K"): 4, + ("P", "L"): 4, + ("P", "M"): 4, + ("P", "N"): 4, + ("P", "P"): 0, + ("P", "Q"): 4, + ("P", "R"): 4, + ("P", "S"): 4, + ("P", "T"): 4, + ("P", "V"): 4, + ("P", "W"): 4, + ("P", "Y"): 4, + ("Q", "A"): 4, + ("Q", "C"): 4, + ("Q", "D"): 4, + ("Q", "E"): 2, + ("Q", "F"): 4, + ("Q", "G"): 4, + ("Q", "H"): 4, + ("Q", "I"): 4, + ("Q", "K"): 3, + ("Q", "L"): 4, + ("Q", "M"): 4, + ("Q", "N"): 4, + ("Q", "P"): 4, + ("Q", "Q"): 0, + ("Q", "R"): 3, + ("Q", "S"): 4, + ("Q", "T"): 4, + ("Q", "V"): 4, + ("Q", "W"): 4, + ("Q", "Y"): 4, + ("R", "A"): 4, + ("R", "C"): 4, + ("R", "D"): 4, + ("R", "E"): 4, + ("R", "F"): 4, + ("R", "G"): 4, + ("R", "H"): 4, + ("R", "I"): 4, + ("R", "K"): 2, + ("R", "L"): 4, + ("R", "M"): 4, + ("R", "N"): 4, + ("R", "P"): 4, + ("R", "Q"): 3, + ("R", "R"): 0, + ("R", "S"): 4, + ("R", "T"): 4, + ("R", "V"): 4, + ("R", "W"): 4, + ("R", "Y"): 4, + ("S", "A"): 3, + ("S", "C"): 4, + ("S", "D"): 4, + ("S", "E"): 4, + ("S", "F"): 4, + ("S", "G"): 4, + ("S", "H"): 4, + ("S", "I"): 4, + ("S", "K"): 4, + ("S", "L"): 4, + ("S", "M"): 4, + ("S", "N"): 3, + ("S", "P"): 4, + ("S", "Q"): 4, + ("S", "R"): 4, + ("S", "S"): 0, + ("S", "T"): 3, + ("S", "V"): 4, + ("S", "W"): 4, + ("S", "Y"): 4, + ("T", "A"): 4, + ("T", "C"): 4, + ("T", "D"): 4, + ("T", "E"): 4, + ("T", "F"): 4, + ("T", "G"): 4, + ("T", "H"): 4, + ("T", "I"): 4, + ("T", "K"): 4, + ("T", "L"): 4, + ("T", "M"): 4, + ("T", "N"): 4, + ("T", "P"): 4, + ("T", "Q"): 4, + ("T", "R"): 4, + ("T", "S"): 3, + ("T", "T"): 0, + ("T", "V"): 4, + ("T", "W"): 4, + ("T", "Y"): 4, + ("V", "A"): 4, + ("V", "C"): 4, + ("V", "D"): 4, + ("V", "E"): 4, + ("V", "F"): 4, + ("V", "G"): 4, + ("V", "H"): 4, + ("V", "I"): 1, + ("V", "K"): 4, + ("V", "L"): 3, + ("V", "M"): 3, + ("V", "N"): 4, + ("V", "P"): 4, + ("V", "Q"): 4, + ("V", "R"): 4, + ("V", "S"): 4, + ("V", "T"): 4, + ("V", "V"): 0, + ("V", "W"): 4, + ("V", "Y"): 4, + ("W", "A"): 4, + ("W", "C"): 4, + ("W", "D"): 4, + ("W", "E"): 4, + ("W", "F"): 3, + ("W", "G"): 4, + ("W", "H"): 4, + ("W", "I"): 4, + ("W", "K"): 4, + ("W", "L"): 4, + ("W", "M"): 4, + ("W", "N"): 4, + ("W", "P"): 4, + ("W", "Q"): 4, + ("W", "R"): 4, + ("W", "S"): 4, + ("W", "T"): 4, + ("W", "V"): 4, + ("W", "W"): 0, + ("W", "Y"): 2, + ("Y", "A"): 4, + ("Y", "C"): 4, + ("Y", "D"): 4, + ("Y", "E"): 4, + ("Y", "F"): 1, + ("Y", "G"): 4, + ("Y", "H"): 2, + ("Y", "I"): 4, + ("Y", "K"): 4, + ("Y", "L"): 4, + ("Y", "M"): 4, + ("Y", "N"): 4, + ("Y", "P"): 4, + ("Y", "Q"): 4, + ("Y", "R"): 4, + ("Y", "S"): 4, + ("Y", "T"): 4, + ("Y", "V"): 4, + ("Y", "W"): 2, + ("Y", "Y"): 0, + } - def __init__(self, dist_weight=3, gap_penalty=4, ntrim=3, ctrim=2, fixed_gappos=True, cutoff: Union[None, int] = None, n_jobs: Union[None, int] = None): + def __init__( + self, + dist_weight=3, + gap_penalty=4, + ntrim=3, + ctrim=2, + fixed_gappos=True, + cutoff: Union[None, int] = None, + n_jobs: Union[None, int] = None, + ): if cutoff is None: cutoff = 20 if n_jobs is None: @@ -421,7 +830,7 @@ def _make_numba_matrix(distance_matrix, alphabet=parasail_aa_alphabet_with_unkno dm[alphabet.index(aa1), alphabet.index(aa2)] = d dm[alphabet.index(aa2), alphabet.index(aa1)] = d return dm - + tcr_nb_distance_matrix = _make_numba_matrix(tcr_dict_distance_matrix) @staticmethod @@ -438,14 +847,25 @@ def _seqs2mat(seqs, alphabet=parasail_aa_alphabet, max_len=None): try: mat[si, aai] = alphabet.index(s[aai]) except ValueError: - #Unknown symbols given value for last column/row of matrix + # Unknown symbols given value for last column/row of matrix mat[si, aai] = len(alphabet) return mat, L - + @staticmethod @nb.jit(nopython=True, parallel=False, nogil=True) - def _nb_tcrdist_mat(seqs_mat1, seqs_mat2, seqs_L1, seqs_L2, distance_matrix=tcr_nb_distance_matrix, dist_weight=3, gap_penalty=4, ntrim=3, ctrim=2, fixed_gappos=True, cutoff=20): - + def _nb_tcrdist_mat( + seqs_mat1, + seqs_mat2, + seqs_L1, + seqs_L2, + distance_matrix=tcr_nb_distance_matrix, + dist_weight=3, + gap_penalty=4, + ntrim=3, + ctrim=2, + fixed_gappos=True, + cutoff=20, + ): assert seqs_mat1.shape[0] == seqs_L1.shape[0] assert seqs_mat2.shape[0] == seqs_L2.shape[0] @@ -454,7 +874,7 @@ def _nb_tcrdist_mat(seqs_mat1, seqs_mat2, seqs_L1, seqs_L2, distance_matrix=tcr_ row_element_counts = np.zeros(seqs_mat1.shape[0]) empty_row = np.zeros(0) - for i in range(0,seqs_mat1.shape[0]): + for i in range(0, seqs_mat1.shape[0]): data_rows.append(empty_row) indices_rows.append(empty_row) @@ -471,7 +891,7 @@ def _nb_tcrdist_mat(seqs_mat1, seqs_mat2, seqs_L1, seqs_L2, distance_matrix=tcr_ for i in range(ntrim, q_L - ctrim): distance += distance_matrix[seqs_mat1[row_index, i], seqs_mat2[col_index, i]] * dist_weight - if(distance <= cutoff + 1): + if distance <= cutoff + 1: data_row[row_end_index] = distance indices_row[row_end_index] = col_index row_end_index += 1 @@ -498,7 +918,9 @@ def _nb_tcrdist_mat(seqs_mat1, seqs_mat2, seqs_L1, seqs_L2, distance_matrix=tcr_ tmp_dist += distance_matrix[seqs_mat1[row_index, n_i], seqs_mat2[col_index, n_i]] for c_i in range(ctrim, remainder): - tmp_dist += distance_matrix[seqs_mat1[row_index, q_L - 1 - c_i], seqs_mat2[col_index, s_L - 1 - c_i]] + tmp_dist += distance_matrix[ + seqs_mat1[row_index, q_L - 1 - c_i], seqs_mat2[col_index, s_L - 1 - c_i] + ] if tmp_dist < min_dist or min_dist == -1: min_dist = tmp_dist @@ -507,7 +929,7 @@ def _nb_tcrdist_mat(seqs_mat1, seqs_mat2, seqs_L1, seqs_L2, distance_matrix=tcr_ break distance = min_dist * dist_weight + len_diff * gap_penalty + 1 - if(distance <= cutoff + 1): + if distance <= cutoff + 1: data_row[row_end_index] = distance indices_row[row_end_index] = col_index row_end_index += 1 @@ -519,20 +941,21 @@ def _nb_tcrdist_mat(seqs_mat1, seqs_mat2, seqs_L1, seqs_L2, distance_matrix=tcr_ return data_rows, indices_rows, row_element_counts def _calc_dist_mat_block(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: - seqs_mat1, seqs_L1 = self._seqs2mat(seqs) seqs_mat2, seqs_L2 = self._seqs2mat(seqs2) - + kwargs = { - 'dist_weight': self.dist_weight, - 'gap_penalty': self.gap_penalty, - 'ntrim': self.ntrim, - 'ctrim': self.ctrim, - 'fixed_gappos': self.fixed_gappos, - 'cutoff': self.cutoff + "dist_weight": self.dist_weight, + "gap_penalty": self.gap_penalty, + "ntrim": self.ntrim, + "ctrim": self.ctrim, + "fixed_gappos": self.fixed_gappos, + "cutoff": self.cutoff, } - data_rows, indices_rows, row_element_counts = self._nb_tcrdist_mat(seqs_mat1, seqs_mat2, seqs_L1, seqs_L2, **kwargs) + data_rows, indices_rows, row_element_counts = self._nb_tcrdist_mat( + seqs_mat1, seqs_mat2, seqs_L1, seqs_L2, **kwargs + ) indptr = np.zeros(row_element_counts.shape[0] + 1) indptr[1:] = np.cumsum(row_element_counts) @@ -540,13 +963,13 @@ def _calc_dist_mat_block(self, seqs: Sequence[str], seqs2: Optional[Sequence[str sparse_distance_matrix = csr_matrix((data, indices, indptr), shape=(len(seqs), len(seqs2))) return sparse_distance_matrix - def calc_dist_mat(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: - if(seqs2 is None): + def calc_dist_mat(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: + if seqs2 is None: seqs2 = seqs - if(self.n_jobs>1): + if self.n_jobs > 1: split_seqs = np.array_split(seqs, self.n_jobs) - arguments = [(split_seqs[x],seqs2) for x in range(self.n_jobs)] + arguments = [(split_seqs[x], seqs2) for x in range(self.n_jobs)] with multiprocessing.Pool(self.n_jobs) as p: results = p.starmap(self._calc_dist_mat_block, arguments) distance_matrix_csr = scipy.sparse.vstack(results) From 3b96e8a90a2de8f1c7ae83502c74b2b95292666c Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Mon, 8 Apr 2024 19:18:51 +0200 Subject: [PATCH 03/27] tcrdist tests added --- src/scirpy/tests/test_ir_dist_metrics.py | 111 ++++++++++++++++++++++- 1 file changed, 109 insertions(+), 2 deletions(-) diff --git a/src/scirpy/tests/test_ir_dist_metrics.py b/src/scirpy/tests/test_ir_dist_metrics.py index ae1f9b32b..df19f18f1 100644 --- a/src/scirpy/tests/test_ir_dist_metrics.py +++ b/src/scirpy/tests/test_ir_dist_metrics.py @@ -14,6 +14,7 @@ IdentityDistanceCalculator, LevenshteinDistanceCalculator, ParallelDistanceCalculator, + TCRdistDistanceCalculator, ) from .util import _squarify @@ -303,10 +304,10 @@ def test_fast_alignment_dist_with_two_seq_arrays(): npt.assert_almost_equal(res.toarray(), np.array([[0, 1, 5], [0, 5, 10], [0, 0, 0], [1, 0, 0]])) - +""" @pytest.mark.parametrize("metric", ["alignment", "fastalignment", "identity", "hamming", "levenshtein"]) def test_sequence_dist_all_metrics(metric): - """Smoke test, no assertions!""" + #Smoke test, no assertions! unique_seqs = np.array(["AAA", "ARA", "AFFFFFA", "FAFAFA", "FFF"]) seqs2 = np.array(["RRR", "FAFA", "WWWWWWW"]) dist_mat = ir.ir_dist.sequence_dist(unique_seqs, metric=metric, cutoff=8, n_jobs=2) @@ -314,3 +315,109 @@ def test_sequence_dist_all_metrics(metric): dist_mat = ir.ir_dist.sequence_dist(unique_seqs, seqs2, metric=metric, cutoff=8, n_jobs=2) assert dist_mat.shape == (5, 3) +""" + +@pytest.mark.parametrize("test_parameters,test_input,expected_result", [ + + #test more complex strings with unequal length and set high cutoff such that cutoff is neglected + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["WEKFAPIQCMNR", "RDAIYTCCNKSWEQ", "CWWMFGHTVRI", "GWSZNNHI"])), + np.array([[57, 74, 65, 33], + [66, 68, 65, 33], + [53, 58, 49, 25]])), + + #test very small input sequences + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 0, "ctrim": 0, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, + (np.array(["A"]), + np.array(["C"])), + np.array([[13]])), + + #test standard parameters with simple input sequences + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 0, 21], + [0, 1, 21], + [21, 21, 1]])), + + #test standard parameters with simple input sequences with second sequences array set to None + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + None), + np.array([[1, 0, 21], + [0, 1, 21], + [21, 21, 1]])), + + #test with dist_weight set to 0 + ({"dist_weight": 0, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 1, 9], + [1, 1, 9], + [9, 9, 1]])), + + #test with dist_weight set high and cutoff set high to neglect it + ({"dist_weight": 30, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 241, 129], + [241, 1, 129], + [129, 129, 1]])), + + #test with gap_penalty set to 0 + ({"dist_weight": 3, "gap_penalty": 0, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 0, 13], + [0, 1, 13], + [13, 13, 1]])), + + #test with gap_penalty set high and cutoff set high to neglect it + ({"dist_weight": 3, "gap_penalty": 40, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 25, 93], + [25, 1, 93], + [93, 93, 1]])), + + #test with fixed_gappos = False and a high cutoff to neglect it + #AAAAA added at the beginning of the usual sequences to make to difference of min_gappos and max_gappos more significant + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": False, "cutoff": 1000, "n_jobs": 1}, + (np.array(["AAAAAAAAAAAAAAA", "AAAAAAAAARRAAAA", "AAAAAAANDAAAA"]), + np.array(["AAAAAAAAAAAAAAA", "AAAAAAAAARRAAAA", "AAAAAAANDAAAA"])), + np.array([[1, 25, 33], + [25, 1, 33], + [33, 33, 1]])), + + #test with cutoff set to 0 + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 0, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 0, 0], + [0, 1, 0], + [0, 0, 1]])), + + #test with cutoff set high to neglect it + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 25, 21], + [25, 1, 21], + [21, 21, 1]])), + + #test with multiple cores by setting n_jobs = 3 + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 3}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 0, 21], + [0, 1, 21], + [21, 21, 1]])), +]) +def test_tcrdist(test_parameters, test_input, expected_result): + tcrdist_calculator = TCRdistDistanceCalculator(**test_parameters) + seq1, seq2 = test_input + res = tcrdist_calculator.calc_dist_mat(seq1, seq2) + assert isinstance(res, scipy.sparse.csr_matrix) + assert res.shape == expected_result.shape + assert np.array_equal(res.todense(), expected_result) \ No newline at end of file From d2c606dc860fe69aa2f8d8ab3c10de136924cabd Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Mon, 8 Apr 2024 19:38:24 +0200 Subject: [PATCH 04/27] fixed ir_dist _get_distance_calculator parameter handling --- src/scirpy/ir_dist/__init__.py | 14 +++++--------- src/scirpy/tests/test_ir_dist_metrics.py | 4 ++-- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/scirpy/ir_dist/__init__.py b/src/scirpy/ir_dist/__init__.py index 9f2f2b2c6..18831a321 100644 --- a/src/scirpy/ir_dist/__init__.py +++ b/src/scirpy/ir_dist/__init__.py @@ -73,7 +73,7 @@ def _get_metric_key(metric: MetricType) -> str: return "custom" if isinstance(metric, metrics.DistanceCalculator) else metric # type: ignore -def _get_distance_calculator(metric: MetricType, cutoff: Union[int, None], *, n_jobs=-1, **kwargs): +def _get_distance_calculator(metric: MetricType, cutoff: Union[int, None], *, n_jobs=None, **kwargs): """Returns an instance of :class:`~scirpy.ir_dist.metrics.DistanceCalculator` given a metric. @@ -83,20 +83,16 @@ def _get_distance_calculator(metric: MetricType, cutoff: Union[int, None], *, n_ metric = "identity" cutoff = 0 - # Let's rely on the default set by the class if cutoff is None - if cutoff is not None: - kwargs["cutoff"] = cutoff - if isinstance(metric, metrics.DistanceCalculator): dist_calc = metric elif metric == "alignment": - dist_calc = metrics.FastAlignmentDistanceCalculator(n_jobs=n_jobs, estimated_penalty=0, **kwargs) + dist_calc = metrics.FastAlignmentDistanceCalculator(cutoff=cutoff, n_jobs=n_jobs, estimated_penalty=0, **kwargs) elif metric == "fastalignment": - dist_calc = metrics.FastAlignmentDistanceCalculator(n_jobs=n_jobs, **kwargs) + dist_calc = metrics.FastAlignmentDistanceCalculator(cutoff=cutoff, n_jobs=n_jobs, **kwargs) elif metric == "identity": - dist_calc = metrics.IdentityDistanceCalculator(**kwargs) + dist_calc = metrics.IdentityDistanceCalculator(cutoff=cutoff, **kwargs) elif metric == "levenshtein": - dist_calc = metrics.LevenshteinDistanceCalculator(n_jobs=n_jobs, **kwargs) + dist_calc = metrics.LevenshteinDistanceCalculator(cutoff=cutoff, n_jobs=n_jobs, **kwargs) elif metric == "hamming": dist_calc = metrics.HammingDistanceCalculator(cutoff=cutoff, n_jobs=n_jobs, **kwargs) elif metric == "tcrdist": diff --git a/src/scirpy/tests/test_ir_dist_metrics.py b/src/scirpy/tests/test_ir_dist_metrics.py index df19f18f1..0902d7779 100644 --- a/src/scirpy/tests/test_ir_dist_metrics.py +++ b/src/scirpy/tests/test_ir_dist_metrics.py @@ -304,7 +304,7 @@ def test_fast_alignment_dist_with_two_seq_arrays(): npt.assert_almost_equal(res.toarray(), np.array([[0, 1, 5], [0, 5, 10], [0, 0, 0], [1, 0, 0]])) -""" + @pytest.mark.parametrize("metric", ["alignment", "fastalignment", "identity", "hamming", "levenshtein"]) def test_sequence_dist_all_metrics(metric): #Smoke test, no assertions! @@ -315,7 +315,7 @@ def test_sequence_dist_all_metrics(metric): dist_mat = ir.ir_dist.sequence_dist(unique_seqs, seqs2, metric=metric, cutoff=8, n_jobs=2) assert dist_mat.shape == (5, 3) -""" + @pytest.mark.parametrize("test_parameters,test_input,expected_result", [ From 1d223ec6fdd08104f40ae7c6d4770b0f234dc2f5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 8 Apr 2024 17:39:30 +0000 Subject: [PATCH 05/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/scirpy/tests/test_ir_dist_metrics.py | 285 +++++++++++++++-------- 1 file changed, 184 insertions(+), 101 deletions(-) diff --git a/src/scirpy/tests/test_ir_dist_metrics.py b/src/scirpy/tests/test_ir_dist_metrics.py index 0902d7779..90e611f0f 100644 --- a/src/scirpy/tests/test_ir_dist_metrics.py +++ b/src/scirpy/tests/test_ir_dist_metrics.py @@ -307,7 +307,7 @@ def test_fast_alignment_dist_with_two_seq_arrays(): @pytest.mark.parametrize("metric", ["alignment", "fastalignment", "identity", "hamming", "levenshtein"]) def test_sequence_dist_all_metrics(metric): - #Smoke test, no assertions! + # Smoke test, no assertions! unique_seqs = np.array(["AAA", "ARA", "AFFFFFA", "FAFAFA", "FFF"]) seqs2 = np.array(["RRR", "FAFA", "WWWWWWW"]) dist_mat = ir.ir_dist.sequence_dist(unique_seqs, metric=metric, cutoff=8, n_jobs=2) @@ -315,109 +315,192 @@ def test_sequence_dist_all_metrics(metric): dist_mat = ir.ir_dist.sequence_dist(unique_seqs, seqs2, metric=metric, cutoff=8, n_jobs=2) assert dist_mat.shape == (5, 3) - - -@pytest.mark.parametrize("test_parameters,test_input,expected_result", [ - - #test more complex strings with unequal length and set high cutoff such that cutoff is neglected - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["WEKFAPIQCMNR", "RDAIYTCCNKSWEQ", "CWWMFGHTVRI", "GWSZNNHI"])), - np.array([[57, 74, 65, 33], - [66, 68, 65, 33], - [53, 58, 49, 25]])), - - #test very small input sequences - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 0, "ctrim": 0, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, - (np.array(["A"]), - np.array(["C"])), - np.array([[13]])), - - #test standard parameters with simple input sequences - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 0, 21], - [0, 1, 21], - [21, 21, 1]])), - - #test standard parameters with simple input sequences with second sequences array set to None - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - None), - np.array([[1, 0, 21], - [0, 1, 21], - [21, 21, 1]])), - - #test with dist_weight set to 0 - ({"dist_weight": 0, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 1, 9], - [1, 1, 9], - [9, 9, 1]])), - - #test with dist_weight set high and cutoff set high to neglect it - ({"dist_weight": 30, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 241, 129], - [241, 1, 129], - [129, 129, 1]])), - - #test with gap_penalty set to 0 - ({"dist_weight": 3, "gap_penalty": 0, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 0, 13], - [0, 1, 13], - [13, 13, 1]])), - - #test with gap_penalty set high and cutoff set high to neglect it - ({"dist_weight": 3, "gap_penalty": 40, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 25, 93], - [25, 1, 93], - [93, 93, 1]])), - - #test with fixed_gappos = False and a high cutoff to neglect it - #AAAAA added at the beginning of the usual sequences to make to difference of min_gappos and max_gappos more significant - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": False, "cutoff": 1000, "n_jobs": 1}, - (np.array(["AAAAAAAAAAAAAAA", "AAAAAAAAARRAAAA", "AAAAAAANDAAAA"]), - np.array(["AAAAAAAAAAAAAAA", "AAAAAAAAARRAAAA", "AAAAAAANDAAAA"])), - np.array([[1, 25, 33], - [25, 1, 33], - [33, 33, 1]])), - - #test with cutoff set to 0 - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 0, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 0, 0], - [0, 1, 0], - [0, 0, 1]])), - - #test with cutoff set high to neglect it - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 25, 21], - [25, 1, 21], - [21, 21, 1]])), - - #test with multiple cores by setting n_jobs = 3 - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 3}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 0, 21], - [0, 1, 21], - [21, 21, 1]])), -]) + + +@pytest.mark.parametrize( + "test_parameters,test_input,expected_result", + [ + # test more complex strings with unequal length and set high cutoff such that cutoff is neglected + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 1000, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["WEKFAPIQCMNR", "RDAIYTCCNKSWEQ", "CWWMFGHTVRI", "GWSZNNHI"]), + ), + np.array([[57, 74, 65, 33], [66, 68, 65, 33], [53, 58, 49, 25]]), + ), + # test very small input sequences + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 0, + "ctrim": 0, + "fixed_gappos": True, + "cutoff": None, + "n_jobs": 1, + }, + (np.array(["A"]), np.array(["C"])), + np.array([[13]]), + ), + # test standard parameters with simple input sequences + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": None, + "n_jobs": 1, + }, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 0, 21], [0, 1, 21], [21, 21, 1]]), + ), + # test standard parameters with simple input sequences with second sequences array set to None + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": None, + "n_jobs": 1, + }, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), None), + np.array([[1, 0, 21], [0, 1, 21], [21, 21, 1]]), + ), + # test with dist_weight set to 0 + ( + { + "dist_weight": 0, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": None, + "n_jobs": 1, + }, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 1, 9], [1, 1, 9], [9, 9, 1]]), + ), + # test with dist_weight set high and cutoff set high to neglect it + ( + { + "dist_weight": 30, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 1000, + "n_jobs": 1, + }, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 241, 129], [241, 1, 129], [129, 129, 1]]), + ), + # test with gap_penalty set to 0 + ( + { + "dist_weight": 3, + "gap_penalty": 0, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": None, + "n_jobs": 1, + }, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 0, 13], [0, 1, 13], [13, 13, 1]]), + ), + # test with gap_penalty set high and cutoff set high to neglect it + ( + { + "dist_weight": 3, + "gap_penalty": 40, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 1000, + "n_jobs": 1, + }, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 25, 93], [25, 1, 93], [93, 93, 1]]), + ), + # test with fixed_gappos = False and a high cutoff to neglect it + # AAAAA added at the beginning of the usual sequences to make to difference of min_gappos and max_gappos more significant + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": False, + "cutoff": 1000, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAAAAAAA", "AAAAAAAAARRAAAA", "AAAAAAANDAAAA"]), + np.array(["AAAAAAAAAAAAAAA", "AAAAAAAAARRAAAA", "AAAAAAANDAAAA"]), + ), + np.array([[1, 25, 33], [25, 1, 33], [33, 33, 1]]), + ), + # test with cutoff set to 0 + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 0, + "n_jobs": 1, + }, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), + ), + # test with cutoff set high to neglect it + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 1000, + "n_jobs": 1, + }, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 25, 21], [25, 1, 21], [21, 21, 1]]), + ), + # test with multiple cores by setting n_jobs = 3 + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": None, + "n_jobs": 3, + }, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 0, 21], [0, 1, 21], [21, 21, 1]]), + ), + ], +) def test_tcrdist(test_parameters, test_input, expected_result): tcrdist_calculator = TCRdistDistanceCalculator(**test_parameters) seq1, seq2 = test_input res = tcrdist_calculator.calc_dist_mat(seq1, seq2) assert isinstance(res, scipy.sparse.csr_matrix) assert res.shape == expected_result.shape - assert np.array_equal(res.todense(), expected_result) \ No newline at end of file + assert np.array_equal(res.todense(), expected_result) From 4ba2c8d26615a478d551bc5dc330cfe180e4592a Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Wed, 10 Apr 2024 18:33:00 +0200 Subject: [PATCH 06/27] handling of empty input sequences fixed --- src/scirpy/ir_dist/metrics.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index d8e05fb44..77c915241 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -941,6 +941,10 @@ def _nb_tcrdist_mat( return data_rows, indices_rows, row_element_counts def _calc_dist_mat_block(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: + + if(len(seqs) == 0 or len(seqs2) == 0): + return csr_matrix((len(seqs), len(seqs2))) + seqs_mat1, seqs_L1 = self._seqs2mat(seqs) seqs_mat2, seqs_L2 = self._seqs2mat(seqs2) From cf9090496e9ce7ae85e234342fb18c4489b0bb7d Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Wed, 10 Apr 2024 18:40:19 +0200 Subject: [PATCH 07/27] additional tests for tcrdist added --- src/scirpy/tests/test_ir_dist_metrics.py | 54 ++++++++++++++++++++---- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/src/scirpy/tests/test_ir_dist_metrics.py b/src/scirpy/tests/test_ir_dist_metrics.py index 0902d7779..8b9994b84 100644 --- a/src/scirpy/tests/test_ir_dist_metrics.py +++ b/src/scirpy/tests/test_ir_dist_metrics.py @@ -326,15 +326,21 @@ def test_sequence_dist_all_metrics(metric): np.array([[57, 74, 65, 33], [66, 68, 65, 33], [53, 58, 49, 25]])), + + #test empty input arrays + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, + (np.array([]), + np.array([])), + np.empty((0,0))), #test very small input sequences - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 0, "ctrim": 0, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 0, "ctrim": 0, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, (np.array(["A"]), np.array(["C"])), np.array([[13]])), #test standard parameters with simple input sequences - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), np.array([[1, 0, 21], @@ -342,7 +348,7 @@ def test_sequence_dist_all_metrics(metric): [21, 21, 1]])), #test standard parameters with simple input sequences with second sequences array set to None - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), None), np.array([[1, 0, 21], @@ -350,7 +356,7 @@ def test_sequence_dist_all_metrics(metric): [21, 21, 1]])), #test with dist_weight set to 0 - ({"dist_weight": 0, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, + ({"dist_weight": 0, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), np.array([[1, 1, 9], @@ -366,7 +372,7 @@ def test_sequence_dist_all_metrics(metric): [129, 129, 1]])), #test with gap_penalty set to 0 - ({"dist_weight": 3, "gap_penalty": 0, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 1}, + ({"dist_weight": 3, "gap_penalty": 0, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), np.array([[1, 0, 13], @@ -380,7 +386,39 @@ def test_sequence_dist_all_metrics(metric): np.array([[1, 25, 93], [25, 1, 93], [93, 93, 1]])), - + + #test with ntrim = 0 and cutoff set high to neglect it + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 0, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 25, 33], + [25, 1, 33], + [33, 33, 1]])), + + #test with high ntrim - trimmimg with ntrim is only possible until the beginning of the gap for sequences of unequal length + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 10, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 1, 9], + [1, 1, 9], + [9, 9, 1]])), + + #test with ctrim = 0 and cutoff set high to neglect it + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 0, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 25, 21], + [25, 1, 21], + [21, 21, 1]])), + + #test with high ctrim - trimmimg with ctrim is only possible until the end of the gap for sequences of unequal length + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 10, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), + np.array([[1, 1, 21], + [1, 1, 21], + [21, 21, 1]])), + #test with fixed_gappos = False and a high cutoff to neglect it #AAAAA added at the beginning of the usual sequences to make to difference of min_gappos and max_gappos more significant ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": False, "cutoff": 1000, "n_jobs": 1}, @@ -406,8 +444,8 @@ def test_sequence_dist_all_metrics(metric): [25, 1, 21], [21, 21, 1]])), - #test with multiple cores by setting n_jobs = 3 - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": None, "n_jobs": 3}, + #test with multiple cores by setting n_jobs = 4 + ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 4}, (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), np.array([[1, 0, 21], From 02a667a3a9206f60cafce9498a18519d77bd4542 Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Wed, 10 Apr 2024 18:44:08 +0200 Subject: [PATCH 08/27] tcrdist test with comparison against reference implementation added --- .../tcrdist_WU3k_csr_result.npz | Bin 0 -> 6159 bytes .../tcrdist_test_data/tcrdist_WU3k_seqs.npy | Bin 0 -> 136528 bytes src/scirpy/tests/test_ir_dist_metrics.py | 15 ++++++++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 src/scirpy/tests/data/tcrdist_test_data/tcrdist_WU3k_csr_result.npz create mode 100644 src/scirpy/tests/data/tcrdist_test_data/tcrdist_WU3k_seqs.npy diff --git a/src/scirpy/tests/data/tcrdist_test_data/tcrdist_WU3k_csr_result.npz b/src/scirpy/tests/data/tcrdist_test_data/tcrdist_WU3k_csr_result.npz new file mode 100644 index 0000000000000000000000000000000000000000..75c88cc10ca3f06cb6d6014ccd5eefd805efaac6 GIT binary patch literal 6159 zcmb`L2Rv2(|HrT5+L_tw;^G$RUaN~_Tzij-h-|V)Mn=h&dF@?he9K6YQudz7S5)>z zsH}_|@w@km=;!;NzmLb^anC&;=bZQZ^LoFZuaB1MNqkxW06>H}K!7LD0~O>*000hV z0Du%g3$S&xur;@IJLl-^jR&B_oyJ@Tz+QqKnZvi9U6vmM+*q9M2ZbT;1toAODHjJl z%i{oD@8@{(4D&HCh(kvuS(pzD;uslu;@-IRg+hO$W-)urV8Ni4RmgfQb=aq8d!1Fg zL}i%d??&UfO_gLaa^~lDeWG$ZdW@tyqf9*cd*9X@pin3`D_2MGL8uP(YTgmE@XGc;a+XQ=wh~Nw#+4O}#>3 zwr>2jfr3-FiWL2r`eGG?XGqif60C%02~P?q(N!AuM+`ojQ2o#!z3%>iEAw;LRbO@b zm;F)SzCYl7y%{q4{h{E`&%IS-H#o+iVfDVflp(OliO3t=xeVPYBknQ|1xM?d^{Z! z`}Nji>yymrcy)U#QbY9pr&;$MCOtfs#3Y8B9oNf}{l3fs2;ohp(as6xEVKB$@MhZ{ z=Onj`*%S2DO_n9jiFQV_>^svyvFc`rZ_dd+pJwqWyqnA$o==5uCRpRs$m{E)s`(^^ zTo7T};R^0%l=1paJf~d3>AUEgcxCiaEXdgR-hl{i9%5x>1=mq-0*Lp0y*wUrTkmxD zJl>Ox?M)IEJfw@rMEe>=57*duBh-xF%F^rb^Q9><+q4A$0yM{?(h>Fu6Np%3j6FRG z#4sqwR}C7Wdk>kABW$^;JfhjaxtPA6k?kBK%W_;&Hs`$Zfp^4rtM!k~Od)}Qt-Rfb zC9s37qK|lAoQC_UA9qXVWUsz_o5}V)kAAE#UFv&2`%iQd9vMmk2)*!m_d(2a+pr4> zrexOmT#U^&ElYBJbe63Vxa6Rmrt}0S(dh7XUIM=!3;$Cki`$+xPEwg>ch0;^>4|(W zhE7cFG81)6>6DzQxEsuG2CXIdIV+JIrfSgQdZi3``LPvw?JA+2LF;YNGAM(tXnCpt z$i>dNwk+!F7ordC8Xo287r$7tZ^7$iPPt?>vP|%`%0_38a~EFdX6$i|UNsu$On%XI zy2LeVVY{*R(>PDni*D9$t}$DZbOhNmnC(MkhYxS^b#o%h;FM z#akr{R%MyLRYb=Iuiewpw{a=3rjp@GT=ofCvU%VtHM=%nUX(mvM=Nda1u&m7L`9b; zZ^RO!=zT^jPDMuIQI}-#6656}gZCrpX7{1`QaIZ8z(`=6`iGZ*iA~o2)9!5^INAJUoIFd*1Gg z3y+;!VS|)GryJV)fY_fbHI7yTgh3q=}atf~` zSW*Rb8=~~gxTTB_G#gG-?I@f>6PVJ(De4lYdXPUh@Kf?|B{ga!G7?pzOro2i#nM@!)Mopc^U2;_)_9m4nE9I%AE%om@&DhJI5x2t?E;Q& zp|3!*i>r4-B`2YpUL8XB=%J=TG{HnU)KQ;7P#`p1-hPrY*{epymwJimaBHC$6wBiP zJH2 zWN8bChr5*R%iFLK^15B^dR&^D-dk#G3TOQQuW@cGM+iT zX3_){&ZCOr%9;fWcE09+7gQje3m33aK6{PuJ<%ZSSw8F6IcY`lxA4i@)t>*V$Z^6p zSbHc#c6!&0vzYf3BICo@d7(uJ^qH3j2H}K_Wp?$dnch^&XmPhfAZBVRuyrjWOj0zo zy6e7I)>aQDK_*`WG3Bj1D4*XPe%3g_p#991ijXmM;Pc1k7nz3`mL*%WHYT@B2FHGj z<$so}=~%Dezl^5Dcb5eBll5szio&ani)}TiP?F0F!lH0NJB;JZ+b|6>GwS<8O~kIy z$>lT4%5`?!s5K5kE~;>4Ssi3thM2O9$|&dQB4xP`kK)kA;{Al)r}ui1>cDKTcdVx+ zn0YZSlLf_1U!5yu(n`1pScyHz89Gds_Eh+hQWm1grwZ@T4T#X2)IFmc-1|sHq*ynw zw@6vwrf%rVfm|ta8znBLRRVPz6`?f3#rw9Hx;dy+ztA9S3Bv`uY9w44dSiIou)7-TEhSUknHiNv%Bj)eqzsJ2lfi=$Q&}SqSU7cjJH?%O?6i~ z80wD9TaDR&tbEemtyWo(#-mYb2eY2{`$4*BynuFI7dPo0PPy`2BXOQw_UFKAp?`ot zpxiZgQho+X{TB}3>bmd(o``!cj0!x`eb+N`4Y=EuOg>LZ&ClM?^GLPK?FWnZ3aTvw z zi)7SqYNv}2|Gx*4Su1Y34Dswv8NBT3Ha6BvqIaHz!T- zBLtT?G)$S_j3)F&6PFaRk`gC%d{7=Y7aT&w0mg9va<4OCAmAx73P%9#a>h7F%cGQs zC=g@T;?sme*yTqGO!g9u6cCvu!#;@a1q#6fO@*jgGQCT!SF?hq!@DfmRt@T8>B>{U{FaQJ2K6de z%F{$1Te2<;*D3g$(xeYJT&XP2Wc`A=>Ob{QVLK{T6m@cS!utZ5 zAHoknQ8sE=zQm~ZUK0L0jG2D82bgbzOWe7Fa5nlET4Fpjrd1v?&bIJRj>ln#udbhY4FGaks znGjg?>Lrv~3GeltIJ>*TKDOBtldr`mOA}It;NXl4Q#mGr?MfOXI!Ii(V{B96fyHA> z9R}3Z_hlLkkv%ZL`_z#uF7oIP))2z7#yuMi(reUeSlFn1mV2ny1!jHE5%c@3dg+JI z-2gJP^KuLw=R<_<#YLJ4n=@dEFCbja45pjXAzUC7_=j~>)&IAs*_#O&4t0oDRs?2l zcPwIC#!#o!y4&4AT+3*2(QroUZr9sloNDcbUV?M3;hjbkPFS;goKuaQtr-XJ{!{!c z9dPm6t9lkWl!-P7-UBdi;>9CY{~9H`dOO@qVL53_5~m*nxR@FeRT;y=)S55;o#4?N0kbWfJYfKMRiz~YgWldFTN z`#v6F&teDm3hYS#LO;(;+1{qrqV*&Tc`d^z&oTdjF^&D>lH3PbNhW$a2`Tj1CJ9Iq zYERWJhz3Cfy)`#o-U&2QnmEaG5?Z_x)Fdb1D4!*>;ti`{{&g7UAjUj^n~kZn<^KJ# zC$R&20d^dFf5&*+od53r3gUZD05~Tl3ItLD|9FUiuX>UACjfvt5$v`lEx^Lm-E{vM zu;;J?`#pAiO?$1;c%HJgv!khQ2=tT^!Y3;)djiBQSstLzkWw60U?xoq4kK8mt>KH2 z#vAFQDXHyQmJ#gKbTMm{+Wji=(Yy9#bwxRd<`}~&P6mQi?@TE_K!i%%>SKsO~wBBBGOTWJCwiId@;#;&uT*bTY zBf2%@75n0~EK@;$eyRVBZRt00*8Y7TDHG#AzVZ8*8PUi7Y0~%Vov#b;-zOH$+7`PI zxu8oPO1@t^p>g%chI?PkUnWtkz51nn#`E6yiY4c7>CB_GsFHh|Yn#>s|H_(>*!0n_ z;(0YR!&tH^Qc?vQpqV`NZq-Pf{O>>q$sdD41HE^%?9oNDBHxx7t?V-fJ$I8r)jIWlr)i<SvaGnN#nrDijnL#1XGq1Zotx@$O#dmVoJ*l%v#zRofBfga{Hs6z3%~fIKl|^0{TKE5U;np1`=5VNKmVP- z|A&A0_kREH{{HX%-`{&U{`Noq&2N75kA5Bhy3YUX*Y&@u^S}G`v(NtI*U$g7&hG!C zUq_vWBV5mV#u?MIuKz;AWO?ZD(O-G0BK`ON*(dL};57+Xb>Q#>(B>-4hE?&>$*yu^~51dD!V)yo{?4Sm|9_`D-5a^7GxkP`4c8%JnP` zK0kKrRgWCw%*AxXdi1SsxaWiRtex)UVx^8(y|}bG zd!A-YIkwk1tAXFmM#rTy+IeFx%+(BExwOy~F5Wk|>MaYj@)wt{K3nm%Zac$Nv{KKny@E%WTu?#$(BdW**NoxH{ORj;pHdetZz{mu2H2S?oGKrZNe zlC>)rCw;-+c#KUqZ>)zevz{JtN}`zed`Nc>vUK@4*kyN+hJezhKrYM#G9Qx zV#MVG)>EzZ=no&_1+CUD2XwCHpl&_JRd4b|bFMM*0t#19{uS{X4hiRtMd1d`|+cD01|F}NEOdjGd?v7UAE7$Y!)OojFn8uZh z6V|S$2k_T_&Ro2_i+Rb@r*mAnc)Z}VEW{H(n?pVM?mT-RUAfjSyWUA*<7JOqt>KkR ztMy|#=>2p*qf6GX(_7fq@JW7gd3V;!k8kKrFQ^$lUsv{ny6tUT{l!ge&wov@{1qMna|JsHm|IAqvP_!`#8Ma_wt#Gb?z_PaD7JKPA<+|-qDp_ zP&2;zd}r^i+u!yx+GOjTU)Pws{IlM0v#0UlsrDIH-QqXBaExO|oY{$j88Lf!frBQ8&|SpaZcG*|G;qn(V`OQ9@eC+y*%SX)kf&C4=>1S*?!&fit zJDqjQF-AV%;^FM}!vU@TbX;07v0XdU{i$9o`%dab|IQxK>AmLQQ?BP}b;wk_@cB;d zu6o0LlC7HhHAdWPzeRU6tKD7wMqKO7zP`hHiLHwfmj_RC5O&9vRXzQr*RZc#Il#02 zZ}wZbyvXr{Khm45nZsTY{h#C)D@XR9CmxUV;wHmS>(_DREIsFXH^&{ak;7_2S~GTRfIy z^T?f=u6z$3aq&_+*E_tr{TNqn_Axvf9Z&YO5to@=5jXAT|bYwo_EmBgI#7l z@}rNjFw<}T9Vz?7C-0$fQ#V?1vq!AvZR7(_Y1SKhk7itRt!nPLVLPXGJG!r2+Vlk$ z&T1djt#9MX#d4_IxsUmLm#w9c+Zc4~Ik zyW&U3#p8#wb(`apoY9LR2lDHh54*V->BWl1r&$|uX-+1b&Eve|@ra8d2k`+{zg+V+ zt6#@eF9&8a@@d`hvY+uc*L<*!l|AjMSKVnmd+?+;I&Ss|bEIA?I_vghjJRhePrlK| zd`PCOU(I|*^Cxw~%-)ciVeL2I=xT(Y{sR}!Hej3 z{v;2c$8^>o=?S0t`Dq{H>4EU0{nI|ivwl2$XX|&~@qARbb9SS9$AgmaqmSjEr}SlS zpda^>y5VBrsu4ELKGSzAdvDz~)Qd6VnP+ue{q_GXiC%U3mcOa_NblV{;=Got-sCjd z%KqbXb6Jbu-T8dx%9q;61K!hKt4H7J7A{TxU5d9pbHtz3tc~=_n1lB8o}bP04Q~3F z{vBrBKJ+0yL9b)CW_MhASL=^;9-Y5>OHVt0Q!lM*Bn#1uzuFs}UQFIT$yhzs)=j>i zYMpVj-{=qDrRa}0E8o#NAHvQ4BYS*S$Gg3uZfSkMz?3gJ#?E}Re2n}Jj~6n#%BovmW5&g#b!UGJxZ~pSm3FzL@%ZuIfMXFKnW5!ae=9p|3-WS^9)v!~enPCi!W8tKK|t$#i8Hbz{Y#s{9t zUA_~q)^NrRKehAj;EA8rbT&p@&&N+Sr1=~DhMRqXA1nP;x4n&#UjHU#z1cH&-|;5s zxIE+qAM{_|%QF1LUu(v5c>c`vnf$HP@2WRkzT!k*KA*kY>-N{^xcqgb_i$IU3*K>M z%X+b}!%rXPo!aHN)h);9xX~AO^1SNBh>Ia#>gJq-2e>QyaK}A+ zx|6S(_823s+FQS~UQ<8wxTaan?);c>J&Utc*J14rJ-6dQ-QIF#DjDXl+Qs7)KRO@m z3D!^RZV%TZtzKLjX_g5;ul&1Y-FA#lZ|cS8^hSK(%Uv!yZuS})X~s`~-0i(}YwH6R z4sUmK*6qjW^nQ{B;EpQ?*3QRo?K^nkI)5v_GUW)Lzr5bzk9i#q0T04d;eoNUh)bKjjK90S zW`d4OYxK&;xAbD-(d`h!wBRPIbYZQI5jRY}-uc+gGvlUb)hF6{edmF>I&L)MB@fuZuwRo@Z{?0J7c6bd#;|f4?oe|acRww@1>vSEFL(^06ymG zo>R9D>xP%Uz|8lF9e(EnuAHTxztI=;>IFG-Sii$uxxUNENH1-k$K$GmQ-|gLcTT(EX#EAV?}G-HssPqV|sbl+Zk8A&i!^*C#>@r z`9LqOdO6fz?>Vpj+eXLbrA+W5ePTXduV^1}J@>@Zdh9hu+<0s4^wO@5&a3;xjEm`@ z{TunEH(uHwR{X5n-bSZ4EP7>v2j)`4{H`wB(Q#=$?-RWCnPa8*W?cH7=3>T;#&71q zdS%Kvd*-SK^~iAF@w|i1CmE~9T5jI+_A3nA@BBWdeRUt|xa##ckuyx+SD(J)I=z^DNG`I+C(n29qY;;< zI1ZW66Y7}$;Bnr4-M-Xu`H)Pg7bfj1dvDz~%!B^Iv}OnHZhwsQp7#dabAQlm4!UQf z<0h-Vx4Y)cYR$~oan+1&tlf8fJu>8CTBo|J*OTw;BfZI3*1hU!NqCKpD-YI>efA=A zJ?RVjtY593XsySF{$i#6_6K>;*EIV4E@O4ekQ)!u8&C9){5^ANl?UgYUY>FFggRkn zKhJgY-F<)WxU^yN*ZCX!Lwrr`%yE~iy5((jdaaL6yu35a6CdvMkGQ;)B{{(MGe7TG zb<1mXTwcUm8fneHcRkH7eYko-#-8+Erx*L|uj}OZ>AsE`uJt>1_u!dcKJao!XWf2` z5tqJrfGu;o>={?>_`y$^2-_O&GBM)fV9IPVmb$I>-IeS4s2Qj^~952tH*|YocDV)udW?0W8_16Aig?d@9aGzuKLKAee$Uv z9oHV5p3__ULmgK=>A1YW#!){%rH{9&q zyiiBnRd&ZczcZis(Q(robUw*i$4%X2jlbD*R&UVM8kKUcV zrfz?Yj{C`;4nE@Ym|kn+0nN@keLdoOULVjbTY7J2Z>ZacdTCXc)W}({BxkgnFYhJu zU&lxfTQyy*TmHbEzW^Px*A0 z>k&6P;|Cw?^H}DTjb!dx!}Q#hOOv|$efZW+BPNgi+^xTUb&QTn2bOcZrkg+JbIfuw z;?hJLW;Cy`>h{+daq;*Nt#qqbazXsj4#HA^FS=i}~yZt5;bX=Y~czO2r6E6RF%U5S?2koD7vBOOs=$7Zy z&>UU;Mm}I-(aP%`opt*$MqEsFayG}4o||#4+nTLc-#N?n-5y!D4EG&6)d`bUemL`E zg;}??T+h?1cJX&76P*t~{iI$oWyDR*qIbj zlfD~q`I@@Pt!vtM_MeWM_q%iQmh(#PMqJ-&3$7N$MA`larb5E7$vh z*6i!9&5`%V>V7%m$^tGXjnT`O)%|D0jb=Krd-fSW{lSA3{)}tSwQl~>j3*P(|0HMhhRG9pcRcCzX3x3XziYbTm~r)+HOs>8 z9?@~LH{{*%t(=9+2TZzoKC4yz`ZZ>Hvp?+AE8vc6{c`J!|E>JuS@#pI^;p}-@l|%S zKl0cbRy^)}2zU2;PEC7^J8o)6}Obx_$za?WBSbX%;U}n&+_)$K18RpI?Db5(u=+0!5x>s-`pqpz=L@4 zB*UYeg)3LG%LmshxvkrdG2`Zaxl*U_a=znczu9HaxcSaQ_ip`WTp3Gm@OSmy#*2PP>o=1#B7%CUN-Zv0+(XVz`4zu9kSezK>{^kQRX zU%&g?s&0La8JDMWv724+JFZ&MoSwdWuVES;7f(;@?z@=Aj2pIda<}U7jO(4rV;<1G z!|%8}#lduzTUz6b{_|Ov=Z>5H+TCwzs$EDrN4Llji;YO!y@MSd%Ch55LNIuwaciumB`!PChJb0oNc*fNS zyvVV$ug$pfMX$Qi&ku9>EH&tQZpMwbIC$UEOK0*c3oAP7wqwln(kK%=2siJKt{)C) z{b$CdPjB%9o2Ndr4xY@|Gp=XxobhIy`t{rBxO~mNgRM`#(Qg<$aOU$j-a~xEp_67l zIDEe1ZQa(^EnK`Onmd2xORf02nt{6YH99U&`QVT<`ggpTapT3#d$^`Q#*E90_>qj! zE`!~Eb3p4q9hax^K@L*CWa24TcEjEAwH`Z0rx!PDdrfj2Kje6(x#OO{o8_Iples%C zpLt5}Oke%_ZH%~{kGFDc?X;$5{C3Z5bX=LGRnD*-JDKRXJk=NS8qI3uGiUS8G|#y7 zI>L3$AAa2RM#rTK59eEXus*(GhP~r$JvQuP+^ku?^auV<-*rChFlx#lW5l(7Ivu=y zGRKUI&jX(9^l;ri^f!9rvAoKbpIy^G%WnPpmYe-+bw9^6Iv=c?zchZ+U+dB%pW$Ki zI<;HfD>@%=Q@2%CJ@OheE_V1~&)z*RXkE;>@pqNA%FW(@FT>yH3;O_8n7iLiIv>tl z9{c(3Z??MSHSV}Od*cgQ7c(wD&&N(CW?bH?odZ98jjvZ`IqA5V`F@7WW2{w=XIy!} z^S#V<-XAM_a&$zVGnarhzEA{KRv{|>~$vb1l-R-fG@ES8NAFMg~O4e5X zjeW-T-6mf0VkHN4+c7$BG^!h(T+pkA)^WFAtZ@As-R?ed$EA(-Z*pZ#@A6_d8#BG? z=D^?awr+3tNjyD~eF57wKlpN&jZSa4>Yfa{Ud`K#8&7lo7T3DtgY!4~jSpCZtYHD-FP)B8*Y_^_I>x^$aqYRuSu)J)?D2fLtLKbs&B@yCJFep@Zte>=V|@wHD+9OO5T#QmA|pTJ8pRV zNao@<&3F1oT>my17RG9=>ekojxU^;8*~vk`9oPDw=)B{~!`a(!?2~HG-<`ZhXU-!Z zaL@ZMe;qPr9kL=1=I(mBoiwMP)$A?^p91M>oIuVc+QbtbTQj85dLCFnNNx`oGK0xTCxmdTDgr z=;Q0%nnx3L+_Rs!`Zu-He|Mwf@<5%QW^Kf!E&G|e(Mi7}n(y*9ea>u3D zJi8g5apg=V^4(^KJ>#Z!>5bLDSxU+<58>G(8S^_-2JJ)-041>A4$54@D8 z_^Wo`$nTkJf2Ud2c4{~B;mqalj<0q5F=kv$b9|G_TOQ-?^l#mMj85;F`^4iJ7aNn_ zZ`Casi@)~XPx~0_%ylipKL2DtJ9F`#{D0hL{^~JVvuE;o^>5?4b-3?DZ@4t8+qe7; zpTB(d^Xgtd^TD1Loh$FOy8SioxICbjPI_10^)qgm_J>{F9XB<5qP1kSdB}GLTw3L9 z$Ky_K-fRAT=Lh{eUfgk?es90y^20v-Exj^@$Jco5v(=vK^kO`D|I}m0=(zbVjjvv` zerx-I^*$bj#s$$-EZXKtk*S-xI65b-t+{2({rm{cU&3M6V9uw zy7e_W?zvvc2EN1Z(u46y9@RHv#?@QVi^X5huk7tJE*~7>VDtH^tv>oOX57?jXMfBD z9hXM>ctLaapH>;P&eyX*(*D)?_`*?CR4aPU-6>j>NU?N zXMX0*>UXp|t~DoTuJN7k_#OA0f#i$_=bow`+cD$nS^o0Ed8J-;+c7#WCY^NeGJ$7Y zb>nCD1wXH5xZ~2e!>QZf#*C{Uu%CF`ar6JfOOCC7$LASW{ppQnY`wSgJz~U-Rvfud zx1FA-+lP5LbM*rLPIm9O)^FWB-tpj$o4sAVzRAt^k@&DuFItUR-J+B3-3)i!d{5vp ze|PKGap?=&-jjEodAyVE_PCCltUcLtYTBW{d51me2TWt67dKhpEkC|dH(Z|bGVf*m z_G!*=X;dpZ@nzT=R@t2o=PZO9-<{28j&5{ZdpeEy4lH$C?Y%2Ee$vR(8!JO0emGke5M2CRMeJrQ=t#p8i~<}v+xWc9n>m5YJr`Rq0M z&a+dy6>fSsKCkq~h#RfR82y-^^c$Av<8Rpc4*WC^IQ*n9OxL_#S%2MjjFAtx4qTay zub<>M8h2`6Q~wxOy)tUe4%x-0clG~YQ@0+Y z_xEeva*U2^-L&$V#&78jH~q&0`C8r6W_q!{&CUCGr^iahYmD@!pRj3ulH-nxXD>I0 z>(zYKt;guN=?m;9{oHY_-66~L+B0ch^}FNdZ+YirST-Mv=ocQd?eunG0C%k&> z7@c11c)6Xu{7xPKEqWS?B^+vv=>(`(JvpX}yZM)Gc&`Le=|_8qTRxXHp!*6z63 zA9gcw$9?ct(@T|15Y)N^!m=A zZmAWvpPl*qv~IZ3sb=bh_YE$dOxSzQIlzC~7s*)GkG0zyBY)3aTF-h{v(|Av|0G}a z*fBaT4;-o6?)Rw~7c=i-XEnV0`>Jj^M#q(1KIk`f+pS&4#pFZ!j?UGL)vd#QQXX*m z5VmuAZl#_hz1eH9yf5S|`sLWXcX=CelUZxW#{7oAxY=`L;L5$zT8|9<#pUhO>~>rl ztv}hNJ=gL2?j17nH(d4iURJyKe8;O>k1^w7(2DO`9ap@q+lKx|uk&sWFpZhs_`2%p z?})$q?+RDWR(%b7#>Mj;CEk9zKg3@-kS}#}{f)Ziy9uohIm7q4ez6umJCF9d&Ifx7 z7Us%6Q@6d1j>`)k#9QZ1+Z*uL4|iOeG;{{%F%V+Gn?A9$)Zt|8I%I&9I?*)44&LL~&FyCE2k8;Lex|}hwq8Tr~;CG(P zxazgSsM~&wj;mg@o;@~?&&)+XuR1?wTpS*UgSlF}88@DWojuLxPy5*V>5XPO@w+B#iz^e>j=9ncb=zyq^zs0k54-Fcmu5^}IIrZOZaYTD z&3hATXJ7b*9qwnz7H`uFv%Mg!^LeBfHyY{wR=?rWDZ^LIe7@_ox~262Hy+1})jfU2 zwO6SbUsvC0cp06)wBgeFO}+XdKGT~y^v8-HonAHL0j=lS#hbhOb=+vBEnJ@a+3y_K zU4M66x;&G2XE>|xyAc;tMzOG0*Iu{2M#trAdd_*a<`?XAG2T0&H{e4uIz0y?yjdb^=sU5v3GO^+;P=N4zhkf&%QT>mvhJU?`Cv<^SzmU zSGLk~__X`XeC|lp3xBfXx{Yo5Wy;WMst$Eshw#*7cKp z>#w@yZFF2r`r@^pvtQh4H9^OXx6$n9Pjkix983qz->6%-e85fJayI9kCnF!C>XJ zWT77W;jfy|B?EF#Z*sNzH$Xkoa`CQu^VyvTGj4kH>;taDnykURUFNR-#)rHk;`M6( z)>OynxU^*5e5LPB;{p8jpBXoK!{>*3+Jjd1p&8fOF);0uJYW58*>R&ad*&{C#LfFh z4SA3n@|?%~xXW_KwJ&-;^~2-C9j|UVM#sg#rPqPmWwn0wW6ZdC;qU66an;VAizyp4 z`)oyb$CWEQIdfjESKWGyj+-pVhKwc4@%gTP9apw6o%yUEb~MkpG^HO_JgwVbW5%UZ z)-Y-1^D2MF%^tJrX-)eY9oPEP7f&)ViR$lizpzj%)o6 z+A*`Ixz2hpce%7;IOJhBZ=K%!4HTx{@Q!v~`R>>k>&BIxq4#?m?Gp_nS^`hgZkDt_x{5vi`=$5q| zpI5jtp=L7eJj-qU`WQ1Vt?3JSPVa?1t7pfxb}ZU(?)s~4c^e%Uk6v|4PdnpxdszMI zHSW0BbjRD(f0w%BX5Zxn{>;<*^=sU5`N0c3b42%DCT3ju!sNw{w{?5l7y9>{E!-V` zrxzDT{dc{sM_RpkN38S!&UJKp=}f+M)*t?i8_mw~+I%Z}+>DE{^KYHe>fCXYFy)czyv5zkMW>gxr#)Pc4)uBm*(Z0_jddO)z4@-l1N#C$R`;>6@3_g8Id*zv z#N|Qub-7K}cKTz)4HHN0{G7Q~zDw5a!@7mb({JvRxOD1;Z|lWWCv`KA%&ojH>$YQz z{LP-8e&aJgKFve+bi5osb6AJZ?&cWj&9nN$^^?AvalgGkM6Yx7n&WQmb<1mXdU+rR z@{oC=Km9k$N5{?gK7Z3k=T6o-E=F=e=Qna@?e-i0ALwMt&(=L#d;R(~MtX6reRt1& zLF;0~_0FWvk(#;Yhu>x2pK0&7*3VP>;wT`RL@REznf$y_B{*|l8JQrr_ch)UuPq^}h7nXDOv(;W1>BWui-F=}R zJ;oh3y6ufL+dF;cJo_#&<0c1u*nKD8akHoKV~0QE>J4j^XWE^yXZub4`WPcFMsnu7 zs};E8%4l+&InswKUUyv2hL@hxYd%}?V})x!(+_wn-qvlc5A^B{8gW0}ck&K)##8@q z)lF}y8!zdUVZCM!KR?~i&Ro22yf3U@jcCK88QbU1t3HqN5D&0v&)&1+bH_dRg?`2k z-+XtyG2&XUta=tB^)uJqd!lYR#)wN7R(cM5r-y+%u9~H8@!?Z{>BYp84cd4880ocs z+2^xs^El7mJN4_gG2+UqHOFI_UCF>1+;Ow+)Xp4^l|2%F#7(`z_VYLLfX4^C^S-va z=hS0Ez0s+5bnfg6n8wJ5aQPYCuIXLvxjXJ^-`13GW5%_92cKn_*3}HpxctD|%|boW z`b%Tp@4Mc1T;J6(@pA0cY{ZRD90#t?cD=az8*bL_jLGZO?<95W4}Zgz1H5l@?KgUu z&+>)udNt1@AM}``?ms~m~UmD7;)*$y1&`S{)SIqXu`S^x~#g zGW@Nah06o{-CnClhJBpBiLZMw&-L_*{*e!79v1!SY1;k%$-d(`&!02l8e8VA8-Z{^hfIXy5t%*Kw_XXaB0(-$uvfW&Ss!=;gy|=0;pzroZB|&s_6^H+TGw>vUGxoNgszRzq>l?kz;iJCS%Sy z%%PsM930U4&y1_jQa}5}&U>Td@)gs&l167-x&A~iuIH_NXZ>~iF*?2HeIZ`puV@AC zxar;W$l2SK_fp5@Y1q!V>213CG|Tdc8(y?N`F?W8#r#&k(VINziLS78Sc1vdRC_ReXiY#A2Y7? zV|=S_$-n)K?r-f6xLNydz3S0pjC_dB)Qr#e$(_FIxN_?_=gmB;?2apAyhzq~!)NoX z^xTM>Uc+?A!?(DpowZ|UZ|D6=kBs!nY}Svv+Y|N3kM{+gYD1s8@fh3B{Gc~`5udwp z)f*r4e#2C+l|A7+$B2tXv-PJxcG(>_+5I;6{Qr|P$8PO9z5GBYUi<9LIK?|iqvjkq-O zlqMRTefFsjSvy@%{=bOz*l=H$vFvBq4)ycilas7vwuX@pxY7L7iy1eyI`0pw9(Po%_2m069!9j%$CH)bt=nGXs#k7hlh$Naoz%-*t9#Fk8!w*J?v9)NfquGI zHFwikm(Q&i(RX@YadBn9=9?%<4lDF0OQO8v;`#?OVo5w3zsN04)3yVg%2-YS!`WZ^#459Ho%%r6xi?nUk7;y1(C46;ujWugnpgIS5!Z7V zG9|~jwC;SDnsKq{bgbsKZaqfFnH4vE5mA!?Hui^d(Vt(?Yq9#Bgg2t)*r3T z-^j1JCBv?>N3QlnJ<{gkNv-fQMn0&KwP*j44Y}LN+llKysg`Z z{?eH&?CQicMtZ}=-tC72dc(y(d$F@ebb6Bsn!`5FDtpAu`;xb~d2ir68 zkgxoWEW7x6h2~GV)=n>tSTy6V@;klp^y!|~acQJ)b-%4!-^Pf`KY4KAKY4eKxI9c2 zzUf1_wBoO3wr+imPA|5D7xo-;?ELN0aqVYVjwki&xYqv7zlZdj_0t)jdFj@KH??|d_3=$_ImoC{ClLKo>3Y@Uj;r=(X7+_u z_KeHdyq8?ln)ExvxpL_b#y5WBF`vX06 z@<30-hh3jLA8^kZSj|B_c8rc|Z_gf)p4ee`Tz;fp=@CDNeV4rvS8wq+`wSNCcdWYQ zH9D^RCTBc}FRSd1t7h3FU7N#PtJ-H=ES}=6>a1IjG2*69bn;ksv7J}Glhkd)_XV!~ z0Egye+4U!V93OEU_}H%bYu-CwMn2$%AzzrQ^{QKsG2^B$u=&Dk*`}W#Gno-)b7f2p_<%20GQ3*D zPA{F9-_%R1nq_~CW^*LVce+P<<3Tj9?g=&RYjj*19jp3Yz+V?LuDw8S@WWoDFFx6S zv)|x7$wECg%r7o&-|k~v>kfB!@2y9NdaXS@an8n04M%mujmB^K8*ci|&vd7c@7`^7 zYwLq&)h=s~{uSnoo1WO+Z!nF{-}IY4a4j451UYfO>yM7B*D#XhygPi(6MM&O@9DVS zoAGw`eBCx29aqhw@w{ic=85mE?xB11Jz~c7951kTHFsQpWAT=r-Hg>EZ64^916t`$ zPuzWfs#}lI=?(YW`Ax>09lQLQUi*S0e((Zc9^y;a*Bv+gAj8<6cV6wg85c**9O-L% zyUf9O<@b6}{@q*9k1AIE~MyJ>FyR7qEBd&E? zbB8fs}C>C!54G#{LYV#t7hjs$U@lhXEzUcIMHd2-CiE)#n1!sfj)EXyyHgP z{M#(f6J|X&tXuv!7aDPQ`WMp}=~Zjl;zK-EvmN~%SFPy8a}M8m<$F-al?ObG#)wOoBR#iL zrx)v_4`}~cfPM|iT z*)x}Je`oA6>yfvIalad_;mWmNc{Y0K40mPyb^9@{-2D5j9P`*5be+9$eK*I5%SSxx_I$KI{rFV)vCq=`0GC-E}afusuw;s-7DVKZ9no5UwIt8 z=D_vYr`%*>mse9CW8?#!*027qt=~GF@7}|8%P~e=S%_X9SpQDOfJfZyGxXt_V^{Zx z8?Ex-T844$&F13KZoly{+)v(}Bfa@9$qxs8{BSnsoz}YLhrjVOzNRa&got z`?}286L`M5H`J{|E+4Ze(5_zUxVlHytu0sWWHmnUI(b;(k9?pPFU;^(d!im4>XrXw z_?$OVI*>;J@GJsPgF`kiwe|5x;%^(L=)^d-AHyn5^ySMD`m z{0RSP9x!RnzZKA_hH~eO%OB^vck@{V_d>o=w(LKiYlfyyEp$Z@6lf?^SZPn~M>bMw)pYPjP*=Iy$b}SugJD+Rt-# zTr7H>`G9ZFU#;DU>sdXG$?qq+N8G$Od76EK=YBTViq9Pv#}Umkm}}>|tc|!B4t~g6 z{Kca6sorQ)r_{AgV$nsPnw>pf@wFZ~#)z9Whv}?O_`J#< zaWTS-*LU@=N8ZMjiy|a%6KH~C$&gka3xpwMz<@!En&9pjb&G!nQn`6b}D_36e z&UpxbCwEtFvS7`4*)Q+Dm)5Osd_R9fFK^=o{_1~WB zGJ$EXc(T%Kb^F`*)Nqrn_^{f$cp1@~^=@`7d=!;!1Z zhi#73c9z4AOE0a-+TDIzx4w;4?p^)gp!LI*o8GYIWMY?Tjw@FW@F1Cs$2)t&mFwM3 zXKE*3w8m>|+1(%X1#a?7XWm8T*{$E%2l~!DIvslClRVJM2dwxitFBl3p>93KSucjQ z^8=fo=Gyh39(l_{xY3Lqzg_dmT=I99y^-E47ss4f(Z0(~#&FI#xZ~9AZ{w;rHA{bF z{kX23S8{yi^1$I)964}p{c5_Jw-Gl?e7yWEyUMIbe&pet;dnx)4D;Y=-Dt&5rkv?c z-tf(L=k?VG_0qTD#OqJ}#g#D!P53+OuiK9?(#wmy_rFy)Jawlte`ASzTpH~G4m?`TW!*d3s@q?K z-ZNJoXpS#o-__{_{B?2VrdH0_crsuuGNy*JeATayapuN@aIkS4D?N4qfBomo^?de0 zbI{8ZUPPzs6;IAw_j@ku=QW*}eDM1fX5Dt=H+4&9F?g6>OXhs`L@%x*{<=<{Kk)%4 zdt+)ypU<6n^3(^<@_{#4yE^NUW1Q<|zoe57_}H|nlc5-~?${4OZ@PQ}lwd-v?^44$RS}Se1shfH5 z`MuK5SG~#BZmw#&w{hmu!Ux*;={t&RnyrC;=as*Wu3XuQW^*`rlD=E&d=bj~(M$ zx9E(&_;m8fd8LQzwjch+SKO!Gc3hernM2*=?P(sYlg{)5pB=k3t4H2^z~!gf+1KdH zUbO4Uxo*injq>^=xAoXDMqJt)v}4=XKJho%rS{ULg3@Es?f(!X23kzU;F6XtTP)~jwk z#)!*<^TAVIVz2JQ9hXL$c@e#z`b%r}9p0|wwQhSGBfZ&o@SkWkLC3|U_31mKPeIinGG zH)|c&zL32Ee^>K}izll*pfld?>}?$v1E0sv`8SVMKW1Fs?r5yrU!&vFDN|40LEtm4 zdi_ms`KrgT`73{FxY8S)URq^>AC6qF_D9F1Ek4M=&fZYBxBh0|;e~7LPxoJ(WLG~w zdFPLOu$QGTqMKg6sM*uH<=vzo)-qO3eZ;rU0&-$H{t({Da zxN5~iT3zq-L*0Iij*A%&a;@j0{VsDOt{TyaXYD*)^|fw2#+4gw`EJH*`M~$CchG&8 zwGkJ~I(M^GkG#f+t4?&&BxAC-Te}fAdENbvQ`5gj$JJvzrTd8=GcKRM$<<@m9;V+M ztNYJL@0p7)%lg5!-PHc}Co1H)q+{$!^{L9Pj7=E%LZ`Qv0_tupgp0k4&>bUc+sN3K2 zODB$+Ia~kJOkDNi(V5!u0edIAb^A~+50fkWZ*XOaMw;>X`i(r`>QR1RCKJ25ul{Bq zk7n$=Gj?>3xO$9EKDfr+&Bm3Br%(Biy^TeEufhYxa_y1AyAMjn5n7fapn^atj*^d<)~?3&h3{q;N_FlqgU zUUiDM&NAjR-XtS+xirpsz~Ci+WebmPKf7M}-cYw4W5iW=T50yT9Dnn@a<%8q-1EQ3 zsTb|mK=W=dU%7n5!}b2$<<%p{2p20ka7}ysrkA(oSnZ87*K_KVOyEW{-m15C>#L7x zq$_pDG0$#4)Fa0@>y18hIP$+oV((;jsAJVVZ9Uq*Z)-PPz(qpd86aFXrI+jDuXg0_0o=}gxM)b;9>gKG!c#UtK z-S=|GjSuk#JAJ&;cO&j6`KsxUG2-$$dxbfkyo;_}`wV}1%VR#ccC4(wZac=6D-T#S z%bER>=1=P;7kUBH5l^h)sa`erEM9cG<}beAt&Xz~xO~Lot+Tv+(r?y(=Eh&^$4++5 zv7&pVSKWAQ9-5zceC1-&nBKrmmOrVRHG3W(^PG*H`d#&AUwBfhlJOd6uJ!U%#%Se- zgJ$#j+NRWr|H^B8Z}+j``OzsVD>EJS;L=IhE_XTALM zd@?2r{KfLQv$d>b;>`6d7EkdozL7KBtdZs?I_t4xT=il*`1~|uz#}dnlCP(luiVs( z-sFup*?-LQq;AoN8$W1Qzj(jGKI_#JdO^=cud~m0^LFL(A^ng{?C7k=k8$N@{n*(n znB>T+<1`pnKLf_?w)qmi<|dD{_e)jUa_P7%#E)2 z>daT3m@|F7^7qV{o9}2C@dOjk@6^xSyZz=_&+x$W(fw&2a2;XlS9 Date: Wed, 10 Apr 2024 20:28:02 +0200 Subject: [PATCH 09/27] formatting of tcrdist tests improved --- src/scirpy/tests/test_ir_dist_metrics.py | 439 +++++++++++++++-------- 1 file changed, 299 insertions(+), 140 deletions(-) diff --git a/src/scirpy/tests/test_ir_dist_metrics.py b/src/scirpy/tests/test_ir_dist_metrics.py index 3bf5d1c2c..326d5b686 100644 --- a/src/scirpy/tests/test_ir_dist_metrics.py +++ b/src/scirpy/tests/test_ir_dist_metrics.py @@ -317,141 +317,292 @@ def test_sequence_dist_all_metrics(metric): assert dist_mat.shape == (5, 3) -@pytest.mark.parametrize("test_parameters,test_input,expected_result", [ - - #test more complex strings with unequal length and set high cutoff such that cutoff is neglected - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["WEKFAPIQCMNR", "RDAIYTCCNKSWEQ", "CWWMFGHTVRI", "GWSZNNHI"])), - np.array([[57, 74, 65, 33], - [66, 68, 65, 33], - [53, 58, 49, 25]])), - - #test empty input arrays - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, - (np.array([]), - np.array([])), - np.empty((0,0))), - - #test very small input sequences - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 0, "ctrim": 0, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, - (np.array(["A"]), - np.array(["C"])), - np.array([[13]])), - - #test standard parameters with simple input sequences - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 0, 21], - [0, 1, 21], - [21, 21, 1]])), - - #test standard parameters with simple input sequences with second sequences array set to None - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - None), - np.array([[1, 0, 21], - [0, 1, 21], - [21, 21, 1]])), - - #test with dist_weight set to 0 - ({"dist_weight": 0, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 1, 9], - [1, 1, 9], - [9, 9, 1]])), - - #test with dist_weight set high and cutoff set high to neglect it - ({"dist_weight": 30, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 241, 129], - [241, 1, 129], - [129, 129, 1]])), - - #test with gap_penalty set to 0 - ({"dist_weight": 3, "gap_penalty": 0, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 0, 13], - [0, 1, 13], - [13, 13, 1]])), - - #test with gap_penalty set high and cutoff set high to neglect it - ({"dist_weight": 3, "gap_penalty": 40, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 25, 93], - [25, 1, 93], - [93, 93, 1]])), - - #test with ntrim = 0 and cutoff set high to neglect it - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 0, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 25, 33], - [25, 1, 33], - [33, 33, 1]])), - - #test with high ntrim - trimmimg with ntrim is only possible until the beginning of the gap for sequences of unequal length - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 10, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 1, 9], - [1, 1, 9], - [9, 9, 1]])), - - #test with ctrim = 0 and cutoff set high to neglect it - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 0, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 25, 21], - [25, 1, 21], - [21, 21, 1]])), - - #test with high ctrim - trimmimg with ctrim is only possible until the end of the gap for sequences of unequal length - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 10, "fixed_gappos": True, "cutoff": 20, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 1, 21], - [1, 1, 21], - [21, 21, 1]])), - - #test with fixed_gappos = False and a high cutoff to neglect it - #AAAAA added at the beginning of the usual sequences to make to difference of min_gappos and max_gappos more significant - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": False, "cutoff": 1000, "n_jobs": 1}, - (np.array(["AAAAAAAAAAAAAAA", "AAAAAAAAARRAAAA", "AAAAAAANDAAAA"]), - np.array(["AAAAAAAAAAAAAAA", "AAAAAAAAARRAAAA", "AAAAAAANDAAAA"])), - np.array([[1, 25, 33], - [25, 1, 33], - [33, 33, 1]])), - - #test with cutoff set to 0 - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 0, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 0, 0], - [0, 1, 0], - [0, 0, 1]])), - - #test with cutoff set high to neglect it - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 1000, "n_jobs": 1}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 25, 21], - [25, 1, 21], - [21, 21, 1]])), - - #test with multiple cores by setting n_jobs = 4 - ({"dist_weight": 3, "gap_penalty": 4, "ntrim": 3, "ctrim": 2, "fixed_gappos": True, "cutoff": 20, "n_jobs": 4}, - (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), - np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"])), - np.array([[1, 0, 21], - [0, 1, 21], - [21, 21, 1]])), -]) +@pytest.mark.parametrize( + "test_parameters,test_input,expected_result", + [ + # test more complex strings with unequal length and set high cutoff such that cutoff is neglected + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 1000, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["WEKFAPIQCMNR", "RDAIYTCCNKSWEQ", "CWWMFGHTVRI", "GWSZNNHI"]), + ), + np.array([[57, 74, 65, 33], [66, 68, 65, 33], [53, 58, 49, 25]]), + ), + # test empty input arrays + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 20, + "n_jobs": 1, + }, + (np.array([]), np.array([])), + np.empty((0, 0)), + ), + # test very small input sequences + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 0, + "ctrim": 0, + "fixed_gappos": True, + "cutoff": 20, + "n_jobs": 1, + }, + (np.array(["A"]), np.array(["C"])), + np.array([[13]]), + ), + # test standard parameters with simple input sequences + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 20, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + ), + np.array([[1, 0, 21], [0, 1, 21], [21, 21, 1]]), + ), + # test standard parameters with simple input sequences with second sequences array set to None + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 20, + "n_jobs": 1, + }, + (np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), None), + np.array([[1, 0, 21], [0, 1, 21], [21, 21, 1]]), + ), + # test with dist_weight set to 0 + ( + { + "dist_weight": 0, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 20, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + ), + np.array([[1, 1, 9], [1, 1, 9], [9, 9, 1]]), + ), + # test with dist_weight set high and cutoff set high to neglect it + ( + { + "dist_weight": 30, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 1000, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + ), + np.array([[1, 241, 129], [241, 1, 129], [129, 129, 1]]), + ), + # test with gap_penalty set to 0 + ( + { + "dist_weight": 3, + "gap_penalty": 0, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 20, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + ), + np.array([[1, 0, 13], [0, 1, 13], [13, 13, 1]]), + ), + # test with gap_penalty set high and cutoff set high to neglect it + ( + { + "dist_weight": 3, + "gap_penalty": 40, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 1000, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + ), + np.array([[1, 25, 93], [25, 1, 93], [93, 93, 1]]), + ), + # test with ntrim = 0 and cutoff set high to neglect it + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 0, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 1000, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + ), + np.array([[1, 25, 33], [25, 1, 33], [33, 33, 1]]), + ), + # test with high ntrim - trimmimg with ntrim is only possible until the beginning of the gap for sequences of unequal length + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 10, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 20, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + ), + np.array([[1, 1, 9], [1, 1, 9], [9, 9, 1]]), + ), + # test with ctrim = 0 and cutoff set high to neglect it + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 0, + "fixed_gappos": True, + "cutoff": 1000, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + ), + np.array([[1, 25, 21], [25, 1, 21], [21, 21, 1]]), + ), + # test with high ctrim - trimmimg with ctrim is only possible until the end of the gap for sequences of unequal length + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 10, + "fixed_gappos": True, + "cutoff": 20, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + ), + np.array([[1, 1, 21], [1, 1, 21], [21, 21, 1]]), + ), + # test with fixed_gappos = False and a high cutoff to neglect it + # AAAAA added at the beginning of the usual sequences to make to difference of min_gappos and max_gappos more significant + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": False, + "cutoff": 1000, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAAAAAAA", "AAAAAAAAARRAAAA", "AAAAAAANDAAAA"]), + np.array(["AAAAAAAAAAAAAAA", "AAAAAAAAARRAAAA", "AAAAAAANDAAAA"]), + ), + np.array([[1, 25, 33], [25, 1, 33], [33, 33, 1]]), + ), + # test with cutoff set to 0 + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 0, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + ), + np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), + ), + # test with cutoff set high to neglect it + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 1000, + "n_jobs": 1, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + ), + np.array([[1, 25, 21], [25, 1, 21], [21, 21, 1]]), + ), + # test with multiple cores by setting n_jobs = 4 + ( + { + "dist_weight": 3, + "gap_penalty": 4, + "ntrim": 3, + "ctrim": 2, + "fixed_gappos": True, + "cutoff": 20, + "n_jobs": 4, + }, + ( + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + np.array(["AAAAAAAAAA", "AAAARRAAAA", "AANDAAAA"]), + ), + np.array([[1, 0, 21], [0, 1, 21], [21, 21, 1]]), + ), + ], +) def test_tcrdist(test_parameters, test_input, expected_result): tcrdist_calculator = TCRdistDistanceCalculator(**test_parameters) seq1, seq2 = test_input @@ -462,11 +613,19 @@ def test_tcrdist(test_parameters, test_input, expected_result): def test_tcrdist_reference(): - #test tcrdist against reference implementation - seqs = np.load('data/tcrdist_test_data/tcrdist_WU3k_seqs.npy') - reference_result = scipy.sparse.load_npz('data/tcrdist_test_data/tcrdist_WU3k_csr_result.npz') - - tcrdist_calculator = TCRdistDistanceCalculator(dist_weight=3, gap_penalty=4, ntrim=3, ctrim=2, fixed_gappos=True, cutoff=15, n_jobs= 1) + # test tcrdist against reference implementation + seqs = np.load("data/tcrdist_test_data/tcrdist_WU3k_seqs.npy") + reference_result = scipy.sparse.load_npz("data/tcrdist_test_data/tcrdist_WU3k_csr_result.npz") + + tcrdist_calculator = TCRdistDistanceCalculator( + dist_weight=3, + gap_penalty=4, + ntrim=3, + ctrim=2, + fixed_gappos=True, + cutoff=15, + n_jobs=1, + ) res = tcrdist_calculator.calc_dist_mat(seqs, seqs) assert np.array_equal(res.data, reference_result.data) From a9e46b4878db69073c8181dc39bd7ee02bcad121 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 10 Apr 2024 18:29:41 +0000 Subject: [PATCH 10/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/scirpy/ir_dist/metrics.py | 5 ++--- src/scirpy/tests/test_ir_dist_metrics.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index 77c915241..855d6e2ff 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -941,10 +941,9 @@ def _nb_tcrdist_mat( return data_rows, indices_rows, row_element_counts def _calc_dist_mat_block(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: - - if(len(seqs) == 0 or len(seqs2) == 0): + if len(seqs) == 0 or len(seqs2) == 0: return csr_matrix((len(seqs), len(seqs2))) - + seqs_mat1, seqs_L1 = self._seqs2mat(seqs) seqs_mat2, seqs_L2 = self._seqs2mat(seqs2) diff --git a/src/scirpy/tests/test_ir_dist_metrics.py b/src/scirpy/tests/test_ir_dist_metrics.py index 326d5b686..03a993d14 100644 --- a/src/scirpy/tests/test_ir_dist_metrics.py +++ b/src/scirpy/tests/test_ir_dist_metrics.py @@ -630,4 +630,4 @@ def test_tcrdist_reference(): assert np.array_equal(res.data, reference_result.data) assert np.array_equal(res.indices, reference_result.indices) - assert np.array_equal(res.indptr, reference_result.indptr) \ No newline at end of file + assert np.array_equal(res.indptr, reference_result.indptr) From ba72d4122d6671b6301e056909049a76c00b3227 Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Tue, 16 Apr 2024 20:33:50 +0200 Subject: [PATCH 11/27] comments for TCRdist added --- src/scirpy/ir_dist/metrics.py | 156 ++++++++++++++++++++++++++++------ 1 file changed, 132 insertions(+), 24 deletions(-) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index 77c915241..8a00bd427 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -395,6 +395,57 @@ def _compute_block(self, seqs1, seqs2, origin): class TCRdistDistanceCalculator: + + """Computes pairwise distances between TCR CDR3 sequences based on the "tcrdist" distance metric. + + The code of this class is heavily based on: https://github.com/agartland/pwseqdist/blob/master/pwseqdist + + Using default weight, gap penalty, ntrim and ctrim is equivalent to the + original distance published in Dash et al, (2017). + + Parameters + ---------- + dist_weight : int + Weight applied to the mismatch distances before summing with the gap penalties + gap_penalty : int + Distance penalty for the difference in the length of the two sequences + ntrim/ctrim : int + Positions trimmed off the N-terminus (0) and C-terminus (L-1) ends of the peptide sequence. These symbols will be ignored + in the distance calculation. + fixed_gappos : bool + If True, insert gaps at a fixed position after the cysteine residue statring the CDR3 (typically position 6). + If False, find the "optimal" position for inserting the gaps to make up the difference in length + cutoff: + Will eleminate distances > cutoff to make efficient + use of sparse matrices. + n_jobs: + Number of jobs (processes) to use for the pairwise distance calculation + """ + + def __init__( + self, + dist_weight=3, + gap_penalty=4, + ntrim=3, + ctrim=2, + fixed_gappos=True, + cutoff: Union[None, int] = None, + n_jobs: Union[None, int] = None, + ): + if cutoff is None: + cutoff = 20 + if n_jobs is None: + n_jobs = 1 + + self.dist_weight = dist_weight + self.gap_penalty = gap_penalty + self.ntrim = ntrim + self.ctrim = ctrim + self.fixed_gappos = fixed_gappos + self.cutoff = cutoff + self.n_jobs = n_jobs + + parasail_aa_alphabet = "ARNDCQEGHILKMFPSTWYVBZX" parasail_aa_alphabet_with_unknown = "ARNDCQEGHILKMFPSTWYVBZX*" tcr_dict_distance_matrix = { @@ -800,31 +851,20 @@ class TCRdistDistanceCalculator: ("Y", "Y"): 0, } - def __init__( - self, - dist_weight=3, - gap_penalty=4, - ntrim=3, - ctrim=2, - fixed_gappos=True, - cutoff: Union[None, int] = None, - n_jobs: Union[None, int] = None, - ): - if cutoff is None: - cutoff = 20 - if n_jobs is None: - n_jobs = 1 - - self.dist_weight = dist_weight - self.gap_penalty = gap_penalty - self.ntrim = ntrim - self.ctrim = ctrim - self.fixed_gappos = fixed_gappos - self.cutoff = cutoff - self.n_jobs = n_jobs - @staticmethod def _make_numba_matrix(distance_matrix, alphabet=parasail_aa_alphabet_with_unknown): + """Creates a numba compatible distance matrix from a dict of tuples. + + Parameters + ---------- + distance_matrix : dict + Keys are tuples like ('A', 'C') with values containing an integer. + alphabet : str + + Returns + ------- + distance_matrix : np.ndarray, dtype=np.int32""" + dm = np.zeros((len(alphabet), len(alphabet)), dtype=np.int32) for (aa1, aa2), d in distance_matrix.items(): dm[alphabet.index(aa1), alphabet.index(aa2)] = d @@ -835,6 +875,30 @@ def _make_numba_matrix(distance_matrix, alphabet=parasail_aa_alphabet_with_unkno @staticmethod def _seqs2mat(seqs, alphabet=parasail_aa_alphabet, max_len=None): + """Convert a collection of gene sequences into a + numpy matrix of integers for fast comparison. + + Parameters + ---------- + seqs : list + List of strings + + Returns + ------- + mat : np.array + + Examples + -------- + >>> seqs2mat(["CAT","HAT"]) + array([[ 4, 0, 16], + [ 8, 0, 16]], dtype=int8) + + Notes + ----- + Requires all seqs to have the same length, therefore shorter sequences + are filled up with -1 entries at the end. + """ + if max_len is None: max_len = np.max([len(s) for s in seqs]) mat = -1 * np.ones((len(seqs), max_len), dtype=np.int8) @@ -866,6 +930,46 @@ def _nb_tcrdist_mat( fixed_gappos=True, cutoff=20, ): + """Computes the pairwise TCRdist distances for sequences in seqs_mat1 and seqs_mat2. + + Note: to use with non-CDR3 sequences set ntrim and ctrim to 0. + + Parameters + ---------- + seqs_mat1/2 : np.ndarray + Matrix containing sequences created by seqs2mat with padding to accomodate + sequences of different lengths (-1 padding) + seqs_L1/2 : np.ndarray + A vector containing the length of each sequence in the respective seqs_mat matrix, + without the padding in seqs_mat + distance_matrix : np.ndarray [alphabet, alphabet] dtype=int32 + A square distance matrix (NOT a similarity matrix). + Matrix must match the alphabet that was used to create + seqs_mat, where each AA is represented by an index into the alphabet. + dist_weight : int + Weight applied to the mismatch distances before summing with the gap penalties + gap_penalty : int + Distance penalty for the difference in the length of the two sequences + ntrim/ctrim : int + Positions trimmed off the N-terminus (0) and C-terminus (L-1) ends of the peptide sequence. These symbols will be ignored + in the distance calculation. + fixed_gappos : bool + If True, insert gaps at a fixed position after the cysteine residue statring the CDR3 (typically position 6). + If False, find the "optimal" position for inserting the gaps to make up the difference in length + + Returns + ------- + data_rows : + List with arrays containing the non-zero data values of the result matrix per row, + needed to create the final scipy CSR result matrix later + indices_rows : + List with arrays containing the non-zero entry column indeces of the result matrix per row, + needed to create the final scipy CSR result matrix later + row_element_counts : + Arraay with integers that indicate the amount of non-zero values of the result matrix per row, + needed to create the final scipy CSR result matrix later + """ + assert seqs_mat1.shape[0] == seqs_L1.shape[0] assert seqs_mat2.shape[0] == seqs_L2.shape[0] @@ -941,7 +1045,8 @@ def _nb_tcrdist_mat( return data_rows, indices_rows, row_element_counts def _calc_dist_mat_block(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: - + "Computes a block of the final TCRdist distance matrix and returns it as CSR matrix" + if(len(seqs) == 0 or len(seqs2) == 0): return csr_matrix((len(seqs), len(seqs2))) @@ -968,6 +1073,9 @@ def _calc_dist_mat_block(self, seqs: Sequence[str], seqs2: Optional[Sequence[str return sparse_distance_matrix def calc_dist_mat(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: + """Calculates the pairwise distances between two vectors of gene sequences based on the TCRdist distance metric + and returns a CSR distance matrix""" + if seqs2 is None: seqs2 = seqs From 8e8dfe888e09e822ad552c2ed6212005fc03c237 Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Tue, 16 Apr 2024 20:45:01 +0200 Subject: [PATCH 12/27] code formatting --- src/scirpy/ir_dist/metrics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index c9450a9cb..2e43b42b6 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -395,7 +395,6 @@ def _compute_block(self, seqs1, seqs2, origin): class TCRdistDistanceCalculator: - """Computes pairwise distances between TCR CDR3 sequences based on the "tcrdist" distance metric. The code of this class is heavily based on: https://github.com/agartland/pwseqdist/blob/master/pwseqdist From 21702cb318f430c09f71a46903d652dec32e54ed Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Apr 2024 18:46:05 +0000 Subject: [PATCH 13/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/scirpy/ir_dist/metrics.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index 2e43b42b6..9027570d9 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -444,7 +444,6 @@ def __init__( self.cutoff = cutoff self.n_jobs = n_jobs - parasail_aa_alphabet = "ARNDCQEGHILKMFPSTWYVBZX" parasail_aa_alphabet_with_unknown = "ARNDCQEGHILKMFPSTWYVBZX*" tcr_dict_distance_matrix = { @@ -862,8 +861,8 @@ def _make_numba_matrix(distance_matrix, alphabet=parasail_aa_alphabet_with_unkno Returns ------- - distance_matrix : np.ndarray, dtype=np.int32""" - + distance_matrix : np.ndarray, dtype=np.int32 + """ dm = np.zeros((len(alphabet), len(alphabet)), dtype=np.int32) for (aa1, aa2), d in distance_matrix.items(): dm[alphabet.index(aa1), alphabet.index(aa2)] = d @@ -879,16 +878,16 @@ def _seqs2mat(seqs, alphabet=parasail_aa_alphabet, max_len=None): Parameters ---------- - seqs : list + seqs : list List of strings Returns ------- - mat : np.array + mat : np.array Examples -------- - >>> seqs2mat(["CAT","HAT"]) + >>> seqs2mat(["CAT", "HAT"]) array([[ 4, 0, 16], [ 8, 0, 16]], dtype=int8) @@ -897,7 +896,6 @@ def _seqs2mat(seqs, alphabet=parasail_aa_alphabet, max_len=None): Requires all seqs to have the same length, therefore shorter sequences are filled up with -1 entries at the end. """ - if max_len is None: max_len = np.max([len(s) for s in seqs]) mat = -1 * np.ones((len(seqs), max_len), dtype=np.int8) @@ -958,7 +956,7 @@ def _nb_tcrdist_mat( Returns ------- - data_rows : + data_rows : List with arrays containing the non-zero data values of the result matrix per row, needed to create the final scipy CSR result matrix later indices_rows : @@ -968,7 +966,6 @@ def _nb_tcrdist_mat( Arraay with integers that indicate the amount of non-zero values of the result matrix per row, needed to create the final scipy CSR result matrix later """ - assert seqs_mat1.shape[0] == seqs_L1.shape[0] assert seqs_mat2.shape[0] == seqs_L2.shape[0] @@ -1044,9 +1041,8 @@ def _nb_tcrdist_mat( return data_rows, indices_rows, row_element_counts def _calc_dist_mat_block(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: - "Computes a block of the final TCRdist distance matrix and returns it as CSR matrix" - - if(len(seqs) == 0 or len(seqs2) == 0): + """Computes a block of the final TCRdist distance matrix and returns it as CSR matrix""" + if len(seqs) == 0 or len(seqs2) == 0: return csr_matrix((len(seqs), len(seqs2))) seqs_mat1, seqs_L1 = self._seqs2mat(seqs) @@ -1073,8 +1069,8 @@ def _calc_dist_mat_block(self, seqs: Sequence[str], seqs2: Optional[Sequence[str def calc_dist_mat(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: """Calculates the pairwise distances between two vectors of gene sequences based on the TCRdist distance metric - and returns a CSR distance matrix""" - + and returns a CSR distance matrix + """ if seqs2 is None: seqs2 = seqs From 51112d80aaf680d4cf413beddc7206905b26a383 Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Thu, 18 Apr 2024 13:09:13 +0200 Subject: [PATCH 14/27] handling of default values for cutoff and n_jobs in _get_distance_calculator adapted --- src/scirpy/ir_dist/__init__.py | 18 +++++++++++------- src/scirpy/ir_dist/metrics.py | 9 ++------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/scirpy/ir_dist/__init__.py b/src/scirpy/ir_dist/__init__.py index 18831a321..05820f0b6 100644 --- a/src/scirpy/ir_dist/__init__.py +++ b/src/scirpy/ir_dist/__init__.py @@ -73,7 +73,7 @@ def _get_metric_key(metric: MetricType) -> str: return "custom" if isinstance(metric, metrics.DistanceCalculator) else metric # type: ignore -def _get_distance_calculator(metric: MetricType, cutoff: Union[int, None], *, n_jobs=None, **kwargs): +def _get_distance_calculator(metric: MetricType, cutoff: Union[int, None], *, n_jobs=-1, **kwargs): """Returns an instance of :class:`~scirpy.ir_dist.metrics.DistanceCalculator` given a metric. @@ -83,20 +83,24 @@ def _get_distance_calculator(metric: MetricType, cutoff: Union[int, None], *, n_ metric = "identity" cutoff = 0 + # Let's rely on the default set by the class if cutoff is None + if cutoff is not None: + kwargs["cutoff"] = cutoff + if isinstance(metric, metrics.DistanceCalculator): dist_calc = metric elif metric == "alignment": - dist_calc = metrics.FastAlignmentDistanceCalculator(cutoff=cutoff, n_jobs=n_jobs, estimated_penalty=0, **kwargs) + dist_calc = metrics.FastAlignmentDistanceCalculator(n_jobs=n_jobs, estimated_penalty=0, **kwargs) elif metric == "fastalignment": - dist_calc = metrics.FastAlignmentDistanceCalculator(cutoff=cutoff, n_jobs=n_jobs, **kwargs) + dist_calc = metrics.FastAlignmentDistanceCalculator(n_jobs=n_jobs, **kwargs) elif metric == "identity": - dist_calc = metrics.IdentityDistanceCalculator(cutoff=cutoff, **kwargs) + dist_calc = metrics.IdentityDistanceCalculator(**kwargs) elif metric == "levenshtein": - dist_calc = metrics.LevenshteinDistanceCalculator(cutoff=cutoff, n_jobs=n_jobs, **kwargs) + dist_calc = metrics.LevenshteinDistanceCalculator(n_jobs=n_jobs, **kwargs) elif metric == "hamming": - dist_calc = metrics.HammingDistanceCalculator(cutoff=cutoff, n_jobs=n_jobs, **kwargs) + dist_calc = metrics.HammingDistanceCalculator(n_jobs=n_jobs, **kwargs) elif metric == "tcrdist": - dist_calc = metrics.TCRdistDistanceCalculator(cutoff=cutoff, n_jobs=n_jobs, **kwargs) + dist_calc = metrics.TCRdistDistanceCalculator(n_jobs=n_jobs, **kwargs) else: raise ValueError("Invalid distance metric.") diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index 2e43b42b6..31571bd64 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -428,14 +428,9 @@ def __init__( ntrim=3, ctrim=2, fixed_gappos=True, - cutoff: Union[None, int] = None, - n_jobs: Union[None, int] = None, + cutoff: int = 20, + n_jobs: int = 1, ): - if cutoff is None: - cutoff = 20 - if n_jobs is None: - n_jobs = 1 - self.dist_weight = dist_weight self.gap_penalty = gap_penalty self.ntrim = ntrim From 101b4115e97d005968f5aef0d93cb1a01adaaa8f Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Thu, 18 Apr 2024 19:19:58 +0200 Subject: [PATCH 15/27] auto formatting disabled for tcr_dict_distance_matrix --- src/scirpy/ir_dist/metrics.py | 406 +--------------------------------- 1 file changed, 4 insertions(+), 402 deletions(-) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index 31571bd64..e3b7008e8 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -442,409 +442,11 @@ def __init__( parasail_aa_alphabet = "ARNDCQEGHILKMFPSTWYVBZX" parasail_aa_alphabet_with_unknown = "ARNDCQEGHILKMFPSTWYVBZX*" - tcr_dict_distance_matrix = { - ("A", "A"): 0, - ("A", "C"): 4, - ("A", "D"): 4, - ("A", "E"): 4, - ("A", "F"): 4, - ("A", "G"): 4, - ("A", "H"): 4, - ("A", "I"): 4, - ("A", "K"): 4, - ("A", "L"): 4, - ("A", "M"): 4, - ("A", "N"): 4, - ("A", "P"): 4, - ("A", "Q"): 4, - ("A", "R"): 4, - ("A", "S"): 3, - ("A", "T"): 4, - ("A", "V"): 4, - ("A", "W"): 4, - ("A", "Y"): 4, - ("C", "A"): 4, - ("C", "C"): 0, - ("C", "D"): 4, - ("C", "E"): 4, - ("C", "F"): 4, - ("C", "G"): 4, - ("C", "H"): 4, - ("C", "I"): 4, - ("C", "K"): 4, - ("C", "L"): 4, - ("C", "M"): 4, - ("C", "N"): 4, - ("C", "P"): 4, - ("C", "Q"): 4, - ("C", "R"): 4, - ("C", "S"): 4, - ("C", "T"): 4, - ("C", "V"): 4, - ("C", "W"): 4, - ("C", "Y"): 4, - ("D", "A"): 4, - ("D", "C"): 4, - ("D", "D"): 0, - ("D", "E"): 2, - ("D", "F"): 4, - ("D", "G"): 4, - ("D", "H"): 4, - ("D", "I"): 4, - ("D", "K"): 4, - ("D", "L"): 4, - ("D", "M"): 4, - ("D", "N"): 3, - ("D", "P"): 4, - ("D", "Q"): 4, - ("D", "R"): 4, - ("D", "S"): 4, - ("D", "T"): 4, - ("D", "V"): 4, - ("D", "W"): 4, - ("D", "Y"): 4, - ("E", "A"): 4, - ("E", "C"): 4, - ("E", "D"): 2, - ("E", "E"): 0, - ("E", "F"): 4, - ("E", "G"): 4, - ("E", "H"): 4, - ("E", "I"): 4, - ("E", "K"): 3, - ("E", "L"): 4, - ("E", "M"): 4, - ("E", "N"): 4, - ("E", "P"): 4, - ("E", "Q"): 2, - ("E", "R"): 4, - ("E", "S"): 4, - ("E", "T"): 4, - ("E", "V"): 4, - ("E", "W"): 4, - ("E", "Y"): 4, - ("F", "A"): 4, - ("F", "C"): 4, - ("F", "D"): 4, - ("F", "E"): 4, - ("F", "F"): 0, - ("F", "G"): 4, - ("F", "H"): 4, - ("F", "I"): 4, - ("F", "K"): 4, - ("F", "L"): 4, - ("F", "M"): 4, - ("F", "N"): 4, - ("F", "P"): 4, - ("F", "Q"): 4, - ("F", "R"): 4, - ("F", "S"): 4, - ("F", "T"): 4, - ("F", "V"): 4, - ("F", "W"): 3, - ("F", "Y"): 1, - ("G", "A"): 4, - ("G", "C"): 4, - ("G", "D"): 4, - ("G", "E"): 4, - ("G", "F"): 4, - ("G", "G"): 0, - ("G", "H"): 4, - ("G", "I"): 4, - ("G", "K"): 4, - ("G", "L"): 4, - ("G", "M"): 4, - ("G", "N"): 4, - ("G", "P"): 4, - ("G", "Q"): 4, - ("G", "R"): 4, - ("G", "S"): 4, - ("G", "T"): 4, - ("G", "V"): 4, - ("G", "W"): 4, - ("G", "Y"): 4, - ("H", "A"): 4, - ("H", "C"): 4, - ("H", "D"): 4, - ("H", "E"): 4, - ("H", "F"): 4, - ("H", "G"): 4, - ("H", "H"): 0, - ("H", "I"): 4, - ("H", "K"): 4, - ("H", "L"): 4, - ("H", "M"): 4, - ("H", "N"): 3, - ("H", "P"): 4, - ("H", "Q"): 4, - ("H", "R"): 4, - ("H", "S"): 4, - ("H", "T"): 4, - ("H", "V"): 4, - ("H", "W"): 4, - ("H", "Y"): 2, - ("I", "A"): 4, - ("I", "C"): 4, - ("I", "D"): 4, - ("I", "E"): 4, - ("I", "F"): 4, - ("I", "G"): 4, - ("I", "H"): 4, - ("I", "I"): 0, - ("I", "K"): 4, - ("I", "L"): 2, - ("I", "M"): 3, - ("I", "N"): 4, - ("I", "P"): 4, - ("I", "Q"): 4, - ("I", "R"): 4, - ("I", "S"): 4, - ("I", "T"): 4, - ("I", "V"): 1, - ("I", "W"): 4, - ("I", "Y"): 4, - ("K", "A"): 4, - ("K", "C"): 4, - ("K", "D"): 4, - ("K", "E"): 3, - ("K", "F"): 4, - ("K", "G"): 4, - ("K", "H"): 4, - ("K", "I"): 4, - ("K", "K"): 0, - ("K", "L"): 4, - ("K", "M"): 4, - ("K", "N"): 4, - ("K", "P"): 4, - ("K", "Q"): 3, - ("K", "R"): 2, - ("K", "S"): 4, - ("K", "T"): 4, - ("K", "V"): 4, - ("K", "W"): 4, - ("K", "Y"): 4, - ("L", "A"): 4, - ("L", "C"): 4, - ("L", "D"): 4, - ("L", "E"): 4, - ("L", "F"): 4, - ("L", "G"): 4, - ("L", "H"): 4, - ("L", "I"): 2, - ("L", "K"): 4, - ("L", "L"): 0, - ("L", "M"): 2, - ("L", "N"): 4, - ("L", "P"): 4, - ("L", "Q"): 4, - ("L", "R"): 4, - ("L", "S"): 4, - ("L", "T"): 4, - ("L", "V"): 3, - ("L", "W"): 4, - ("L", "Y"): 4, - ("M", "A"): 4, - ("M", "C"): 4, - ("M", "D"): 4, - ("M", "E"): 4, - ("M", "F"): 4, - ("M", "G"): 4, - ("M", "H"): 4, - ("M", "I"): 3, - ("M", "K"): 4, - ("M", "L"): 2, - ("M", "M"): 0, - ("M", "N"): 4, - ("M", "P"): 4, - ("M", "Q"): 4, - ("M", "R"): 4, - ("M", "S"): 4, - ("M", "T"): 4, - ("M", "V"): 3, - ("M", "W"): 4, - ("M", "Y"): 4, - ("N", "A"): 4, - ("N", "C"): 4, - ("N", "D"): 3, - ("N", "E"): 4, - ("N", "F"): 4, - ("N", "G"): 4, - ("N", "H"): 3, - ("N", "I"): 4, - ("N", "K"): 4, - ("N", "L"): 4, - ("N", "M"): 4, - ("N", "N"): 0, - ("N", "P"): 4, - ("N", "Q"): 4, - ("N", "R"): 4, - ("N", "S"): 3, - ("N", "T"): 4, - ("N", "V"): 4, - ("N", "W"): 4, - ("N", "Y"): 4, - ("P", "A"): 4, - ("P", "C"): 4, - ("P", "D"): 4, - ("P", "E"): 4, - ("P", "F"): 4, - ("P", "G"): 4, - ("P", "H"): 4, - ("P", "I"): 4, - ("P", "K"): 4, - ("P", "L"): 4, - ("P", "M"): 4, - ("P", "N"): 4, - ("P", "P"): 0, - ("P", "Q"): 4, - ("P", "R"): 4, - ("P", "S"): 4, - ("P", "T"): 4, - ("P", "V"): 4, - ("P", "W"): 4, - ("P", "Y"): 4, - ("Q", "A"): 4, - ("Q", "C"): 4, - ("Q", "D"): 4, - ("Q", "E"): 2, - ("Q", "F"): 4, - ("Q", "G"): 4, - ("Q", "H"): 4, - ("Q", "I"): 4, - ("Q", "K"): 3, - ("Q", "L"): 4, - ("Q", "M"): 4, - ("Q", "N"): 4, - ("Q", "P"): 4, - ("Q", "Q"): 0, - ("Q", "R"): 3, - ("Q", "S"): 4, - ("Q", "T"): 4, - ("Q", "V"): 4, - ("Q", "W"): 4, - ("Q", "Y"): 4, - ("R", "A"): 4, - ("R", "C"): 4, - ("R", "D"): 4, - ("R", "E"): 4, - ("R", "F"): 4, - ("R", "G"): 4, - ("R", "H"): 4, - ("R", "I"): 4, - ("R", "K"): 2, - ("R", "L"): 4, - ("R", "M"): 4, - ("R", "N"): 4, - ("R", "P"): 4, - ("R", "Q"): 3, - ("R", "R"): 0, - ("R", "S"): 4, - ("R", "T"): 4, - ("R", "V"): 4, - ("R", "W"): 4, - ("R", "Y"): 4, - ("S", "A"): 3, - ("S", "C"): 4, - ("S", "D"): 4, - ("S", "E"): 4, - ("S", "F"): 4, - ("S", "G"): 4, - ("S", "H"): 4, - ("S", "I"): 4, - ("S", "K"): 4, - ("S", "L"): 4, - ("S", "M"): 4, - ("S", "N"): 3, - ("S", "P"): 4, - ("S", "Q"): 4, - ("S", "R"): 4, - ("S", "S"): 0, - ("S", "T"): 3, - ("S", "V"): 4, - ("S", "W"): 4, - ("S", "Y"): 4, - ("T", "A"): 4, - ("T", "C"): 4, - ("T", "D"): 4, - ("T", "E"): 4, - ("T", "F"): 4, - ("T", "G"): 4, - ("T", "H"): 4, - ("T", "I"): 4, - ("T", "K"): 4, - ("T", "L"): 4, - ("T", "M"): 4, - ("T", "N"): 4, - ("T", "P"): 4, - ("T", "Q"): 4, - ("T", "R"): 4, - ("T", "S"): 3, - ("T", "T"): 0, - ("T", "V"): 4, - ("T", "W"): 4, - ("T", "Y"): 4, - ("V", "A"): 4, - ("V", "C"): 4, - ("V", "D"): 4, - ("V", "E"): 4, - ("V", "F"): 4, - ("V", "G"): 4, - ("V", "H"): 4, - ("V", "I"): 1, - ("V", "K"): 4, - ("V", "L"): 3, - ("V", "M"): 3, - ("V", "N"): 4, - ("V", "P"): 4, - ("V", "Q"): 4, - ("V", "R"): 4, - ("V", "S"): 4, - ("V", "T"): 4, - ("V", "V"): 0, - ("V", "W"): 4, - ("V", "Y"): 4, - ("W", "A"): 4, - ("W", "C"): 4, - ("W", "D"): 4, - ("W", "E"): 4, - ("W", "F"): 3, - ("W", "G"): 4, - ("W", "H"): 4, - ("W", "I"): 4, - ("W", "K"): 4, - ("W", "L"): 4, - ("W", "M"): 4, - ("W", "N"): 4, - ("W", "P"): 4, - ("W", "Q"): 4, - ("W", "R"): 4, - ("W", "S"): 4, - ("W", "T"): 4, - ("W", "V"): 4, - ("W", "W"): 0, - ("W", "Y"): 2, - ("Y", "A"): 4, - ("Y", "C"): 4, - ("Y", "D"): 4, - ("Y", "E"): 4, - ("Y", "F"): 1, - ("Y", "G"): 4, - ("Y", "H"): 2, - ("Y", "I"): 4, - ("Y", "K"): 4, - ("Y", "L"): 4, - ("Y", "M"): 4, - ("Y", "N"): 4, - ("Y", "P"): 4, - ("Y", "Q"): 4, - ("Y", "R"): 4, - ("Y", "S"): 4, - ("Y", "T"): 4, - ("Y", "V"): 4, - ("Y", "W"): 2, - ("Y", "Y"): 0, - } + # fmt: off + tcr_dict_distance_matrix = {('A', 'A'): 0, ('A', 'C'): 4, ('A', 'D'): 4, ('A', 'E'): 4, ('A', 'F'): 4, ('A', 'G'): 4, ('A', 'H'): 4, ('A', 'I'): 4, ('A', 'K'): 4, ('A', 'L'): 4, ('A', 'M'): 4, ('A', 'N'): 4, ('A', 'P'): 4, ('A', 'Q'): 4, ('A', 'R'): 4, ('A', 'S'): 3, ('A', 'T'): 4, ('A', 'V'): 4, ('A', 'W'): 4, ('A', 'Y'): 4, ('C', 'A'): 4, ('C', 'C'): 0, ('C', 'D'): 4, ('C', 'E'): 4, ('C', 'F'): 4, ('C', 'G'): 4, ('C', 'H'): 4, ('C', 'I'): 4, ('C', 'K'): 4, ('C', 'L'): 4, ('C', 'M'): 4, ('C', 'N'): 4, ('C', 'P'): 4, ('C', 'Q'): 4, ('C', 'R'): 4, ('C', 'S'): 4, ('C', 'T'): 4, ('C', 'V'): 4, ('C', 'W'): 4, ('C', 'Y'): 4, ('D', 'A'): 4, ('D', 'C'): 4, ('D', 'D'): 0, ('D', 'E'): 2, ('D', 'F'): 4, ('D', 'G'): 4, ('D', 'H'): 4, ('D', 'I'): 4, ('D', 'K'): 4, ('D', 'L'): 4, ('D', 'M'): 4, ('D', 'N'): 3, ('D', 'P'): 4, ('D', 'Q'): 4, ('D', 'R'): 4, ('D', 'S'): 4, ('D', 'T'): 4, ('D', 'V'): 4, ('D', 'W'): 4, ('D', 'Y'): 4, ('E', 'A'): 4, ('E', 'C'): 4, ('E', 'D'): 2, ('E', 'E'): 0, ('E', 'F'): 4, ('E', 'G'): 4, ('E', 'H'): 4, ('E', 'I'): 4, ('E', 'K'): 3, ('E', 'L'): 4, ('E', 'M'): 4, ('E', 'N'): 4, ('E', 'P'): 4, ('E', 'Q'): 2, ('E', 'R'): 4, ('E', 'S'): 4, ('E', 'T'): 4, ('E', 'V'): 4, ('E', 'W'): 4, ('E', 'Y'): 4, ('F', 'A'): 4, ('F', 'C'): 4, ('F', 'D'): 4, ('F', 'E'): 4, ('F', 'F'): 0, ('F', 'G'): 4, ('F', 'H'): 4, ('F', 'I'): 4, ('F', 'K'): 4, ('F', 'L'): 4, ('F', 'M'): 4, ('F', 'N'): 4, ('F', 'P'): 4, ('F', 'Q'): 4, ('F', 'R'): 4, ('F', 'S'): 4, ('F', 'T'): 4, ('F', 'V'): 4, ('F', 'W'): 3, ('F', 'Y'): 1, ('G', 'A'): 4, ('G', 'C'): 4, ('G', 'D'): 4, ('G', 'E'): 4, ('G', 'F'): 4, ('G', 'G'): 0, ('G', 'H'): 4, ('G', 'I'): 4, ('G', 'K'): 4, ('G', 'L'): 4, ('G', 'M'): 4, ('G', 'N'): 4, ('G', 'P'): 4, ('G', 'Q'): 4, ('G', 'R'): 4, ('G', 'S'): 4, ('G', 'T'): 4, ('G', 'V'): 4, ('G', 'W'): 4, ('G', 'Y'): 4, ('H', 'A'): 4, ('H', 'C'): 4, ('H', 'D'): 4, ('H', 'E'): 4, ('H', 'F'): 4, ('H', 'G'): 4, ('H', 'H'): 0, ('H', 'I'): 4, ('H', 'K'): 4, ('H', 'L'): 4, ('H', 'M'): 4, ('H', 'N'): 3, ('H', 'P'): 4, ('H', 'Q'): 4, ('H', 'R'): 4, ('H', 'S'): 4, ('H', 'T'): 4, ('H', 'V'): 4, ('H', 'W'): 4, ('H', 'Y'): 2, ('I', 'A'): 4, ('I', 'C'): 4, ('I', 'D'): 4, ('I', 'E'): 4, ('I', 'F'): 4, ('I', 'G'): 4, ('I', 'H'): 4, ('I', 'I'): 0, ('I', 'K'): 4, ('I', 'L'): 2, ('I', 'M'): 3, ('I', 'N'): 4, ('I', 'P'): 4, ('I', 'Q'): 4, ('I', 'R'): 4, ('I', 'S'): 4, ('I', 'T'): 4, ('I', 'V'): 1, ('I', 'W'): 4, ('I', 'Y'): 4, ('K', 'A'): 4, ('K', 'C'): 4, ('K', 'D'): 4, ('K', 'E'): 3, ('K', 'F'): 4, ('K', 'G'): 4, ('K', 'H'): 4, ('K', 'I'): 4, ('K', 'K'): 0, ('K', 'L'): 4, ('K', 'M'): 4, ('K', 'N'): 4, ('K', 'P'): 4, ('K', 'Q'): 3, ('K', 'R'): 2, ('K', 'S'): 4, ('K', 'T'): 4, ('K', 'V'): 4, ('K', 'W'): 4, ('K', 'Y'): 4, ('L', 'A'): 4, ('L', 'C'): 4, ('L', 'D'): 4, ('L', 'E'): 4, ('L', 'F'): 4, ('L', 'G'): 4, ('L', 'H'): 4, ('L', 'I'): 2, ('L', 'K'): 4, ('L', 'L'): 0, ('L', 'M'): 2, ('L', 'N'): 4, ('L', 'P'): 4, ('L', 'Q'): 4, ('L', 'R'): 4, ('L', 'S'): 4, ('L', 'T'): 4, ('L', 'V'): 3, ('L', 'W'): 4, ('L', 'Y'): 4, ('M', 'A'): 4, ('M', 'C'): 4, ('M', 'D'): 4, ('M', 'E'): 4, ('M', 'F'): 4, ('M', 'G'): 4, ('M', 'H'): 4, ('M', 'I'): 3, ('M', 'K'): 4, ('M', 'L'): 2, ('M', 'M'): 0, ('M', 'N'): 4, ('M', 'P'): 4, ('M', 'Q'): 4, ('M', 'R'): 4, ('M', 'S'): 4, ('M', 'T'): 4, ('M', 'V'): 3, ('M', 'W'): 4, ('M', 'Y'): 4, ('N', 'A'): 4, ('N', 'C'): 4, ('N', 'D'): 3, ('N', 'E'): 4, ('N', 'F'): 4, ('N', 'G'): 4, ('N', 'H'): 3, ('N', 'I'): 4, ('N', 'K'): 4, ('N', 'L'): 4, ('N', 'M'): 4, ('N', 'N'): 0, ('N', 'P'): 4, ('N', 'Q'): 4, ('N', 'R'): 4, ('N', 'S'): 3, ('N', 'T'): 4, ('N', 'V'): 4, ('N', 'W'): 4, ('N', 'Y'): 4, ('P', 'A'): 4, ('P', 'C'): 4, ('P', 'D'): 4, ('P', 'E'): 4, ('P', 'F'): 4, ('P', 'G'): 4, ('P', 'H'): 4, ('P', 'I'): 4, ('P', 'K'): 4, ('P', 'L'): 4, ('P', 'M'): 4, ('P', 'N'): 4, ('P', 'P'): 0, ('P', 'Q'): 4, ('P', 'R'): 4, ('P', 'S'): 4, ('P', 'T'): 4, ('P', 'V'): 4, ('P', 'W'): 4, ('P', 'Y'): 4, ('Q', 'A'): 4, ('Q', 'C'): 4, ('Q', 'D'): 4, ('Q', 'E'): 2, ('Q', 'F'): 4, ('Q', 'G'): 4, ('Q', 'H'): 4, ('Q', 'I'): 4, ('Q', 'K'): 3, ('Q', 'L'): 4, ('Q', 'M'): 4, ('Q', 'N'): 4, ('Q', 'P'): 4, ('Q', 'Q'): 0, ('Q', 'R'): 3, ('Q', 'S'): 4, ('Q', 'T'): 4, ('Q', 'V'): 4, ('Q', 'W'): 4, ('Q', 'Y'): 4, ('R', 'A'): 4, ('R', 'C'): 4, ('R', 'D'): 4, ('R', 'E'): 4, ('R', 'F'): 4, ('R', 'G'): 4, ('R', 'H'): 4, ('R', 'I'): 4, ('R', 'K'): 2, ('R', 'L'): 4, ('R', 'M'): 4, ('R', 'N'): 4, ('R', 'P'): 4, ('R', 'Q'): 3, ('R', 'R'): 0, ('R', 'S'): 4, ('R', 'T'): 4, ('R', 'V'): 4, ('R', 'W'): 4, ('R', 'Y'): 4, ('S', 'A'): 3, ('S', 'C'): 4, ('S', 'D'): 4, ('S', 'E'): 4, ('S', 'F'): 4, ('S', 'G'): 4, ('S', 'H'): 4, ('S', 'I'): 4, ('S', 'K'): 4, ('S', 'L'): 4, ('S', 'M'): 4, ('S', 'N'): 3, ('S', 'P'): 4, ('S', 'Q'): 4, ('S', 'R'): 4, ('S', 'S'): 0, ('S', 'T'): 3, ('S', 'V'): 4, ('S', 'W'): 4, ('S', 'Y'): 4, ('T', 'A'): 4, ('T', 'C'): 4, ('T', 'D'): 4, ('T', 'E'): 4, ('T', 'F'): 4, ('T', 'G'): 4, ('T', 'H'): 4, ('T', 'I'): 4, ('T', 'K'): 4, ('T', 'L'): 4, ('T', 'M'): 4, ('T', 'N'): 4, ('T', 'P'): 4, ('T', 'Q'): 4, ('T', 'R'): 4, ('T', 'S'): 3, ('T', 'T'): 0, ('T', 'V'): 4, ('T', 'W'): 4, ('T', 'Y'): 4, ('V', 'A'): 4, ('V', 'C'): 4, ('V', 'D'): 4, ('V', 'E'): 4, ('V', 'F'): 4, ('V', 'G'): 4, ('V', 'H'): 4, ('V', 'I'): 1, ('V', 'K'): 4, ('V', 'L'): 3, ('V', 'M'): 3, ('V', 'N'): 4, ('V', 'P'): 4, ('V', 'Q'): 4, ('V', 'R'): 4, ('V', 'S'): 4, ('V', 'T'): 4, ('V', 'V'): 0, ('V', 'W'): 4, ('V', 'Y'): 4, ('W', 'A'): 4, ('W', 'C'): 4, ('W', 'D'): 4, ('W', 'E'): 4, ('W', 'F'): 3, ('W', 'G'): 4, ('W', 'H'): 4, ('W', 'I'): 4, ('W', 'K'): 4, ('W', 'L'): 4, ('W', 'M'): 4, ('W', 'N'): 4, ('W', 'P'): 4, ('W', 'Q'): 4, ('W', 'R'): 4, ('W', 'S'): 4, ('W', 'T'): 4, ('W', 'V'): 4, ('W', 'W'): 0, ('W', 'Y'): 2, ('Y', 'A'): 4, ('Y', 'C'): 4, ('Y', 'D'): 4, ('Y', 'E'): 4, ('Y', 'F'): 1, ('Y', 'G'): 4, ('Y', 'H'): 2, ('Y', 'I'): 4, ('Y', 'K'): 4, ('Y', 'L'): 4, ('Y', 'M'): 4, ('Y', 'N'): 4, ('Y', 'P'): 4, ('Y', 'Q'): 4, ('Y', 'R'): 4, ('Y', 'S'): 4, ('Y', 'T'): 4, ('Y', 'V'): 4, ('Y', 'W'): 2, ('Y', 'Y'): 0} + # fmt: on + @staticmethod def _make_numba_matrix(distance_matrix, alphabet=parasail_aa_alphabet_with_unknown): """Creates a numba compatible distance matrix from a dict of tuples. From 7ac26ebfcfc8abe9e21994e73a3b24e9d1c53d6e Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Thu, 18 Apr 2024 19:48:56 +0200 Subject: [PATCH 16/27] added data type hints to functions and adapted function comments --- src/scirpy/ir_dist/metrics.py | 89 ++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index e3b7008e8..faa10fea0 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -400,18 +400,18 @@ class TCRdistDistanceCalculator: The code of this class is heavily based on: https://github.com/agartland/pwseqdist/blob/master/pwseqdist Using default weight, gap penalty, ntrim and ctrim is equivalent to the - original distance published in Dash et al, (2017). + original distance published in :cite:`TCRdist`. Parameters ---------- - dist_weight : int + dist_weight: Weight applied to the mismatch distances before summing with the gap penalties - gap_penalty : int + gap_penalty: Distance penalty for the difference in the length of the two sequences - ntrim/ctrim : int + ntrim/ctrim: Positions trimmed off the N-terminus (0) and C-terminus (L-1) ends of the peptide sequence. These symbols will be ignored in the distance calculation. - fixed_gappos : bool + fixed_gappos: If True, insert gaps at a fixed position after the cysteine residue statring the CDR3 (typically position 6). If False, find the "optimal" position for inserting the gaps to make up the difference in length cutoff: @@ -423,11 +423,11 @@ class TCRdistDistanceCalculator: def __init__( self, - dist_weight=3, - gap_penalty=4, - ntrim=3, - ctrim=2, - fixed_gappos=True, + dist_weight: int = 3, + gap_penalty: int = 4, + ntrim: int = 3, + ctrim: int = 2, + fixed_gappos: bool = True, cutoff: int = 20, n_jobs: int = 1, ): @@ -446,20 +446,22 @@ def __init__( tcr_dict_distance_matrix = {('A', 'A'): 0, ('A', 'C'): 4, ('A', 'D'): 4, ('A', 'E'): 4, ('A', 'F'): 4, ('A', 'G'): 4, ('A', 'H'): 4, ('A', 'I'): 4, ('A', 'K'): 4, ('A', 'L'): 4, ('A', 'M'): 4, ('A', 'N'): 4, ('A', 'P'): 4, ('A', 'Q'): 4, ('A', 'R'): 4, ('A', 'S'): 3, ('A', 'T'): 4, ('A', 'V'): 4, ('A', 'W'): 4, ('A', 'Y'): 4, ('C', 'A'): 4, ('C', 'C'): 0, ('C', 'D'): 4, ('C', 'E'): 4, ('C', 'F'): 4, ('C', 'G'): 4, ('C', 'H'): 4, ('C', 'I'): 4, ('C', 'K'): 4, ('C', 'L'): 4, ('C', 'M'): 4, ('C', 'N'): 4, ('C', 'P'): 4, ('C', 'Q'): 4, ('C', 'R'): 4, ('C', 'S'): 4, ('C', 'T'): 4, ('C', 'V'): 4, ('C', 'W'): 4, ('C', 'Y'): 4, ('D', 'A'): 4, ('D', 'C'): 4, ('D', 'D'): 0, ('D', 'E'): 2, ('D', 'F'): 4, ('D', 'G'): 4, ('D', 'H'): 4, ('D', 'I'): 4, ('D', 'K'): 4, ('D', 'L'): 4, ('D', 'M'): 4, ('D', 'N'): 3, ('D', 'P'): 4, ('D', 'Q'): 4, ('D', 'R'): 4, ('D', 'S'): 4, ('D', 'T'): 4, ('D', 'V'): 4, ('D', 'W'): 4, ('D', 'Y'): 4, ('E', 'A'): 4, ('E', 'C'): 4, ('E', 'D'): 2, ('E', 'E'): 0, ('E', 'F'): 4, ('E', 'G'): 4, ('E', 'H'): 4, ('E', 'I'): 4, ('E', 'K'): 3, ('E', 'L'): 4, ('E', 'M'): 4, ('E', 'N'): 4, ('E', 'P'): 4, ('E', 'Q'): 2, ('E', 'R'): 4, ('E', 'S'): 4, ('E', 'T'): 4, ('E', 'V'): 4, ('E', 'W'): 4, ('E', 'Y'): 4, ('F', 'A'): 4, ('F', 'C'): 4, ('F', 'D'): 4, ('F', 'E'): 4, ('F', 'F'): 0, ('F', 'G'): 4, ('F', 'H'): 4, ('F', 'I'): 4, ('F', 'K'): 4, ('F', 'L'): 4, ('F', 'M'): 4, ('F', 'N'): 4, ('F', 'P'): 4, ('F', 'Q'): 4, ('F', 'R'): 4, ('F', 'S'): 4, ('F', 'T'): 4, ('F', 'V'): 4, ('F', 'W'): 3, ('F', 'Y'): 1, ('G', 'A'): 4, ('G', 'C'): 4, ('G', 'D'): 4, ('G', 'E'): 4, ('G', 'F'): 4, ('G', 'G'): 0, ('G', 'H'): 4, ('G', 'I'): 4, ('G', 'K'): 4, ('G', 'L'): 4, ('G', 'M'): 4, ('G', 'N'): 4, ('G', 'P'): 4, ('G', 'Q'): 4, ('G', 'R'): 4, ('G', 'S'): 4, ('G', 'T'): 4, ('G', 'V'): 4, ('G', 'W'): 4, ('G', 'Y'): 4, ('H', 'A'): 4, ('H', 'C'): 4, ('H', 'D'): 4, ('H', 'E'): 4, ('H', 'F'): 4, ('H', 'G'): 4, ('H', 'H'): 0, ('H', 'I'): 4, ('H', 'K'): 4, ('H', 'L'): 4, ('H', 'M'): 4, ('H', 'N'): 3, ('H', 'P'): 4, ('H', 'Q'): 4, ('H', 'R'): 4, ('H', 'S'): 4, ('H', 'T'): 4, ('H', 'V'): 4, ('H', 'W'): 4, ('H', 'Y'): 2, ('I', 'A'): 4, ('I', 'C'): 4, ('I', 'D'): 4, ('I', 'E'): 4, ('I', 'F'): 4, ('I', 'G'): 4, ('I', 'H'): 4, ('I', 'I'): 0, ('I', 'K'): 4, ('I', 'L'): 2, ('I', 'M'): 3, ('I', 'N'): 4, ('I', 'P'): 4, ('I', 'Q'): 4, ('I', 'R'): 4, ('I', 'S'): 4, ('I', 'T'): 4, ('I', 'V'): 1, ('I', 'W'): 4, ('I', 'Y'): 4, ('K', 'A'): 4, ('K', 'C'): 4, ('K', 'D'): 4, ('K', 'E'): 3, ('K', 'F'): 4, ('K', 'G'): 4, ('K', 'H'): 4, ('K', 'I'): 4, ('K', 'K'): 0, ('K', 'L'): 4, ('K', 'M'): 4, ('K', 'N'): 4, ('K', 'P'): 4, ('K', 'Q'): 3, ('K', 'R'): 2, ('K', 'S'): 4, ('K', 'T'): 4, ('K', 'V'): 4, ('K', 'W'): 4, ('K', 'Y'): 4, ('L', 'A'): 4, ('L', 'C'): 4, ('L', 'D'): 4, ('L', 'E'): 4, ('L', 'F'): 4, ('L', 'G'): 4, ('L', 'H'): 4, ('L', 'I'): 2, ('L', 'K'): 4, ('L', 'L'): 0, ('L', 'M'): 2, ('L', 'N'): 4, ('L', 'P'): 4, ('L', 'Q'): 4, ('L', 'R'): 4, ('L', 'S'): 4, ('L', 'T'): 4, ('L', 'V'): 3, ('L', 'W'): 4, ('L', 'Y'): 4, ('M', 'A'): 4, ('M', 'C'): 4, ('M', 'D'): 4, ('M', 'E'): 4, ('M', 'F'): 4, ('M', 'G'): 4, ('M', 'H'): 4, ('M', 'I'): 3, ('M', 'K'): 4, ('M', 'L'): 2, ('M', 'M'): 0, ('M', 'N'): 4, ('M', 'P'): 4, ('M', 'Q'): 4, ('M', 'R'): 4, ('M', 'S'): 4, ('M', 'T'): 4, ('M', 'V'): 3, ('M', 'W'): 4, ('M', 'Y'): 4, ('N', 'A'): 4, ('N', 'C'): 4, ('N', 'D'): 3, ('N', 'E'): 4, ('N', 'F'): 4, ('N', 'G'): 4, ('N', 'H'): 3, ('N', 'I'): 4, ('N', 'K'): 4, ('N', 'L'): 4, ('N', 'M'): 4, ('N', 'N'): 0, ('N', 'P'): 4, ('N', 'Q'): 4, ('N', 'R'): 4, ('N', 'S'): 3, ('N', 'T'): 4, ('N', 'V'): 4, ('N', 'W'): 4, ('N', 'Y'): 4, ('P', 'A'): 4, ('P', 'C'): 4, ('P', 'D'): 4, ('P', 'E'): 4, ('P', 'F'): 4, ('P', 'G'): 4, ('P', 'H'): 4, ('P', 'I'): 4, ('P', 'K'): 4, ('P', 'L'): 4, ('P', 'M'): 4, ('P', 'N'): 4, ('P', 'P'): 0, ('P', 'Q'): 4, ('P', 'R'): 4, ('P', 'S'): 4, ('P', 'T'): 4, ('P', 'V'): 4, ('P', 'W'): 4, ('P', 'Y'): 4, ('Q', 'A'): 4, ('Q', 'C'): 4, ('Q', 'D'): 4, ('Q', 'E'): 2, ('Q', 'F'): 4, ('Q', 'G'): 4, ('Q', 'H'): 4, ('Q', 'I'): 4, ('Q', 'K'): 3, ('Q', 'L'): 4, ('Q', 'M'): 4, ('Q', 'N'): 4, ('Q', 'P'): 4, ('Q', 'Q'): 0, ('Q', 'R'): 3, ('Q', 'S'): 4, ('Q', 'T'): 4, ('Q', 'V'): 4, ('Q', 'W'): 4, ('Q', 'Y'): 4, ('R', 'A'): 4, ('R', 'C'): 4, ('R', 'D'): 4, ('R', 'E'): 4, ('R', 'F'): 4, ('R', 'G'): 4, ('R', 'H'): 4, ('R', 'I'): 4, ('R', 'K'): 2, ('R', 'L'): 4, ('R', 'M'): 4, ('R', 'N'): 4, ('R', 'P'): 4, ('R', 'Q'): 3, ('R', 'R'): 0, ('R', 'S'): 4, ('R', 'T'): 4, ('R', 'V'): 4, ('R', 'W'): 4, ('R', 'Y'): 4, ('S', 'A'): 3, ('S', 'C'): 4, ('S', 'D'): 4, ('S', 'E'): 4, ('S', 'F'): 4, ('S', 'G'): 4, ('S', 'H'): 4, ('S', 'I'): 4, ('S', 'K'): 4, ('S', 'L'): 4, ('S', 'M'): 4, ('S', 'N'): 3, ('S', 'P'): 4, ('S', 'Q'): 4, ('S', 'R'): 4, ('S', 'S'): 0, ('S', 'T'): 3, ('S', 'V'): 4, ('S', 'W'): 4, ('S', 'Y'): 4, ('T', 'A'): 4, ('T', 'C'): 4, ('T', 'D'): 4, ('T', 'E'): 4, ('T', 'F'): 4, ('T', 'G'): 4, ('T', 'H'): 4, ('T', 'I'): 4, ('T', 'K'): 4, ('T', 'L'): 4, ('T', 'M'): 4, ('T', 'N'): 4, ('T', 'P'): 4, ('T', 'Q'): 4, ('T', 'R'): 4, ('T', 'S'): 3, ('T', 'T'): 0, ('T', 'V'): 4, ('T', 'W'): 4, ('T', 'Y'): 4, ('V', 'A'): 4, ('V', 'C'): 4, ('V', 'D'): 4, ('V', 'E'): 4, ('V', 'F'): 4, ('V', 'G'): 4, ('V', 'H'): 4, ('V', 'I'): 1, ('V', 'K'): 4, ('V', 'L'): 3, ('V', 'M'): 3, ('V', 'N'): 4, ('V', 'P'): 4, ('V', 'Q'): 4, ('V', 'R'): 4, ('V', 'S'): 4, ('V', 'T'): 4, ('V', 'V'): 0, ('V', 'W'): 4, ('V', 'Y'): 4, ('W', 'A'): 4, ('W', 'C'): 4, ('W', 'D'): 4, ('W', 'E'): 4, ('W', 'F'): 3, ('W', 'G'): 4, ('W', 'H'): 4, ('W', 'I'): 4, ('W', 'K'): 4, ('W', 'L'): 4, ('W', 'M'): 4, ('W', 'N'): 4, ('W', 'P'): 4, ('W', 'Q'): 4, ('W', 'R'): 4, ('W', 'S'): 4, ('W', 'T'): 4, ('W', 'V'): 4, ('W', 'W'): 0, ('W', 'Y'): 2, ('Y', 'A'): 4, ('Y', 'C'): 4, ('Y', 'D'): 4, ('Y', 'E'): 4, ('Y', 'F'): 1, ('Y', 'G'): 4, ('Y', 'H'): 2, ('Y', 'I'): 4, ('Y', 'K'): 4, ('Y', 'L'): 4, ('Y', 'M'): 4, ('Y', 'N'): 4, ('Y', 'P'): 4, ('Y', 'Q'): 4, ('Y', 'R'): 4, ('Y', 'S'): 4, ('Y', 'T'): 4, ('Y', 'V'): 4, ('Y', 'W'): 2, ('Y', 'Y'): 0} # fmt: on - + @staticmethod - def _make_numba_matrix(distance_matrix, alphabet=parasail_aa_alphabet_with_unknown): + def _make_numba_matrix(distance_matrix: dict, alphabet: str = parasail_aa_alphabet_with_unknown) -> np.ndarray: """Creates a numba compatible distance matrix from a dict of tuples. Parameters ---------- - distance_matrix : dict + distance_matrix: Keys are tuples like ('A', 'C') with values containing an integer. - alphabet : str + alphabet: Returns ------- - distance_matrix : np.ndarray, dtype=np.int32""" + distance_matrix: + distance lookup matrix + """ dm = np.zeros((len(alphabet), len(alphabet)), dtype=np.int32) for (aa1, aa2), d in distance_matrix.items(): @@ -470,18 +472,21 @@ def _make_numba_matrix(distance_matrix, alphabet=parasail_aa_alphabet_with_unkno tcr_nb_distance_matrix = _make_numba_matrix(tcr_dict_distance_matrix) @staticmethod - def _seqs2mat(seqs, alphabet=parasail_aa_alphabet, max_len=None): + def _seqs2mat(seqs: Sequence[str], alphabet: str = parasail_aa_alphabet, max_len: Union[None, int] = None) -> tuple[np.ndarray, np.ndarray]: """Convert a collection of gene sequences into a numpy matrix of integers for fast comparison. Parameters ---------- - seqs : list - List of strings + seqs: + Sequence of strings Returns ------- - mat : np.array + mat: + matrix with gene sequences encoded as integers + L: + vector with length values of the gene sequences in the matrix Examples -------- @@ -514,55 +519,55 @@ def _seqs2mat(seqs, alphabet=parasail_aa_alphabet, max_len=None): @staticmethod @nb.jit(nopython=True, parallel=False, nogil=True) def _nb_tcrdist_mat( - seqs_mat1, - seqs_mat2, - seqs_L1, - seqs_L2, - distance_matrix=tcr_nb_distance_matrix, - dist_weight=3, - gap_penalty=4, - ntrim=3, - ctrim=2, - fixed_gappos=True, - cutoff=20, - ): + seqs_mat1: np.ndarray, + seqs_mat2: np.ndarray, + seqs_L1: np.ndarray, + seqs_L2: np.ndarray, + distance_matrix: np.ndarray = tcr_nb_distance_matrix, + dist_weight: int = 3, + gap_penalty: int = 4, + ntrim: int = 3, + ctrim: int = 2, + fixed_gappos: bool = True, + cutoff: int = 20, + ) -> tuple[list[np.ndarray], list[np.ndarray], np.ndarray]: """Computes the pairwise TCRdist distances for sequences in seqs_mat1 and seqs_mat2. Note: to use with non-CDR3 sequences set ntrim and ctrim to 0. Parameters ---------- - seqs_mat1/2 : np.ndarray + seqs_mat1/2: Matrix containing sequences created by seqs2mat with padding to accomodate sequences of different lengths (-1 padding) - seqs_L1/2 : np.ndarray + seqs_L1/2: A vector containing the length of each sequence in the respective seqs_mat matrix, without the padding in seqs_mat - distance_matrix : np.ndarray [alphabet, alphabet] dtype=int32 + distance_matrix: A square distance matrix (NOT a similarity matrix). Matrix must match the alphabet that was used to create seqs_mat, where each AA is represented by an index into the alphabet. - dist_weight : int + dist_weight: Weight applied to the mismatch distances before summing with the gap penalties - gap_penalty : int + gap_penalty: Distance penalty for the difference in the length of the two sequences - ntrim/ctrim : int + ntrim/ctrim: Positions trimmed off the N-terminus (0) and C-terminus (L-1) ends of the peptide sequence. These symbols will be ignored in the distance calculation. - fixed_gappos : bool + fixed_gappos: If True, insert gaps at a fixed position after the cysteine residue statring the CDR3 (typically position 6). If False, find the "optimal" position for inserting the gaps to make up the difference in length Returns ------- - data_rows : + data_rows: List with arrays containing the non-zero data values of the result matrix per row, needed to create the final scipy CSR result matrix later - indices_rows : + indices_rows: List with arrays containing the non-zero entry column indeces of the result matrix per row, needed to create the final scipy CSR result matrix later - row_element_counts : - Arraay with integers that indicate the amount of non-zero values of the result matrix per row, + row_element_counts: + Array with integers that indicate the amount of non-zero values of the result matrix per row, needed to create the final scipy CSR result matrix later """ From cc173648abbfead367b4c0dc5c2c3f7cbcf40dae Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Thu, 18 Apr 2024 19:56:25 +0200 Subject: [PATCH 17/27] changed testdata import for test cases --- src/scirpy/tests/test_ir_dist_metrics.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/scirpy/tests/test_ir_dist_metrics.py b/src/scirpy/tests/test_ir_dist_metrics.py index 03a993d14..ea4a48f0b 100644 --- a/src/scirpy/tests/test_ir_dist_metrics.py +++ b/src/scirpy/tests/test_ir_dist_metrics.py @@ -614,8 +614,9 @@ def test_tcrdist(test_parameters, test_input, expected_result): def test_tcrdist_reference(): # test tcrdist against reference implementation - seqs = np.load("data/tcrdist_test_data/tcrdist_WU3k_seqs.npy") - reference_result = scipy.sparse.load_npz("data/tcrdist_test_data/tcrdist_WU3k_csr_result.npz") + from . import TESTDATA + seqs = np.load(TESTDATA / "tcrdist_test_data/tcrdist_WU3k_seqs.npy") + reference_result = scipy.sparse.load_npz(TESTDATA / "tcrdist_test_data/tcrdist_WU3k_csr_result.npz") tcrdist_calculator = TCRdistDistanceCalculator( dist_weight=3, From e7fe4eb347ba83afa3fce42f008a966cc9b4e307 Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Thu, 18 Apr 2024 20:00:07 +0200 Subject: [PATCH 18/27] changed __init__ and _nb_tcrdist_mat in TCRdistDistanceCalculator to keywords only --- src/scirpy/ir_dist/metrics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index faa10fea0..d38757e1a 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -423,6 +423,7 @@ class TCRdistDistanceCalculator: def __init__( self, + *, dist_weight: int = 3, gap_penalty: int = 4, ntrim: int = 3, @@ -519,6 +520,7 @@ def _seqs2mat(seqs: Sequence[str], alphabet: str = parasail_aa_alphabet, max_len @staticmethod @nb.jit(nopython=True, parallel=False, nogil=True) def _nb_tcrdist_mat( + *, seqs_mat1: np.ndarray, seqs_mat2: np.ndarray, seqs_L1: np.ndarray, From ac803e7904a4e8792d78a7e3b68e6832a3feb9d6 Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Thu, 18 Apr 2024 20:22:54 +0200 Subject: [PATCH 19/27] keywords only for _nb_tcrdist_mat removed, because it doesn't work with numba --- src/scirpy/ir_dist/metrics.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index d38757e1a..7eefc2c4b 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -520,7 +520,6 @@ def _seqs2mat(seqs: Sequence[str], alphabet: str = parasail_aa_alphabet, max_len @staticmethod @nb.jit(nopython=True, parallel=False, nogil=True) def _nb_tcrdist_mat( - *, seqs_mat1: np.ndarray, seqs_mat2: np.ndarray, seqs_L1: np.ndarray, @@ -656,17 +655,17 @@ def _calc_dist_mat_block(self, seqs: Sequence[str], seqs2: Optional[Sequence[str seqs_mat1, seqs_L1 = self._seqs2mat(seqs) seqs_mat2, seqs_L2 = self._seqs2mat(seqs2) - kwargs = { - "dist_weight": self.dist_weight, - "gap_penalty": self.gap_penalty, - "ntrim": self.ntrim, - "ctrim": self.ctrim, - "fixed_gappos": self.fixed_gappos, - "cutoff": self.cutoff, - } - data_rows, indices_rows, row_element_counts = self._nb_tcrdist_mat( - seqs_mat1, seqs_mat2, seqs_L1, seqs_L2, **kwargs + seqs_mat1 = seqs_mat1, + seqs_mat2 = seqs_mat2, + seqs_L1 = seqs_L1, + seqs_L2 = seqs_L2, + dist_weight = self.dist_weight, + gap_penalty = self.gap_penalty, + ntrim = self.ntrim, + ctrim = self.ctrim, + fixed_gappos = self.fixed_gappos, + cutoff = self.cutoff, ) indptr = np.zeros(row_element_counts.shape[0] + 1) From 3a3d53cc275e7f142753329191cb8a9e913e78c9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 18 Apr 2024 18:30:20 +0000 Subject: [PATCH 20/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/scirpy/ir_dist/metrics.py | 32 ++++++++++++------------ src/scirpy/tests/test_ir_dist_metrics.py | 1 + 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index e95de0743..9248e1df1 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -446,7 +446,6 @@ def __init__( tcr_dict_distance_matrix = {('A', 'A'): 0, ('A', 'C'): 4, ('A', 'D'): 4, ('A', 'E'): 4, ('A', 'F'): 4, ('A', 'G'): 4, ('A', 'H'): 4, ('A', 'I'): 4, ('A', 'K'): 4, ('A', 'L'): 4, ('A', 'M'): 4, ('A', 'N'): 4, ('A', 'P'): 4, ('A', 'Q'): 4, ('A', 'R'): 4, ('A', 'S'): 3, ('A', 'T'): 4, ('A', 'V'): 4, ('A', 'W'): 4, ('A', 'Y'): 4, ('C', 'A'): 4, ('C', 'C'): 0, ('C', 'D'): 4, ('C', 'E'): 4, ('C', 'F'): 4, ('C', 'G'): 4, ('C', 'H'): 4, ('C', 'I'): 4, ('C', 'K'): 4, ('C', 'L'): 4, ('C', 'M'): 4, ('C', 'N'): 4, ('C', 'P'): 4, ('C', 'Q'): 4, ('C', 'R'): 4, ('C', 'S'): 4, ('C', 'T'): 4, ('C', 'V'): 4, ('C', 'W'): 4, ('C', 'Y'): 4, ('D', 'A'): 4, ('D', 'C'): 4, ('D', 'D'): 0, ('D', 'E'): 2, ('D', 'F'): 4, ('D', 'G'): 4, ('D', 'H'): 4, ('D', 'I'): 4, ('D', 'K'): 4, ('D', 'L'): 4, ('D', 'M'): 4, ('D', 'N'): 3, ('D', 'P'): 4, ('D', 'Q'): 4, ('D', 'R'): 4, ('D', 'S'): 4, ('D', 'T'): 4, ('D', 'V'): 4, ('D', 'W'): 4, ('D', 'Y'): 4, ('E', 'A'): 4, ('E', 'C'): 4, ('E', 'D'): 2, ('E', 'E'): 0, ('E', 'F'): 4, ('E', 'G'): 4, ('E', 'H'): 4, ('E', 'I'): 4, ('E', 'K'): 3, ('E', 'L'): 4, ('E', 'M'): 4, ('E', 'N'): 4, ('E', 'P'): 4, ('E', 'Q'): 2, ('E', 'R'): 4, ('E', 'S'): 4, ('E', 'T'): 4, ('E', 'V'): 4, ('E', 'W'): 4, ('E', 'Y'): 4, ('F', 'A'): 4, ('F', 'C'): 4, ('F', 'D'): 4, ('F', 'E'): 4, ('F', 'F'): 0, ('F', 'G'): 4, ('F', 'H'): 4, ('F', 'I'): 4, ('F', 'K'): 4, ('F', 'L'): 4, ('F', 'M'): 4, ('F', 'N'): 4, ('F', 'P'): 4, ('F', 'Q'): 4, ('F', 'R'): 4, ('F', 'S'): 4, ('F', 'T'): 4, ('F', 'V'): 4, ('F', 'W'): 3, ('F', 'Y'): 1, ('G', 'A'): 4, ('G', 'C'): 4, ('G', 'D'): 4, ('G', 'E'): 4, ('G', 'F'): 4, ('G', 'G'): 0, ('G', 'H'): 4, ('G', 'I'): 4, ('G', 'K'): 4, ('G', 'L'): 4, ('G', 'M'): 4, ('G', 'N'): 4, ('G', 'P'): 4, ('G', 'Q'): 4, ('G', 'R'): 4, ('G', 'S'): 4, ('G', 'T'): 4, ('G', 'V'): 4, ('G', 'W'): 4, ('G', 'Y'): 4, ('H', 'A'): 4, ('H', 'C'): 4, ('H', 'D'): 4, ('H', 'E'): 4, ('H', 'F'): 4, ('H', 'G'): 4, ('H', 'H'): 0, ('H', 'I'): 4, ('H', 'K'): 4, ('H', 'L'): 4, ('H', 'M'): 4, ('H', 'N'): 3, ('H', 'P'): 4, ('H', 'Q'): 4, ('H', 'R'): 4, ('H', 'S'): 4, ('H', 'T'): 4, ('H', 'V'): 4, ('H', 'W'): 4, ('H', 'Y'): 2, ('I', 'A'): 4, ('I', 'C'): 4, ('I', 'D'): 4, ('I', 'E'): 4, ('I', 'F'): 4, ('I', 'G'): 4, ('I', 'H'): 4, ('I', 'I'): 0, ('I', 'K'): 4, ('I', 'L'): 2, ('I', 'M'): 3, ('I', 'N'): 4, ('I', 'P'): 4, ('I', 'Q'): 4, ('I', 'R'): 4, ('I', 'S'): 4, ('I', 'T'): 4, ('I', 'V'): 1, ('I', 'W'): 4, ('I', 'Y'): 4, ('K', 'A'): 4, ('K', 'C'): 4, ('K', 'D'): 4, ('K', 'E'): 3, ('K', 'F'): 4, ('K', 'G'): 4, ('K', 'H'): 4, ('K', 'I'): 4, ('K', 'K'): 0, ('K', 'L'): 4, ('K', 'M'): 4, ('K', 'N'): 4, ('K', 'P'): 4, ('K', 'Q'): 3, ('K', 'R'): 2, ('K', 'S'): 4, ('K', 'T'): 4, ('K', 'V'): 4, ('K', 'W'): 4, ('K', 'Y'): 4, ('L', 'A'): 4, ('L', 'C'): 4, ('L', 'D'): 4, ('L', 'E'): 4, ('L', 'F'): 4, ('L', 'G'): 4, ('L', 'H'): 4, ('L', 'I'): 2, ('L', 'K'): 4, ('L', 'L'): 0, ('L', 'M'): 2, ('L', 'N'): 4, ('L', 'P'): 4, ('L', 'Q'): 4, ('L', 'R'): 4, ('L', 'S'): 4, ('L', 'T'): 4, ('L', 'V'): 3, ('L', 'W'): 4, ('L', 'Y'): 4, ('M', 'A'): 4, ('M', 'C'): 4, ('M', 'D'): 4, ('M', 'E'): 4, ('M', 'F'): 4, ('M', 'G'): 4, ('M', 'H'): 4, ('M', 'I'): 3, ('M', 'K'): 4, ('M', 'L'): 2, ('M', 'M'): 0, ('M', 'N'): 4, ('M', 'P'): 4, ('M', 'Q'): 4, ('M', 'R'): 4, ('M', 'S'): 4, ('M', 'T'): 4, ('M', 'V'): 3, ('M', 'W'): 4, ('M', 'Y'): 4, ('N', 'A'): 4, ('N', 'C'): 4, ('N', 'D'): 3, ('N', 'E'): 4, ('N', 'F'): 4, ('N', 'G'): 4, ('N', 'H'): 3, ('N', 'I'): 4, ('N', 'K'): 4, ('N', 'L'): 4, ('N', 'M'): 4, ('N', 'N'): 0, ('N', 'P'): 4, ('N', 'Q'): 4, ('N', 'R'): 4, ('N', 'S'): 3, ('N', 'T'): 4, ('N', 'V'): 4, ('N', 'W'): 4, ('N', 'Y'): 4, ('P', 'A'): 4, ('P', 'C'): 4, ('P', 'D'): 4, ('P', 'E'): 4, ('P', 'F'): 4, ('P', 'G'): 4, ('P', 'H'): 4, ('P', 'I'): 4, ('P', 'K'): 4, ('P', 'L'): 4, ('P', 'M'): 4, ('P', 'N'): 4, ('P', 'P'): 0, ('P', 'Q'): 4, ('P', 'R'): 4, ('P', 'S'): 4, ('P', 'T'): 4, ('P', 'V'): 4, ('P', 'W'): 4, ('P', 'Y'): 4, ('Q', 'A'): 4, ('Q', 'C'): 4, ('Q', 'D'): 4, ('Q', 'E'): 2, ('Q', 'F'): 4, ('Q', 'G'): 4, ('Q', 'H'): 4, ('Q', 'I'): 4, ('Q', 'K'): 3, ('Q', 'L'): 4, ('Q', 'M'): 4, ('Q', 'N'): 4, ('Q', 'P'): 4, ('Q', 'Q'): 0, ('Q', 'R'): 3, ('Q', 'S'): 4, ('Q', 'T'): 4, ('Q', 'V'): 4, ('Q', 'W'): 4, ('Q', 'Y'): 4, ('R', 'A'): 4, ('R', 'C'): 4, ('R', 'D'): 4, ('R', 'E'): 4, ('R', 'F'): 4, ('R', 'G'): 4, ('R', 'H'): 4, ('R', 'I'): 4, ('R', 'K'): 2, ('R', 'L'): 4, ('R', 'M'): 4, ('R', 'N'): 4, ('R', 'P'): 4, ('R', 'Q'): 3, ('R', 'R'): 0, ('R', 'S'): 4, ('R', 'T'): 4, ('R', 'V'): 4, ('R', 'W'): 4, ('R', 'Y'): 4, ('S', 'A'): 3, ('S', 'C'): 4, ('S', 'D'): 4, ('S', 'E'): 4, ('S', 'F'): 4, ('S', 'G'): 4, ('S', 'H'): 4, ('S', 'I'): 4, ('S', 'K'): 4, ('S', 'L'): 4, ('S', 'M'): 4, ('S', 'N'): 3, ('S', 'P'): 4, ('S', 'Q'): 4, ('S', 'R'): 4, ('S', 'S'): 0, ('S', 'T'): 3, ('S', 'V'): 4, ('S', 'W'): 4, ('S', 'Y'): 4, ('T', 'A'): 4, ('T', 'C'): 4, ('T', 'D'): 4, ('T', 'E'): 4, ('T', 'F'): 4, ('T', 'G'): 4, ('T', 'H'): 4, ('T', 'I'): 4, ('T', 'K'): 4, ('T', 'L'): 4, ('T', 'M'): 4, ('T', 'N'): 4, ('T', 'P'): 4, ('T', 'Q'): 4, ('T', 'R'): 4, ('T', 'S'): 3, ('T', 'T'): 0, ('T', 'V'): 4, ('T', 'W'): 4, ('T', 'Y'): 4, ('V', 'A'): 4, ('V', 'C'): 4, ('V', 'D'): 4, ('V', 'E'): 4, ('V', 'F'): 4, ('V', 'G'): 4, ('V', 'H'): 4, ('V', 'I'): 1, ('V', 'K'): 4, ('V', 'L'): 3, ('V', 'M'): 3, ('V', 'N'): 4, ('V', 'P'): 4, ('V', 'Q'): 4, ('V', 'R'): 4, ('V', 'S'): 4, ('V', 'T'): 4, ('V', 'V'): 0, ('V', 'W'): 4, ('V', 'Y'): 4, ('W', 'A'): 4, ('W', 'C'): 4, ('W', 'D'): 4, ('W', 'E'): 4, ('W', 'F'): 3, ('W', 'G'): 4, ('W', 'H'): 4, ('W', 'I'): 4, ('W', 'K'): 4, ('W', 'L'): 4, ('W', 'M'): 4, ('W', 'N'): 4, ('W', 'P'): 4, ('W', 'Q'): 4, ('W', 'R'): 4, ('W', 'S'): 4, ('W', 'T'): 4, ('W', 'V'): 4, ('W', 'W'): 0, ('W', 'Y'): 2, ('Y', 'A'): 4, ('Y', 'C'): 4, ('Y', 'D'): 4, ('Y', 'E'): 4, ('Y', 'F'): 1, ('Y', 'G'): 4, ('Y', 'H'): 2, ('Y', 'I'): 4, ('Y', 'K'): 4, ('Y', 'L'): 4, ('Y', 'M'): 4, ('Y', 'N'): 4, ('Y', 'P'): 4, ('Y', 'Q'): 4, ('Y', 'R'): 4, ('Y', 'S'): 4, ('Y', 'T'): 4, ('Y', 'V'): 4, ('Y', 'W'): 2, ('Y', 'Y'): 0} # fmt: on - @staticmethod def _make_numba_matrix(distance_matrix: dict, alphabet: str = parasail_aa_alphabet_with_unknown) -> np.ndarray: """Creates a numba compatible distance matrix from a dict of tuples. @@ -462,7 +461,6 @@ def _make_numba_matrix(distance_matrix: dict, alphabet: str = parasail_aa_alphab distance_matrix: distance lookup matrix """ - dm = np.zeros((len(alphabet), len(alphabet)), dtype=np.int32) for (aa1, aa2), d in distance_matrix.items(): dm[alphabet.index(aa1), alphabet.index(aa2)] = d @@ -472,13 +470,15 @@ def _make_numba_matrix(distance_matrix: dict, alphabet: str = parasail_aa_alphab tcr_nb_distance_matrix = _make_numba_matrix(tcr_dict_distance_matrix) @staticmethod - def _seqs2mat(seqs: Sequence[str], alphabet: str = parasail_aa_alphabet, max_len: Union[None, int] = None) -> tuple[np.ndarray, np.ndarray]: + def _seqs2mat( + seqs: Sequence[str], alphabet: str = parasail_aa_alphabet, max_len: Union[None, int] = None + ) -> tuple[np.ndarray, np.ndarray]: """Convert a collection of gene sequences into a numpy matrix of integers for fast comparison. Parameters ---------- - seqs: + seqs: Sequence of strings Returns @@ -522,7 +522,7 @@ def _nb_tcrdist_mat( seqs_mat2: np.ndarray, seqs_L1: np.ndarray, seqs_L2: np.ndarray, - distance_matrix: np.ndarray = tcr_nb_distance_matrix, + distance_matrix: np.ndarray = tcr_nb_distance_matrix, dist_weight: int = 3, gap_penalty: int = 4, ntrim: int = 3, @@ -559,7 +559,7 @@ def _nb_tcrdist_mat( Returns ------- - data_rows: + data_rows: List with arrays containing the non-zero data values of the result matrix per row, needed to create the final scipy CSR result matrix later indices_rows: @@ -652,16 +652,16 @@ def _calc_dist_mat_block(self, seqs: Sequence[str], seqs2: Optional[Sequence[str seqs_mat2, seqs_L2 = self._seqs2mat(seqs2) data_rows, indices_rows, row_element_counts = self._nb_tcrdist_mat( - seqs_mat1 = seqs_mat1, - seqs_mat2 = seqs_mat2, - seqs_L1 = seqs_L1, - seqs_L2 = seqs_L2, - dist_weight = self.dist_weight, - gap_penalty = self.gap_penalty, - ntrim = self.ntrim, - ctrim = self.ctrim, - fixed_gappos = self.fixed_gappos, - cutoff = self.cutoff, + seqs_mat1=seqs_mat1, + seqs_mat2=seqs_mat2, + seqs_L1=seqs_L1, + seqs_L2=seqs_L2, + dist_weight=self.dist_weight, + gap_penalty=self.gap_penalty, + ntrim=self.ntrim, + ctrim=self.ctrim, + fixed_gappos=self.fixed_gappos, + cutoff=self.cutoff, ) indptr = np.zeros(row_element_counts.shape[0] + 1) diff --git a/src/scirpy/tests/test_ir_dist_metrics.py b/src/scirpy/tests/test_ir_dist_metrics.py index ea4a48f0b..8db93e43b 100644 --- a/src/scirpy/tests/test_ir_dist_metrics.py +++ b/src/scirpy/tests/test_ir_dist_metrics.py @@ -615,6 +615,7 @@ def test_tcrdist(test_parameters, test_input, expected_result): def test_tcrdist_reference(): # test tcrdist against reference implementation from . import TESTDATA + seqs = np.load(TESTDATA / "tcrdist_test_data/tcrdist_WU3k_seqs.npy") reference_result = scipy.sparse.load_npz(TESTDATA / "tcrdist_test_data/tcrdist_WU3k_csr_result.npz") From 05817d39031632463bb5e3077af7f84148ed6094 Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Thu, 18 Apr 2024 20:40:58 +0200 Subject: [PATCH 21/27] unused control variable changed to _ --- src/scirpy/ir_dist/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index e95de0743..de0982813 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -577,7 +577,7 @@ def _nb_tcrdist_mat( row_element_counts = np.zeros(seqs_mat1.shape[0]) empty_row = np.zeros(0) - for i in range(0, seqs_mat1.shape[0]): + for _ in range(0, seqs_mat1.shape[0]): data_rows.append(empty_row) indices_rows.append(empty_row) From f5b9857847bea6d39a77cb3cfefe1bc8f0b7b90c Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Fri, 19 Apr 2024 10:12:08 +0200 Subject: [PATCH 22/27] creation of numba lookup matrix changed --- src/scirpy/ir_dist/metrics.py | 58 ++++++++++++++++------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index de0982813..9fdc8a532 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -394,6 +394,27 @@ def _compute_block(self, seqs1, seqs2, origin): return result +def _make_numba_matrix(distance_matrix: dict, alphabet: str = "ARNDCQEGHILKMFPSTWYVBZX*") -> np.ndarray: + """Creates a numba compatible distance matrix from a dict of tuples. + + Parameters + ---------- + distance_matrix: + Keys are tuples like ('A', 'C') with values containing an integer. + alphabet: + + Returns + ------- + distance_matrix: + distance lookup matrix + """ + + dm = np.zeros((len(alphabet), len(alphabet)), dtype=np.int32) + for (aa1, aa2), d in distance_matrix.items(): + dm[alphabet.index(aa1), alphabet.index(aa2)] = d + dm[alphabet.index(aa2), alphabet.index(aa1)] = d + return dm + class TCRdistDistanceCalculator: """Computes pairwise distances between TCR CDR3 sequences based on the "tcrdist" distance metric. @@ -420,6 +441,12 @@ class TCRdistDistanceCalculator: n_jobs: Number of jobs (processes) to use for the pairwise distance calculation """ + parasail_aa_alphabet = "ARNDCQEGHILKMFPSTWYVBZX" + parasail_aa_alphabet_with_unknown = "ARNDCQEGHILKMFPSTWYVBZX*" + # fmt: off + tcr_dict_distance_matrix = {('A', 'A'): 0, ('A', 'C'): 4, ('A', 'D'): 4, ('A', 'E'): 4, ('A', 'F'): 4, ('A', 'G'): 4, ('A', 'H'): 4, ('A', 'I'): 4, ('A', 'K'): 4, ('A', 'L'): 4, ('A', 'M'): 4, ('A', 'N'): 4, ('A', 'P'): 4, ('A', 'Q'): 4, ('A', 'R'): 4, ('A', 'S'): 3, ('A', 'T'): 4, ('A', 'V'): 4, ('A', 'W'): 4, ('A', 'Y'): 4, ('C', 'A'): 4, ('C', 'C'): 0, ('C', 'D'): 4, ('C', 'E'): 4, ('C', 'F'): 4, ('C', 'G'): 4, ('C', 'H'): 4, ('C', 'I'): 4, ('C', 'K'): 4, ('C', 'L'): 4, ('C', 'M'): 4, ('C', 'N'): 4, ('C', 'P'): 4, ('C', 'Q'): 4, ('C', 'R'): 4, ('C', 'S'): 4, ('C', 'T'): 4, ('C', 'V'): 4, ('C', 'W'): 4, ('C', 'Y'): 4, ('D', 'A'): 4, ('D', 'C'): 4, ('D', 'D'): 0, ('D', 'E'): 2, ('D', 'F'): 4, ('D', 'G'): 4, ('D', 'H'): 4, ('D', 'I'): 4, ('D', 'K'): 4, ('D', 'L'): 4, ('D', 'M'): 4, ('D', 'N'): 3, ('D', 'P'): 4, ('D', 'Q'): 4, ('D', 'R'): 4, ('D', 'S'): 4, ('D', 'T'): 4, ('D', 'V'): 4, ('D', 'W'): 4, ('D', 'Y'): 4, ('E', 'A'): 4, ('E', 'C'): 4, ('E', 'D'): 2, ('E', 'E'): 0, ('E', 'F'): 4, ('E', 'G'): 4, ('E', 'H'): 4, ('E', 'I'): 4, ('E', 'K'): 3, ('E', 'L'): 4, ('E', 'M'): 4, ('E', 'N'): 4, ('E', 'P'): 4, ('E', 'Q'): 2, ('E', 'R'): 4, ('E', 'S'): 4, ('E', 'T'): 4, ('E', 'V'): 4, ('E', 'W'): 4, ('E', 'Y'): 4, ('F', 'A'): 4, ('F', 'C'): 4, ('F', 'D'): 4, ('F', 'E'): 4, ('F', 'F'): 0, ('F', 'G'): 4, ('F', 'H'): 4, ('F', 'I'): 4, ('F', 'K'): 4, ('F', 'L'): 4, ('F', 'M'): 4, ('F', 'N'): 4, ('F', 'P'): 4, ('F', 'Q'): 4, ('F', 'R'): 4, ('F', 'S'): 4, ('F', 'T'): 4, ('F', 'V'): 4, ('F', 'W'): 3, ('F', 'Y'): 1, ('G', 'A'): 4, ('G', 'C'): 4, ('G', 'D'): 4, ('G', 'E'): 4, ('G', 'F'): 4, ('G', 'G'): 0, ('G', 'H'): 4, ('G', 'I'): 4, ('G', 'K'): 4, ('G', 'L'): 4, ('G', 'M'): 4, ('G', 'N'): 4, ('G', 'P'): 4, ('G', 'Q'): 4, ('G', 'R'): 4, ('G', 'S'): 4, ('G', 'T'): 4, ('G', 'V'): 4, ('G', 'W'): 4, ('G', 'Y'): 4, ('H', 'A'): 4, ('H', 'C'): 4, ('H', 'D'): 4, ('H', 'E'): 4, ('H', 'F'): 4, ('H', 'G'): 4, ('H', 'H'): 0, ('H', 'I'): 4, ('H', 'K'): 4, ('H', 'L'): 4, ('H', 'M'): 4, ('H', 'N'): 3, ('H', 'P'): 4, ('H', 'Q'): 4, ('H', 'R'): 4, ('H', 'S'): 4, ('H', 'T'): 4, ('H', 'V'): 4, ('H', 'W'): 4, ('H', 'Y'): 2, ('I', 'A'): 4, ('I', 'C'): 4, ('I', 'D'): 4, ('I', 'E'): 4, ('I', 'F'): 4, ('I', 'G'): 4, ('I', 'H'): 4, ('I', 'I'): 0, ('I', 'K'): 4, ('I', 'L'): 2, ('I', 'M'): 3, ('I', 'N'): 4, ('I', 'P'): 4, ('I', 'Q'): 4, ('I', 'R'): 4, ('I', 'S'): 4, ('I', 'T'): 4, ('I', 'V'): 1, ('I', 'W'): 4, ('I', 'Y'): 4, ('K', 'A'): 4, ('K', 'C'): 4, ('K', 'D'): 4, ('K', 'E'): 3, ('K', 'F'): 4, ('K', 'G'): 4, ('K', 'H'): 4, ('K', 'I'): 4, ('K', 'K'): 0, ('K', 'L'): 4, ('K', 'M'): 4, ('K', 'N'): 4, ('K', 'P'): 4, ('K', 'Q'): 3, ('K', 'R'): 2, ('K', 'S'): 4, ('K', 'T'): 4, ('K', 'V'): 4, ('K', 'W'): 4, ('K', 'Y'): 4, ('L', 'A'): 4, ('L', 'C'): 4, ('L', 'D'): 4, ('L', 'E'): 4, ('L', 'F'): 4, ('L', 'G'): 4, ('L', 'H'): 4, ('L', 'I'): 2, ('L', 'K'): 4, ('L', 'L'): 0, ('L', 'M'): 2, ('L', 'N'): 4, ('L', 'P'): 4, ('L', 'Q'): 4, ('L', 'R'): 4, ('L', 'S'): 4, ('L', 'T'): 4, ('L', 'V'): 3, ('L', 'W'): 4, ('L', 'Y'): 4, ('M', 'A'): 4, ('M', 'C'): 4, ('M', 'D'): 4, ('M', 'E'): 4, ('M', 'F'): 4, ('M', 'G'): 4, ('M', 'H'): 4, ('M', 'I'): 3, ('M', 'K'): 4, ('M', 'L'): 2, ('M', 'M'): 0, ('M', 'N'): 4, ('M', 'P'): 4, ('M', 'Q'): 4, ('M', 'R'): 4, ('M', 'S'): 4, ('M', 'T'): 4, ('M', 'V'): 3, ('M', 'W'): 4, ('M', 'Y'): 4, ('N', 'A'): 4, ('N', 'C'): 4, ('N', 'D'): 3, ('N', 'E'): 4, ('N', 'F'): 4, ('N', 'G'): 4, ('N', 'H'): 3, ('N', 'I'): 4, ('N', 'K'): 4, ('N', 'L'): 4, ('N', 'M'): 4, ('N', 'N'): 0, ('N', 'P'): 4, ('N', 'Q'): 4, ('N', 'R'): 4, ('N', 'S'): 3, ('N', 'T'): 4, ('N', 'V'): 4, ('N', 'W'): 4, ('N', 'Y'): 4, ('P', 'A'): 4, ('P', 'C'): 4, ('P', 'D'): 4, ('P', 'E'): 4, ('P', 'F'): 4, ('P', 'G'): 4, ('P', 'H'): 4, ('P', 'I'): 4, ('P', 'K'): 4, ('P', 'L'): 4, ('P', 'M'): 4, ('P', 'N'): 4, ('P', 'P'): 0, ('P', 'Q'): 4, ('P', 'R'): 4, ('P', 'S'): 4, ('P', 'T'): 4, ('P', 'V'): 4, ('P', 'W'): 4, ('P', 'Y'): 4, ('Q', 'A'): 4, ('Q', 'C'): 4, ('Q', 'D'): 4, ('Q', 'E'): 2, ('Q', 'F'): 4, ('Q', 'G'): 4, ('Q', 'H'): 4, ('Q', 'I'): 4, ('Q', 'K'): 3, ('Q', 'L'): 4, ('Q', 'M'): 4, ('Q', 'N'): 4, ('Q', 'P'): 4, ('Q', 'Q'): 0, ('Q', 'R'): 3, ('Q', 'S'): 4, ('Q', 'T'): 4, ('Q', 'V'): 4, ('Q', 'W'): 4, ('Q', 'Y'): 4, ('R', 'A'): 4, ('R', 'C'): 4, ('R', 'D'): 4, ('R', 'E'): 4, ('R', 'F'): 4, ('R', 'G'): 4, ('R', 'H'): 4, ('R', 'I'): 4, ('R', 'K'): 2, ('R', 'L'): 4, ('R', 'M'): 4, ('R', 'N'): 4, ('R', 'P'): 4, ('R', 'Q'): 3, ('R', 'R'): 0, ('R', 'S'): 4, ('R', 'T'): 4, ('R', 'V'): 4, ('R', 'W'): 4, ('R', 'Y'): 4, ('S', 'A'): 3, ('S', 'C'): 4, ('S', 'D'): 4, ('S', 'E'): 4, ('S', 'F'): 4, ('S', 'G'): 4, ('S', 'H'): 4, ('S', 'I'): 4, ('S', 'K'): 4, ('S', 'L'): 4, ('S', 'M'): 4, ('S', 'N'): 3, ('S', 'P'): 4, ('S', 'Q'): 4, ('S', 'R'): 4, ('S', 'S'): 0, ('S', 'T'): 3, ('S', 'V'): 4, ('S', 'W'): 4, ('S', 'Y'): 4, ('T', 'A'): 4, ('T', 'C'): 4, ('T', 'D'): 4, ('T', 'E'): 4, ('T', 'F'): 4, ('T', 'G'): 4, ('T', 'H'): 4, ('T', 'I'): 4, ('T', 'K'): 4, ('T', 'L'): 4, ('T', 'M'): 4, ('T', 'N'): 4, ('T', 'P'): 4, ('T', 'Q'): 4, ('T', 'R'): 4, ('T', 'S'): 3, ('T', 'T'): 0, ('T', 'V'): 4, ('T', 'W'): 4, ('T', 'Y'): 4, ('V', 'A'): 4, ('V', 'C'): 4, ('V', 'D'): 4, ('V', 'E'): 4, ('V', 'F'): 4, ('V', 'G'): 4, ('V', 'H'): 4, ('V', 'I'): 1, ('V', 'K'): 4, ('V', 'L'): 3, ('V', 'M'): 3, ('V', 'N'): 4, ('V', 'P'): 4, ('V', 'Q'): 4, ('V', 'R'): 4, ('V', 'S'): 4, ('V', 'T'): 4, ('V', 'V'): 0, ('V', 'W'): 4, ('V', 'Y'): 4, ('W', 'A'): 4, ('W', 'C'): 4, ('W', 'D'): 4, ('W', 'E'): 4, ('W', 'F'): 3, ('W', 'G'): 4, ('W', 'H'): 4, ('W', 'I'): 4, ('W', 'K'): 4, ('W', 'L'): 4, ('W', 'M'): 4, ('W', 'N'): 4, ('W', 'P'): 4, ('W', 'Q'): 4, ('W', 'R'): 4, ('W', 'S'): 4, ('W', 'T'): 4, ('W', 'V'): 4, ('W', 'W'): 0, ('W', 'Y'): 2, ('Y', 'A'): 4, ('Y', 'C'): 4, ('Y', 'D'): 4, ('Y', 'E'): 4, ('Y', 'F'): 1, ('Y', 'G'): 4, ('Y', 'H'): 2, ('Y', 'I'): 4, ('Y', 'K'): 4, ('Y', 'L'): 4, ('Y', 'M'): 4, ('Y', 'N'): 4, ('Y', 'P'): 4, ('Y', 'Q'): 4, ('Y', 'R'): 4, ('Y', 'S'): 4, ('Y', 'T'): 4, ('Y', 'V'): 4, ('Y', 'W'): 2, ('Y', 'Y'): 0} + # fmt: on + tcr_nb_distance_matrix = _make_numba_matrix(tcr_dict_distance_matrix, parasail_aa_alphabet_with_unknown) def __init__( self, @@ -440,37 +467,6 @@ def __init__( self.cutoff = cutoff self.n_jobs = n_jobs - parasail_aa_alphabet = "ARNDCQEGHILKMFPSTWYVBZX" - parasail_aa_alphabet_with_unknown = "ARNDCQEGHILKMFPSTWYVBZX*" - # fmt: off - tcr_dict_distance_matrix = {('A', 'A'): 0, ('A', 'C'): 4, ('A', 'D'): 4, ('A', 'E'): 4, ('A', 'F'): 4, ('A', 'G'): 4, ('A', 'H'): 4, ('A', 'I'): 4, ('A', 'K'): 4, ('A', 'L'): 4, ('A', 'M'): 4, ('A', 'N'): 4, ('A', 'P'): 4, ('A', 'Q'): 4, ('A', 'R'): 4, ('A', 'S'): 3, ('A', 'T'): 4, ('A', 'V'): 4, ('A', 'W'): 4, ('A', 'Y'): 4, ('C', 'A'): 4, ('C', 'C'): 0, ('C', 'D'): 4, ('C', 'E'): 4, ('C', 'F'): 4, ('C', 'G'): 4, ('C', 'H'): 4, ('C', 'I'): 4, ('C', 'K'): 4, ('C', 'L'): 4, ('C', 'M'): 4, ('C', 'N'): 4, ('C', 'P'): 4, ('C', 'Q'): 4, ('C', 'R'): 4, ('C', 'S'): 4, ('C', 'T'): 4, ('C', 'V'): 4, ('C', 'W'): 4, ('C', 'Y'): 4, ('D', 'A'): 4, ('D', 'C'): 4, ('D', 'D'): 0, ('D', 'E'): 2, ('D', 'F'): 4, ('D', 'G'): 4, ('D', 'H'): 4, ('D', 'I'): 4, ('D', 'K'): 4, ('D', 'L'): 4, ('D', 'M'): 4, ('D', 'N'): 3, ('D', 'P'): 4, ('D', 'Q'): 4, ('D', 'R'): 4, ('D', 'S'): 4, ('D', 'T'): 4, ('D', 'V'): 4, ('D', 'W'): 4, ('D', 'Y'): 4, ('E', 'A'): 4, ('E', 'C'): 4, ('E', 'D'): 2, ('E', 'E'): 0, ('E', 'F'): 4, ('E', 'G'): 4, ('E', 'H'): 4, ('E', 'I'): 4, ('E', 'K'): 3, ('E', 'L'): 4, ('E', 'M'): 4, ('E', 'N'): 4, ('E', 'P'): 4, ('E', 'Q'): 2, ('E', 'R'): 4, ('E', 'S'): 4, ('E', 'T'): 4, ('E', 'V'): 4, ('E', 'W'): 4, ('E', 'Y'): 4, ('F', 'A'): 4, ('F', 'C'): 4, ('F', 'D'): 4, ('F', 'E'): 4, ('F', 'F'): 0, ('F', 'G'): 4, ('F', 'H'): 4, ('F', 'I'): 4, ('F', 'K'): 4, ('F', 'L'): 4, ('F', 'M'): 4, ('F', 'N'): 4, ('F', 'P'): 4, ('F', 'Q'): 4, ('F', 'R'): 4, ('F', 'S'): 4, ('F', 'T'): 4, ('F', 'V'): 4, ('F', 'W'): 3, ('F', 'Y'): 1, ('G', 'A'): 4, ('G', 'C'): 4, ('G', 'D'): 4, ('G', 'E'): 4, ('G', 'F'): 4, ('G', 'G'): 0, ('G', 'H'): 4, ('G', 'I'): 4, ('G', 'K'): 4, ('G', 'L'): 4, ('G', 'M'): 4, ('G', 'N'): 4, ('G', 'P'): 4, ('G', 'Q'): 4, ('G', 'R'): 4, ('G', 'S'): 4, ('G', 'T'): 4, ('G', 'V'): 4, ('G', 'W'): 4, ('G', 'Y'): 4, ('H', 'A'): 4, ('H', 'C'): 4, ('H', 'D'): 4, ('H', 'E'): 4, ('H', 'F'): 4, ('H', 'G'): 4, ('H', 'H'): 0, ('H', 'I'): 4, ('H', 'K'): 4, ('H', 'L'): 4, ('H', 'M'): 4, ('H', 'N'): 3, ('H', 'P'): 4, ('H', 'Q'): 4, ('H', 'R'): 4, ('H', 'S'): 4, ('H', 'T'): 4, ('H', 'V'): 4, ('H', 'W'): 4, ('H', 'Y'): 2, ('I', 'A'): 4, ('I', 'C'): 4, ('I', 'D'): 4, ('I', 'E'): 4, ('I', 'F'): 4, ('I', 'G'): 4, ('I', 'H'): 4, ('I', 'I'): 0, ('I', 'K'): 4, ('I', 'L'): 2, ('I', 'M'): 3, ('I', 'N'): 4, ('I', 'P'): 4, ('I', 'Q'): 4, ('I', 'R'): 4, ('I', 'S'): 4, ('I', 'T'): 4, ('I', 'V'): 1, ('I', 'W'): 4, ('I', 'Y'): 4, ('K', 'A'): 4, ('K', 'C'): 4, ('K', 'D'): 4, ('K', 'E'): 3, ('K', 'F'): 4, ('K', 'G'): 4, ('K', 'H'): 4, ('K', 'I'): 4, ('K', 'K'): 0, ('K', 'L'): 4, ('K', 'M'): 4, ('K', 'N'): 4, ('K', 'P'): 4, ('K', 'Q'): 3, ('K', 'R'): 2, ('K', 'S'): 4, ('K', 'T'): 4, ('K', 'V'): 4, ('K', 'W'): 4, ('K', 'Y'): 4, ('L', 'A'): 4, ('L', 'C'): 4, ('L', 'D'): 4, ('L', 'E'): 4, ('L', 'F'): 4, ('L', 'G'): 4, ('L', 'H'): 4, ('L', 'I'): 2, ('L', 'K'): 4, ('L', 'L'): 0, ('L', 'M'): 2, ('L', 'N'): 4, ('L', 'P'): 4, ('L', 'Q'): 4, ('L', 'R'): 4, ('L', 'S'): 4, ('L', 'T'): 4, ('L', 'V'): 3, ('L', 'W'): 4, ('L', 'Y'): 4, ('M', 'A'): 4, ('M', 'C'): 4, ('M', 'D'): 4, ('M', 'E'): 4, ('M', 'F'): 4, ('M', 'G'): 4, ('M', 'H'): 4, ('M', 'I'): 3, ('M', 'K'): 4, ('M', 'L'): 2, ('M', 'M'): 0, ('M', 'N'): 4, ('M', 'P'): 4, ('M', 'Q'): 4, ('M', 'R'): 4, ('M', 'S'): 4, ('M', 'T'): 4, ('M', 'V'): 3, ('M', 'W'): 4, ('M', 'Y'): 4, ('N', 'A'): 4, ('N', 'C'): 4, ('N', 'D'): 3, ('N', 'E'): 4, ('N', 'F'): 4, ('N', 'G'): 4, ('N', 'H'): 3, ('N', 'I'): 4, ('N', 'K'): 4, ('N', 'L'): 4, ('N', 'M'): 4, ('N', 'N'): 0, ('N', 'P'): 4, ('N', 'Q'): 4, ('N', 'R'): 4, ('N', 'S'): 3, ('N', 'T'): 4, ('N', 'V'): 4, ('N', 'W'): 4, ('N', 'Y'): 4, ('P', 'A'): 4, ('P', 'C'): 4, ('P', 'D'): 4, ('P', 'E'): 4, ('P', 'F'): 4, ('P', 'G'): 4, ('P', 'H'): 4, ('P', 'I'): 4, ('P', 'K'): 4, ('P', 'L'): 4, ('P', 'M'): 4, ('P', 'N'): 4, ('P', 'P'): 0, ('P', 'Q'): 4, ('P', 'R'): 4, ('P', 'S'): 4, ('P', 'T'): 4, ('P', 'V'): 4, ('P', 'W'): 4, ('P', 'Y'): 4, ('Q', 'A'): 4, ('Q', 'C'): 4, ('Q', 'D'): 4, ('Q', 'E'): 2, ('Q', 'F'): 4, ('Q', 'G'): 4, ('Q', 'H'): 4, ('Q', 'I'): 4, ('Q', 'K'): 3, ('Q', 'L'): 4, ('Q', 'M'): 4, ('Q', 'N'): 4, ('Q', 'P'): 4, ('Q', 'Q'): 0, ('Q', 'R'): 3, ('Q', 'S'): 4, ('Q', 'T'): 4, ('Q', 'V'): 4, ('Q', 'W'): 4, ('Q', 'Y'): 4, ('R', 'A'): 4, ('R', 'C'): 4, ('R', 'D'): 4, ('R', 'E'): 4, ('R', 'F'): 4, ('R', 'G'): 4, ('R', 'H'): 4, ('R', 'I'): 4, ('R', 'K'): 2, ('R', 'L'): 4, ('R', 'M'): 4, ('R', 'N'): 4, ('R', 'P'): 4, ('R', 'Q'): 3, ('R', 'R'): 0, ('R', 'S'): 4, ('R', 'T'): 4, ('R', 'V'): 4, ('R', 'W'): 4, ('R', 'Y'): 4, ('S', 'A'): 3, ('S', 'C'): 4, ('S', 'D'): 4, ('S', 'E'): 4, ('S', 'F'): 4, ('S', 'G'): 4, ('S', 'H'): 4, ('S', 'I'): 4, ('S', 'K'): 4, ('S', 'L'): 4, ('S', 'M'): 4, ('S', 'N'): 3, ('S', 'P'): 4, ('S', 'Q'): 4, ('S', 'R'): 4, ('S', 'S'): 0, ('S', 'T'): 3, ('S', 'V'): 4, ('S', 'W'): 4, ('S', 'Y'): 4, ('T', 'A'): 4, ('T', 'C'): 4, ('T', 'D'): 4, ('T', 'E'): 4, ('T', 'F'): 4, ('T', 'G'): 4, ('T', 'H'): 4, ('T', 'I'): 4, ('T', 'K'): 4, ('T', 'L'): 4, ('T', 'M'): 4, ('T', 'N'): 4, ('T', 'P'): 4, ('T', 'Q'): 4, ('T', 'R'): 4, ('T', 'S'): 3, ('T', 'T'): 0, ('T', 'V'): 4, ('T', 'W'): 4, ('T', 'Y'): 4, ('V', 'A'): 4, ('V', 'C'): 4, ('V', 'D'): 4, ('V', 'E'): 4, ('V', 'F'): 4, ('V', 'G'): 4, ('V', 'H'): 4, ('V', 'I'): 1, ('V', 'K'): 4, ('V', 'L'): 3, ('V', 'M'): 3, ('V', 'N'): 4, ('V', 'P'): 4, ('V', 'Q'): 4, ('V', 'R'): 4, ('V', 'S'): 4, ('V', 'T'): 4, ('V', 'V'): 0, ('V', 'W'): 4, ('V', 'Y'): 4, ('W', 'A'): 4, ('W', 'C'): 4, ('W', 'D'): 4, ('W', 'E'): 4, ('W', 'F'): 3, ('W', 'G'): 4, ('W', 'H'): 4, ('W', 'I'): 4, ('W', 'K'): 4, ('W', 'L'): 4, ('W', 'M'): 4, ('W', 'N'): 4, ('W', 'P'): 4, ('W', 'Q'): 4, ('W', 'R'): 4, ('W', 'S'): 4, ('W', 'T'): 4, ('W', 'V'): 4, ('W', 'W'): 0, ('W', 'Y'): 2, ('Y', 'A'): 4, ('Y', 'C'): 4, ('Y', 'D'): 4, ('Y', 'E'): 4, ('Y', 'F'): 1, ('Y', 'G'): 4, ('Y', 'H'): 2, ('Y', 'I'): 4, ('Y', 'K'): 4, ('Y', 'L'): 4, ('Y', 'M'): 4, ('Y', 'N'): 4, ('Y', 'P'): 4, ('Y', 'Q'): 4, ('Y', 'R'): 4, ('Y', 'S'): 4, ('Y', 'T'): 4, ('Y', 'V'): 4, ('Y', 'W'): 2, ('Y', 'Y'): 0} - # fmt: on - - - @staticmethod - def _make_numba_matrix(distance_matrix: dict, alphabet: str = parasail_aa_alphabet_with_unknown) -> np.ndarray: - """Creates a numba compatible distance matrix from a dict of tuples. - - Parameters - ---------- - distance_matrix: - Keys are tuples like ('A', 'C') with values containing an integer. - alphabet: - - Returns - ------- - distance_matrix: - distance lookup matrix - """ - - dm = np.zeros((len(alphabet), len(alphabet)), dtype=np.int32) - for (aa1, aa2), d in distance_matrix.items(): - dm[alphabet.index(aa1), alphabet.index(aa2)] = d - dm[alphabet.index(aa2), alphabet.index(aa1)] = d - return dm - - tcr_nb_distance_matrix = _make_numba_matrix(tcr_dict_distance_matrix) - @staticmethod def _seqs2mat(seqs: Sequence[str], alphabet: str = parasail_aa_alphabet, max_len: Union[None, int] = None) -> tuple[np.ndarray, np.ndarray]: """Convert a collection of gene sequences into a From 5d12de52c76bd9a373500e18b5459c5bdab9f0f4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 19 Apr 2024 08:14:36 +0000 Subject: [PATCH 23/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/scirpy/ir_dist/metrics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index 615d3ae8f..08dd78b37 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -408,13 +408,13 @@ def _make_numba_matrix(distance_matrix: dict, alphabet: str = "ARNDCQEGHILKMFPST distance_matrix: distance lookup matrix """ - dm = np.zeros((len(alphabet), len(alphabet)), dtype=np.int32) for (aa1, aa2), d in distance_matrix.items(): dm[alphabet.index(aa1), alphabet.index(aa2)] = d dm[alphabet.index(aa2), alphabet.index(aa1)] = d return dm + class TCRdistDistanceCalculator: """Computes pairwise distances between TCR CDR3 sequences based on the "tcrdist" distance metric. @@ -441,6 +441,7 @@ class TCRdistDistanceCalculator: n_jobs: Number of jobs (processes) to use for the pairwise distance calculation """ + parasail_aa_alphabet = "ARNDCQEGHILKMFPSTWYVBZX" parasail_aa_alphabet_with_unknown = "ARNDCQEGHILKMFPSTWYVBZX*" # fmt: off From fcce8e7a3ab525110e529f92db61f472978db36a Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sun, 21 Apr 2024 21:15:44 +0200 Subject: [PATCH 24/27] Update CHANGELOG --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 775981963..96ba991df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ and this project adheres to [Semantic Versioning][]. [keep a changelog]: https://keepachangelog.com/en/1.0.0/ [semantic versioning]: https://semver.org/spec/v2.0.0.html +## Unreleased + +- Add "TCRdist" as new metric ([#502](https://github.com/scverse/scirpy/pull/502)) + ## v0.16.1 ### Fixes From d6e7191d3fce8bd45873cd3346fa6f77dbd4e48f Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sun, 21 Apr 2024 21:20:37 +0200 Subject: [PATCH 25/27] Update docstring --- docs/api.rst | 1 + src/scirpy/ir_dist/metrics.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 958ee9293..1c4a41282 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -302,3 +302,4 @@ distance metrics ir_dist.metrics.HammingDistanceCalculator ir_dist.metrics.AlignmentDistanceCalculator ir_dist.metrics.FastAlignmentDistanceCalculator + ir_dist.metrics.TCRdistDistanceCalculator diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index 08dd78b37..157bb04b3 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -418,7 +418,8 @@ def _make_numba_matrix(distance_matrix: dict, alphabet: str = "ARNDCQEGHILKMFPST class TCRdistDistanceCalculator: """Computes pairwise distances between TCR CDR3 sequences based on the "tcrdist" distance metric. - The code of this class is heavily based on: https://github.com/agartland/pwseqdist/blob/master/pwseqdist + The code of this class is heavily based on `pwseqdist `_. + Reused under MIT license, Copyright (c) 2020 Andrew Fiore-Gartland. Using default weight, gap penalty, ntrim and ctrim is equivalent to the original distance published in :cite:`TCRdist`. @@ -451,13 +452,13 @@ class TCRdistDistanceCalculator: def __init__( self, + cutoff: int = 20, *, dist_weight: int = 3, gap_penalty: int = 4, ntrim: int = 3, ctrim: int = 2, fixed_gappos: bool = True, - cutoff: int = 20, n_jobs: int = 1, ): self.dist_weight = dist_weight From 1d83f8bd5350f1e634be2d7bcd8d06280e433db9 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sun, 21 Apr 2024 21:40:54 +0200 Subject: [PATCH 26/27] Update ir_dist docstring --- src/scirpy/ir_dist/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/scirpy/ir_dist/__init__.py b/src/scirpy/ir_dist/__init__.py index 05820f0b6..223fec74b 100644 --- a/src/scirpy/ir_dist/__init__.py +++ b/src/scirpy/ir_dist/__init__.py @@ -174,6 +174,8 @@ def _ir_dist( Like `airr_key`, but for `reference`. chain_idx_key_ref Like `chain_idx_key`, but for `reference`. + **kwargs + Arguments are passed to the respective :class:`~scirpy.ir_dist.metrics.DistanceCalculator` class. Returns ------- From 74bfce28edcf8d883b4d88cbd4d3f37fabf039bc Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sun, 21 Apr 2024 21:54:06 +0200 Subject: [PATCH 27/27] Update description in tutorial --- docs/tutorials/tutorial_3k_tcr.ipynb | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/docs/tutorials/tutorial_3k_tcr.ipynb b/docs/tutorials/tutorial_3k_tcr.ipynb index f40e8e404..dd1e91a29 100644 --- a/docs/tutorials/tutorial_3k_tcr.ipynb +++ b/docs/tutorials/tutorial_3k_tcr.ipynb @@ -1049,13 +1049,19 @@ "For instance, a distance of `10` is equivalent to 2 Rs mutating into N.\n", "This appoach was initially proposed as *TCRdist* by Dash et al. {cite}`TCRdist`.\n", "\n", - ":::{tip}\n", - "You can use `metric=\"fastalignment\"` for a faster calculation at the cost of a few false-negatives (i.e. sequence pairs\n", - "that are actually below the distance cutoff, but are removed during a pre-filtering step). With default parameters, \n", - "the false-negative rate (of all sequence pairs actually below the cutoff) was ~2% on the {func}`scirpy.datasets.wu2020`\n", - "dataset. \n", - "\n", - "See also {class}`scirpy.ir_dist.metrics.FastAlignmentDistanceCalculator`. \n", + ":::{admonition} Speeding up TCR distance calculation\n", + ":class: tip\n", + "\n", + "Scirpy provides alternative distance metrics that are similar to `\"alignment\"`, but a lot faster: \n", + "\n", + "* `metric=\"tcrdist\"` is an implementation of [tcrdist3](https://github.com/kmayerb/tcrdist3) within scirpy. The scores\n", + " are calculated differently, but it gives very similar results compared to `metric=\"alignment\"`.\n", + " See also {class}`scirpy.ir_dist.metrics.TCRdistDistanceCalculator`.\n", + "* `metric=\"fastalignment\"` uses a heuristic to speed up the `\"alignment\"` metric at the cost of a few false-negatives (i.e. sequence pairs\n", + " that are actually below the distance cutoff, but are removed during a pre-filtering step). With default parameters, \n", + " the false-negative rate (of all sequence pairs actually below the cutoff) was ~2% on the {func}`scirpy.datasets.wu2020`\n", + " dataset. See also {class}`scirpy.ir_dist.metrics.FastAlignmentDistanceCalculator`.\n", + " \n", ":::\n", "\n", "All cells with a distance between their CDR3 sequences lower than `cutoff` will be connected in the network.\n"