From d1db84813629e3a6f03829610a1170486b7e13ec Mon Sep 17 00:00:00 2001 From: felixpetschko <48593591+felixpetschko@users.noreply.github.com> Date: Sun, 21 Apr 2024 22:11:54 +0200 Subject: [PATCH] tcrdist draft version implemented (#502) * tcrdist draft version implemented * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tcrdist tests added * fixed ir_dist _get_distance_calculator parameter handling * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * handling of empty input sequences fixed * additional tests for tcrdist added * tcrdist test with comparison against reference implementation added * formatting of tcrdist tests improved * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * comments for TCRdist added * code formatting * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * handling of default values for cutoff and n_jobs in _get_distance_calculator adapted * auto formatting disabled for tcr_dict_distance_matrix * added data type hints to functions and adapted function comments * changed testdata import for test cases * changed __init__ and _nb_tcrdist_mat in TCRdistDistanceCalculator to keywords only * keywords only for _nb_tcrdist_mat removed, because it doesn't work with numba * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * unused control variable changed to _ * creation of numba lookup matrix changed * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update CHANGELOG * Update docstring * Update ir_dist docstring * Update description in tutorial --------- Co-authored-by: Gregor Sturm Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- CHANGELOG.md | 4 + docs/api.rst | 1 + docs/tutorials/tutorial_3k_tcr.ipynb | 20 +- src/scirpy/ir_dist/__init__.py | 7 +- src/scirpy/ir_dist/metrics.py | 297 ++++++++++++++++ .../tcrdist_WU3k_csr_result.npz | Bin 0 -> 6159 bytes .../tcrdist_test_data/tcrdist_WU3k_seqs.npy | Bin 0 -> 136528 bytes src/scirpy/tests/test_ir_dist_metrics.py | 321 +++++++++++++++++- 8 files changed, 641 insertions(+), 9 deletions(-) create mode 100644 src/scirpy/tests/data/tcrdist_test_data/tcrdist_WU3k_csr_result.npz create mode 100644 src/scirpy/tests/data/tcrdist_test_data/tcrdist_WU3k_seqs.npy diff --git a/CHANGELOG.md b/CHANGELOG.md index 775981963..96ba991df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ and this project adheres to [Semantic Versioning][]. [keep a changelog]: https://keepachangelog.com/en/1.0.0/ [semantic versioning]: https://semver.org/spec/v2.0.0.html +## Unreleased + +- Add "TCRdist" as new metric ([#502](https://github.com/scverse/scirpy/pull/502)) + ## v0.16.1 ### Fixes diff --git a/docs/api.rst b/docs/api.rst index 958ee9293..1c4a41282 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -302,3 +302,4 @@ distance metrics ir_dist.metrics.HammingDistanceCalculator ir_dist.metrics.AlignmentDistanceCalculator ir_dist.metrics.FastAlignmentDistanceCalculator + ir_dist.metrics.TCRdistDistanceCalculator diff --git a/docs/tutorials/tutorial_3k_tcr.ipynb b/docs/tutorials/tutorial_3k_tcr.ipynb index f40e8e404..dd1e91a29 100644 --- a/docs/tutorials/tutorial_3k_tcr.ipynb +++ b/docs/tutorials/tutorial_3k_tcr.ipynb @@ -1049,13 +1049,19 @@ "For instance, a distance of `10` is equivalent to 2 Rs mutating into N.\n", "This appoach was initially proposed as *TCRdist* by Dash et al. {cite}`TCRdist`.\n", "\n", - ":::{tip}\n", - "You can use `metric=\"fastalignment\"` for a faster calculation at the cost of a few false-negatives (i.e. sequence pairs\n", - "that are actually below the distance cutoff, but are removed during a pre-filtering step). With default parameters, \n", - "the false-negative rate (of all sequence pairs actually below the cutoff) was ~2% on the {func}`scirpy.datasets.wu2020`\n", - "dataset. \n", - "\n", - "See also {class}`scirpy.ir_dist.metrics.FastAlignmentDistanceCalculator`. \n", + ":::{admonition} Speeding up TCR distance calculation\n", + ":class: tip\n", + "\n", + "Scirpy provides alternative distance metrics that are similar to `\"alignment\"`, but a lot faster: \n", + "\n", + "* `metric=\"tcrdist\"` is an implementation of [tcrdist3](https://github.com/kmayerb/tcrdist3) within scirpy. The scores\n", + " are calculated differently, but it gives very similar results compared to `metric=\"alignment\"`.\n", + " See also {class}`scirpy.ir_dist.metrics.TCRdistDistanceCalculator`.\n", + "* `metric=\"fastalignment\"` uses a heuristic to speed up the `\"alignment\"` metric at the cost of a few false-negatives (i.e. sequence pairs\n", + " that are actually below the distance cutoff, but are removed during a pre-filtering step). With default parameters, \n", + " the false-negative rate (of all sequence pairs actually below the cutoff) was ~2% on the {func}`scirpy.datasets.wu2020`\n", + " dataset. See also {class}`scirpy.ir_dist.metrics.FastAlignmentDistanceCalculator`.\n", + " \n", ":::\n", "\n", "All cells with a distance between their CDR3 sequences lower than `cutoff` will be connected in the network.\n" diff --git a/src/scirpy/ir_dist/__init__.py b/src/scirpy/ir_dist/__init__.py index 4afdf0c8b..223fec74b 100644 --- a/src/scirpy/ir_dist/__init__.py +++ b/src/scirpy/ir_dist/__init__.py @@ -99,6 +99,8 @@ def _get_distance_calculator(metric: MetricType, cutoff: Union[int, None], *, n_ dist_calc = metrics.LevenshteinDistanceCalculator(n_jobs=n_jobs, **kwargs) elif metric == "hamming": dist_calc = metrics.HammingDistanceCalculator(n_jobs=n_jobs, **kwargs) + elif metric == "tcrdist": + dist_calc = metrics.TCRdistDistanceCalculator(n_jobs=n_jobs, **kwargs) else: raise ValueError("Invalid distance metric.") @@ -122,6 +124,7 @@ def _ir_dist( airr_mod_ref: str = "airr", airr_key_ref: str = "airr", chain_idx_key_ref: str = "chain_indices", + **kwargs, ) -> Union[dict, None]: """\ Computes a sequence-distance metric between all unique :term:`VJ ` @@ -171,6 +174,8 @@ def _ir_dist( Like `airr_key`, but for `reference`. chain_idx_key_ref Like `chain_idx_key`, but for `reference`. + **kwargs + Arguments are passed to the respective :class:`~scirpy.ir_dist.metrics.DistanceCalculator` class. Returns ------- @@ -227,7 +232,7 @@ def _get_unique_seqs(tmp_adata, chain_type): result[chain_type][tmp_key] = unique_seqs # compute distance matrices - dist_calc = _get_distance_calculator(metric, cutoff, n_jobs=n_jobs) + dist_calc = _get_distance_calculator(metric, cutoff, n_jobs=n_jobs, **kwargs) for chain_type in ["VJ", "VDJ"]: logging.info(f"Computing sequence x sequence distance matrix for {chain_type} sequences.") # type: ignore result[chain_type]["distances"] = dist_calc.calc_dist_mat( diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py index 2acf46ad6..157bb04b3 100644 --- a/src/scirpy/ir_dist/metrics.py +++ b/src/scirpy/ir_dist/metrics.py @@ -1,10 +1,12 @@ import abc import itertools +import multiprocessing import warnings from collections.abc import Sequence from typing import Optional, Union import joblib +import numba as nb import numpy as np import scipy.sparse import scipy.spatial @@ -392,6 +394,301 @@ def _compute_block(self, seqs1, seqs2, origin): return result +def _make_numba_matrix(distance_matrix: dict, alphabet: str = "ARNDCQEGHILKMFPSTWYVBZX*") -> np.ndarray: + """Creates a numba compatible distance matrix from a dict of tuples. + + Parameters + ---------- + distance_matrix: + Keys are tuples like ('A', 'C') with values containing an integer. + alphabet: + + Returns + ------- + distance_matrix: + distance lookup matrix + """ + dm = np.zeros((len(alphabet), len(alphabet)), dtype=np.int32) + for (aa1, aa2), d in distance_matrix.items(): + dm[alphabet.index(aa1), alphabet.index(aa2)] = d + dm[alphabet.index(aa2), alphabet.index(aa1)] = d + return dm + + +class TCRdistDistanceCalculator: + """Computes pairwise distances between TCR CDR3 sequences based on the "tcrdist" distance metric. + + The code of this class is heavily based on `pwseqdist `_. + Reused under MIT license, Copyright (c) 2020 Andrew Fiore-Gartland. + + Using default weight, gap penalty, ntrim and ctrim is equivalent to the + original distance published in :cite:`TCRdist`. + + Parameters + ---------- + dist_weight: + Weight applied to the mismatch distances before summing with the gap penalties + gap_penalty: + Distance penalty for the difference in the length of the two sequences + ntrim/ctrim: + Positions trimmed off the N-terminus (0) and C-terminus (L-1) ends of the peptide sequence. These symbols will be ignored + in the distance calculation. + fixed_gappos: + If True, insert gaps at a fixed position after the cysteine residue statring the CDR3 (typically position 6). + If False, find the "optimal" position for inserting the gaps to make up the difference in length + cutoff: + Will eleminate distances > cutoff to make efficient + use of sparse matrices. + n_jobs: + Number of jobs (processes) to use for the pairwise distance calculation + """ + + parasail_aa_alphabet = "ARNDCQEGHILKMFPSTWYVBZX" + parasail_aa_alphabet_with_unknown = "ARNDCQEGHILKMFPSTWYVBZX*" + # fmt: off + tcr_dict_distance_matrix = {('A', 'A'): 0, ('A', 'C'): 4, ('A', 'D'): 4, ('A', 'E'): 4, ('A', 'F'): 4, ('A', 'G'): 4, ('A', 'H'): 4, ('A', 'I'): 4, ('A', 'K'): 4, ('A', 'L'): 4, ('A', 'M'): 4, ('A', 'N'): 4, ('A', 'P'): 4, ('A', 'Q'): 4, ('A', 'R'): 4, ('A', 'S'): 3, ('A', 'T'): 4, ('A', 'V'): 4, ('A', 'W'): 4, ('A', 'Y'): 4, ('C', 'A'): 4, ('C', 'C'): 0, ('C', 'D'): 4, ('C', 'E'): 4, ('C', 'F'): 4, ('C', 'G'): 4, ('C', 'H'): 4, ('C', 'I'): 4, ('C', 'K'): 4, ('C', 'L'): 4, ('C', 'M'): 4, ('C', 'N'): 4, ('C', 'P'): 4, ('C', 'Q'): 4, ('C', 'R'): 4, ('C', 'S'): 4, ('C', 'T'): 4, ('C', 'V'): 4, ('C', 'W'): 4, ('C', 'Y'): 4, ('D', 'A'): 4, ('D', 'C'): 4, ('D', 'D'): 0, ('D', 'E'): 2, ('D', 'F'): 4, ('D', 'G'): 4, ('D', 'H'): 4, ('D', 'I'): 4, ('D', 'K'): 4, ('D', 'L'): 4, ('D', 'M'): 4, ('D', 'N'): 3, ('D', 'P'): 4, ('D', 'Q'): 4, ('D', 'R'): 4, ('D', 'S'): 4, ('D', 'T'): 4, ('D', 'V'): 4, ('D', 'W'): 4, ('D', 'Y'): 4, ('E', 'A'): 4, ('E', 'C'): 4, ('E', 'D'): 2, ('E', 'E'): 0, ('E', 'F'): 4, ('E', 'G'): 4, ('E', 'H'): 4, ('E', 'I'): 4, ('E', 'K'): 3, ('E', 'L'): 4, ('E', 'M'): 4, ('E', 'N'): 4, ('E', 'P'): 4, ('E', 'Q'): 2, ('E', 'R'): 4, ('E', 'S'): 4, ('E', 'T'): 4, ('E', 'V'): 4, ('E', 'W'): 4, ('E', 'Y'): 4, ('F', 'A'): 4, ('F', 'C'): 4, ('F', 'D'): 4, ('F', 'E'): 4, ('F', 'F'): 0, ('F', 'G'): 4, ('F', 'H'): 4, ('F', 'I'): 4, ('F', 'K'): 4, ('F', 'L'): 4, ('F', 'M'): 4, ('F', 'N'): 4, ('F', 'P'): 4, ('F', 'Q'): 4, ('F', 'R'): 4, ('F', 'S'): 4, ('F', 'T'): 4, ('F', 'V'): 4, ('F', 'W'): 3, ('F', 'Y'): 1, ('G', 'A'): 4, ('G', 'C'): 4, ('G', 'D'): 4, ('G', 'E'): 4, ('G', 'F'): 4, ('G', 'G'): 0, ('G', 'H'): 4, ('G', 'I'): 4, ('G', 'K'): 4, ('G', 'L'): 4, ('G', 'M'): 4, ('G', 'N'): 4, ('G', 'P'): 4, ('G', 'Q'): 4, ('G', 'R'): 4, ('G', 'S'): 4, ('G', 'T'): 4, ('G', 'V'): 4, ('G', 'W'): 4, ('G', 'Y'): 4, ('H', 'A'): 4, ('H', 'C'): 4, ('H', 'D'): 4, ('H', 'E'): 4, ('H', 'F'): 4, ('H', 'G'): 4, ('H', 'H'): 0, ('H', 'I'): 4, ('H', 'K'): 4, ('H', 'L'): 4, ('H', 'M'): 4, ('H', 'N'): 3, ('H', 'P'): 4, ('H', 'Q'): 4, ('H', 'R'): 4, ('H', 'S'): 4, ('H', 'T'): 4, ('H', 'V'): 4, ('H', 'W'): 4, ('H', 'Y'): 2, ('I', 'A'): 4, ('I', 'C'): 4, ('I', 'D'): 4, ('I', 'E'): 4, ('I', 'F'): 4, ('I', 'G'): 4, ('I', 'H'): 4, ('I', 'I'): 0, ('I', 'K'): 4, ('I', 'L'): 2, ('I', 'M'): 3, ('I', 'N'): 4, ('I', 'P'): 4, ('I', 'Q'): 4, ('I', 'R'): 4, ('I', 'S'): 4, ('I', 'T'): 4, ('I', 'V'): 1, ('I', 'W'): 4, ('I', 'Y'): 4, ('K', 'A'): 4, ('K', 'C'): 4, ('K', 'D'): 4, ('K', 'E'): 3, ('K', 'F'): 4, ('K', 'G'): 4, ('K', 'H'): 4, ('K', 'I'): 4, ('K', 'K'): 0, ('K', 'L'): 4, ('K', 'M'): 4, ('K', 'N'): 4, ('K', 'P'): 4, ('K', 'Q'): 3, ('K', 'R'): 2, ('K', 'S'): 4, ('K', 'T'): 4, ('K', 'V'): 4, ('K', 'W'): 4, ('K', 'Y'): 4, ('L', 'A'): 4, ('L', 'C'): 4, ('L', 'D'): 4, ('L', 'E'): 4, ('L', 'F'): 4, ('L', 'G'): 4, ('L', 'H'): 4, ('L', 'I'): 2, ('L', 'K'): 4, ('L', 'L'): 0, ('L', 'M'): 2, ('L', 'N'): 4, ('L', 'P'): 4, ('L', 'Q'): 4, ('L', 'R'): 4, ('L', 'S'): 4, ('L', 'T'): 4, ('L', 'V'): 3, ('L', 'W'): 4, ('L', 'Y'): 4, ('M', 'A'): 4, ('M', 'C'): 4, ('M', 'D'): 4, ('M', 'E'): 4, ('M', 'F'): 4, ('M', 'G'): 4, ('M', 'H'): 4, ('M', 'I'): 3, ('M', 'K'): 4, ('M', 'L'): 2, ('M', 'M'): 0, ('M', 'N'): 4, ('M', 'P'): 4, ('M', 'Q'): 4, ('M', 'R'): 4, ('M', 'S'): 4, ('M', 'T'): 4, ('M', 'V'): 3, ('M', 'W'): 4, ('M', 'Y'): 4, ('N', 'A'): 4, ('N', 'C'): 4, ('N', 'D'): 3, ('N', 'E'): 4, ('N', 'F'): 4, ('N', 'G'): 4, ('N', 'H'): 3, ('N', 'I'): 4, ('N', 'K'): 4, ('N', 'L'): 4, ('N', 'M'): 4, ('N', 'N'): 0, ('N', 'P'): 4, ('N', 'Q'): 4, ('N', 'R'): 4, ('N', 'S'): 3, ('N', 'T'): 4, ('N', 'V'): 4, ('N', 'W'): 4, ('N', 'Y'): 4, ('P', 'A'): 4, ('P', 'C'): 4, ('P', 'D'): 4, ('P', 'E'): 4, ('P', 'F'): 4, ('P', 'G'): 4, ('P', 'H'): 4, ('P', 'I'): 4, ('P', 'K'): 4, ('P', 'L'): 4, ('P', 'M'): 4, ('P', 'N'): 4, ('P', 'P'): 0, ('P', 'Q'): 4, ('P', 'R'): 4, ('P', 'S'): 4, ('P', 'T'): 4, ('P', 'V'): 4, ('P', 'W'): 4, ('P', 'Y'): 4, ('Q', 'A'): 4, ('Q', 'C'): 4, ('Q', 'D'): 4, ('Q', 'E'): 2, ('Q', 'F'): 4, ('Q', 'G'): 4, ('Q', 'H'): 4, ('Q', 'I'): 4, ('Q', 'K'): 3, ('Q', 'L'): 4, ('Q', 'M'): 4, ('Q', 'N'): 4, ('Q', 'P'): 4, ('Q', 'Q'): 0, ('Q', 'R'): 3, ('Q', 'S'): 4, ('Q', 'T'): 4, ('Q', 'V'): 4, ('Q', 'W'): 4, ('Q', 'Y'): 4, ('R', 'A'): 4, ('R', 'C'): 4, ('R', 'D'): 4, ('R', 'E'): 4, ('R', 'F'): 4, ('R', 'G'): 4, ('R', 'H'): 4, ('R', 'I'): 4, ('R', 'K'): 2, ('R', 'L'): 4, ('R', 'M'): 4, ('R', 'N'): 4, ('R', 'P'): 4, ('R', 'Q'): 3, ('R', 'R'): 0, ('R', 'S'): 4, ('R', 'T'): 4, ('R', 'V'): 4, ('R', 'W'): 4, ('R', 'Y'): 4, ('S', 'A'): 3, ('S', 'C'): 4, ('S', 'D'): 4, ('S', 'E'): 4, ('S', 'F'): 4, ('S', 'G'): 4, ('S', 'H'): 4, ('S', 'I'): 4, ('S', 'K'): 4, ('S', 'L'): 4, ('S', 'M'): 4, ('S', 'N'): 3, ('S', 'P'): 4, ('S', 'Q'): 4, ('S', 'R'): 4, ('S', 'S'): 0, ('S', 'T'): 3, ('S', 'V'): 4, ('S', 'W'): 4, ('S', 'Y'): 4, ('T', 'A'): 4, ('T', 'C'): 4, ('T', 'D'): 4, ('T', 'E'): 4, ('T', 'F'): 4, ('T', 'G'): 4, ('T', 'H'): 4, ('T', 'I'): 4, ('T', 'K'): 4, ('T', 'L'): 4, ('T', 'M'): 4, ('T', 'N'): 4, ('T', 'P'): 4, ('T', 'Q'): 4, ('T', 'R'): 4, ('T', 'S'): 3, ('T', 'T'): 0, ('T', 'V'): 4, ('T', 'W'): 4, ('T', 'Y'): 4, ('V', 'A'): 4, ('V', 'C'): 4, ('V', 'D'): 4, ('V', 'E'): 4, ('V', 'F'): 4, ('V', 'G'): 4, ('V', 'H'): 4, ('V', 'I'): 1, ('V', 'K'): 4, ('V', 'L'): 3, ('V', 'M'): 3, ('V', 'N'): 4, ('V', 'P'): 4, ('V', 'Q'): 4, ('V', 'R'): 4, ('V', 'S'): 4, ('V', 'T'): 4, ('V', 'V'): 0, ('V', 'W'): 4, ('V', 'Y'): 4, ('W', 'A'): 4, ('W', 'C'): 4, ('W', 'D'): 4, ('W', 'E'): 4, ('W', 'F'): 3, ('W', 'G'): 4, ('W', 'H'): 4, ('W', 'I'): 4, ('W', 'K'): 4, ('W', 'L'): 4, ('W', 'M'): 4, ('W', 'N'): 4, ('W', 'P'): 4, ('W', 'Q'): 4, ('W', 'R'): 4, ('W', 'S'): 4, ('W', 'T'): 4, ('W', 'V'): 4, ('W', 'W'): 0, ('W', 'Y'): 2, ('Y', 'A'): 4, ('Y', 'C'): 4, ('Y', 'D'): 4, ('Y', 'E'): 4, ('Y', 'F'): 1, ('Y', 'G'): 4, ('Y', 'H'): 2, ('Y', 'I'): 4, ('Y', 'K'): 4, ('Y', 'L'): 4, ('Y', 'M'): 4, ('Y', 'N'): 4, ('Y', 'P'): 4, ('Y', 'Q'): 4, ('Y', 'R'): 4, ('Y', 'S'): 4, ('Y', 'T'): 4, ('Y', 'V'): 4, ('Y', 'W'): 2, ('Y', 'Y'): 0} + # fmt: on + tcr_nb_distance_matrix = _make_numba_matrix(tcr_dict_distance_matrix, parasail_aa_alphabet_with_unknown) + + def __init__( + self, + cutoff: int = 20, + *, + dist_weight: int = 3, + gap_penalty: int = 4, + ntrim: int = 3, + ctrim: int = 2, + fixed_gappos: bool = True, + n_jobs: int = 1, + ): + self.dist_weight = dist_weight + self.gap_penalty = gap_penalty + self.ntrim = ntrim + self.ctrim = ctrim + self.fixed_gappos = fixed_gappos + self.cutoff = cutoff + self.n_jobs = n_jobs + + @staticmethod + def _seqs2mat( + seqs: Sequence[str], alphabet: str = parasail_aa_alphabet, max_len: Union[None, int] = None + ) -> tuple[np.ndarray, np.ndarray]: + """Convert a collection of gene sequences into a + numpy matrix of integers for fast comparison. + + Parameters + ---------- + seqs: + Sequence of strings + + Returns + ------- + mat: + matrix with gene sequences encoded as integers + L: + vector with length values of the gene sequences in the matrix + + Examples + -------- + >>> seqs2mat(["CAT", "HAT"]) + array([[ 4, 0, 16], + [ 8, 0, 16]], dtype=int8) + + Notes + ----- + Requires all seqs to have the same length, therefore shorter sequences + are filled up with -1 entries at the end. + """ + if max_len is None: + max_len = np.max([len(s) for s in seqs]) + mat = -1 * np.ones((len(seqs), max_len), dtype=np.int8) + L = np.zeros(len(seqs), dtype=np.int8) + for si, s in enumerate(seqs): + L[si] = len(s) + for aai in range(max_len): + if aai >= len(s): + break + try: + mat[si, aai] = alphabet.index(s[aai]) + except ValueError: + # Unknown symbols given value for last column/row of matrix + mat[si, aai] = len(alphabet) + return mat, L + + @staticmethod + @nb.jit(nopython=True, parallel=False, nogil=True) + def _nb_tcrdist_mat( + seqs_mat1: np.ndarray, + seqs_mat2: np.ndarray, + seqs_L1: np.ndarray, + seqs_L2: np.ndarray, + distance_matrix: np.ndarray = tcr_nb_distance_matrix, + dist_weight: int = 3, + gap_penalty: int = 4, + ntrim: int = 3, + ctrim: int = 2, + fixed_gappos: bool = True, + cutoff: int = 20, + ) -> tuple[list[np.ndarray], list[np.ndarray], np.ndarray]: + """Computes the pairwise TCRdist distances for sequences in seqs_mat1 and seqs_mat2. + + Note: to use with non-CDR3 sequences set ntrim and ctrim to 0. + + Parameters + ---------- + seqs_mat1/2: + Matrix containing sequences created by seqs2mat with padding to accomodate + sequences of different lengths (-1 padding) + seqs_L1/2: + A vector containing the length of each sequence in the respective seqs_mat matrix, + without the padding in seqs_mat + distance_matrix: + A square distance matrix (NOT a similarity matrix). + Matrix must match the alphabet that was used to create + seqs_mat, where each AA is represented by an index into the alphabet. + dist_weight: + Weight applied to the mismatch distances before summing with the gap penalties + gap_penalty: + Distance penalty for the difference in the length of the two sequences + ntrim/ctrim: + Positions trimmed off the N-terminus (0) and C-terminus (L-1) ends of the peptide sequence. These symbols will be ignored + in the distance calculation. + fixed_gappos: + If True, insert gaps at a fixed position after the cysteine residue statring the CDR3 (typically position 6). + If False, find the "optimal" position for inserting the gaps to make up the difference in length + + Returns + ------- + data_rows: + List with arrays containing the non-zero data values of the result matrix per row, + needed to create the final scipy CSR result matrix later + indices_rows: + List with arrays containing the non-zero entry column indeces of the result matrix per row, + needed to create the final scipy CSR result matrix later + row_element_counts: + Array with integers that indicate the amount of non-zero values of the result matrix per row, + needed to create the final scipy CSR result matrix later + """ + assert seqs_mat1.shape[0] == seqs_L1.shape[0] + assert seqs_mat2.shape[0] == seqs_L2.shape[0] + + data_rows = nb.typed.List() + indices_rows = nb.typed.List() + row_element_counts = np.zeros(seqs_mat1.shape[0]) + + empty_row = np.zeros(0) + for _ in range(0, seqs_mat1.shape[0]): + data_rows.append(empty_row) + indices_rows.append(empty_row) + + for row_index in range(seqs_mat1.shape[0]): + data_row = np.zeros(seqs_mat2.shape[0]) + indices_row = np.zeros(seqs_mat2.shape[0]) + row_end_index = 0 + + for col_index in range(seqs_mat2.shape[0]): + q_L = seqs_L1[row_index] + s_L = seqs_L2[col_index] + if q_L == s_L: + distance = 1 + for i in range(ntrim, q_L - ctrim): + distance += distance_matrix[seqs_mat1[row_index, i], seqs_mat2[col_index, i]] * dist_weight + + if distance <= cutoff + 1: + data_row[row_end_index] = distance + indices_row[row_end_index] = col_index + row_end_index += 1 + continue + + short_len = min(q_L, s_L) + len_diff = abs(q_L - s_L) + if fixed_gappos: + min_gappos = min(6, 3 + (short_len - 5) // 2) + max_gappos = min_gappos + else: + min_gappos = 5 + max_gappos = short_len - 1 - 4 + while min_gappos > max_gappos: + min_gappos -= 1 + max_gappos += 1 + min_dist = -1 + + for gappos in range(min_gappos, max_gappos + 1): + tmp_dist = 0 + + remainder = short_len - gappos + for n_i in range(ntrim, gappos): + tmp_dist += distance_matrix[seqs_mat1[row_index, n_i], seqs_mat2[col_index, n_i]] + + for c_i in range(ctrim, remainder): + tmp_dist += distance_matrix[ + seqs_mat1[row_index, q_L - 1 - c_i], seqs_mat2[col_index, s_L - 1 - c_i] + ] + + if tmp_dist < min_dist or min_dist == -1: + min_dist = tmp_dist + + if min_dist == 0: + break + + distance = min_dist * dist_weight + len_diff * gap_penalty + 1 + if distance <= cutoff + 1: + data_row[row_end_index] = distance + indices_row[row_end_index] = col_index + row_end_index += 1 + + data_rows[row_index] = data_row[0:row_end_index] + indices_rows[row_index] = indices_row[0:row_end_index] + row_element_counts[row_index] = row_end_index + + return data_rows, indices_rows, row_element_counts + + def _calc_dist_mat_block(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: + """Computes a block of the final TCRdist distance matrix and returns it as CSR matrix""" + if len(seqs) == 0 or len(seqs2) == 0: + return csr_matrix((len(seqs), len(seqs2))) + + seqs_mat1, seqs_L1 = self._seqs2mat(seqs) + seqs_mat2, seqs_L2 = self._seqs2mat(seqs2) + + data_rows, indices_rows, row_element_counts = self._nb_tcrdist_mat( + seqs_mat1=seqs_mat1, + seqs_mat2=seqs_mat2, + seqs_L1=seqs_L1, + seqs_L2=seqs_L2, + dist_weight=self.dist_weight, + gap_penalty=self.gap_penalty, + ntrim=self.ntrim, + ctrim=self.ctrim, + fixed_gappos=self.fixed_gappos, + cutoff=self.cutoff, + ) + + indptr = np.zeros(row_element_counts.shape[0] + 1) + indptr[1:] = np.cumsum(row_element_counts) + data, indices = np.concatenate(data_rows), np.concatenate(indices_rows) + sparse_distance_matrix = csr_matrix((data, indices, indptr), shape=(len(seqs), len(seqs2))) + return sparse_distance_matrix + + def calc_dist_mat(self, seqs: Sequence[str], seqs2: Optional[Sequence[str]] = None) -> csr_matrix: + """Calculates the pairwise distances between two vectors of gene sequences based on the TCRdist distance metric + and returns a CSR distance matrix + """ + if seqs2 is None: + seqs2 = seqs + + if self.n_jobs > 1: + split_seqs = np.array_split(seqs, self.n_jobs) + arguments = [(split_seqs[x], seqs2) for x in range(self.n_jobs)] + with multiprocessing.Pool(self.n_jobs) as p: + results = p.starmap(self._calc_dist_mat_block, arguments) + distance_matrix_csr = scipy.sparse.vstack(results) + else: + distance_matrix_csr = self._calc_dist_mat_block(seqs, seqs2) + + return distance_matrix_csr + + @_doc_params(params=_doc_params_parallel_distance_calculator) class AlignmentDistanceCalculator(ParallelDistanceCalculator): """\ diff --git a/src/scirpy/tests/data/tcrdist_test_data/tcrdist_WU3k_csr_result.npz b/src/scirpy/tests/data/tcrdist_test_data/tcrdist_WU3k_csr_result.npz new file mode 100644 index 0000000000000000000000000000000000000000..75c88cc10ca3f06cb6d6014ccd5eefd805efaac6 GIT binary patch literal 6159 zcmb`L2Rv2(|HrT5+L_tw;^G$RUaN~_Tzij-h-|V)Mn=h&dF@?he9K6YQudz7S5)>z zsH}_|@w@km=;!;NzmLb^anC&;=bZQZ^LoFZuaB1MNqkxW06>H}K!7LD0~O>*000hV z0Du%g3$S&xur;@IJLl-^jR&B_oyJ@Tz+QqKnZvi9U6vmM+*q9M2ZbT;1toAODHjJl z%i{oD@8@{(4D&HCh(kvuS(pzD;uslu;@-IRg+hO$W-)urV8Ni4RmgfQb=aq8d!1Fg zL}i%d??&UfO_gLaa^~lDeWG$ZdW@tyqf9*cd*9X@pin3`D_2MGL8uP(YTgmE@XGc;a+XQ=wh~Nw#+4O}#>3 zwr>2jfr3-FiWL2r`eGG?XGqif60C%02~P?q(N!AuM+`ojQ2o#!z3%>iEAw;LRbO@b zm;F)SzCYl7y%{q4{h{E`&%IS-H#o+iVfDVflp(OliO3t=xeVPYBknQ|1xM?d^{Z! z`}Nji>yymrcy)U#QbY9pr&;$MCOtfs#3Y8B9oNf}{l3fs2;ohp(as6xEVKB$@MhZ{ z=Onj`*%S2DO_n9jiFQV_>^svyvFc`rZ_dd+pJwqWyqnA$o==5uCRpRs$m{E)s`(^^ zTo7T};R^0%l=1paJf~d3>AUEgcxCiaEXdgR-hl{i9%5x>1=mq-0*Lp0y*wUrTkmxD zJl>Ox?M)IEJfw@rMEe>=57*duBh-xF%F^rb^Q9><+q4A$0yM{?(h>Fu6Np%3j6FRG z#4sqwR}C7Wdk>kABW$^;JfhjaxtPA6k?kBK%W_;&Hs`$Zfp^4rtM!k~Od)}Qt-Rfb zC9s37qK|lAoQC_UA9qXVWUsz_o5}V)kAAE#UFv&2`%iQd9vMmk2)*!m_d(2a+pr4> zrexOmT#U^&ElYBJbe63Vxa6Rmrt}0S(dh7XUIM=!3;$Cki`$+xPEwg>ch0;^>4|(W zhE7cFG81)6>6DzQxEsuG2CXIdIV+JIrfSgQdZi3``LPvw?JA+2LF;YNGAM(tXnCpt z$i>dNwk+!F7ordC8Xo287r$7tZ^7$iPPt?>vP|%`%0_38a~EFdX6$i|UNsu$On%XI zy2LeVVY{*R(>PDni*D9$t}$DZbOhNmnC(MkhYxS^b#o%h;FM z#akr{R%MyLRYb=Iuiewpw{a=3rjp@GT=ofCvU%VtHM=%nUX(mvM=Nda1u&m7L`9b; zZ^RO!=zT^jPDMuIQI}-#6656}gZCrpX7{1`QaIZ8z(`=6`iGZ*iA~o2)9!5^INAJUoIFd*1Gg z3y+;!VS|)GryJV)fY_fbHI7yTgh3q=}atf~` zSW*Rb8=~~gxTTB_G#gG-?I@f>6PVJ(De4lYdXPUh@Kf?|B{ga!G7?pzOro2i#nM@!)Mopc^U2;_)_9m4nE9I%AE%om@&DhJI5x2t?E;Q& zp|3!*i>r4-B`2YpUL8XB=%J=TG{HnU)KQ;7P#`p1-hPrY*{epymwJimaBHC$6wBiP zJH2 zWN8bChr5*R%iFLK^15B^dR&^D-dk#G3TOQQuW@cGM+iT zX3_){&ZCOr%9;fWcE09+7gQje3m33aK6{PuJ<%ZSSw8F6IcY`lxA4i@)t>*V$Z^6p zSbHc#c6!&0vzYf3BICo@d7(uJ^qH3j2H}K_Wp?$dnch^&XmPhfAZBVRuyrjWOj0zo zy6e7I)>aQDK_*`WG3Bj1D4*XPe%3g_p#991ijXmM;Pc1k7nz3`mL*%WHYT@B2FHGj z<$so}=~%Dezl^5Dcb5eBll5szio&ani)}TiP?F0F!lH0NJB;JZ+b|6>GwS<8O~kIy z$>lT4%5`?!s5K5kE~;>4Ssi3thM2O9$|&dQB4xP`kK)kA;{Al)r}ui1>cDKTcdVx+ zn0YZSlLf_1U!5yu(n`1pScyHz89Gds_Eh+hQWm1grwZ@T4T#X2)IFmc-1|sHq*ynw zw@6vwrf%rVfm|ta8znBLRRVPz6`?f3#rw9Hx;dy+ztA9S3Bv`uY9w44dSiIou)7-TEhSUknHiNv%Bj)eqzsJ2lfi=$Q&}SqSU7cjJH?%O?6i~ z80wD9TaDR&tbEemtyWo(#-mYb2eY2{`$4*BynuFI7dPo0PPy`2BXOQw_UFKAp?`ot zpxiZgQho+X{TB}3>bmd(o``!cj0!x`eb+N`4Y=EuOg>LZ&ClM?^GLPK?FWnZ3aTvw z zi)7SqYNv}2|Gx*4Su1Y34Dswv8NBT3Ha6BvqIaHz!T- zBLtT?G)$S_j3)F&6PFaRk`gC%d{7=Y7aT&w0mg9va<4OCAmAx73P%9#a>h7F%cGQs zC=g@T;?sme*yTqGO!g9u6cCvu!#;@a1q#6fO@*jgGQCT!SF?hq!@DfmRt@T8>B>{U{FaQJ2K6de z%F{$1Te2<;*D3g$(xeYJT&XP2Wc`A=>Ob{QVLK{T6m@cS!utZ5 zAHoknQ8sE=zQm~ZUK0L0jG2D82bgbzOWe7Fa5nlET4Fpjrd1v?&bIJRj>ln#udbhY4FGaks znGjg?>Lrv~3GeltIJ>*TKDOBtldr`mOA}It;NXl4Q#mGr?MfOXI!Ii(V{B96fyHA> z9R}3Z_hlLkkv%ZL`_z#uF7oIP))2z7#yuMi(reUeSlFn1mV2ny1!jHE5%c@3dg+JI z-2gJP^KuLw=R<_<#YLJ4n=@dEFCbja45pjXAzUC7_=j~>)&IAs*_#O&4t0oDRs?2l zcPwIC#!#o!y4&4AT+3*2(QroUZr9sloNDcbUV?M3;hjbkPFS;goKuaQtr-XJ{!{!c z9dPm6t9lkWl!-P7-UBdi;>9CY{~9H`dOO@qVL53_5~m*nxR@FeRT;y=)S55;o#4?N0kbWfJYfKMRiz~YgWldFTN z`#v6F&teDm3hYS#LO;(;+1{qrqV*&Tc`d^z&oTdjF^&D>lH3PbNhW$a2`Tj1CJ9Iq zYERWJhz3Cfy)`#o-U&2QnmEaG5?Z_x)Fdb1D4!*>;ti`{{&g7UAjUj^n~kZn<^KJ# zC$R&20d^dFf5&*+od53r3gUZD05~Tl3ItLD|9FUiuX>UACjfvt5$v`lEx^Lm-E{vM zu;;J?`#pAiO?$1;c%HJgv!khQ2=tT^!Y3;)djiBQSstLzkWw60U?xoq4kK8mt>KH2 z#vAFQDXHyQmJ#gKbTMm{+Wji=(Yy9#bwxRd<`}~&P6mQi?@TE_K!i%%>SKsO~wBBBGOTWJCwiId@;#;&uT*bTY zBf2%@75n0~EK@;$eyRVBZRt00*8Y7TDHG#AzVZ8*8PUi7Y0~%Vov#b;-zOH$+7`PI zxu8oPO1@t^p>g%chI?PkUnWtkz51nn#`E6yiY4c7>CB_GsFHh|Yn#>s|H_(>*!0n_ z;(0YR!&tH^Qc?vQpqV`NZq-Pf{O>>q$sdD41HE^%?9oNDBHxx7t?V-fJ$I8r)jIWlr)i<SvaGnN#nrDijnL#1XGq1Zotx@$O#dmVoJ*l%v#zRofBfga{Hs6z3%~fIKl|^0{TKE5U;np1`=5VNKmVP- z|A&A0_kREH{{HX%-`{&U{`Noq&2N75kA5Bhy3YUX*Y&@u^S}G`v(NtI*U$g7&hG!C zUq_vWBV5mV#u?MIuKz;AWO?ZD(O-G0BK`ON*(dL};57+Xb>Q#>(B>-4hE?&>$*yu^~51dD!V)yo{?4Sm|9_`D-5a^7GxkP`4c8%JnP` zK0kKrRgWCw%*AxXdi1SsxaWiRtex)UVx^8(y|}bG zd!A-YIkwk1tAXFmM#rTy+IeFx%+(BExwOy~F5Wk|>MaYj@)wt{K3nm%Zac$Nv{KKny@E%WTu?#$(BdW**NoxH{ORj;pHdetZz{mu2H2S?oGKrZNe zlC>)rCw;-+c#KUqZ>)zevz{JtN}`zed`Nc>vUK@4*kyN+hJezhKrYM#G9Qx zV#MVG)>EzZ=no&_1+CUD2XwCHpl&_JRd4b|bFMM*0t#19{uS{X4hiRtMd1d`|+cD01|F}NEOdjGd?v7UAE7$Y!)OojFn8uZh z6V|S$2k_T_&Ro2_i+Rb@r*mAnc)Z}VEW{H(n?pVM?mT-RUAfjSyWUA*<7JOqt>KkR ztMy|#=>2p*qf6GX(_7fq@JW7gd3V;!k8kKrFQ^$lUsv{ny6tUT{l!ge&wov@{1qMna|JsHm|IAqvP_!`#8Ma_wt#Gb?z_PaD7JKPA<+|-qDp_ zP&2;zd}r^i+u!yx+GOjTU)Pws{IlM0v#0UlsrDIH-QqXBaExO|oY{$j88Lf!frBQ8&|SpaZcG*|G;qn(V`OQ9@eC+y*%SX)kf&C4=>1S*?!&fit zJDqjQF-AV%;^FM}!vU@TbX;07v0XdU{i$9o`%dab|IQxK>AmLQQ?BP}b;wk_@cB;d zu6o0LlC7HhHAdWPzeRU6tKD7wMqKO7zP`hHiLHwfmj_RC5O&9vRXzQr*RZc#Il#02 zZ}wZbyvXr{Khm45nZsTY{h#C)D@XR9CmxUV;wHmS>(_DREIsFXH^&{ak;7_2S~GTRfIy z^T?f=u6z$3aq&_+*E_tr{TNqn_Axvf9Z&YO5to@=5jXAT|bYwo_EmBgI#7l z@}rNjFw<}T9Vz?7C-0$fQ#V?1vq!AvZR7(_Y1SKhk7itRt!nPLVLPXGJG!r2+Vlk$ z&T1djt#9MX#d4_IxsUmLm#w9c+Zc4~Ik zyW&U3#p8#wb(`apoY9LR2lDHh54*V->BWl1r&$|uX-+1b&Eve|@ra8d2k`+{zg+V+ zt6#@eF9&8a@@d`hvY+uc*L<*!l|AjMSKVnmd+?+;I&Ss|bEIA?I_vghjJRhePrlK| zd`PCOU(I|*^Cxw~%-)ciVeL2I=xT(Y{sR}!Hej3 z{v;2c$8^>o=?S0t`Dq{H>4EU0{nI|ivwl2$XX|&~@qARbb9SS9$AgmaqmSjEr}SlS zpda^>y5VBrsu4ELKGSzAdvDz~)Qd6VnP+ue{q_GXiC%U3mcOa_NblV{;=Got-sCjd z%KqbXb6Jbu-T8dx%9q;61K!hKt4H7J7A{TxU5d9pbHtz3tc~=_n1lB8o}bP04Q~3F z{vBrBKJ+0yL9b)CW_MhASL=^;9-Y5>OHVt0Q!lM*Bn#1uzuFs}UQFIT$yhzs)=j>i zYMpVj-{=qDrRa}0E8o#NAHvQ4BYS*S$Gg3uZfSkMz?3gJ#?E}Re2n}Jj~6n#%BovmW5&g#b!UGJxZ~pSm3FzL@%ZuIfMXFKnW5!ae=9p|3-WS^9)v!~enPCi!W8tKK|t$#i8Hbz{Y#s{9t zUA_~q)^NrRKehAj;EA8rbT&p@&&N+Sr1=~DhMRqXA1nP;x4n&#UjHU#z1cH&-|;5s zxIE+qAM{_|%QF1LUu(v5c>c`vnf$HP@2WRkzT!k*KA*kY>-N{^xcqgb_i$IU3*K>M z%X+b}!%rXPo!aHN)h);9xX~AO^1SNBh>Ia#>gJq-2e>QyaK}A+ zx|6S(_823s+FQS~UQ<8wxTaan?);c>J&Utc*J14rJ-6dQ-QIF#DjDXl+Qs7)KRO@m z3D!^RZV%TZtzKLjX_g5;ul&1Y-FA#lZ|cS8^hSK(%Uv!yZuS})X~s`~-0i(}YwH6R z4sUmK*6qjW^nQ{B;EpQ?*3QRo?K^nkI)5v_GUW)Lzr5bzk9i#q0T04d;eoNUh)bKjjK90S zW`d4OYxK&;xAbD-(d`h!wBRPIbYZQI5jRY}-uc+gGvlUb)hF6{edmF>I&L)MB@fuZuwRo@Z{?0J7c6bd#;|f4?oe|acRww@1>vSEFL(^06ymG zo>R9D>xP%Uz|8lF9e(EnuAHTxztI=;>IFG-Sii$uxxUNENH1-k$K$GmQ-|gLcTT(EX#EAV?}G-HssPqV|sbl+Zk8A&i!^*C#>@r z`9LqOdO6fz?>Vpj+eXLbrA+W5ePTXduV^1}J@>@Zdh9hu+<0s4^wO@5&a3;xjEm`@ z{TunEH(uHwR{X5n-bSZ4EP7>v2j)`4{H`wB(Q#=$?-RWCnPa8*W?cH7=3>T;#&71q zdS%Kvd*-SK^~iAF@w|i1CmE~9T5jI+_A3nA@BBWdeRUt|xa##ckuyx+SD(J)I=z^DNG`I+C(n29qY;;< zI1ZW66Y7}$;Bnr4-M-Xu`H)Pg7bfj1dvDz~%!B^Iv}OnHZhwsQp7#dabAQlm4!UQf z<0h-Vx4Y)cYR$~oan+1&tlf8fJu>8CTBo|J*OTw;BfZI3*1hU!NqCKpD-YI>efA=A zJ?RVjtY593XsySF{$i#6_6K>;*EIV4E@O4ekQ)!u8&C9){5^ANl?UgYUY>FFggRkn zKhJgY-F<)WxU^yN*ZCX!Lwrr`%yE~iy5((jdaaL6yu35a6CdvMkGQ;)B{{(MGe7TG zb<1mXTwcUm8fneHcRkH7eYko-#-8+Erx*L|uj}OZ>AsE`uJt>1_u!dcKJao!XWf2` z5tqJrfGu;o>={?>_`y$^2-_O&GBM)fV9IPVmb$I>-IeS4s2Qj^~952tH*|YocDV)udW?0W8_16Aig?d@9aGzuKLKAee$Uv z9oHV5p3__ULmgK=>A1YW#!){%rH{9&q zyiiBnRd&ZczcZis(Q(robUw*i$4%X2jlbD*R&UVM8kKUcV zrfz?Yj{C`;4nE@Ym|kn+0nN@keLdoOULVjbTY7J2Z>ZacdTCXc)W}({BxkgnFYhJu zU&lxfTQyy*TmHbEzW^Px*A0 z>k&6P;|Cw?^H}DTjb!dx!}Q#hOOv|$efZW+BPNgi+^xTUb&QTn2bOcZrkg+JbIfuw z;?hJLW;Cy`>h{+daq;*Nt#qqbazXsj4#HA^FS=i}~yZt5;bX=Y~czO2r6E6RF%U5S?2koD7vBOOs=$7Zy z&>UU;Mm}I-(aP%`opt*$MqEsFayG}4o||#4+nTLc-#N?n-5y!D4EG&6)d`bUemL`E zg;}??T+h?1cJX&76P*t~{iI$oWyDR*qIbj zlfD~q`I@@Pt!vtM_MeWM_q%iQmh(#PMqJ-&3$7N$MA`larb5E7$vh z*6i!9&5`%V>V7%m$^tGXjnT`O)%|D0jb=Krd-fSW{lSA3{)}tSwQl~>j3*P(|0HMhhRG9pcRcCzX3x3XziYbTm~r)+HOs>8 z9?@~LH{{*%t(=9+2TZzoKC4yz`ZZ>Hvp?+AE8vc6{c`J!|E>JuS@#pI^;p}-@l|%S zKl0cbRy^)}2zU2;PEC7^J8o)6}Obx_$za?WBSbX%;U}n&+_)$K18RpI?Db5(u=+0!5x>s-`pqpz=L@4 zB*UYeg)3LG%LmshxvkrdG2`Zaxl*U_a=znczu9HaxcSaQ_ip`WTp3Gm@OSmy#*2PP>o=1#B7%CUN-Zv0+(XVz`4zu9kSezK>{^kQRX zU%&g?s&0La8JDMWv724+JFZ&MoSwdWuVES;7f(;@?z@=Aj2pIda<}U7jO(4rV;<1G z!|%8}#lduzTUz6b{_|Ov=Z>5H+TCwzs$EDrN4Llji;YO!y@MSd%Ch55LNIuwaciumB`!PChJb0oNc*fNS zyvVV$ug$pfMX$Qi&ku9>EH&tQZpMwbIC$UEOK0*c3oAP7wqwln(kK%=2siJKt{)C) z{b$CdPjB%9o2Ndr4xY@|Gp=XxobhIy`t{rBxO~mNgRM`#(Qg<$aOU$j-a~xEp_67l zIDEe1ZQa(^EnK`Onmd2xORf02nt{6YH99U&`QVT<`ggpTapT3#d$^`Q#*E90_>qj! zE`!~Eb3p4q9hax^K@L*CWa24TcEjEAwH`Z0rx!PDdrfj2Kje6(x#OO{o8_Iples%C zpLt5}Oke%_ZH%~{kGFDc?X;$5{C3Z5bX=LGRnD*-JDKRXJk=NS8qI3uGiUS8G|#y7 zI>L3$AAa2RM#rTK59eEXus*(GhP~r$JvQuP+^ku?^auV<-*rChFlx#lW5l(7Ivu=y zGRKUI&jX(9^l;ri^f!9rvAoKbpIy^G%WnPpmYe-+bw9^6Iv=c?zchZ+U+dB%pW$Ki zI<;HfD>@%=Q@2%CJ@OheE_V1~&)z*RXkE;>@pqNA%FW(@FT>yH3;O_8n7iLiIv>tl z9{c(3Z??MSHSV}Od*cgQ7c(wD&&N(CW?bH?odZ98jjvZ`IqA5V`F@7WW2{w=XIy!} z^S#V<-XAM_a&$zVGnarhzEA{KRv{|>~$vb1l-R-fG@ES8NAFMg~O4e5X zjeW-T-6mf0VkHN4+c7$BG^!h(T+pkA)^WFAtZ@As-R?ed$EA(-Z*pZ#@A6_d8#BG? z=D^?awr+3tNjyD~eF57wKlpN&jZSa4>Yfa{Ud`K#8&7lo7T3DtgY!4~jSpCZtYHD-FP)B8*Y_^_I>x^$aqYRuSu)J)?D2fLtLKbs&B@yCJFep@Zte>=V|@wHD+9OO5T#QmA|pTJ8pRV zNao@<&3F1oT>my17RG9=>ekojxU^;8*~vk`9oPDw=)B{~!`a(!?2~HG-<`ZhXU-!Z zaL@ZMe;qPr9kL=1=I(mBoiwMP)$A?^p91M>oIuVc+QbtbTQj85dLCFnNNx`oGK0xTCxmdTDgr z=;Q0%nnx3L+_Rs!`Zu-He|Mwf@<5%QW^Kf!E&G|e(Mi7}n(y*9ea>u3D zJi8g5apg=V^4(^KJ>#Z!>5bLDSxU+<58>G(8S^_-2JJ)-041>A4$54@D8 z_^Wo`$nTkJf2Ud2c4{~B;mqalj<0q5F=kv$b9|G_TOQ-?^l#mMj85;F`^4iJ7aNn_ zZ`Casi@)~XPx~0_%ylipKL2DtJ9F`#{D0hL{^~JVvuE;o^>5?4b-3?DZ@4t8+qe7; zpTB(d^Xgtd^TD1Loh$FOy8SioxICbjPI_10^)qgm_J>{F9XB<5qP1kSdB}GLTw3L9 z$Ky_K-fRAT=Lh{eUfgk?es90y^20v-Exj^@$Jco5v(=vK^kO`D|I}m0=(zbVjjvv` zerx-I^*$bj#s$$-EZXKtk*S-xI65b-t+{2({rm{cU&3M6V9uw zy7e_W?zvvc2EN1Z(u46y9@RHv#?@QVi^X5huk7tJE*~7>VDtH^tv>oOX57?jXMfBD z9hXM>ctLaapH>;P&eyX*(*D)?_`*?CR4aPU-6>j>NU?N zXMX0*>UXp|t~DoTuJN7k_#OA0f#i$_=bow`+cD$nS^o0Ed8J-;+c7#WCY^NeGJ$7Y zb>nCD1wXH5xZ~2e!>QZf#*C{Uu%CF`ar6JfOOCC7$LASW{ppQnY`wSgJz~U-Rvfud zx1FA-+lP5LbM*rLPIm9O)^FWB-tpj$o4sAVzRAt^k@&DuFItUR-J+B3-3)i!d{5vp ze|PKGap?=&-jjEodAyVE_PCCltUcLtYTBW{d51me2TWt67dKhpEkC|dH(Z|bGVf*m z_G!*=X;dpZ@nzT=R@t2o=PZO9-<{28j&5{ZdpeEy4lH$C?Y%2Ee$vR(8!JO0emGke5M2CRMeJrQ=t#p8i~<}v+xWc9n>m5YJr`Rq0M z&a+dy6>fSsKCkq~h#RfR82y-^^c$Av<8Rpc4*WC^IQ*n9OxL_#S%2MjjFAtx4qTay zub<>M8h2`6Q~wxOy)tUe4%x-0clG~YQ@0+Y z_xEeva*U2^-L&$V#&78jH~q&0`C8r6W_q!{&CUCGr^iahYmD@!pRj3ulH-nxXD>I0 z>(zYKt;guN=?m;9{oHY_-66~L+B0ch^}FNdZ+YirST-Mv=ocQd?eunG0C%k&> z7@c11c)6Xu{7xPKEqWS?B^+vv=>(`(JvpX}yZM)Gc&`Le=|_8qTRxXHp!*6z63 zA9gcw$9?ct(@T|15Y)N^!m=A zZmAWvpPl*qv~IZ3sb=bh_YE$dOxSzQIlzC~7s*)GkG0zyBY)3aTF-h{v(|Av|0G}a z*fBaT4;-o6?)Rw~7c=i-XEnV0`>Jj^M#q(1KIk`f+pS&4#pFZ!j?UGL)vd#QQXX*m z5VmuAZl#_hz1eH9yf5S|`sLWXcX=CelUZxW#{7oAxY=`L;L5$zT8|9<#pUhO>~>rl ztv}hNJ=gL2?j17nH(d4iURJyKe8;O>k1^w7(2DO`9ap@q+lKx|uk&sWFpZhs_`2%p z?})$q?+RDWR(%b7#>Mj;CEk9zKg3@-kS}#}{f)Ziy9uohIm7q4ez6umJCF9d&Ifx7 z7Us%6Q@6d1j>`)k#9QZ1+Z*uL4|iOeG;{{%F%V+Gn?A9$)Zt|8I%I&9I?*)44&LL~&FyCE2k8;Lex|}hwq8Tr~;CG(P zxazgSsM~&wj;mg@o;@~?&&)+XuR1?wTpS*UgSlF}88@DWojuLxPy5*V>5XPO@w+B#iz^e>j=9ncb=zyq^zs0k54-Fcmu5^}IIrZOZaYTD z&3hATXJ7b*9qwnz7H`uFv%Mg!^LeBfHyY{wR=?rWDZ^LIe7@_ox~262Hy+1})jfU2 zwO6SbUsvC0cp06)wBgeFO}+XdKGT~y^v8-HonAHL0j=lS#hbhOb=+vBEnJ@a+3y_K zU4M66x;&G2XE>|xyAc;tMzOG0*Iu{2M#trAdd_*a<`?XAG2T0&H{e4uIz0y?yjdb^=sU5v3GO^+;P=N4zhkf&%QT>mvhJU?`Cv<^SzmU zSGLk~__X`XeC|lp3xBfXx{Yo5Wy;WMst$Eshw#*7cKp z>#w@yZFF2r`r@^pvtQh4H9^OXx6$n9Pjkix983qz->6%-e85fJayI9kCnF!C>XJ zWT77W;jfy|B?EF#Z*sNzH$Xkoa`CQu^VyvTGj4kH>;taDnykURUFNR-#)rHk;`M6( z)>OynxU^*5e5LPB;{p8jpBXoK!{>*3+Jjd1p&8fOF);0uJYW58*>R&ad*&{C#LfFh z4SA3n@|?%~xXW_KwJ&-;^~2-C9j|UVM#sg#rPqPmWwn0wW6ZdC;qU66an;VAizyp4 z`)oyb$CWEQIdfjESKWGyj+-pVhKwc4@%gTP9apw6o%yUEb~MkpG^HO_JgwVbW5%UZ z)-Y-1^D2MF%^tJrX-)eY9oPEP7f&)ViR$lizpzj%)o6 z+A*`Ixz2hpce%7;IOJhBZ=K%!4HTx{@Q!v~`R>>k>&BIxq4#?m?Gp_nS^`hgZkDt_x{5vi`=$5q| zpI5jtp=L7eJj-qU`WQ1Vt?3JSPVa?1t7pfxb}ZU(?)s~4c^e%Uk6v|4PdnpxdszMI zHSW0BbjRD(f0w%BX5Zxn{>;<*^=sU5`N0c3b42%DCT3ju!sNw{w{?5l7y9>{E!-V` zrxzDT{dc{sM_RpkN38S!&UJKp=}f+M)*t?i8_mw~+I%Z}+>DE{^KYHe>fCXYFy)czyv5zkMW>gxr#)Pc4)uBm*(Z0_jddO)z4@-l1N#C$R`;>6@3_g8Id*zv z#N|Qub-7K}cKTz)4HHN0{G7Q~zDw5a!@7mb({JvRxOD1;Z|lWWCv`KA%&ojH>$YQz z{LP-8e&aJgKFve+bi5osb6AJZ?&cWj&9nN$^^?AvalgGkM6Yx7n&WQmb<1mXdU+rR z@{oC=Km9k$N5{?gK7Z3k=T6o-E=F=e=Qna@?e-i0ALwMt&(=L#d;R(~MtX6reRt1& zLF;0~_0FWvk(#;Yhu>x2pK0&7*3VP>;wT`RL@REznf$y_B{*|l8JQrr_ch)UuPq^}h7nXDOv(;W1>BWui-F=}R zJ;oh3y6ufL+dF;cJo_#&<0c1u*nKD8akHoKV~0QE>J4j^XWE^yXZub4`WPcFMsnu7 zs};E8%4l+&InswKUUyv2hL@hxYd%}?V})x!(+_wn-qvlc5A^B{8gW0}ck&K)##8@q z)lF}y8!zdUVZCM!KR?~i&Ro22yf3U@jcCK88QbU1t3HqN5D&0v&)&1+bH_dRg?`2k z-+XtyG2&XUta=tB^)uJqd!lYR#)wN7R(cM5r-y+%u9~H8@!?Z{>BYp84cd4880ocs z+2^xs^El7mJN4_gG2+UqHOFI_UCF>1+;Ow+)Xp4^l|2%F#7(`z_VYLLfX4^C^S-va z=hS0Ez0s+5bnfg6n8wJ5aQPYCuIXLvxjXJ^-`13GW5%_92cKn_*3}HpxctD|%|boW z`b%Tp@4Mc1T;J6(@pA0cY{ZRD90#t?cD=az8*bL_jLGZO?<95W4}Zgz1H5l@?KgUu z&+>)udNt1@AM}``?ms~m~UmD7;)*$y1&`S{)SIqXu`S^x~#g zGW@Nah06o{-CnClhJBpBiLZMw&-L_*{*e!79v1!SY1;k%$-d(`&!02l8e8VA8-Z{^hfIXy5t%*Kw_XXaB0(-$uvfW&Ss!=;gy|=0;pzroZB|&s_6^H+TGw>vUGxoNgszRzq>l?kz;iJCS%Sy z%%PsM930U4&y1_jQa}5}&U>Td@)gs&l167-x&A~iuIH_NXZ>~iF*?2HeIZ`puV@AC zxar;W$l2SK_fp5@Y1q!V>213CG|Tdc8(y?N`F?W8#r#&k(VINziLS78Sc1vdRC_ReXiY#A2Y7? zV|=S_$-n)K?r-f6xLNydz3S0pjC_dB)Qr#e$(_FIxN_?_=gmB;?2apAyhzq~!)NoX z^xTM>Uc+?A!?(DpowZ|UZ|D6=kBs!nY}Svv+Y|N3kM{+gYD1s8@fh3B{Gc~`5udwp z)f*r4e#2C+l|A7+$B2tXv-PJxcG(>_+5I;6{Qr|P$8PO9z5GBYUi<9LIK?|iqvjkq-O zlqMRTefFsjSvy@%{=bOz*l=H$vFvBq4)ycilas7vwuX@pxY7L7iy1eyI`0pw9(Po%_2m069!9j%$CH)bt=nGXs#k7hlh$Naoz%-*t9#Fk8!w*J?v9)NfquGI zHFwikm(Q&i(RX@YadBn9=9?%<4lDF0OQO8v;`#?OVo5w3zsN04)3yVg%2-YS!`WZ^#459Ho%%r6xi?nUk7;y1(C46;ujWugnpgIS5!Z7V zG9|~jwC;SDnsKq{bgbsKZaqfFnH4vE5mA!?Hui^d(Vt(?Yq9#Bgg2t)*r3T z-^j1JCBv?>N3QlnJ<{gkNv-fQMn0&KwP*j44Y}LN+llKysg`Z z{?eH&?CQicMtZ}=-tC72dc(y(d$F@ebb6Bsn!`5FDtpAu`;xb~d2ir68 zkgxoWEW7x6h2~GV)=n>tSTy6V@;klp^y!|~acQJ)b-%4!-^Pf`KY4KAKY4eKxI9c2 zzUf1_wBoO3wr+imPA|5D7xo-;?ELN0aqVYVjwki&xYqv7zlZdj_0t)jdFj@KH??|d_3=$_ImoC{ClLKo>3Y@Uj;r=(X7+_u z_KeHdyq8?ln)ExvxpL_b#y5WBF`vX06 z@<30-hh3jLA8^kZSj|B_c8rc|Z_gf)p4ee`Tz;fp=@CDNeV4rvS8wq+`wSNCcdWYQ zH9D^RCTBc}FRSd1t7h3FU7N#PtJ-H=ES}=6>a1IjG2*69bn;ksv7J}Glhkd)_XV!~ z0Egye+4U!V93OEU_}H%bYu-CwMn2$%AzzrQ^{QKsG2^B$u=&Dk*`}W#Gno-)b7f2p_<%20GQ3*D zPA{F9-_%R1nq_~CW^*LVce+P<<3Tj9?g=&RYjj*19jp3Yz+V?LuDw8S@WWoDFFx6S zv)|x7$wECg%r7o&-|k~v>kfB!@2y9NdaXS@an8n04M%mujmB^K8*ci|&vd7c@7`^7 zYwLq&)h=s~{uSnoo1WO+Z!nF{-}IY4a4j451UYfO>yM7B*D#XhygPi(6MM&O@9DVS zoAGw`eBCx29aqhw@w{ic=85mE?xB11Jz~c7951kTHFsQpWAT=r-Hg>EZ64^916t`$ zPuzWfs#}lI=?(YW`Ax>09lQLQUi*S0e((Zc9^y;a*Bv+gAj8<6cV6wg85c**9O-L% zyUf9O<@b6}{@q*9k1AIE~MyJ>FyR7qEBd&E? zbB8fs}C>C!54G#{LYV#t7hjs$U@lhXEzUcIMHd2-CiE)#n1!sfj)EXyyHgP z{M#(f6J|X&tXuv!7aDPQ`WMp}=~Zjl;zK-EvmN~%SFPy8a}M8m<$F-al?ObG#)wOoBR#iL zrx)v_4`}~cfPM|iT z*)x}Je`oA6>yfvIalad_;mWmNc{Y0K40mPyb^9@{-2D5j9P`*5be+9$eK*I5%SSxx_I$KI{rFV)vCq=`0GC-E}afusuw;s-7DVKZ9no5UwIt8 z=D_vYr`%*>mse9CW8?#!*027qt=~GF@7}|8%P~e=S%_X9SpQDOfJfZyGxXt_V^{Zx z8?Ex-T844$&F13KZoly{+)v(}Bfa@9$qxs8{BSnsoz}YLhrjVOzNRa&got z`?}286L`M5H`J{|E+4Ze(5_zUxVlHytu0sWWHmnUI(b;(k9?pPFU;^(d!im4>XrXw z_?$OVI*>;J@GJsPgF`kiwe|5x;%^(L=)^d-AHyn5^ySMD`m z{0RSP9x!RnzZKA_hH~eO%OB^vck@{V_d>o=w(LKiYlfyyEp$Z@6lf?^SZPn~M>bMw)pYPjP*=Iy$b}SugJD+Rt-# zTr7H>`G9ZFU#;DU>sdXG$?qq+N8G$Od76EK=YBTViq9Pv#}Umkm}}>|tc|!B4t~g6 z{Kca6sorQ)r_{AgV$nsPnw>pf@wFZ~#)z9Whv}?O_`J#< zaWTS-*LU@=N8ZMjiy|a%6KH~C$&gka3xpwMz<@!En&9pjb&G!nQn`6b}D_36e z&UpxbCwEtFvS7`4*)Q+Dm)5Osd_R9fFK^=o{_1~WB zGJ$EXc(T%Kb^F`*)Nqrn_^{f$cp1@~^=@`7d=!;!1Z zhi#73c9z4AOE0a-+TDIzx4w;4?p^)gp!LI*o8GYIWMY?Tjw@FW@F1Cs$2)t&mFwM3 zXKE*3w8m>|+1(%X1#a?7XWm8T*{$E%2l~!DIvslClRVJM2dwxitFBl3p>93KSucjQ z^8=fo=Gyh39(l_{xY3Lqzg_dmT=I99y^-E47ss4f(Z0(~#&FI#xZ~9AZ{w;rHA{bF z{kX23S8{yi^1$I)964}p{c5_Jw-Gl?e7yWEyUMIbe&pet;dnx)4D;Y=-Dt&5rkv?c z-tf(L=k?VG_0qTD#OqJ}#g#D!P53+OuiK9?(#wmy_rFy)Jawlte`ASzTpH~G4m?`TW!*d3s@q?K z-ZNJoXpS#o-__{_{B?2VrdH0_crsuuGNy*JeATayapuN@aIkS4D?N4qfBomo^?de0 zbI{8ZUPPzs6;IAw_j@ku=QW*}eDM1fX5Dt=H+4&9F?g6>OXhs`L@%x*{<=<{Kk)%4 zdt+)ypU<6n^3(^<@_{#4yE^NUW1Q<|zoe57_}H|nlc5-~?${4OZ@PQ}lwd-v?^44$RS}Se1shfH5 z`MuK5SG~#BZmw#&w{hmu!Ux*;={t&RnyrC;=as*Wu3XuQW^*`rlD=E&d=bj~(M$ zx9E(&_;m8fd8LQzwjch+SKO!Gc3hernM2*=?P(sYlg{)5pB=k3t4H2^z~!gf+1KdH zUbO4Uxo*injq>^=xAoXDMqJt)v}4=XKJho%rS{ULg3@Es?f(!X23kzU;F6XtTP)~jwk z#)!*<^TAVIVz2JQ9hXL$c@e#z`b%r}9p0|wwQhSGBfZ&o@SkWkLC3|U_31mKPeIinGG zH)|c&zL32Ee^>K}izll*pfld?>}?$v1E0sv`8SVMKW1Fs?r5yrU!&vFDN|40LEtm4 zdi_ms`KrgT`73{FxY8S)URq^>AC6qF_D9F1Ek4M=&fZYBxBh0|;e~7LPxoJ(WLG~w zdFPLOu$QGTqMKg6sM*uH<=vzo)-qO3eZ;rU0&-$H{t({Da zxN5~iT3zq-L*0Iij*A%&a;@j0{VsDOt{TyaXYD*)^|fw2#+4gw`EJH*`M~$CchG&8 zwGkJ~I(M^GkG#f+t4?&&BxAC-Te}fAdENbvQ`5gj$JJvzrTd8=GcKRM$<<@m9;V+M ztNYJL@0p7)%lg5!-PHc}Co1H)q+{$!^{L9Pj7=E%LZ`Qv0_tupgp0k4&>bUc+sN3K2 zODB$+Ia~kJOkDNi(V5!u0edIAb^A~+50fkWZ*XOaMw;>X`i(r`>QR1RCKJ25ul{Bq zk7n$=Gj?>3xO$9EKDfr+&Bm3Br%(Biy^TeEufhYxa_y1AyAMjn5n7fapn^atj*^d<)~?3&h3{q;N_FlqgU zUUiDM&NAjR-XtS+xirpsz~Ci+WebmPKf7M}-cYw4W5iW=T50yT9Dnn@a<%8q-1EQ3 zsTb|mK=W=dU%7n5!}b2$<<%p{2p20ka7}ysrkA(oSnZ87*K_KVOyEW{-m15C>#L7x zq$_pDG0$#4)Fa0@>y18hIP$+oV((;jsAJVVZ9Uq*Z)-PPz(qpd86aFXrI+jDuXg0_0o=}gxM)b;9>gKG!c#UtK z-S=|GjSuk#JAJ&;cO&j6`KsxUG2-$$dxbfkyo;_}`wV}1%VR#ccC4(wZac=6D-T#S z%bER>=1=P;7kUBH5l^h)sa`erEM9cG<}beAt&Xz~xO~Lot+Tv+(r?y(=Eh&^$4++5 zv7&pVSKWAQ9-5zceC1-&nBKrmmOrVRHG3W(^PG*H`d#&AUwBfhlJOd6uJ!U%#%Se- zgJ$#j+NRWr|H^B8Z}+j``OzsVD>EJS;L=IhE_XTALM zd@?2r{KfLQv$d>b;>`6d7EkdozL7KBtdZs?I_t4xT=il*`1~|uz#}dnlCP(luiVs( z-sFup*?-LQq;AoN8$W1Qzj(jGKI_#JdO^=cud~m0^LFL(A^ng{?C7k=k8$N@{n*(n znB>T+<1`pnKLf_?w)qmi<|dD{_e)jUa_P7%#E)2 z>daT3m@|F7^7qV{o9}2C@dOjk@6^xSyZz=_&+x$W(fw&2a2;XlS9