Make hamming distance calculation different for i7 and i5 (#2867)(patch)

diitaz93 · web-flow · commit 1593a3403a14 · 2024-01-25T14:37:48.000+01:00
## Description Addresses #2857, partial rollback of #2683. Changes the way in which Hamming distance is calculated for index 2 (i5). If the override cycles of the sample sheet are reverse complemented, it uses the tails of the indexes to calculate the Hamming distance. It the override cycles value is not reverse complemented, the function uses the head of the indexes. If, for example, we had two i5 on the same lane with sequences CGGAACTG and CAGAACAGAA, the heads hamming distance would be calculated ``` CGGAACTG |X||||X| ---> Hamming distance = 2 CAGAACAGAA ``` While the tail comparison would be ``` CGGAACTG XXXX|XXX ---> Hamming distance = 7 CAGAACAGAA ``` This table summarises the new rule | **Reverse complement** | **Override cycles example** | **String comparison** | **Barcode mismatch value<br> for the sequences above** | |------------------------ |----------------------------- |----------------------- |---------------------------- | | True | `Y151;I8N2;N2I8;Y151` | Tails | 1 | | False | `Y151;I8N2;I8N2;Y151` | Heads | 0 | ### Added - Function to calculate barcode mismatches for index 2 - Test for new function ### Changed - Renamed previous hamming distance for indexes to only index - Parametrised tests
diff --git a/cg/apps/demultiplex/sample_sheet/index.py b/cg/apps/demultiplex/sample_sheet/index.py
@@ -75,11 +75,34 @@ def pad_index_two(index_string: str, reverse_complement: bool) -> str:
     return index_string + INDEX_TWO_PAD_SEQUENCE
 
 
-def get_hamming_distance_for_indexes(sequence_1: str, sequence_2: str) -> int:
-    """Get the hamming distance between two index sequences.
+def get_hamming_distance_index_1(sequence_1: str, sequence_2: str) -> int:
+    """
+    Get the hamming distance between two index 1 sequences.
     In the case that one sequence is longer than the other, the distance is calculated between
-    the shortest sequence and the first segment of equal length of the longest sequence."""
+    the shortest sequence and the first segment of equal length of the longest sequence.
+    """
     shortest_index_length: int = min(len(sequence_1), len(sequence_2))
     return get_hamming_distance(
         str_1=sequence_1[:shortest_index_length], str_2=sequence_2[:shortest_index_length]
     )
+
+
+def get_hamming_distance_index_2(
+    sequence_1: str, sequence_2: str, is_reverse_complement: bool
+) -> int:
+    """
+    Get the hamming distance between two index 2 sequences.
+    In the case that one sequence is longer than the other, the distance is calculated between
+    the shortest sequence and the last segment of equal length of the longest sequence.
+    If it does not require reverse complement, the calculation is the same as for index 1.
+    """
+    shortest_index_length: int = min(len(sequence_1), len(sequence_2))
+    return (
+        get_hamming_distance(
+            str_1=sequence_1[-shortest_index_length:], str_2=sequence_2[-shortest_index_length:]
+        )
+        if is_reverse_complement
+        else get_hamming_distance(
+            str_1=sequence_1[:shortest_index_length], str_2=sequence_2[:shortest_index_length]
+        )
+    )
diff --git a/cg/apps/demultiplex/sample_sheet/sample_models.py b/cg/apps/demultiplex/sample_sheet/sample_models.py
@@ -5,7 +5,8 @@
 
 from cg.apps.demultiplex.sample_sheet.index import (
     MINIMUM_HAMMING_DISTANCE,
-    get_hamming_distance_for_indexes,
+    get_hamming_distance_index_1,
+    get_hamming_distance_index_2,
     get_reverse_complement_dna_seq,
     is_dual_index,
     is_padding_needed,
@@ -49,7 +50,7 @@ def process_indexes(self, run_parameters: RunParameters):
 
     @abstractmethod
     def update_barcode_mismatches(
-        self, samples_to_compare: list, is_run_single_index: bool
+        self, samples_to_compare: list, is_run_single_index: bool, is_reverse_complement: bool
     ) -> None:
         """Update the barcode_mismatches_1 and barcode_mismatches_2 attributes."""
         pass
@@ -100,7 +101,7 @@ def process_indexes(self, run_parameters: RunParameters):
             self.index2 = get_reverse_complement_dna_seq(self.index2)
 
     def update_barcode_mismatches(
-        self, samples_to_compare: list, is_run_single_index: bool
+        self, samples_to_compare: list, is_run_single_index: bool, is_reverse_complement: bool
     ) -> None:
         """No updating of barcode mismatch values for Bcl2Fastq samples."""
         LOG.debug(f"No updating of barcode mismatch values for Bcl2Fastq sample {self.sample_id}")
@@ -176,15 +177,17 @@ def _update_barcode_mismatches_1(
             if self.sample_id == sample.sample_id:
                 continue
             if (
-                get_hamming_distance_for_indexes(sequence_1=self.index, sequence_2=sample.index)
+                get_hamming_distance_index_1(sequence_1=self.index, sequence_2=sample.index)
                 < MINIMUM_HAMMING_DISTANCE
             ):
                 LOG.info(f"Turning barcode mismatch for index 1 to 0 for sample {self.sample_id}")
                 self.barcode_mismatches_1 = 0
                 break
 
     def _update_barcode_mismatches_2(
-        self, samples_to_compare: list["FlowCellSampleBCLConvert"]
+        self,
+        samples_to_compare: list["FlowCellSampleBCLConvert"],
+        is_reverse_complement: bool,
     ) -> None:
         """Assign zero to barcode_mismatches_2 if the hamming distance between self.index2
         and the index2 of any sample in the lane is below the minimum threshold.
@@ -197,7 +200,11 @@ def _update_barcode_mismatches_2(
             if self.sample_id == sample.sample_id:
                 continue
             if (
-                get_hamming_distance_for_indexes(sequence_1=self.index2, sequence_2=sample.index2)
+                get_hamming_distance_index_2(
+                    sequence_1=self.index2,
+                    sequence_2=sample.index2,
+                    is_reverse_complement=is_reverse_complement,
+                )
                 < MINIMUM_HAMMING_DISTANCE
             ):
                 LOG.info(f"Turning barcode mismatch for index 2 to 0 for sample {self.sample_id}")
@@ -212,7 +219,10 @@ def process_indexes(self, run_parameters: RunParameters):
         self.update_override_cycles(run_parameters=run_parameters)
 
     def update_barcode_mismatches(
-        self, samples_to_compare: list["FlowCellSampleBCLConvert"], is_run_single_index: bool
+        self,
+        samples_to_compare: list["FlowCellSampleBCLConvert"],
+        is_run_single_index: bool,
+        is_reverse_complement: bool,
     ) -> None:
         """Update barcode mismatch attributes comparing to the rest of the samples in the lane."""
         if not samples_to_compare:
@@ -221,4 +231,6 @@ def update_barcode_mismatches(
         if is_run_single_index:
             LOG.debug("Run is single-indexed, skipping barcode mismatch update for index 2")
             return
-        self._update_barcode_mismatches_2(samples_to_compare=samples_to_compare)
+        self._update_barcode_mismatches_2(
+            samples_to_compare=samples_to_compare, is_reverse_complement=is_reverse_complement
+        )
diff --git a/cg/apps/demultiplex/sample_sheet/sample_sheet_creator.py b/cg/apps/demultiplex/sample_sheet/sample_sheet_creator.py
@@ -109,6 +109,7 @@ def process_samples_for_sample_sheet(self) -> None:
                 lims_sample.update_barcode_mismatches(
                     samples_to_compare=samples_in_lane,
                     is_run_single_index=self.run_parameters.is_single_index,
+                    is_reverse_complement=self.index_settings.are_i5_override_cycles_reverse_complemented,
                 )
 
     def construct_sample_sheet(self) -> list[list[str]]:
diff --git a/tests/apps/demultiplex/test_index.py b/tests/apps/demultiplex/test_index.py
@@ -3,7 +3,8 @@
 
 from cg.apps.demultiplex.sample_sheet.index import (
     Index,
-    get_hamming_distance_for_indexes,
+    get_hamming_distance_index_1,
+    get_hamming_distance_index_2,
     get_reverse_complement_dna_seq,
     get_valid_indexes,
     is_padding_needed,
@@ -76,34 +77,120 @@ def test_get_reverse_complement_not_dna(caplog):
         get_reverse_complement_dna_seq(dna=strain)
 
 
-def test_get_hamming_distance_index_1_different_lengths():
-    """Test that hamming distance between indexes with same prefix but different lengths is zero."""
-    # GIVEN two index_1 sequences with the same prefixes but different lengths
-    sequence_1: str = "GATTACA"
-    sequence_2: str = "GATTACAXX"
+@pytest.mark.parametrize(
+    "sequence_1, sequence_2, expected_distance",
+    [
+        ("GATTACA", "GATTACA", 0),
+        ("GATTACA", "GATTACAXX", 0),
+        ("XXXACA", "GATTACA", 6),
+        ("XXXXXXX", "GATTACA", 7),
+    ],
+    ids=[
+        "Identical sequences",
+        "Same initial part, different lengths",
+        "Same final part, different lengths",
+        "Different sequences, same length",
+    ],
+)
+def test_get_hamming_distance_index_1(sequence_1: str, sequence_2: str, expected_distance: int):
+    """
+    Test that Hamming distances are calculated correctly for different sets of index 1 sequences.
+    This is, that the operation is commutative and aligns sequences from the left.
+    """
+    # GIVEN two index_1 sequences
 
     # WHEN getting the hamming distance between them in any order
 
     # THEN the distance is zero
-    assert get_hamming_distance_for_indexes(sequence_1=sequence_1, sequence_2=sequence_2) == 0
-    assert get_hamming_distance_for_indexes(sequence_1=sequence_2, sequence_2=sequence_1) == 0
+    assert (
+        get_hamming_distance_index_1(sequence_1=sequence_1, sequence_2=sequence_2)
+        == expected_distance
+    )
+    assert (
+        get_hamming_distance_index_1(sequence_1=sequence_2, sequence_2=sequence_1)
+        == expected_distance
+    )
+
+
+@pytest.mark.parametrize(
+    "sequence_1, sequence_2, expected_distance",
+    [
+        ("GATTACA", "GATTACA", 0),
+        ("GATTACA", "XXGATTACA", 0),
+        ("GATXX", "GATTACA", 5),
+        ("XXXXXXX", "GATTACA", 7),
+    ],
+    ids=[
+        "Identical sequences",
+        "Same final part, different lengths",
+        "Same initial part, different lengths",
+        "Different sequences, same length",
+    ],
+)
+def test_get_hamming_distance_index_2_reverse_complement(
+    sequence_1: str, sequence_2: str, expected_distance: int
+):
+    """
+    Test that Hamming distances are calculated correctly for different sets of index 2 sequences
+    with reverse complement. This is, that the operation is commutative and aligns sequences from
+    the right.
+    """
+    # GIVEN two index_2 sequences
 
-    # WHEN getting the hamming distance between themselves
+    # WHEN getting the hamming distance between them in any order
 
     # THEN the distance is zero
-    assert get_hamming_distance_for_indexes(sequence_1=sequence_1, sequence_2=sequence_1) == 0
-    assert get_hamming_distance_for_indexes(sequence_1=sequence_2, sequence_2=sequence_2) == 0
+    assert (
+        get_hamming_distance_index_2(
+            sequence_1=sequence_1, sequence_2=sequence_2, is_reverse_complement=True
+        )
+        == expected_distance
+    )
+    assert (
+        get_hamming_distance_index_2(
+            sequence_1=sequence_2, sequence_2=sequence_1, is_reverse_complement=True
+        )
+        == expected_distance
+    )
 
 
-def test_get_hamming_distance_index_1_different_prefixes():
-    """Test that hamming distance for index 1 counts different characters from the left."""
-    # GIVEN two index_1 sequences with different lengths differing by two characters
-    # when aligned to the left
-    sequence_1: str = "GATXX"
-    sequence_2: str = "GATTACA"
+@pytest.mark.parametrize(
+    "sequence_1, sequence_2, expected_distance",
+    [
+        ("GATTACA", "GATTACA", 0),
+        ("GATTACA", "GATTACAXX", 0),
+        ("XXXACA", "GATTACA", 6),
+        ("XXXXXXX", "GATTACA", 7),
+    ],
+    ids=[
+        "Identical sequences",
+        "Same initial part, different lengths",
+        "Same final part, different lengths",
+        "Different sequences, same length",
+    ],
+)
+def test_get_hamming_distance_index_2_no_reverse_complement(
+    sequence_1: str, sequence_2: str, expected_distance: int
+):
+    """
+    Test that Hamming distances are calculated correctly for different sets of index 2 sequences
+    without reverse complement. This is, that the operation is commutative and aligns sequences
+    from the left.
+    """
+    # GIVEN two index_2 sequences
 
-    # WHEN getting the hamming distance between them in any order
+    # WHEN getting the hamming distance between them in any order with reverse complement
 
-    # THEN the distance is equal to the number of different characters
-    assert get_hamming_distance_for_indexes(sequence_1=sequence_1, sequence_2=sequence_2) == 2
-    assert get_hamming_distance_for_indexes(sequence_1=sequence_2, sequence_2=sequence_1) == 2
+    # THEN the distance is zero
+    assert (
+        get_hamming_distance_index_2(
+            sequence_1=sequence_1, sequence_2=sequence_2, is_reverse_complement=False
+        )
+        == expected_distance
+    )
+    assert (
+        get_hamming_distance_index_2(
+            sequence_1=sequence_2, sequence_2=sequence_1, is_reverse_complement=False
+        )
+        == expected_distance
+    )
diff --git a/tests/apps/demultiplex/test_sample_models.py b/tests/apps/demultiplex/test_sample_models.py
@@ -311,7 +311,9 @@ def test_update_barcode_mismatches_2(
     sample_to_update: FlowCellSampleBCLConvert = sample_list[0]
 
     # WHEN updating the value for index 2 barcode mismatches
-    sample_to_update._update_barcode_mismatches_2(samples_to_compare=sample_list)
+    sample_to_update._update_barcode_mismatches_2(
+        samples_to_compare=sample_list, is_reverse_complement=False
+    )
 
     # THEN the value for index 2 barcode mismatches is updated with the expected value
     assert sample_to_update.barcode_mismatches_2 == expected_barcode_mismatch

Original file line number	Diff line number	Diff line change
`@@ -109,6 +109,7 @@ def process_samples_for_sample_sheet(self) -> None:`
`109`	`109`	`lims_sample.update_barcode_mismatches(`
`110`	`110`	`samples_to_compare=samples_in_lane,`
`111`	`111`	`is_run_single_index=self.run_parameters.is_single_index,`
	`112`	`+ is_reverse_complement=self.index_settings.are_i5_override_cycles_reverse_complemented,`
`112`	`113`	`)`
`113`	`114`
`114`	`115`	`def construct_sample_sheet(self) -> list[list[str]]:`