Skip to content

Commit 1593a34

Browse files
authored
Make hamming distance calculation different for i7 and i5 (#2867)(patch)
## Description Addresses #2857, partial rollback of #2683. Changes the way in which Hamming distance is calculated for index 2 (i5). If the override cycles of the sample sheet are reverse complemented, it uses the tails of the indexes to calculate the Hamming distance. It the override cycles value is not reverse complemented, the function uses the head of the indexes. If, for example, we had two i5 on the same lane with sequences CGGAACTG and CAGAACAGAA, the heads hamming distance would be calculated ``` CGGAACTG |X||||X| ---> Hamming distance = 2 CAGAACAGAA ``` While the tail comparison would be ``` CGGAACTG XXXX|XXX ---> Hamming distance = 7 CAGAACAGAA ``` This table summarises the new rule | **Reverse complement** | **Override cycles example** | **String comparison** | **Barcode mismatch value<br> for the sequences above** | |------------------------ |----------------------------- |----------------------- |---------------------------- | | True | `Y151;I8N2;N2I8;Y151` | Tails | 1 | | False | `Y151;I8N2;I8N2;Y151` | Heads | 0 | ### Added - Function to calculate barcode mismatches for index 2 - Test for new function ### Changed - Renamed previous hamming distance for indexes to only index - Parametrised tests
1 parent 2a5be9f commit 1593a34

File tree

5 files changed

+158
-33
lines changed

5 files changed

+158
-33
lines changed

cg/apps/demultiplex/sample_sheet/index.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,34 @@ def pad_index_two(index_string: str, reverse_complement: bool) -> str:
7575
return index_string + INDEX_TWO_PAD_SEQUENCE
7676

7777

78-
def get_hamming_distance_for_indexes(sequence_1: str, sequence_2: str) -> int:
79-
"""Get the hamming distance between two index sequences.
78+
def get_hamming_distance_index_1(sequence_1: str, sequence_2: str) -> int:
79+
"""
80+
Get the hamming distance between two index 1 sequences.
8081
In the case that one sequence is longer than the other, the distance is calculated between
81-
the shortest sequence and the first segment of equal length of the longest sequence."""
82+
the shortest sequence and the first segment of equal length of the longest sequence.
83+
"""
8284
shortest_index_length: int = min(len(sequence_1), len(sequence_2))
8385
return get_hamming_distance(
8486
str_1=sequence_1[:shortest_index_length], str_2=sequence_2[:shortest_index_length]
8587
)
88+
89+
90+
def get_hamming_distance_index_2(
91+
sequence_1: str, sequence_2: str, is_reverse_complement: bool
92+
) -> int:
93+
"""
94+
Get the hamming distance between two index 2 sequences.
95+
In the case that one sequence is longer than the other, the distance is calculated between
96+
the shortest sequence and the last segment of equal length of the longest sequence.
97+
If it does not require reverse complement, the calculation is the same as for index 1.
98+
"""
99+
shortest_index_length: int = min(len(sequence_1), len(sequence_2))
100+
return (
101+
get_hamming_distance(
102+
str_1=sequence_1[-shortest_index_length:], str_2=sequence_2[-shortest_index_length:]
103+
)
104+
if is_reverse_complement
105+
else get_hamming_distance(
106+
str_1=sequence_1[:shortest_index_length], str_2=sequence_2[:shortest_index_length]
107+
)
108+
)

cg/apps/demultiplex/sample_sheet/sample_models.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55

66
from cg.apps.demultiplex.sample_sheet.index import (
77
MINIMUM_HAMMING_DISTANCE,
8-
get_hamming_distance_for_indexes,
8+
get_hamming_distance_index_1,
9+
get_hamming_distance_index_2,
910
get_reverse_complement_dna_seq,
1011
is_dual_index,
1112
is_padding_needed,
@@ -49,7 +50,7 @@ def process_indexes(self, run_parameters: RunParameters):
4950

5051
@abstractmethod
5152
def update_barcode_mismatches(
52-
self, samples_to_compare: list, is_run_single_index: bool
53+
self, samples_to_compare: list, is_run_single_index: bool, is_reverse_complement: bool
5354
) -> None:
5455
"""Update the barcode_mismatches_1 and barcode_mismatches_2 attributes."""
5556
pass
@@ -100,7 +101,7 @@ def process_indexes(self, run_parameters: RunParameters):
100101
self.index2 = get_reverse_complement_dna_seq(self.index2)
101102

102103
def update_barcode_mismatches(
103-
self, samples_to_compare: list, is_run_single_index: bool
104+
self, samples_to_compare: list, is_run_single_index: bool, is_reverse_complement: bool
104105
) -> None:
105106
"""No updating of barcode mismatch values for Bcl2Fastq samples."""
106107
LOG.debug(f"No updating of barcode mismatch values for Bcl2Fastq sample {self.sample_id}")
@@ -176,15 +177,17 @@ def _update_barcode_mismatches_1(
176177
if self.sample_id == sample.sample_id:
177178
continue
178179
if (
179-
get_hamming_distance_for_indexes(sequence_1=self.index, sequence_2=sample.index)
180+
get_hamming_distance_index_1(sequence_1=self.index, sequence_2=sample.index)
180181
< MINIMUM_HAMMING_DISTANCE
181182
):
182183
LOG.info(f"Turning barcode mismatch for index 1 to 0 for sample {self.sample_id}")
183184
self.barcode_mismatches_1 = 0
184185
break
185186

186187
def _update_barcode_mismatches_2(
187-
self, samples_to_compare: list["FlowCellSampleBCLConvert"]
188+
self,
189+
samples_to_compare: list["FlowCellSampleBCLConvert"],
190+
is_reverse_complement: bool,
188191
) -> None:
189192
"""Assign zero to barcode_mismatches_2 if the hamming distance between self.index2
190193
and the index2 of any sample in the lane is below the minimum threshold.
@@ -197,7 +200,11 @@ def _update_barcode_mismatches_2(
197200
if self.sample_id == sample.sample_id:
198201
continue
199202
if (
200-
get_hamming_distance_for_indexes(sequence_1=self.index2, sequence_2=sample.index2)
203+
get_hamming_distance_index_2(
204+
sequence_1=self.index2,
205+
sequence_2=sample.index2,
206+
is_reverse_complement=is_reverse_complement,
207+
)
201208
< MINIMUM_HAMMING_DISTANCE
202209
):
203210
LOG.info(f"Turning barcode mismatch for index 2 to 0 for sample {self.sample_id}")
@@ -212,7 +219,10 @@ def process_indexes(self, run_parameters: RunParameters):
212219
self.update_override_cycles(run_parameters=run_parameters)
213220

214221
def update_barcode_mismatches(
215-
self, samples_to_compare: list["FlowCellSampleBCLConvert"], is_run_single_index: bool
222+
self,
223+
samples_to_compare: list["FlowCellSampleBCLConvert"],
224+
is_run_single_index: bool,
225+
is_reverse_complement: bool,
216226
) -> None:
217227
"""Update barcode mismatch attributes comparing to the rest of the samples in the lane."""
218228
if not samples_to_compare:
@@ -221,4 +231,6 @@ def update_barcode_mismatches(
221231
if is_run_single_index:
222232
LOG.debug("Run is single-indexed, skipping barcode mismatch update for index 2")
223233
return
224-
self._update_barcode_mismatches_2(samples_to_compare=samples_to_compare)
234+
self._update_barcode_mismatches_2(
235+
samples_to_compare=samples_to_compare, is_reverse_complement=is_reverse_complement
236+
)

cg/apps/demultiplex/sample_sheet/sample_sheet_creator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def process_samples_for_sample_sheet(self) -> None:
109109
lims_sample.update_barcode_mismatches(
110110
samples_to_compare=samples_in_lane,
111111
is_run_single_index=self.run_parameters.is_single_index,
112+
is_reverse_complement=self.index_settings.are_i5_override_cycles_reverse_complemented,
112113
)
113114

114115
def construct_sample_sheet(self) -> list[list[str]]:

tests/apps/demultiplex/test_index.py

Lines changed: 108 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33

44
from cg.apps.demultiplex.sample_sheet.index import (
55
Index,
6-
get_hamming_distance_for_indexes,
6+
get_hamming_distance_index_1,
7+
get_hamming_distance_index_2,
78
get_reverse_complement_dna_seq,
89
get_valid_indexes,
910
is_padding_needed,
@@ -76,34 +77,120 @@ def test_get_reverse_complement_not_dna(caplog):
7677
get_reverse_complement_dna_seq(dna=strain)
7778

7879

79-
def test_get_hamming_distance_index_1_different_lengths():
80-
"""Test that hamming distance between indexes with same prefix but different lengths is zero."""
81-
# GIVEN two index_1 sequences with the same prefixes but different lengths
82-
sequence_1: str = "GATTACA"
83-
sequence_2: str = "GATTACAXX"
80+
@pytest.mark.parametrize(
81+
"sequence_1, sequence_2, expected_distance",
82+
[
83+
("GATTACA", "GATTACA", 0),
84+
("GATTACA", "GATTACAXX", 0),
85+
("XXXACA", "GATTACA", 6),
86+
("XXXXXXX", "GATTACA", 7),
87+
],
88+
ids=[
89+
"Identical sequences",
90+
"Same initial part, different lengths",
91+
"Same final part, different lengths",
92+
"Different sequences, same length",
93+
],
94+
)
95+
def test_get_hamming_distance_index_1(sequence_1: str, sequence_2: str, expected_distance: int):
96+
"""
97+
Test that Hamming distances are calculated correctly for different sets of index 1 sequences.
98+
This is, that the operation is commutative and aligns sequences from the left.
99+
"""
100+
# GIVEN two index_1 sequences
84101

85102
# WHEN getting the hamming distance between them in any order
86103

87104
# THEN the distance is zero
88-
assert get_hamming_distance_for_indexes(sequence_1=sequence_1, sequence_2=sequence_2) == 0
89-
assert get_hamming_distance_for_indexes(sequence_1=sequence_2, sequence_2=sequence_1) == 0
105+
assert (
106+
get_hamming_distance_index_1(sequence_1=sequence_1, sequence_2=sequence_2)
107+
== expected_distance
108+
)
109+
assert (
110+
get_hamming_distance_index_1(sequence_1=sequence_2, sequence_2=sequence_1)
111+
== expected_distance
112+
)
113+
114+
115+
@pytest.mark.parametrize(
116+
"sequence_1, sequence_2, expected_distance",
117+
[
118+
("GATTACA", "GATTACA", 0),
119+
("GATTACA", "XXGATTACA", 0),
120+
("GATXX", "GATTACA", 5),
121+
("XXXXXXX", "GATTACA", 7),
122+
],
123+
ids=[
124+
"Identical sequences",
125+
"Same final part, different lengths",
126+
"Same initial part, different lengths",
127+
"Different sequences, same length",
128+
],
129+
)
130+
def test_get_hamming_distance_index_2_reverse_complement(
131+
sequence_1: str, sequence_2: str, expected_distance: int
132+
):
133+
"""
134+
Test that Hamming distances are calculated correctly for different sets of index 2 sequences
135+
with reverse complement. This is, that the operation is commutative and aligns sequences from
136+
the right.
137+
"""
138+
# GIVEN two index_2 sequences
90139

91-
# WHEN getting the hamming distance between themselves
140+
# WHEN getting the hamming distance between them in any order
92141

93142
# THEN the distance is zero
94-
assert get_hamming_distance_for_indexes(sequence_1=sequence_1, sequence_2=sequence_1) == 0
95-
assert get_hamming_distance_for_indexes(sequence_1=sequence_2, sequence_2=sequence_2) == 0
143+
assert (
144+
get_hamming_distance_index_2(
145+
sequence_1=sequence_1, sequence_2=sequence_2, is_reverse_complement=True
146+
)
147+
== expected_distance
148+
)
149+
assert (
150+
get_hamming_distance_index_2(
151+
sequence_1=sequence_2, sequence_2=sequence_1, is_reverse_complement=True
152+
)
153+
== expected_distance
154+
)
96155

97156

98-
def test_get_hamming_distance_index_1_different_prefixes():
99-
"""Test that hamming distance for index 1 counts different characters from the left."""
100-
# GIVEN two index_1 sequences with different lengths differing by two characters
101-
# when aligned to the left
102-
sequence_1: str = "GATXX"
103-
sequence_2: str = "GATTACA"
157+
@pytest.mark.parametrize(
158+
"sequence_1, sequence_2, expected_distance",
159+
[
160+
("GATTACA", "GATTACA", 0),
161+
("GATTACA", "GATTACAXX", 0),
162+
("XXXACA", "GATTACA", 6),
163+
("XXXXXXX", "GATTACA", 7),
164+
],
165+
ids=[
166+
"Identical sequences",
167+
"Same initial part, different lengths",
168+
"Same final part, different lengths",
169+
"Different sequences, same length",
170+
],
171+
)
172+
def test_get_hamming_distance_index_2_no_reverse_complement(
173+
sequence_1: str, sequence_2: str, expected_distance: int
174+
):
175+
"""
176+
Test that Hamming distances are calculated correctly for different sets of index 2 sequences
177+
without reverse complement. This is, that the operation is commutative and aligns sequences
178+
from the left.
179+
"""
180+
# GIVEN two index_2 sequences
104181

105-
# WHEN getting the hamming distance between them in any order
182+
# WHEN getting the hamming distance between them in any order with reverse complement
106183

107-
# THEN the distance is equal to the number of different characters
108-
assert get_hamming_distance_for_indexes(sequence_1=sequence_1, sequence_2=sequence_2) == 2
109-
assert get_hamming_distance_for_indexes(sequence_1=sequence_2, sequence_2=sequence_1) == 2
184+
# THEN the distance is zero
185+
assert (
186+
get_hamming_distance_index_2(
187+
sequence_1=sequence_1, sequence_2=sequence_2, is_reverse_complement=False
188+
)
189+
== expected_distance
190+
)
191+
assert (
192+
get_hamming_distance_index_2(
193+
sequence_1=sequence_2, sequence_2=sequence_1, is_reverse_complement=False
194+
)
195+
== expected_distance
196+
)

tests/apps/demultiplex/test_sample_models.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,9 @@ def test_update_barcode_mismatches_2(
311311
sample_to_update: FlowCellSampleBCLConvert = sample_list[0]
312312

313313
# WHEN updating the value for index 2 barcode mismatches
314-
sample_to_update._update_barcode_mismatches_2(samples_to_compare=sample_list)
314+
sample_to_update._update_barcode_mismatches_2(
315+
samples_to_compare=sample_list, is_reverse_complement=False
316+
)
315317

316318
# THEN the value for index 2 barcode mismatches is updated with the expected value
317319
assert sample_to_update.barcode_mismatches_2 == expected_barcode_mismatch

0 commit comments

Comments
 (0)