Skip to content

Commit

Permalink
Fix(Sample sheet) Granulate logic for indices (#2683)(patch)
Browse files Browse the repository at this point in the history
### Added

- Class `IndexSettings` to hold settings
- Function: `_get_index_settings` to return the correct settings.
- Function: `_is_novaseq6000_post_1_5_kit` to check the version of novaseq6000 flow cells
- Tests for the three types of NovaSeq flow cells
- Fixtures for three new flow cell directories

### Changed
- Removed `is_reverse_complement`
- Removed `get_hamming_distance_index_2`
- Removed `is_reverse_complement_needed`
- Removed `get_reagent_kit_version`
  • Loading branch information
Vince-janv authored Dec 6, 2023
1 parent b0d6a57 commit 841c138
Show file tree
Hide file tree
Showing 35 changed files with 22,707 additions and 277 deletions.
84 changes: 11 additions & 73 deletions cg/apps/demultiplex/sample_sheet/index.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Functions that deal with modifications of the indexes."""
import logging

from packaging import version
from pydantic import BaseModel

from cg.apps.demultiplex.sample_sheet.models import (
Expand All @@ -12,7 +11,6 @@
from cg.constants.constants import FileFormat
from cg.constants.sequencing import Sequencers
from cg.io.controller import ReadFile
from cg.models.demultiplex.run_parameters import RunParameters
from cg.resources import VALID_INDEXES_PATH
from cg.utils.utils import get_hamming_distance

Expand All @@ -22,9 +20,6 @@
INDEX_TWO_PAD_SEQUENCE: str = "AC"
LONG_INDEX_CYCLE_NR: int = 10
MINIMUM_HAMMING_DISTANCE: int = 3
NEW_CONTROL_SOFTWARE_VERSION: str = "1.7.0"
NEW_REAGENT_KIT_VERSION: str = "1.5"
REAGENT_KIT_PARAMETER_TO_VERSION: dict[str, str] = {"1": "1.0", "3": "1.5"}
SHORT_SAMPLE_INDEX_LENGTH: int = 8


Expand Down Expand Up @@ -56,15 +51,6 @@ def get_valid_indexes(dual_indexes_only: bool = True) -> list[Index]:
return indexes


def get_reagent_kit_version(reagent_kit_version: str) -> str:
"""Derives the reagent kit version from the run parameters."""
LOG.debug(f"Converting reagent kit parameter {reagent_kit_version} to version")
if reagent_kit_version not in REAGENT_KIT_PARAMETER_TO_VERSION:
raise SyntaxError(f"Unknown reagent kit version {reagent_kit_version}")

return REAGENT_KIT_PARAMETER_TO_VERSION[reagent_kit_version]


def get_index_pair(sample: FlowCellSample) -> tuple[str, str]:
"""Returns a sample index separated into index 1 and index 2."""
if is_dual_index(sample.index):
Expand All @@ -73,35 +59,6 @@ def get_index_pair(sample: FlowCellSample) -> tuple[str, str]:
return sample.index.replace("NNNNNNNNN", ""), sample.index2


def is_reverse_complement_needed(run_parameters: RunParameters) -> bool:
"""Return True if the second index requires reverse complement.
If the run used the new NovaSeq control software version (NEW_CONTROL_SOFTWARE_VERSION)
and the new reagent kit version (NEW_REAGENT_KIT_VERSION), then it requires reverse complement.
If the run is NovaSeqX, HiSeqX or HiSeq2500, does not require reverse complement.
"""
if run_parameters.sequencer != Sequencers.NOVASEQ:
return False
control_software_version: str = run_parameters.control_software_version
reagent_kit_version: str = run_parameters.reagent_kit_version
LOG.debug("Check if run is reverse complement")
if version.parse(version=control_software_version) < version.parse(
version=NEW_CONTROL_SOFTWARE_VERSION
):
LOG.warning(
f"Old software version {control_software_version}, no need for reverse complement"
)
return False
reagent_kit_version: str = get_reagent_kit_version(reagent_kit_version=reagent_kit_version)
if version.parse(reagent_kit_version) < version.parse(NEW_REAGENT_KIT_VERSION):
LOG.warning(
f"Reagent kit version {reagent_kit_version} does not does not need reverse complement"
)
return False
LOG.debug("Run is reverse complement")
return True


def is_padding_needed(index_cycles: int, sample_index_length: int) -> bool:
"""Returns whether a sample needs padding or not given the sample index length.
A sample needs padding if its adapted index length is shorter than the number of index cycles
Expand Down Expand Up @@ -130,8 +87,8 @@ def pad_index_two(index_string: str, reverse_complement: bool) -> str:
return index_string + INDEX_TWO_PAD_SEQUENCE


def get_hamming_distance_index_1(sequence_1: str, sequence_2: str) -> int:
"""Get the hamming distance between two index 1 sequences.
def get_hamming_distance_for_indexes(sequence_1: str, sequence_2: str) -> int:
"""Get the hamming distance between two index sequences.
In the case that one sequence is longer than the other, the distance is calculated between
the shortest sequence and the first segment of equal length of the longest sequence."""
shortest_index_length: int = min(len(sequence_1), len(sequence_2))
Expand All @@ -140,29 +97,9 @@ def get_hamming_distance_index_1(sequence_1: str, sequence_2: str) -> int:
)


def get_hamming_distance_index_2(
sequence_1: str, sequence_2: str, is_reverse_complement: bool
) -> int:
"""Get the hamming distance between two index 2 sequences.
In the case that one sequence is longer than the other, the distance is calculated between
the shortest sequence and the last segment of equal length of the longest sequence.
If the sample requires reverse complement, the calculation is the same as for index 1."""
shortest_index_length: int = min(len(sequence_1), len(sequence_2))
return (
get_hamming_distance(
str_1=sequence_1[:shortest_index_length], str_2=sequence_2[:shortest_index_length]
)
if is_reverse_complement
else get_hamming_distance(
str_1=sequence_1[-shortest_index_length:], str_2=sequence_2[-shortest_index_length:]
)
)


def update_barcode_mismatch_values_for_sample(
sample_to_update: FlowCellSampleBCLConvert,
samples_to_compare_to: list[FlowCellSampleBCLConvert],
is_reverse_complement: bool,
) -> None:
"""Updates the sample's barcode mismatch values.
If a sample index has a hamming distance to any other sample lower than the threshold
Expand All @@ -173,18 +110,19 @@ def update_barcode_mismatch_values_for_sample(
continue
index_1, index_2 = get_index_pair(sample=sample_to_compare_to)
if (
get_hamming_distance_index_1(sequence_1=index_1_sample_to_update, sequence_2=index_1)
get_hamming_distance_for_indexes(
sequence_1=index_1_sample_to_update, sequence_2=index_1
)
< MINIMUM_HAMMING_DISTANCE
):
LOG.debug(
f"Turning barcode mismatch for index 1 to 0 for sample {sample_to_update.sample_id}"
)
sample_to_update.barcode_mismatches_1 = 0
if (
get_hamming_distance_index_2(
get_hamming_distance_for_indexes(
sequence_1=index_2_sample_to_update,
sequence_2=index_2,
is_reverse_complement=is_reverse_complement,
)
< MINIMUM_HAMMING_DISTANCE
):
Expand All @@ -195,7 +133,7 @@ def update_barcode_mismatch_values_for_sample(


def pad_and_reverse_complement_sample_indexes(
sample: FlowCellSample, index_cycles: int, is_reverse_complement: bool
sample: FlowCellSample, index_cycles: int, perform_reverse_complement: bool
) -> None:
"""Adapts the indexes of sample.
1. Pad indexes if needed so that all indexes have a length equal to the number of index reads
Expand All @@ -209,9 +147,9 @@ def pad_and_reverse_complement_sample_indexes(
):
LOG.debug("Padding indexes")
index1 = pad_index_one(index_string=index1)
index2 = pad_index_two(index_string=index2, reverse_complement=is_reverse_complement)
index2 = pad_index_two(index_string=index2, reverse_complement=perform_reverse_complement)
LOG.debug(f"Padding not necessary for sample {sample.sample_id}")
if is_reverse_complement:
if perform_reverse_complement:
index2 = get_reverse_complement_dna_seq(index2)
sample.index = index1
sample.index2 = index2
Expand All @@ -220,7 +158,7 @@ def pad_and_reverse_complement_sample_indexes(
def update_indexes_for_samples(
samples: list[FlowCellSampleBCLConvert | FlowCellSampleBcl2Fastq],
index_cycles: int,
is_reverse_complement: bool,
perform_reverse_complement: bool,
sequencer: str,
) -> None:
"""Updates the values to the fields index1 and index 2 of samples."""
Expand All @@ -233,5 +171,5 @@ def update_indexes_for_samples(
pad_and_reverse_complement_sample_indexes(
sample=sample,
index_cycles=index_cycles,
is_reverse_complement=is_reverse_complement,
perform_reverse_complement=perform_reverse_complement,
)
55 changes: 42 additions & 13 deletions cg/apps/demultiplex/sample_sheet/sample_sheet_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
import logging
from typing import Type

from packaging.version import parse

from cg.apps.demultiplex.sample_sheet.index import (
Index,
get_index_pair,
get_valid_indexes,
is_dual_index,
is_reverse_complement_needed,
update_barcode_mismatch_values_for_sample,
update_indexes_for_samples,
)
Expand All @@ -20,10 +21,17 @@
get_validated_sample_sheet,
)
from cg.constants.demultiplexing import (
NEW_NOVASEQ_CONTROL_SOFTWARE_VERSION,
NEW_NOVASEQ_REAGENT_KIT_VERSION,
NO_REVERSE_COMPLEMENTS,
NOVASEQ_6000_POST_1_5_KITS,
NOVASEQ_X_INDEX_SETTINGS,
BclConverter,
IndexSettings,
SampleSheetBcl2FastqSections,
SampleSheetBCLConvertSections,
)
from cg.constants.sequencing import Sequencers
from cg.exc import SampleSheetError
from cg.models.demultiplex.run_parameters import RunParameters
from cg.models.flow_cell.flow_cell import FlowCellDirectoryData
Expand All @@ -48,6 +56,34 @@ def __init__(
FlowCellSampleBCLConvert | FlowCellSampleBcl2Fastq
] = flow_cell.sample_type
self.force: bool = force
self.index_settings: IndexSettings = self._get_index_settings()

def _get_index_settings(self) -> IndexSettings:
"""Returns the correct index-related settings for the run in question"""
if self.run_parameters.sequencer == Sequencers.NOVASEQX:
LOG.debug("Using NovaSeqX index settings")
return NOVASEQ_X_INDEX_SETTINGS
if self._is_novaseq6000_post_1_5_kit:
LOG.debug("Using NovaSeq 6000 post 1.5 kits index settings")
return NOVASEQ_6000_POST_1_5_KITS
return NO_REVERSE_COMPLEMENTS

@property
def _is_novaseq6000_post_1_5_kit(self) -> bool:
"""
Returns whether sequencing was performed after the 1.5 consumables kits where introduced.
This is indicated by the software version and the reagent kit fields in the run parameters.
"""
if self.run_parameters.sequencer != Sequencers.NOVASEQ:
return False

if parse(self.run_parameters.control_software_version) < parse(
NEW_NOVASEQ_CONTROL_SOFTWARE_VERSION
):
return False
if parse(self.run_parameters.reagent_kit_version) < parse(NEW_NOVASEQ_REAGENT_KIT_VERSION):
return False
return True

@property
def bcl_converter(self) -> str:
Expand All @@ -58,11 +94,6 @@ def bcl_converter(self) -> str:
def valid_indexes(self) -> list[Index]:
return get_valid_indexes(dual_indexes_only=True)

@property
def is_reverse_complement(self) -> bool:
"""Return whether the samples require reverse complement."""
return is_reverse_complement_needed(run_parameters=self.run_parameters)

def update_barcode_mismatch_values_for_samples(self, *args) -> None:
"""Updates barcode mismatch values for samples if applicable."""
raise NotImplementedError(
Expand Down Expand Up @@ -129,7 +160,7 @@ def process_samples_for_sample_sheet(self) -> None:
update_indexes_for_samples(
samples=samples_in_lane,
index_cycles=self.run_parameters.index_length,
is_reverse_complement=self.is_reverse_complement,
perform_reverse_complement=self.index_settings.should_i5_be_reverse_complimented,
sequencer=self.run_parameters.sequencer,
)
self.update_barcode_mismatch_values_for_samples(samples_in_lane)
Expand Down Expand Up @@ -196,9 +227,7 @@ def update_barcode_mismatch_values_for_samples(
"""Update barcode mismatch values for both indexes of given samples."""
for sample in samples:
update_barcode_mismatch_values_for_sample(
sample_to_update=sample,
samples_to_compare_to=samples,
is_reverse_complement=self.is_reverse_complement,
sample_to_update=sample, samples_to_compare_to=samples
)

def add_override_cycles_to_samples(self) -> None:
Expand All @@ -216,9 +245,9 @@ def add_override_cycles_to_samples(self) -> None:
index1_cycles = f"I{sample_index1_len}N{length_index1 - sample_index1_len};"
if sample_index2_len < length_index2:
index2_cycles = (
f"I{sample_index2_len}N{length_index2 - sample_index2_len};"
if self.is_reverse_complement
else f"N{length_index2 - sample_index2_len}I{sample_index2_len};"
f"N{length_index2-sample_index2_len}I{sample_index2_len};"
if self.index_settings.are_i5_override_cycles_reverse_complemented
else f"I{sample_index2_len}N{length_index2 - sample_index2_len};"
)
sample.override_cycles = read1_cycles + index1_cycles + index2_cycles + read2_cycles

Expand Down
34 changes: 34 additions & 0 deletions cg/constants/demultiplexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pathlib import Path

import click
from pydantic import BaseModel

from cg.constants.sequencing import Sequencers

Expand Down Expand Up @@ -221,3 +222,36 @@ def column_names(cls) -> list[str]:
FASTQ_FILE_SUFFIXES: list[str] = [".fastq", ".gz"]
INDEX_CHECK: str = "indexcheck"
UNDETERMINED: str = "Undetermined"

NEW_NOVASEQ_CONTROL_SOFTWARE_VERSION: str = "1.7.0"
NEW_NOVASEQ_REAGENT_KIT_VERSION: str = "1.5"


class IndexSettings(BaseModel):
"""
Holds the settings defining how the sample indexes should be handled in the sample sheet.
These vary between machines and versions.
Attributes:
should_i5_be_reverse_complimented (bool): Whether the i5 index should be reverse complemented.
are_i5_override_cycles_reverse_complemented (bool): Whether the override cycles for i5 should be written in as NXIX.
"""

should_i5_be_reverse_complimented: bool
are_i5_override_cycles_reverse_complemented: bool


# The logic for the settings below are acquired empirically, any changes should be well motivated
# and rigorously tested.

NOVASEQ_X_INDEX_SETTINGS = IndexSettings(
should_i5_be_reverse_complimented=False, are_i5_override_cycles_reverse_complemented=True
)
NOVASEQ_6000_POST_1_5_KITS = IndexSettings(
should_i5_be_reverse_complimented=True, are_i5_override_cycles_reverse_complemented=False
)
NO_REVERSE_COMPLEMENTS = IndexSettings(
should_i5_be_reverse_complimented=False,
are_i5_override_cycles_reverse_complemented=False,
)
6 changes: 3 additions & 3 deletions tests/apps/demultiplex/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ def bcl_convert_samples_with_updated_indexes() -> list[FlowCellSampleBCLConvert]
@pytest.fixture
def override_cycles_for_samples_with_updated_indexes() -> list[str]:
"""Return the correspondent Override Cycles values for three samples."""
return ["Y151;I8N2;N2I8;Y151", "Y151;I8N2;N2I8;Y151", "Y151;I10;I10;Y151"]
return ["Y151;I8N2;I8N2;Y151", "Y151;I8N2;I8N2;Y151", "Y151;I10;I10;Y151"]


@pytest.fixture
def override_cycles_for_samples_with_updated_indexes_reverse_complement() -> list[str]:
def override_cycles_for_novaseq_x_samples() -> list[str]:
"""Return the correspondent Override Cycles values for three samples."""
return ["Y151;I8N2;I8N2;Y151", "Y151;I8N2;I8N2;Y151", "Y151;I10;I10;Y151"]
return ["Y151;I8N2;N2I8;Y151", "Y151;I8N2;N2I8;Y151", "Y151;I10;I10;Y151"]


@pytest.fixture
Expand Down
Loading

0 comments on commit 841c138

Please sign in to comment.