From f8a6ac8cbd086e06aa12cfb8e979f92532c1dcfc Mon Sep 17 00:00:00 2001 From: Sebastian Diaz Date: Thu, 17 Oct 2024 09:58:08 +0200 Subject: [PATCH] new post process all (#3845)(minor) ## Description Closes https://github.com/Clinical-Genomics/add-new-tech/issues/118 Create a command `post-process all` that will post-process all instrument runs regardless of the instrument. Currently only works for Pacbio. The instrument can be added as a flag so that only runs of a given instrument are post-processed. ### Added - Cli command function `post_process_all_runs` in `cg/cli/post_process/post_process.py`. - Utils functions and a model to get and filter the runs to post-process in `cg/cli/post_process/utils.py` - Tests and fixtures for these utils functions - Abstract service family ` RunNamesService` with the child `PacbioRunNamesService` in charge of generating sequencing run names - Tests and fixtures for this new service - New property `run_names_services` in the `CGConfig` class to access the run names service from the config - New check function `is_run_processed` defined abstractly in the post-processing class and in implemented in the Pacbio child class that checks if a run has been post-processed --- cg/cli/post_process/post_process.py | 38 ++++++++++- cg/cli/post_process/utils.py | 58 +++++++++++++++++ cg/models/cg_config.py | 40 +++++++----- cg/services/run_devices/abstract_classes.py | 14 ++++- .../pacbio_store_service.py | 3 + .../data_transfer_service.py | 11 +++- .../pacbio_houskeeper_service.py | 9 +-- .../pacbio/metrics_parser/metrics_parser.py | 7 ++- .../pacbio/post_processing_service.py | 11 +++- .../run_validator/pacbio_run_validator.py | 12 ++-- cg/services/run_devices/run_names/__init__.py | 0 cg/services/run_devices/run_names/pacbio.py | 20 ++++++ cg/services/run_devices/run_names/service.py | 12 ++++ .../validate_file_transfer_service.py | 4 ++ cg/store/crud/create.py | 2 + tests/cli/post_process/test_utils.py | 35 ++++++++++- tests/conftest.py | 1 + tests/fixture_plugins/device_fixtures.py | 13 ++-- .../pacbio_fixtures/context_fixtures.py | 7 ++- .../pacbio_fixtures/name_fixtures.py | 27 ++++++++ .../pacbio_fixtures/service_fixtures.py | 26 +++++++- .../unprocessed_runs_fixtures.py | 35 +++++++++++ .../1_B01/post_processing_completed | 0 .../test_pac_bio_run_validator.py | 49 +++++++-------- .../test_pacbio_run_names_service.py | 63 +++++++++++++++++++ 25 files changed, 422 insertions(+), 75 deletions(-) create mode 100644 cg/services/run_devices/run_names/__init__.py create mode 100644 cg/services/run_devices/run_names/pacbio.py create mode 100644 cg/services/run_devices/run_names/service.py create mode 100644 tests/fixture_plugins/pacbio_fixtures/unprocessed_runs_fixtures.py create mode 100644 tests/fixtures/devices/pacbio/SMRTcells/r84202_20240913_121403/1_B01/post_processing_completed create mode 100644 tests/services/run_names/test_pacbio_run_names_service.py diff --git a/cg/cli/post_process/post_process.py b/cg/cli/post_process/post_process.py index 93d43b540a..f0fc1ccd8f 100644 --- a/cg/cli/post_process/post_process.py +++ b/cg/cli/post_process/post_process.py @@ -4,7 +4,11 @@ import click -from cg.cli.post_process.utils import get_post_processing_service_from_run_name +from cg.cli.post_process.utils import ( + UnprocessedRunInfo, + get_post_processing_service_from_run_name, + get_unprocessed_runs_info, +) from cg.cli.utils import CLICK_CONTEXT_SETTINGS from cg.constants.cli_options import DRY_RUN from cg.models.cg_config import CGConfig @@ -23,7 +27,7 @@ def post_process_group(): @DRY_RUN @click.argument("run-name") @click.pass_obj -def post_process_sequencing_run(context: CGConfig, run_name: str, dry_run: bool): +def post_process_run(context: CGConfig, run_name: str, dry_run: bool) -> None: """Post-process a sequencing run from the PacBio instrument. run-name is the full name of the sequencing unit of run. For example: @@ -35,4 +39,32 @@ def post_process_sequencing_run(context: CGConfig, run_name: str, dry_run: bool) post_processing_service.post_process(run_name=run_name, dry_run=dry_run) -post_process_group.add_command(post_process_sequencing_run) +@post_process_group.command(name="all") +@DRY_RUN +@click.option( + "--instrument", + type=click.Choice(["pacbio", "all"]), + default="all", + show_default=True, + help="The instrument for which the runs will be post-processed. Choose 'all' to post-process runs from all instruments.", + required=False, +) +@click.pass_obj +def post_process_all_runs(context: CGConfig, instrument: str, dry_run: bool) -> None: + """Post-process all available runs.""" + exit_success: bool = True + unprocessed_runs: list[UnprocessedRunInfo] = get_unprocessed_runs_info( + context=context, instrument=instrument + ) + for run in unprocessed_runs: + try: + run.post_processing_service.post_process(run_name=run.name, dry_run=dry_run) + except Exception as error: + LOG.error(f"Could not post-process {run.instrument} run {run.name}: {error}") + exit_success = False + if not exit_success: + raise click.Abort + + +post_process_group.add_command(post_process_run) +post_process_group.add_command(post_process_all_runs) diff --git a/cg/cli/post_process/utils.py b/cg/cli/post_process/utils.py index 7ddddbc4e1..2896fb94f9 100644 --- a/cg/cli/post_process/utils.py +++ b/cg/cli/post_process/utils.py @@ -1,13 +1,30 @@ +"""Utility functions for the post-process command.""" + +import logging + +from pydantic import BaseModel, ConfigDict + from cg.exc import CgError from cg.models.cg_config import CGConfig +from cg.services.run_devices.abstract_classes import PostProcessingService from cg.services.run_devices.pacbio.post_processing_service import PacBioPostProcessingService +from cg.services.run_devices.run_names.service import RunNamesService from cg.utils.mapping import get_item_by_pattern_in_source +LOG = logging.getLogger(__name__) + PATTERN_TO_DEVICE_MAP: dict[str, str] = { r"^r\d+_\d+_\d+/(1|2)_[^/]+$": "pacbio", } +class UnprocessedRunInfo(BaseModel): + name: str + post_processing_service: PostProcessingService + instrument: str + model_config = ConfigDict(arbitrary_types_allowed=True) + + def get_post_processing_service_from_run_name( context: CGConfig, run_name: str ) -> PacBioPostProcessingService: @@ -21,3 +38,44 @@ def get_post_processing_service_from_run_name( f"Run name {run_name} does not match with any known sequencing run name pattern" ) from error return getattr(context.post_processing_services, device) + + +def get_unprocessed_runs_info(context: CGConfig, instrument: str) -> list[UnprocessedRunInfo]: + """Return a list of unprocessed runs for a given instrument or for all instruments.""" + runs: list[UnprocessedRunInfo] = [] + instruments_to_check: list[str] = _instruments_to_check(instrument) + for instrument_name in instruments_to_check: + run_names_service: RunNamesService = getattr(context.run_names_services, instrument_name) + runs.extend( + _get_unprocessed_runs_from_run_names( + run_names=run_names_service.get_run_names(), + post_processing_service=getattr(context.post_processing_services, instrument_name), + instrument_name=instrument_name, + ) + ) + return runs + + +def _instruments_to_check(instrument: str) -> list[str]: + """Return a list of instruments to check for unprocessed runs.""" + possible_instruments: list[str] = ["pacbio"] # Add more instruments here + return [instrument] if instrument != "all" else possible_instruments + + +def _get_unprocessed_runs_from_run_names( + run_names: list[str], post_processing_service: PostProcessingService, instrument_name +) -> list[UnprocessedRunInfo]: + LOG.debug(f"Adding {instrument_name} run names to the post-processing list") + runs: list[UnprocessedRunInfo] = [] + for name in run_names: + if post_processing_service.is_run_processed(name): + LOG.debug(f"Run {name} has already been post-processed. Skipping") + continue + runs.append( + UnprocessedRunInfo( + name=name, + post_processing_service=post_processing_service, + instrument=instrument_name, + ) + ) + return runs diff --git a/cg/models/cg_config.py b/cg/models/cg_config.py index 6d96aaf27c..911ab9b32c 100644 --- a/cg/models/cg_config.py +++ b/cg/models/cg_config.py @@ -2,7 +2,7 @@ from pathlib import Path from typing import Any -from pydantic import BaseModel, EmailStr, Field, ConfigDict +from pydantic import BaseModel, ConfigDict, EmailStr, Field from typing_extensions import Literal from cg.apps.coverage import ChanjoAPI @@ -47,21 +47,14 @@ from cg.services.run_devices.pacbio.housekeeper_service.pacbio_houskeeper_service import ( PacBioHousekeeperService, ) -from cg.services.run_devices.pacbio.metrics_parser.metrics_parser import ( - PacBioMetricsParser, -) -from cg.services.run_devices.pacbio.post_processing_service import ( - PacBioPostProcessingService, -) +from cg.services.run_devices.pacbio.metrics_parser.metrics_parser import PacBioMetricsParser +from cg.services.run_devices.pacbio.post_processing_service import PacBioPostProcessingService from cg.services.run_devices.pacbio.run_data_generator.pacbio_run_data_generator import ( PacBioRunDataGenerator, ) -from cg.services.run_devices.pacbio.run_file_manager.run_file_manager import ( - PacBioRunFileManager, -) -from cg.services.run_devices.pacbio.run_validator.pacbio_run_validator import ( - PacBioRunValidator, -) +from cg.services.run_devices.pacbio.run_file_manager.run_file_manager import PacBioRunFileManager +from cg.services.run_devices.pacbio.run_validator.pacbio_run_validator import PacBioRunValidator +from cg.services.run_devices.run_names.pacbio import PacbioRunNamesService from cg.services.sequencing_qc_service.sequencing_qc_service import SequencingQCService from cg.services.slurm_service.slurm_cli_service import SlurmCLIService from cg.services.slurm_service.slurm_service import SlurmService @@ -360,11 +353,14 @@ class RunInstruments(BaseModel): illumina: IlluminaConfig +class RunNamesServices(BaseModel): + pacbio: PacbioRunNamesService + model_config = ConfigDict(arbitrary_types_allowed=True) + + class PostProcessingServices(BaseModel): pacbio: PacBioPostProcessingService - - class Config: - arbitrary_types_allowed = True + model_config = ConfigDict(arbitrary_types_allowed=True) class CGConfig(BaseModel): @@ -427,6 +423,7 @@ class CGConfig(BaseModel): pdc_service_: PdcService | None = None post_processing_services_: PostProcessingServices | None = None pigz: CommonAppConfig | None = None + run_names_services_: RunNamesServices | None = None sample_sheet_api_: IlluminaSampleSheetService | None = None scout: CommonAppConfig = None scout_api_: ScoutAPI = None @@ -633,6 +630,17 @@ def pdc_service(self) -> PdcService: self.pdc_service_ = service return service + @property + def run_names_services(self) -> RunNamesServices: + services = self.run_names_services_ + if services is None: + LOG.debug("Instantiating run directory names services") + services = RunNamesServices( + pacbio=PacbioRunNamesService(self.run_instruments.pacbio.data_dir) + ) + self.run_names_services_ = services + return services + @property def sample_sheet_api(self) -> IlluminaSampleSheetService: sample_sheet_api = self.__dict__.get("sample_sheet_api_") diff --git a/cg/services/run_devices/abstract_classes.py b/cg/services/run_devices/abstract_classes.py index 07a1f10dc0..086d4b5d44 100644 --- a/cg/services/run_devices/abstract_classes.py +++ b/cg/services/run_devices/abstract_classes.py @@ -1,11 +1,14 @@ """Post-processing service abstract classes.""" +import logging from abc import ABC, abstractmethod from pathlib import Path from cg.services.run_devices.abstract_models import PostProcessingDTOs, RunData, RunMetrics from cg.services.run_devices.constants import POST_PROCESSING_COMPLETED +LOG = logging.getLogger(__name__) + class RunDataGenerator(ABC): """Abstract class that holds functionality to create a run data object.""" @@ -83,8 +86,15 @@ def post_process(self, run_name: str, dry_run: bool = False): """Store sequencing metrics in StatusDB and relevant files in Housekeeper.""" pass + @abstractmethod + def is_run_processed(self, run_name: str) -> bool: + """Check if a run has been post-processed.""" + pass + @staticmethod - def _touch_post_processing_complete(run_data: RunData) -> None: + def _touch_post_processing_complete(run_data: RunData, dry_run: bool = False) -> None: """Touch the post-processing complete file.""" processing_complete_file = Path(run_data.full_path, POST_PROCESSING_COMPLETED) - processing_complete_file.touch() + if not dry_run: + processing_complete_file.touch() + LOG.debug(f"Post-processing complete for {run_data.full_path}") diff --git a/cg/services/run_devices/pacbio/data_storage_service/pacbio_store_service.py b/cg/services/run_devices/pacbio/data_storage_service/pacbio_store_service.py index 2fe34dcdc5..5efd5fd0a8 100644 --- a/cg/services/run_devices/pacbio/data_storage_service/pacbio_store_service.py +++ b/cg/services/run_devices/pacbio/data_storage_service/pacbio_store_service.py @@ -1,3 +1,5 @@ +"""Module for the Pacbio database store service.""" + import logging from datetime import datetime @@ -85,4 +87,5 @@ def store_post_processing_data(self, run_data: PacBioRunData, dry_run: bool = Fa f"Dry run, no entries will be added to database for SMRT cell {run_data.full_path}." ) return + LOG.debug(f"Data stored in statusDB for run {run_data.sequencing_run_name}") self.store.commit_to_store() diff --git a/cg/services/run_devices/pacbio/data_transfer_service/data_transfer_service.py b/cg/services/run_devices/pacbio/data_transfer_service/data_transfer_service.py index 730db2fbf8..6d953df06a 100644 --- a/cg/services/run_devices/pacbio/data_transfer_service/data_transfer_service.py +++ b/cg/services/run_devices/pacbio/data_transfer_service/data_transfer_service.py @@ -1,3 +1,7 @@ +"""Module for the Pacbio data transfer object creation service.""" + +import logging + from pydantic import ValidationError from cg.services.run_devices.abstract_classes import PostProcessingDataTransferService @@ -17,12 +21,12 @@ get_sequencing_run_dto, get_smrt_cell_dto, ) -from cg.services.run_devices.pacbio.metrics_parser.metrics_parser import ( - PacBioMetricsParser, -) +from cg.services.run_devices.pacbio.metrics_parser.metrics_parser import PacBioMetricsParser from cg.services.run_devices.pacbio.metrics_parser.models import PacBioMetrics from cg.services.run_devices.pacbio.run_data_generator.run_data import PacBioRunData +LOG = logging.getLogger(__name__) + class PacBioDataTransferService(PostProcessingDataTransferService): def __init__(self, metrics_service: PacBioMetricsParser): @@ -41,6 +45,7 @@ def get_post_processing_dtos(self, run_data: PacBioRunData) -> PacBioDTOs: sample_sequencing_metrics_dtos: list[PacBioSampleSequencingMetricsDTO] = ( get_sample_sequencing_metrics_dtos(metrics.samples) ) + LOG.debug("DTOs created") return PacBioDTOs( run_device=smrt_cell_dto, sequencing_run=sequencing_run_dto, diff --git a/cg/services/run_devices/pacbio/housekeeper_service/pacbio_houskeeper_service.py b/cg/services/run_devices/pacbio/housekeeper_service/pacbio_houskeeper_service.py index e873ec87bc..6116cc017d 100644 --- a/cg/services/run_devices/pacbio/housekeeper_service/pacbio_houskeeper_service.py +++ b/cg/services/run_devices/pacbio/housekeeper_service/pacbio_houskeeper_service.py @@ -20,14 +20,10 @@ PostProcessingStoreFileError, ) from cg.services.run_devices.pacbio.housekeeper_service.models import PacBioFileData -from cg.services.run_devices.pacbio.metrics_parser.metrics_parser import ( - PacBioMetricsParser, -) +from cg.services.run_devices.pacbio.metrics_parser.metrics_parser import PacBioMetricsParser from cg.services.run_devices.pacbio.metrics_parser.models import PacBioMetrics from cg.services.run_devices.pacbio.run_data_generator.run_data import PacBioRunData -from cg.services.run_devices.pacbio.run_file_manager.run_file_manager import ( - PacBioRunFileManager, -) +from cg.services.run_devices.pacbio.run_file_manager.run_file_manager import PacBioRunFileManager from cg.utils.mapping import get_item_by_pattern_in_source LOG = logging.getLogger(__name__) @@ -64,6 +60,7 @@ def store_files_in_housekeeper(self, run_data: PacBioRunData, dry_run: bool = Fa file_path=bundle_info.file_path, tags=bundle_info.tags, ) + LOG.debug(f"Files stored in Housekeeper for run {run_data.sequencing_run_name}") @staticmethod def _get_bundle_type_for_file(file_path: Path) -> str: diff --git a/cg/services/run_devices/pacbio/metrics_parser/metrics_parser.py b/cg/services/run_devices/pacbio/metrics_parser/metrics_parser.py index 2352a2f2d0..f5db9c09e0 100644 --- a/cg/services/run_devices/pacbio/metrics_parser/metrics_parser.py +++ b/cg/services/run_devices/pacbio/metrics_parser/metrics_parser.py @@ -1,3 +1,6 @@ +"""Module for teh Pacbio sequenicng metrics parsing service.""" + +import logging from pathlib import Path from pydantic import ValidationError @@ -26,6 +29,8 @@ from cg.services.run_devices.pacbio.run_data_generator.run_data import PacBioRunData from cg.services.run_devices.pacbio.run_file_manager.run_file_manager import PacBioRunFileManager +LOG = logging.getLogger(__name__) + class PacBioMetricsParser(PostProcessingMetricsParser): """Class for parsing PacBio sequencing metrics.""" @@ -59,7 +64,7 @@ def parse_metrics(self, run_data: PacBioRunData) -> PacBioMetrics: metrics_files=metrics_files, file_name=PacBioDirsAndFiles.BARCODES_REPORT ) sample_metrics: list[SampleMetrics] = get_parsed_sample_metrics(metrics_files) - + LOG.debug(f"All metrics parsed for run {run_data.sequencing_run_name}") return PacBioMetrics( read=read_metrics, control=control_metrics, diff --git a/cg/services/run_devices/pacbio/post_processing_service.py b/cg/services/run_devices/pacbio/post_processing_service.py index 53fcb55e3b..35ebda2cd3 100644 --- a/cg/services/run_devices/pacbio/post_processing_service.py +++ b/cg/services/run_devices/pacbio/post_processing_service.py @@ -1,6 +1,8 @@ import logging +from pathlib import Path from cg.services.run_devices.abstract_classes import PostProcessingService +from cg.services.run_devices.constants import POST_PROCESSING_COMPLETED from cg.services.run_devices.error_handler import handle_post_processing_errors from cg.services.run_devices.exc import ( PostProcessingError, @@ -49,11 +51,16 @@ def __init__( to_raise=PostProcessingError, ) def post_process(self, run_name: str, dry_run: bool = False) -> None: - LOG.info(f"Starting PacBio post-processing for run: {run_name}") + LOG.info(f"Starting Pacbio post-processing for run: {run_name}") run_data: PacBioRunData = self.run_data_generator.get_run_data( run_name=run_name, sequencing_dir=self.sequencing_dir ) self.run_validator.ensure_post_processing_can_start(run_data) self.store_service.store_post_processing_data(run_data=run_data, dry_run=dry_run) self.hk_service.store_files_in_housekeeper(run_data=run_data, dry_run=dry_run) - self._touch_post_processing_complete(run_data) + self._touch_post_processing_complete(run_data=run_data, dry_run=dry_run) + + def is_run_processed(self, run_name: str) -> bool: + """Check if a run has been post-processed.""" + processing_complete_file = Path(self.sequencing_dir, run_name, POST_PROCESSING_COMPLETED) + return processing_complete_file.exists() diff --git a/cg/services/run_devices/pacbio/run_validator/pacbio_run_validator.py b/cg/services/run_devices/pacbio/run_validator/pacbio_run_validator.py index 22aa4a3a35..06a266af99 100644 --- a/cg/services/run_devices/pacbio/run_validator/pacbio_run_validator.py +++ b/cg/services/run_devices/pacbio/run_validator/pacbio_run_validator.py @@ -1,5 +1,6 @@ -from pathlib import Path import logging +from pathlib import Path + from cg.constants.constants import FileFormat from cg.constants.pacbio import PacBioDirsAndFiles from cg.services.decompression_service.decompressor import Decompressor @@ -49,7 +50,6 @@ def ensure_post_processing_can_start(self, run_data: PacBioRunData) -> None: source_dir=run_data.full_path, manifest_file_format=FileFormat.TXT, ) - self.decompressor.decompress( source_path=paths_information.decompression_target, destination_path=paths_information.decompression_destination, @@ -57,10 +57,10 @@ def ensure_post_processing_can_start(self, run_data: PacBioRunData) -> None: self._touch_is_validated(run_data.full_path) LOG.debug(f"Run for {run_data.full_path} is validated.") - @staticmethod - def _touch_is_validated(run_path: Path): - Path(run_path, PacBioDirsAndFiles.RUN_IS_VALID).touch() - @staticmethod def _is_validated(run_path: Path) -> bool: return Path(run_path, PacBioDirsAndFiles.RUN_IS_VALID).exists() + + @staticmethod + def _touch_is_validated(run_path: Path) -> None: + Path(run_path, PacBioDirsAndFiles.RUN_IS_VALID).touch() diff --git a/cg/services/run_devices/run_names/__init__.py b/cg/services/run_devices/run_names/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cg/services/run_devices/run_names/pacbio.py b/cg/services/run_devices/run_names/pacbio.py new file mode 100644 index 0000000000..14b24b52eb --- /dev/null +++ b/cg/services/run_devices/run_names/pacbio.py @@ -0,0 +1,20 @@ +from pathlib import Path + +from cg.services.run_devices.run_names.service import RunNamesService + + +class PacbioRunNamesService(RunNamesService): + + def get_run_names(self) -> list[str]: + """ + Get all the run names from the PacBio sequencing directory in the form + /, for example: + r84202_20240913_121403/1_C01 + """ + + run_names = [] + for run_folder in Path(self.run_directory).iterdir(): + if run_folder.is_dir(): + for cell_folder in run_folder.iterdir(): + run_names.append(f"{run_folder.name}/{cell_folder.name}") + return run_names diff --git a/cg/services/run_devices/run_names/service.py b/cg/services/run_devices/run_names/service.py new file mode 100644 index 0000000000..8122345f11 --- /dev/null +++ b/cg/services/run_devices/run_names/service.py @@ -0,0 +1,12 @@ +from abc import ABC, abstractmethod + + +class RunNamesService(ABC): + + def __init__(self, run_directory: str): + self.run_directory = run_directory + + @abstractmethod + def get_run_names(self) -> list[str]: + """Get all the run names from a run directory.""" + pass diff --git a/cg/services/validate_file_transfer_service/validate_file_transfer_service.py b/cg/services/validate_file_transfer_service/validate_file_transfer_service.py index a0046ca1e5..61e7e54dfa 100644 --- a/cg/services/validate_file_transfer_service/validate_file_transfer_service.py +++ b/cg/services/validate_file_transfer_service/validate_file_transfer_service.py @@ -1,8 +1,11 @@ +import logging from pathlib import Path from cg.io.controller import ReadFile from cg.utils.files import get_file_in_directory, get_files_in_directory_with_pattern +LOG = logging.getLogger(__name__) + class ValidateFileTransferService: """Service to validate file transfers via a manifest file.""" @@ -20,6 +23,7 @@ def validate_file_transfer( raise FileNotFoundError( f"Not all files listed in the manifest file {manifest_file} are present in the directory tree {source_dir}" ) + LOG.debug("File transfer validated") return @staticmethod diff --git a/cg/store/crud/create.py b/cg/store/crud/create.py index 91c0d040c7..83d19c3cb9 100644 --- a/cg/store/crud/create.py +++ b/cg/store/crud/create.py @@ -479,6 +479,7 @@ def add_illumina_sample_metrics_entry( return new_metric def create_pac_bio_smrt_cell(self, run_device_dto: PacBioSMRTCellDTO) -> PacbioSMRTCell: + LOG.debug(f"Creating Pacbio SMRT cell for {run_device_dto.internal_id}") if self.get_pac_bio_smrt_cell_by_internal_id(run_device_dto.internal_id): raise ValueError(f"SMRT cell with {run_device_dto.internal_id} already exists.") new_smrt_cell = PacbioSMRTCell( @@ -490,6 +491,7 @@ def create_pac_bio_smrt_cell(self, run_device_dto: PacBioSMRTCellDTO) -> PacbioS def create_pac_bio_sequencing_run( self, sequencing_run_dto: PacBioSequencingRunDTO, smrt_cell: PacbioSMRTCell ) -> PacbioSequencingRun: + LOG.debug(f"Creating Pacbio sequencing run for SMRT cell {smrt_cell.internal_id}") new_sequencing_run = PacbioSequencingRun( type=sequencing_run_dto.type, well=sequencing_run_dto.well, diff --git a/tests/cli/post_process/test_utils.py b/tests/cli/post_process/test_utils.py index a505936495..9db3d83d99 100644 --- a/tests/cli/post_process/test_utils.py +++ b/tests/cli/post_process/test_utils.py @@ -1,6 +1,10 @@ import pytest -from cg.cli.post_process.utils import get_post_processing_service_from_run_name +from cg.cli.post_process.utils import ( + UnprocessedRunInfo, + get_post_processing_service_from_run_name, + get_unprocessed_runs_info, +) from cg.models.cg_config import CGConfig from cg.services.run_devices.pacbio.post_processing_service import PacBioPostProcessingService @@ -43,3 +47,32 @@ def test_get_post_processing_service_from_wrong_run_name( get_post_processing_service_from_run_name(context=pac_bio_context, run_name=wrong_run_name) # THEN an error is raised + + +def test_get_unprocessed_runs_info_pacbio( + pac_bio_context: CGConfig, + pacbio_barcoded_sequencing_run_name: str, + pacbio_sequencing_run_name: str, +): + """Test that a list of unprocessed runs is returned for Pacbio.""" + # GIVEN a context with a post-processing service for Pacbio + assert pac_bio_context.post_processing_services.pacbio + instrument: str = "pacbio" + + # GIVEN that there are unprocessed run in the directory + expected_run_names: set[str] = {pacbio_sequencing_run_name, pacbio_barcoded_sequencing_run_name} + + # GIVEN that there is an already processed run in the directory + number_of_runs: int = len(pac_bio_context.run_names_services.pacbio.get_run_names()) + number_unprocessed: int = len(expected_run_names) + assert number_of_runs > number_unprocessed + + # WHEN getting the unprocessed runs + unprocessed_runs: list[UnprocessedRunInfo] = get_unprocessed_runs_info( + context=pac_bio_context, instrument=instrument + ) + + # THEN the expected unprocessed runs are returned + assert unprocessed_runs + unprocessed_run_names: set[str] = {run.name for run in unprocessed_runs} + assert unprocessed_run_names == expected_run_names diff --git a/tests/conftest.py b/tests/conftest.py index 780fcea443..4fc03bfeed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -130,6 +130,7 @@ "tests.fixture_plugins.pacbio_fixtures.path_fixtures", "tests.fixture_plugins.pacbio_fixtures.run_data_fixtures", "tests.fixture_plugins.pacbio_fixtures.service_fixtures", + "tests.fixture_plugins.pacbio_fixtures.unprocessed_runs_fixtures", "tests.fixture_plugins.quality_controller_fixtures.sequencing_qc_check_scenario", "tests.fixture_plugins.quality_controller_fixtures.sequencing_qc_fixtures", "tests.fixture_plugins.timestamp_fixtures", diff --git a/tests/fixture_plugins/device_fixtures.py b/tests/fixture_plugins/device_fixtures.py index 44977fc244..08dd87ce0c 100644 --- a/tests/fixture_plugins/device_fixtures.py +++ b/tests/fixture_plugins/device_fixtures.py @@ -2,8 +2,9 @@ import pytest -from cg.models.cg_config import PostProcessingServices +from cg.models.cg_config import PostProcessingServices, RunNamesServices from cg.services.run_devices.pacbio.post_processing_service import PacBioPostProcessingService +from cg.services.run_devices.run_names.pacbio import PacbioRunNamesService @pytest.fixture @@ -11,6 +12,10 @@ def post_processing_services( pac_bio_post_processing_service: PacBioPostProcessingService, ) -> PostProcessingServices: """Return the post processing services.""" - return PostProcessingServices( - pacbio=pac_bio_post_processing_service, - ) + return PostProcessingServices(pacbio=pac_bio_post_processing_service) + + +@pytest.fixture +def run_names_services(pacbio_run_names_service: PacbioRunNamesService) -> RunNamesServices: + """Return the run names services.""" + return RunNamesServices(pacbio=pacbio_run_names_service) diff --git a/tests/fixture_plugins/pacbio_fixtures/context_fixtures.py b/tests/fixture_plugins/pacbio_fixtures/context_fixtures.py index f7216386f5..f816537466 100644 --- a/tests/fixture_plugins/pacbio_fixtures/context_fixtures.py +++ b/tests/fixture_plugins/pacbio_fixtures/context_fixtures.py @@ -1,11 +1,14 @@ import pytest -from cg.models.cg_config import CGConfig, PostProcessingServices +from cg.models.cg_config import CGConfig, PostProcessingServices, RunNamesServices @pytest.fixture def pac_bio_context( - cg_context: CGConfig, post_processing_services: PostProcessingServices + cg_context: CGConfig, + post_processing_services: PostProcessingServices, + run_names_services: RunNamesServices, ) -> CGConfig: cg_context.post_processing_services_ = post_processing_services + cg_context.run_names_services_ = run_names_services return cg_context diff --git a/tests/fixture_plugins/pacbio_fixtures/name_fixtures.py b/tests/fixture_plugins/pacbio_fixtures/name_fixtures.py index c11e019312..4317f19b48 100644 --- a/tests/fixture_plugins/pacbio_fixtures/name_fixtures.py +++ b/tests/fixture_plugins/pacbio_fixtures/name_fixtures.py @@ -24,6 +24,12 @@ def pacbio_barcoded_run_name() -> str: return "r84202_20240913_121403" +@pytest.fixture +def pacbio_sequencing_run_name(pac_bio_test_run_name: str, pac_bio_smrt_cell_name: str) -> str: + """Return the name of a PacBio SMRT cell.""" + return f"{pac_bio_test_run_name}/{pac_bio_smrt_cell_name}" + + @pytest.fixture def pacbio_barcoded_sequencing_run_name( pacbio_barcoded_run_name: str, pacbio_barcoded_smrt_cell_name: str @@ -32,6 +38,27 @@ def pacbio_barcoded_sequencing_run_name( return f"{pacbio_barcoded_run_name}/{pacbio_barcoded_smrt_cell_name}" +@pytest.fixture +def pacbio_processed_sequencing_run_name( + pacbio_barcoded_run_name: str, pac_bio_smrt_cell_name: str +) -> str: + """Return the name of a PacBio SMRT cell.""" + return f"{pacbio_barcoded_run_name}/{pac_bio_smrt_cell_name}" + + +@pytest.fixture +def pacbio_run_names( + pacbio_sequencing_run_name: str, + pacbio_barcoded_sequencing_run_name: str, + pacbio_processed_sequencing_run_name: str, +) -> set[str]: + return { + pacbio_sequencing_run_name, + pacbio_barcoded_sequencing_run_name, + pacbio_processed_sequencing_run_name, + } + + @pytest.fixture def pacbio_barcoded_1_c01_cell_full_name() -> str: """Return the full name of a PacBio SMRT cell.""" diff --git a/tests/fixture_plugins/pacbio_fixtures/service_fixtures.py b/tests/fixture_plugins/pacbio_fixtures/service_fixtures.py index 43da23757b..6449fbe1e0 100644 --- a/tests/fixture_plugins/pacbio_fixtures/service_fixtures.py +++ b/tests/fixture_plugins/pacbio_fixtures/service_fixtures.py @@ -1,5 +1,6 @@ """Module for PacBio fixtures returning service objects.""" +from pathlib import Path from unittest.mock import Mock import pytest @@ -20,8 +21,24 @@ PacBioRunDataGenerator, ) from cg.services.run_devices.pacbio.run_file_manager.run_file_manager import PacBioRunFileManager +from cg.services.run_devices.pacbio.run_validator.pacbio_run_validator import PacBioRunValidator +from cg.services.run_devices.run_names.pacbio import PacbioRunNamesService from cg.store.store import Store +# Mocked services + + +@pytest.fixture +def mock_pacbio_run_validator() -> PacBioRunValidator: + return PacBioRunValidator( + decompressor=Mock(), + file_transfer_validator=Mock(), + file_manager=Mock(), + ) + + +# Real services + @pytest.fixture def pac_bio_run_data_generator() -> PacBioRunDataGenerator: @@ -33,6 +50,11 @@ def pac_bio_run_file_manager() -> PacBioRunFileManager: return PacBioRunFileManager() +@pytest.fixture +def pacbio_run_names_service(pac_bio_runs_dir: Path) -> PacbioRunNamesService: + return PacbioRunNamesService(run_directory=pac_bio_runs_dir.as_posix()) + + @pytest.fixture def pac_bio_metrics_parser(pac_bio_run_file_manager: PacBioRunFileManager) -> PacBioMetricsParser: return PacBioMetricsParser(file_manager=pac_bio_run_file_manager) @@ -58,14 +80,14 @@ def pac_bio_post_processing_service( pac_bio_run_data_generator: PacBioRunDataGenerator, pac_bio_housekeeper_service: PacBioHousekeeperService, pac_bio_store_service: PacBioStoreService, - pacbio_barcoded_sequencing_run_name: str, + pac_bio_runs_dir: Path, ) -> PacBioPostProcessingService: return PacBioPostProcessingService( run_validator=Mock(), run_data_generator=pac_bio_run_data_generator, hk_service=pac_bio_housekeeper_service, store_service=pac_bio_store_service, - sequencing_dir=pacbio_barcoded_sequencing_run_name, + sequencing_dir=pac_bio_runs_dir.as_posix(), ) diff --git a/tests/fixture_plugins/pacbio_fixtures/unprocessed_runs_fixtures.py b/tests/fixture_plugins/pacbio_fixtures/unprocessed_runs_fixtures.py new file mode 100644 index 0000000000..880a0d4e75 --- /dev/null +++ b/tests/fixture_plugins/pacbio_fixtures/unprocessed_runs_fixtures.py @@ -0,0 +1,35 @@ +import pytest + +from cg.cli.post_process.utils import UnprocessedRunInfo +from cg.services.run_devices.pacbio.post_processing_service import PacBioPostProcessingService + + +@pytest.fixture +def pacbio_unprocessed_run_info( + pacbio_sequencing_run_name: str, pac_bio_post_processing_service: PacBioPostProcessingService +) -> UnprocessedRunInfo: + return UnprocessedRunInfo( + name=pacbio_sequencing_run_name, + post_processing_service=pac_bio_post_processing_service, + instrument="pacbio", + ) + + +@pytest.fixture +def pacbio_barcoded_unprocessed_run_info( + pacbio_barcoded_sequencing_run_name: str, + pac_bio_post_processing_service: PacBioPostProcessingService, +) -> UnprocessedRunInfo: + return UnprocessedRunInfo( + name=pacbio_barcoded_sequencing_run_name, + post_processing_service=pac_bio_post_processing_service, + instrument="pacbio", + ) + + +@pytest.fixture +def pacbio_unprocessed_runs( + pacbio_unprocessed_run_info: UnprocessedRunInfo, + pacbio_barcoded_unprocessed_run_info: UnprocessedRunInfo, +) -> list[UnprocessedRunInfo]: + return [pacbio_unprocessed_run_info, pacbio_barcoded_unprocessed_run_info] diff --git a/tests/fixtures/devices/pacbio/SMRTcells/r84202_20240913_121403/1_B01/post_processing_completed b/tests/fixtures/devices/pacbio/SMRTcells/r84202_20240913_121403/1_B01/post_processing_completed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/services/run_devices/pacbio/run_validator/test_pac_bio_run_validator.py b/tests/services/run_devices/pacbio/run_validator/test_pac_bio_run_validator.py index 4b9aa29036..6707c2c6f5 100644 --- a/tests/services/run_devices/pacbio/run_validator/test_pac_bio_run_validator.py +++ b/tests/services/run_devices/pacbio/run_validator/test_pac_bio_run_validator.py @@ -1,55 +1,50 @@ from pathlib import Path -from unittest.mock import Mock from cg.constants.pacbio import PacBioDirsAndFiles from cg.services.run_devices.pacbio.run_data_generator.run_data import PacBioRunData from cg.services.run_devices.pacbio.run_validator.pacbio_run_validator import PacBioRunValidator -def test_pac_bio_run_validator(tmp_path: Path): +def test_pacbio_run_validator(tmp_path: Path, mock_pacbio_run_validator: PacBioRunValidator): + """Test that the run validator flow works as expected.""" # GIVEN run data and a run validator run_data = PacBioRunData( - full_path=Path(tmp_path), + full_path=tmp_path, sequencing_run_name="not_relevant", well_name="not_relevant", plate=1, ) - run_validator = PacBioRunValidator( - decompressor=Mock(), - file_transfer_validator=Mock(), - file_manager=Mock(), - ) - # WHEN ensuring post processing can start - run_validator.ensure_post_processing_can_start(run_data) + # WHEN ensuring post-processing can start + mock_pacbio_run_validator.ensure_post_processing_can_start(run_data) # THEN the run is validated - assert run_validator.file_manager.get_run_validation_files.call_count == 1 - assert run_validator.file_transfer_validator.validate_file_transfer.call_count == 1 - assert run_validator.decompressor.decompress.call_count == 1 + assert mock_pacbio_run_validator.file_manager.get_run_validation_files.call_count == 1 + assert mock_pacbio_run_validator.file_transfer_validator.validate_file_transfer.call_count == 1 + assert mock_pacbio_run_validator.decompressor.decompress.call_count == 1 assert Path(run_data.full_path, PacBioDirsAndFiles.RUN_IS_VALID).exists() -def test_pac_bio_run_validator_skip_if_validated(tmp_path: Path): +def test_pacbio_run_validator_skip_if_validated( + tmp_path: Path, mock_pacbio_run_validator: PacBioRunValidator +): + """Test that the run validator skips validation if the run is already validated.""" # GIVEN run data and a run validator run_data = PacBioRunData( - full_path=Path(tmp_path), + full_path=tmp_path, sequencing_run_name="not_relevant", well_name="not_relevant", plate=1, ) - run_validator = PacBioRunValidator( - decompressor=Mock(), - file_transfer_validator=Mock(), - file_manager=Mock(), - ) - run_validator._touch_is_validated(run_data.full_path) - # WHEN ensuring post processing can start - run_validator.ensure_post_processing_can_start(run_data) + # GIVEN that the run is already validated + mock_pacbio_run_validator._touch_is_validated(run_data.full_path) - # THEN the run is validated + # WHEN ensuring post-processing can start + mock_pacbio_run_validator.ensure_post_processing_can_start(run_data) + + # THEN the validation flow is skipped as the run is already validated assert Path(run_data.full_path, PacBioDirsAndFiles.RUN_IS_VALID).exists() - assert run_validator.file_manager.get_run_validation_files.call_count == 0 - assert run_validator.file_transfer_validator.validate_file_transfer.call_count == 0 - assert run_validator.decompressor.decompress.call_count == 0 + assert mock_pacbio_run_validator.file_manager.get_run_validation_files.call_count == 0 + assert mock_pacbio_run_validator.file_transfer_validator.validate_file_transfer.call_count == 0 + assert mock_pacbio_run_validator.decompressor.decompress.call_count == 0 diff --git a/tests/services/run_names/test_pacbio_run_names_service.py b/tests/services/run_names/test_pacbio_run_names_service.py new file mode 100644 index 0000000000..ce7136e97d --- /dev/null +++ b/tests/services/run_names/test_pacbio_run_names_service.py @@ -0,0 +1,63 @@ +from pathlib import Path + +from cg.services.run_devices.run_names.pacbio import PacbioRunNamesService + + +def test_pacbio_get_run_names( + pacbio_run_names_service: PacbioRunNamesService, pacbio_run_names: set +): + """Test that getting PacBio run names works.""" + # GIVEN a PacBio run names service + + # WHEN getting the run names + run_names: list[str] = pacbio_run_names_service.get_run_names() + + # THEN the run names are returned + assert set(run_names) == pacbio_run_names + + +def test_pacbio_get_run_names_empty(tmp_path: Path): + # GIVEN a run directory with no run folders and a PacBio run names service + pacbio_run_names_service: PacbioRunNamesService = PacbioRunNamesService( + run_directory=tmp_path.as_posix() + ) + + # WHEN getting the run names + run_names: list[str] = pacbio_run_names_service.get_run_names() + + # THEN no run names are returned + assert run_names == [] + + +def test_pacbio_get_run_names_with_files_return_emty(tmp_path: Path): + # GIVEN a run directory with files and a PacBio run names service + run_dir = Path(tmp_path, "run") + run_dir.mkdir() + Path(run_dir, "file1").touch() + Path(run_dir, "file2").touch() + pacbio_run_names_service: PacbioRunNamesService = PacbioRunNamesService( + run_directory=run_dir.as_posix() + ) + + # WHEN getting the run names + run_names: list[str] = pacbio_run_names_service.get_run_names() + + # THEN no run names are returned + assert run_names == [] + + +def test_pacbio_get_run_names_with_subfolders_return_empty(tmp_path: Path): + # GIVEN a run directory with an empty subfolder and a PacBio run names service + run_dir = Path(tmp_path, "run") + run_dir.mkdir() + subfolder = Path(run_dir, "subfolder") + subfolder.mkdir() + pacbio_run_names_service: PacbioRunNamesService = PacbioRunNamesService( + run_directory=run_dir.as_posix() + ) + + # WHEN getting the run names + run_names: list[str] = pacbio_run_names_service.get_run_names() + + # THEN no run names are returned + assert run_names == []