Merge branch 'master' into feat-remove-bcl2fastq

Clinical-Genomics · Dec 14, 2023 · 0a47874 · 0a47874
2 parents 172b804 + 92f2c04
commit 0a47874
Show file tree

Hide file tree

Showing 47 changed files with 1,227 additions and 684 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 54.2.0
+current_version = 54.4.6
 commit = True
 tag = True
 tag_name = v{new_version}

diff --git a/cg/__init__.py b/cg/__init__.py
@@ -1,2 +1,2 @@
 __title__ = "cg"
-__version__ = "54.2.0"
+__version__ = "54.4.6"
diff --git a/cg/cli/archive.py b/cg/cli/archive.py
@@ -2,6 +2,7 @@
 from click.core import ParameterSource
 
 from cg.constants.archiving import DEFAULT_SPRING_ARCHIVE_COUNT
+from cg.constants.constants import DRY_RUN
 from cg.meta.archive.archive import SpringArchiveAPI
 from cg.models.cg_config import CGConfig
 
@@ -64,3 +65,20 @@ def update_job_statuses(context: CGConfig):
         data_flow_config=context.data_flow,
     )
     spring_archive_api.update_statuses_for_ongoing_tasks()
+
+
+@archive.command("delete-file")
+@DRY_RUN
+@click.pass_obj
+@click.argument("file_path", required=True)
+def delete_file(context: CGConfig, dry_run: bool, file_path: str):
+    """Delete an archived file and remove it from Housekeeper.
+    The file will not be deleted if it is not confirmed archived.
+    The file will not be deleted if its archive location can not be determined from the file tags.
+    """
+    spring_archive_api = SpringArchiveAPI(
+        status_db=context.status_db,
+        housekeeper_api=context.housekeeper_api,
+        data_flow_config=context.data_flow,
+    )
+    spring_archive_api.delete_file(file_path=file_path, dry_run=dry_run)
diff --git a/cg/cli/workflow/rnafusion/base.py b/cg/cli/workflow/rnafusion/base.py
@@ -119,7 +119,7 @@ def run(
                 case_id=case_id, params_file=params_file
             ),
             "name": case_id,
-            "compute_env": compute_env or analysis_api.compute_env,
+            "compute_env": compute_env or analysis_api.get_compute_env(case_id=case_id),
             "revision": revision or analysis_api.revision,
             "wait": "SUBMITTED",
             "id": nf_tower_id,

diff --git a/cg/cli/workflow/taxprofiler/base.py b/cg/cli/workflow/taxprofiler/base.py
@@ -116,7 +116,7 @@ def run(
                 case_id=case_id, params_file=params_file
             ),
             "name": case_id,
-            "compute_env": compute_env or analysis_api.compute_env,
+            "compute_env": compute_env or analysis_api.get_compute_env(case_id=case_id),
             "revision": revision or analysis_api.revision,
             "wait": NfTowerStatus.SUBMITTED,
             "id": nf_tower_id,

diff --git a/cg/constants/cgstats.py b/cg/constants/cgstats.py
diff --git a/cg/constants/delivery.py b/cg/constants/delivery.py
@@ -1,6 +1,7 @@
 """Constants for delivery."""
 
 from cg.constants.constants import Pipeline
+from cg.constants.housekeeper_tags import AlignmentFileTag, AnalysisTag, HK_DELIVERY_REPORT_TAG
 
 ONLY_ONE_CASE_PER_TICKET: list[Pipeline] = [
     Pipeline.FASTQ,
@@ -37,8 +38,8 @@
 ]
 
 BALSAMIC_ANALYSIS_SAMPLE_TAGS: list[set[str]] = [
-    {"cram"},
-    {"cram-index"},
+    {AlignmentFileTag.CRAM},
+    {AlignmentFileTag.CRAM_INDEX},
 ]
 
 BALSAMIC_QC_ANALYSIS_CASE_TAGS: list[set[str]] = [
@@ -95,10 +96,10 @@
 ]
 
 MIP_DNA_ANALYSIS_SAMPLE_TAGS: list[set[str]] = [
-    {"bam"},
-    {"bam-index"},
-    {"cram"},
-    {"cram-index"},
+    {AlignmentFileTag.BAM},
+    {AlignmentFileTag.BAM_BAI},
+    {AlignmentFileTag.CRAM},
+    {AlignmentFileTag.CRAM_INDEX},
 ]
 
 MIP_RNA_ANALYSIS_CASE_TAGS: list[set[str]] = [
@@ -114,12 +115,12 @@
 ]
 
 MIP_RNA_ANALYSIS_SAMPLE_TAGS: list[set[str]] = [
-    {"fusion", "star-fusion"},
-    {"fusion", "arriba"},
-    {"cram"},
-    {"cram-index"},
-    {"fusion", "vcf"},
-    {"fusion", "vcf-index"},
+    {AnalysisTag.FUSION, AnalysisTag.STARFUSION},
+    {AnalysisTag.FUSION, AnalysisTag.ARRIBA},
+    {AlignmentFileTag.CRAM},
+    {AlignmentFileTag.CRAM_INDEX},
+    {AnalysisTag.FUSION, "vcf"},
+    {AnalysisTag.FUSION, "vcf-index"},
     {"salmon-quant"},
 ]
 
@@ -150,23 +151,23 @@
 ]
 
 RNAFUSION_ANALYSIS_CASE_TAGS: list[set[str]] = [
-    {"fusion", "arriba"},
-    {"fusion", "star-fusion"},
-    {"fusion", "fusioncatcher"},
-    {"fusioncatcher-summary"},
-    {"fusioninspector"},
-    {"fusionreport", "research"},
-    {"fusioninspector-html", "research"},
-    {"arriba-visualisation", "research"},
-    {"multiqc-html", "rna"},
-    {"delivery-report"},
-    {"vcf-fusion"},
-    {"gene-counts"},
+    {AnalysisTag.FUSION, AnalysisTag.ARRIBA},
+    {AnalysisTag.FUSION, AnalysisTag.STARFUSION},
+    {AnalysisTag.FUSION, AnalysisTag.FUSIONCATCHER},
+    {AnalysisTag.FUSIONCATCHER_SUMMARY},
+    {AnalysisTag.FUSIONINSPECTOR},
+    {AnalysisTag.FUSIONREPORT, AnalysisTag.RESEARCH},
+    {AnalysisTag.FUSIONINSPECTOR_HTML, AnalysisTag.RESEARCH},
+    {AnalysisTag.ARRIBA_VISUALIZATION, AnalysisTag.RESEARCH},
+    {AnalysisTag.MULTIQC_HTML, AnalysisTag.RNA},
+    {HK_DELIVERY_REPORT_TAG},
+    {AnalysisTag.VCF_FUSION},
+    {AnalysisTag.GENE_COUNTS},
 ]
 
 RNAFUSION_ANALYSIS_SAMPLE_TAGS: list[set[str]] = [
-    {"cram"},
-    {"cram-index"},
+    {AlignmentFileTag.CRAM},
+    {AlignmentFileTag.CRAM_INDEX},
 ]
 
 

diff --git a/cg/constants/gene_panel.py b/cg/constants/gene_panel.py
@@ -41,6 +41,7 @@ class GenePanelMasterList(StrEnum):
     SOVM: str = "SOVM"
     STROKE: str = "STROKE"
     AID: str = "AID"
+    INHERITED_CANCER: str = "Inherited cancer"
 
     @classmethod
     def get_panel_names(cls, panels=None) -> list[str]:

diff --git a/cg/constants/housekeeper_tags.py b/cg/constants/housekeeper_tags.py
@@ -52,6 +52,25 @@ class SequencingFileTag(StrEnum):
 HK_DELIVERY_REPORT_TAG = "delivery-report"
 
 
+class AnalysisTag(StrEnum):
+    """Tags for analysis files."""
+
+    ARRIBA: str = "arriba"
+    ARRIBA_VISUALIZATION: str = "arriba-visualisation"
+    FUSION: str = "fusion"
+    FUSIONCATCHER: str = "fusioncatcher"
+    FUSIONCATCHER_SUMMARY: str = "fusioncatcher-summary"
+    FUSIONINSPECTOR: str = "fusioninspector"
+    FUSIONINSPECTOR_HTML: str = "fusioninspector-html"
+    FUSIONREPORT: str = "fusionreport"
+    GENE_COUNTS: str = "gene-counts"
+    MULTIQC_HTML: str = "multiqc-html"
+    RESEARCH: str = "research"
+    RNA: str = "rna"
+    STARFUSION: str = "star-fusion"
+    VCF_FUSION: str = "vcf-fusion"
+
+
 class HkMipAnalysisTag:
     CONFIG: list[str] = ["mip-config"]
     QC_METRICS: list[str] = ["qc-metrics", "deliverable"]

diff --git a/cg/io/gzip.py b/cg/io/gzip.py
@@ -0,0 +1,8 @@
+import gzip
+from pathlib import Path
+
+
+def read_gzip_first_line(file_path: Path) -> str:
+    """Return first line of gzip file."""
+    with gzip.open(file_path) as file:
+        return file.readline().decode()
diff --git a/cg/meta/archive/archive.py b/cg/meta/archive/archive.py
@@ -1,14 +1,15 @@
 import logging
 from typing import Callable, Type
 
+import click
 from housekeeper.store.models import Archive, File
 from pydantic import BaseModel, ConfigDict
 
 from cg.apps.housekeeper.hk import HousekeeperAPI
 from cg.constants import SequencingFileTag
 from cg.constants.archiving import ArchiveLocations
 from cg.exc import ArchiveJobFailedError
-from cg.meta.archive.ddn_dataflow import DDNDataFlowClient
+from cg.meta.archive.ddn.ddn_data_flow_client import DDNDataFlowClient
 from cg.meta.archive.models import ArchiveHandler, FileAndSample, SampleAndDestination
 from cg.models.cg_config import DataFlowConfig
 from cg.store import Store
@@ -252,7 +253,7 @@ def sort_archival_ids_on_archive_location(
         jobs_per_location: dict[ArchiveLocations, list[int]] = {}
         jobs_and_locations: set[
             tuple[int, ArchiveLocations]
-        ] = self.get_unique_archival_ids_and_their_archive_location(archive_entries)
+        ] = self.get_unique_archival_ids_and_archive_locations(archive_entries)
 
         for archive_location in ArchiveLocations:
             jobs_per_location[ArchiveLocations(archive_location)] = [
@@ -262,18 +263,14 @@ def sort_archival_ids_on_archive_location(
             ]
         return jobs_per_location
 
-    def get_unique_archival_ids_and_their_archive_location(
+    def get_unique_archival_ids_and_archive_locations(
         self, archive_entries: list[Archive]
     ) -> set[tuple[int, ArchiveLocations]]:
-        return set(
-            [
-                (
-                    archive.archiving_task_id,
-                    ArchiveLocations(self.get_archive_location_from_file(archive.file)),
-                )
-                for archive in archive_entries
-            ]
-        )
+        ids_and_locations: set[tuple[int, ArchiveLocations]] = set()
+        for archive in archive_entries:
+            if location := self.get_archive_location_from_file(archive.file):
+                ids_and_locations.add((archive.archiving_task_id, location))
+        return ids_and_locations
 
     def sort_retrieval_ids_on_archive_location(
         self, archive_entries: list[Archive]
@@ -283,7 +280,7 @@ def sort_retrieval_ids_on_archive_location(
         jobs_per_location: dict[ArchiveLocations, list[int]] = {}
         jobs_and_locations: set[
             tuple[int, ArchiveLocations]
-        ] = self.get_unique_retrieval_ids_and_their_archive_location(archive_entries)
+        ] = self.get_unique_retrieval_ids_and_archive_locations(archive_entries)
         for archive_location in ArchiveLocations:
             jobs_per_location[ArchiveLocations(archive_location)] = [
                 job_and_location[0]
@@ -292,18 +289,51 @@ def sort_retrieval_ids_on_archive_location(
             ]
         return jobs_per_location
 
-    def get_unique_retrieval_ids_and_their_archive_location(
+    def get_unique_retrieval_ids_and_archive_locations(
         self, archive_entries: list[Archive]
     ) -> set[tuple[int, ArchiveLocations]]:
-        return set(
-            [
-                (
-                    archive.retrieval_task_id,
-                    ArchiveLocations(self.get_archive_location_from_file(archive.file)),
-                )
-                for archive in archive_entries
-            ]
+        ids_and_locations: set[tuple[int, ArchiveLocations]] = set()
+        for archive in archive_entries:
+            if location := self.get_archive_location_from_file(archive.file):
+                ids_and_locations.add((archive.retrieval_task_id, location))
+        return ids_and_locations
+
+    @staticmethod
+    def is_file_archived(file: File) -> bool:
+        return file.archive and file.archive.archived_at
+
+    @staticmethod
+    def get_archive_location_from_file(file: File) -> ArchiveLocations | None:
+        for tag_name in [tag.name for tag in file.tags]:
+            if tag_name in iter(ArchiveLocations):
+                LOG.info(f"Found archive location {tag_name}")
+                return tag_name
+        LOG.warning("No archive location in the file tags")
+        return None
+
+    def delete_file_from_archive_location(
+        self, file_and_sample: FileAndSample, archive_location: ArchiveLocations
+    ) -> None:
+        archive_handler: ArchiveHandler = ARCHIVE_HANDLERS[archive_location](self.data_flow_config)
+        archive_handler.delete_file(file_and_sample)
+
+    def delete_file(self, file_path: str, dry_run: bool = False) -> None:
+        """Deletes the specified file where it is archived and deletes the Housekeeper record.
+        Raises:
+            Click.Abort if yes is not specified or the user does not confirm the deletion."""
+        file: File = self.housekeeper_api.files(path=file_path).first()
+        if not self.is_file_archived(file):
+            LOG.warning(f"No archived file found for file {file_path} - exiting")
+            return
+        archive_location: ArchiveLocations | None = self.get_archive_location_from_file(file)
+        if not archive_location:
+            LOG.warning("No archive location could be determined - exiting")
+            return
+        if dry_run:
+            click.echo(f"Would have deleted file {file_path} from {archive_location}.")
+            return
+        file_and_sample: FileAndSample = self.add_samples_to_files([file])[0]
+        self.delete_file_from_archive_location(
+            file_and_sample=file_and_sample, archive_location=archive_location
         )
-
-    def get_archive_location_from_file(self, file: File) -> str:
-        return self.status_db.get_sample_by_internal_id(file.version.bundle.name).archive_location
+        self.housekeeper_api.delete_file(file.id)
diff --git a/cg/meta/archive/ddn/__init__.py b/cg/meta/archive/ddn/__init__.py
diff --git a/cg/meta/archive/ddn/constants.py b/cg/meta/archive/ddn/constants.py
@@ -0,0 +1,51 @@
+from enum import StrEnum
+
+OSTYPE: str = "Unix/MacOS"
+ROOT_TO_TRIM: str = "/home"
+
+DESTINATION_ATTRIBUTE: str = "destination"
+SOURCE_ATTRIBUTE: str = "source"
+
+
+class DataflowEndpoints(StrEnum):
+    """Enum containing all DDN dataflow endpoints used."""
+
+    ARCHIVE_FILES = "files/archive"
+    DELETE_FILE = "files/delete"
+    GET_AUTH_TOKEN = "auth/token"
+    REFRESH_AUTH_TOKEN = "auth/token/refresh"
+    RETRIEVE_FILES = "files/retrieve"
+    GET_JOB_STATUS = "activity/jobs/"
+
+
+class JobStatus(StrEnum):
+    """Enum for the different job statuses which can be returned via Miria."""
+
+    CANCELED = "Canceled"
+    COMPLETED = "Completed"
+    DENIED = "Denied"
+    CREATION_IN_PROGRESS = "Creation in progress"
+    IN_QUEUE = "In Queue"
+    INVALID_LICENSE = "Invalid license"
+    ON_VALIDATION = "On validation"
+    REFUSED = "Refused"
+    RUNNING = "Running"
+    SUSPENDED = "Suspended"
+    TERMINATED_ON_ERROR = "Terminated on error"
+    TERMINATED_ON_WARNING = "Terminated on warning"
+
+
+FAILED_JOB_STATUSES: list[str] = [
+    JobStatus.CANCELED,
+    JobStatus.DENIED,
+    JobStatus.INVALID_LICENSE,
+    JobStatus.REFUSED,
+    JobStatus.TERMINATED_ON_ERROR,
+    JobStatus.TERMINATED_ON_WARNING,
+]
+ONGOING_JOB_STATUSES: list[str] = [
+    JobStatus.CREATION_IN_PROGRESS,
+    JobStatus.IN_QUEUE,
+    JobStatus.ON_VALIDATION,
+    JobStatus.RUNNING,
+]