(Archiving) get job status (#2319) (minor)

islean · web-flow · commit f4704095fa33 · 2023-11-09T12:43:13.000+01:00
### Added

- Support for querying ongoing archivals/retrievals and updating their statuses.
diff --git a/cg/apps/housekeeper/hk.py b/cg/apps/housekeeper/hk.py
@@ -17,7 +17,11 @@
 from sqlalchemy.orm import Query
 
 from cg.constants import SequencingFileTag
-from cg.exc import HousekeeperBundleVersionMissingError, HousekeeperFileMissingError
+from cg.exc import (
+    HousekeeperArchiveMissingError,
+    HousekeeperBundleVersionMissingError,
+    HousekeeperFileMissingError,
+)
 
 LOG = logging.getLogger(__name__)
 
@@ -490,7 +494,6 @@ def set_archive_archived_at(self, file_id: int, archiving_task_id: int):
                 f"while archiving task id in Housekeeper is {archive.archiving_task_id}."
             )
         self._store.update_archiving_time_stamp(archive=archive)
-        self.commit()
 
     def set_archive_retrieval_task_id(self, file_id: int, retrieval_task_id: int) -> None:
         """Sets the retrieval_task_id for an Archive entry. Raises a ValueError if the given retrieval task id
@@ -499,7 +502,6 @@ def set_archive_retrieval_task_id(self, file_id: int, retrieval_task_id: int) ->
         if not archive:
             raise ValueError(f"No Archive entry found for file with id {file_id}.")
         self._store.update_retrieval_task_id(archive=archive, retrieval_task_id=retrieval_task_id)
-        self.commit()
 
     def get_sample_sheets_from_latest_version(self, flow_cell_id: str) -> list[File]:
         """Returns the files tagged with 'samplesheet' for the given bundle."""
@@ -573,3 +575,54 @@ def store_fastq_path_in_housekeeper(
             bundle_name=sample_internal_id,
             tag_names=[SequencingFileTag.FASTQ, flow_cell_id, sample_internal_id],
         )
+
+    def get_archive_entries(
+        self, archival_task_id: int = None, retrieval_task_id: int = None
+    ) -> list[Archive]:
+        """Returns all archives matching the provided task ids. If no task ids are provided, all archive entries are
+        returned. If only an archival_task_id is provided, filtering is only done on that parameter and vice versa
+        with retrieval_task_id.
+        """
+        return self._store.get_archives(
+            archival_task_id=archival_task_id, retrieval_task_id=retrieval_task_id
+        )
+
+    def set_archived_at(self, archival_task_id: int) -> None:
+        """Sets archived_at to the current time for archive entries with matching archival task id.
+        Raises:
+                HousekeeperArchiveMissingError if no Archive entries match the given retrieval task id.
+        """
+        archive_entries: list[Archive] = self.get_archive_entries(archival_task_id=archival_task_id)
+        if not archive_entries:
+            raise HousekeeperArchiveMissingError(
+                f"Could not find any archives with archival_task_id {archival_task_id}"
+            )
+        for archive in archive_entries:
+            self.set_archive_archived_at(
+                archiving_task_id=archival_task_id, file_id=archive.file_id
+            )
+        self.commit()
+
+    def set_retrieved_at(self, retrieval_task_id: int) -> None:
+        """Sets retrieved_at to the current time for archive entries with matching archival task id.
+        Raises:
+                HousekeeperArchiveMissingError if no Archive entries match the given retrieval task id.
+        """
+        archive_entries: list[Archive] = self.get_archive_entries(
+            retrieval_task_id=retrieval_task_id
+        )
+        if not archive_entries:
+            raise HousekeeperArchiveMissingError(
+                f"Could not find any archives with retrieval_task_id {retrieval_task_id}"
+            )
+        for archive in archive_entries:
+            self.set_archive_retrieved_at(
+                retrieval_task_id=retrieval_task_id, file_id=archive.file_id
+            )
+        self.commit()
+
+    def get_ongoing_archivals(self) -> list[Archive]:
+        return self._store.get_ongoing_archivals()
+
+    def get_ongoing_retrievals(self) -> list[Archive]:
+        return self._store.get_ongoing_retrievals()
diff --git a/cg/exc.py b/cg/exc.py
@@ -120,6 +120,12 @@ class HousekeeperBundleVersionMissingError(CgError):
     """
 
 
+class HousekeeperArchiveMissingError(CgError):
+    """
+    Exception raised when an archive is missing in Housekeeper.
+    """
+
+
 class LimsDataError(CgError):
     """
     Error related to missing/incomplete data in LIMS.
diff --git a/cg/meta/archive/archive.py b/cg/meta/archive/archive.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 from typing import Callable, Optional, Type
 
-from housekeeper.store.models import File
+from housekeeper.store.models import Archive, File
 from pydantic import BaseModel, ConfigDict
 
 from cg.apps.housekeeper.hk import HousekeeperAPI
@@ -16,6 +16,9 @@
 
 LOG = logging.getLogger(__name__)
 DEFAULT_SPRING_ARCHIVE_COUNT = 200
+ARCHIVE_HANDLERS: dict[str, Type[ArchiveHandler]] = {
+    ArchiveLocations.KAROLINSKA_BUCKET: DDNDataFlowClient
+}
 
 
 class ArchiveModels(BaseModel):
@@ -53,11 +56,6 @@ def filter_samples_on_archive_location(
     ]
 
 
-ARCHIVE_HANDLERS: dict[str, Type[ArchiveHandler]] = {
-    ArchiveLocations.KAROLINSKA_BUCKET: DDNDataFlowClient
-}
-
-
 class SpringArchiveAPI:
     """Class handling the archiving of sample SPRING files to an off-premise location for long
     term storage."""
@@ -188,7 +186,125 @@ def add_samples_to_files(self, files_to_archive: list[File]) -> list[FileAndSamp
         adds it to the list which is returned."""
         files_and_samples: list[FileAndSample] = []
         for file in files_to_archive:
-            sample: Optional[Sample] = self.get_sample(file)
-            if sample:
+            if sample := self.get_sample(file):
                 files_and_samples.append(FileAndSample(file=file, sample=sample))
         return files_and_samples
+
+    def update_status_for_ongoing_tasks(self) -> None:
+        """Updates any completed jobs with a finished timestamp."""
+        self.update_ongoing_archivals()
+        self.update_ongoing_retrievals()
+
+    def update_ongoing_archivals(self) -> None:
+        ongoing_archivals: list[Archive] = self.housekeeper_api.get_ongoing_archivals()
+        archival_ids_per_location: dict[
+            ArchiveLocations, list[int]
+        ] = self.sort_archival_ids_on_archive_location(ongoing_archivals)
+        for archive_location in ArchiveLocations:
+            self.update_archival_jobs_for_archive_location(
+                archive_location=archive_location,
+                job_ids=archival_ids_per_location.get(archive_location),
+            )
+
+    def update_ongoing_retrievals(self) -> None:
+        ongoing_retrievals: list[Archive] = self.housekeeper_api.get_ongoing_retrievals()
+        retrieval_ids_per_location: dict[
+            ArchiveLocations, list[int]
+        ] = self.sort_retrieval_ids_on_archive_location(ongoing_retrievals)
+        for archive_location in ArchiveLocations:
+            self.update_retrieval_jobs_for_archive_location(
+                archive_location=archive_location,
+                job_ids=retrieval_ids_per_location.get(archive_location),
+            )
+
+    def update_archival_jobs_for_archive_location(
+        self, archive_location: ArchiveLocations, job_ids: list[int]
+    ) -> None:
+        for job_id in job_ids:
+            self.update_ongoing_task(
+                task_id=job_id, archive_location=archive_location, is_archival=True
+            )
+
+    def update_retrieval_jobs_for_archive_location(
+        self, archive_location: ArchiveLocations, job_ids: list[int]
+    ) -> None:
+        for job_id in job_ids:
+            self.update_ongoing_task(
+                task_id=job_id, archive_location=archive_location, is_archival=False
+            )
+
+    def update_ongoing_task(
+        self, task_id: int, archive_location: ArchiveLocations, is_archival: bool
+    ) -> None:
+        """Fetches info on an ongoing job and updates the Archive entry in Housekeeper."""
+        archive_handler: ArchiveHandler = ARCHIVE_HANDLERS[archive_location](self.data_flow_config)
+        is_job_done: bool = archive_handler.is_job_done(task_id)
+        if is_job_done:
+            LOG.info(f"Job with id {task_id} has finished, updating Archive entries.")
+            if is_archival:
+                self.housekeeper_api.set_archived_at(task_id)
+            else:
+                self.housekeeper_api.set_retrieved_at(task_id)
+        else:
+            LOG.info(f"Job with id {task_id} has not yet finished.")
+
+    def sort_archival_ids_on_archive_location(
+        self, archive_entries: list[Archive]
+    ) -> dict[ArchiveLocations, list[int]]:
+        """Returns a dictionary with keys being ArchiveLocations and the values being the subset of the given
+        archival jobs which should be archived there."""
+
+        jobs_per_location: dict[ArchiveLocations, list[int]] = {}
+        jobs_and_locations: set[
+            tuple[int, ArchiveLocations]
+        ] = self.get_unique_archival_ids_and_their_archive_location(archive_entries)
+
+        for archive_location in ArchiveLocations:
+            jobs_per_location[ArchiveLocations(archive_location)] = [
+                job_and_location[0]
+                for job_and_location in jobs_and_locations
+                if job_and_location[1] == archive_location
+            ]
+        return jobs_per_location
+
+    def get_unique_archival_ids_and_their_archive_location(
+        self, archive_entries: list[Archive]
+    ) -> set[tuple[int, ArchiveLocations]]:
+        return set(
+            [
+                (archive.archiving_task_id, self.get_archive_location_from_file(archive.file))
+                for archive in archive_entries
+            ]
+        )
+
+    def sort_retrieval_ids_on_archive_location(
+        self, archive_entries: list[Archive]
+    ) -> dict[ArchiveLocations, list[int]]:
+        """Returns a dictionary with keys being ArchiveLocations and the values being the subset of the given
+        retrieval jobs which should be archived there."""
+        jobs_per_location: dict[ArchiveLocations, list[int]] = {}
+        jobs_and_locations: set[
+            tuple[int, ArchiveLocations]
+        ] = self.get_unique_retrieval_ids_and_their_archive_location(archive_entries)
+        for archive_location in ArchiveLocations:
+            jobs_per_location[ArchiveLocations(archive_location)] = [
+                job_and_location[0]
+                for job_and_location in jobs_and_locations
+                if job_and_location[1] == archive_location
+            ]
+        return jobs_per_location
+
+    def get_unique_retrieval_ids_and_their_archive_location(
+        self, archive_entries: list[Archive]
+    ) -> set[tuple[int, ArchiveLocations]]:
+        return set(
+            [
+                (archive.retrieval_task_id, self.get_archive_location_from_file(archive.file))
+                for archive in archive_entries
+            ]
+        )
+
+    def get_archive_location_from_file(self, file: File) -> ArchiveLocations:
+        return ArchiveLocations(
+            self.status_db.get_sample_by_internal_id(file.version.bundle.name).archive_location
+        )
diff --git a/cg/meta/archive/ddn_dataflow.py b/cg/meta/archive/ddn_dataflow.py
@@ -1,4 +1,5 @@
 """Module for archiving and retrieving folders via DDN Dataflow."""
+import logging
 from datetime import datetime
 from enum import StrEnum
 from pathlib import Path
@@ -21,6 +22,8 @@
 from cg.models.cg_config import DataFlowConfig
 from cg.store.models import Sample
 
+LOG = logging.getLogger(__name__)
+
 OSTYPE: str = "Unix/MacOS"
 ROOT_TO_TRIM: str = "/home"
 
@@ -35,6 +38,20 @@ class DataflowEndpoints(StrEnum):
     GET_AUTH_TOKEN = "auth/token"
     REFRESH_AUTH_TOKEN = "auth/token/refresh"
     RETRIEVE_FILES = "files/retrieve"
+    GET_JOB_STATUS = "getJobStatus"
+
+
+class JobDescription(StrEnum):
+    """Enum for the different job statuses which can be returned via Miria."""
+
+    CANCELED = "Canceled"
+    COMPLETED = "Completed"
+    CREATION = "Creation"
+    IN_QUEUE = "In Queue"
+    REFUSED = "Refused"
+    RUNNING = "Running"
+    SUSPENDED = "Suspended"
+    TERMINATED_ON_ERROR = "Terminated on Error"
 
 
 class MiriaObject(FileTransferData):
@@ -158,6 +175,62 @@ class TransferJob(BaseModel):
     job_id: int
 
 
+class SubJob(BaseModel):
+    """Model representing the response fields in a subjob returned in a get_job_status post."""
+
+    subjob_id: int
+    subjob_type: str
+    status: int
+    description: str
+    progress: float
+    total_rate: int
+    throughput: int
+    estimated_end: datetime
+    estimated_left: int
+
+
+class GetJobStatusResponse(BaseModel):
+    """Model representing the response fields from a get_job_status post."""
+
+    request_date: Optional[datetime] = None
+    operation: Optional[str] = None
+    job_id: int
+    type: Optional[str] = None
+    status: Optional[int] = None
+    description: str
+    start_date: Optional[datetime] = None
+    end_date: Optional[datetime] = None
+    durationTime: Optional[int] = None
+    priority: Optional[int] = None
+    progress: Optional[float] = None
+    subjobs: Optional[list[SubJob]] = None
+
+
+class GetJobStatusPayload(BaseModel):
+    """Model representing the payload for a get_job_status request."""
+
+    job_id: int
+    subjob_id: Optional[int] = None
+    related_jobs: Optional[bool] = None
+    main_subjob: Optional[bool] = None
+    debug: Optional[bool] = None
+
+    def post_request(self, url: str, headers: dict) -> GetJobStatusResponse:
+        """Sends a request to the given url with the given headers, and its own content as
+        payload. Returns the job ID of the launched transfer task.
+        Raises:
+             HTTPError if the response code is not ok.
+        """
+        response: Response = APIRequest.api_request_from_content(
+            api_method=APIMethods.POST,
+            url=url,
+            headers=headers,
+            json=self.model_dump(),
+        )
+        response.raise_for_status()
+        return GetJobStatusResponse.model_validate(response.json())
+
+
 class DDNDataFlowClient(ArchiveHandler):
     """Class for archiving and retrieving folders via DDN Dataflow."""
 
@@ -281,3 +354,17 @@ def convert_into_transfer_data(
             )
             for file_and_sample in files_and_samples
         ]
+
+    def is_job_done(self, job_id: int) -> bool:
+        get_job_status_payload = GetJobStatusPayload(job_id=job_id)
+        get_job_status_response: GetJobStatusResponse = get_job_status_payload.post_request(
+            url=urljoin(self.url, DataflowEndpoints.GET_JOB_STATUS),
+            headers=dict(self.headers, **self.auth_header),
+        )
+        if get_job_status_response.description == JobDescription.COMPLETED:
+            return True
+        LOG.info(
+            f"Job with id {job_id} has not been completed. "
+            f"Current job description is {get_job_status_response.description}"
+        )
+        return False
diff --git a/cg/meta/archive/models.py b/cg/meta/archive/models.py
@@ -53,14 +53,14 @@ def retrieve_samples(self, samples_and_destinations: list[SampleAndDestination])
         """Retrieves all files for all samples for the given flowcell."""
         pass
 
-    @abstractmethod
-    def retrieve_file(self, file_and_sample: FileAndSample):
-        """Retrieves the specified archived file."""
-        pass
-
     @abstractmethod
     def convert_into_transfer_data(
         self, files_and_samples: list[FileAndSample], is_archiving: bool = True
     ) -> list[FileTransferData]:
         """Converts the provided files_and_samples into a list of objects formatted for the specific archiving flow."""
         pass
+
+    @abstractmethod
+    def is_job_done(self, job_id: int) -> bool:
+        """Returns true if job has been completed, false otherwise."""
+        pass
diff --git a/tests/meta/archive/conftest.py b/tests/meta/archive/conftest.py
diff --git a/tests/meta/archive/test_archive_api.py b/tests/meta/archive/test_archive_api.py