Skip to content

Commit 95bad81

Browse files
Improved after feedback
1 parent 23ab294 commit 95bad81

File tree

7 files changed

+169
-105
lines changed

7 files changed

+169
-105
lines changed

cg/services/illumina/backup/backup_service.py

Lines changed: 43 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import subprocess
2+
from datetime import datetime
23
from pathlib import Path
34

45
from cg.apps.slurm.slurm_api import SlurmAPI
@@ -11,24 +12,21 @@
1112
IlluminaRunEncryptionError,
1213
PdcError,
1314
PdcNoFilesMatchingSearchError,
14-
ValidationError,
1515
)
1616
from cg.meta.backup.backup import LOG
1717
from cg.meta.encryption.encryption import EncryptionAPI
1818
from cg.meta.tar.tar import TarAPI
1919
from cg.models.cg_config import PDCArchivingDirectory
2020
from cg.models.run_devices.illumina_run_directory_data import IlluminaRunDirectoryData
2121
from cg.services.illumina.backup.encrypt_service import IlluminaRunEncryptionService
22+
from cg.services.illumina.backup.models import DsmcEncryptionKey, DsmcSequencingFile
2223
from cg.services.illumina.backup.utils import (
2324
DsmcOutput,
25+
convert_string_to_datetime_object,
2426
get_latest_dsmc_archived_sequencing_run,
2527
get_latest_dsmc_encryption_key,
26-
contains_dsmc_key,
27-
contains_dsmc_sequencing_path,
28-
)
29-
from cg.services.illumina.file_parsing.models import (
30-
DsmcEncryptionKey,
31-
DsmcSequencingFile,
28+
is_dsmc_encryption_key,
29+
is_dsmc_sequencing_path,
3230
)
3331
from cg.services.pdc_service.pdc_service import PdcService
3432
from cg.store.models import IlluminaSequencingRun
@@ -293,58 +291,71 @@ def parse_dsmc_output_sequencing_path(dsmc_output: list[str]) -> list[DsmcSequen
293291
"""Parses the DSMC command output to extract validated sequencing paths."""
294292
validated_responses = []
295293
for line in dsmc_output:
296-
if contains_dsmc_sequencing_path(line):
294+
if is_dsmc_sequencing_path(line):
297295
parts = line.split()
298-
try:
299-
query_response = DsmcSequencingFile(
300-
date=f"{parts[DsmcOutput.DATE_COLUMN_INDEX]} {parts[DsmcOutput.TIME_COLUMN_INDEX]}",
301-
sequencing_path=parts[DsmcOutput.PATH_COLUMN_INDEX],
302-
)
303-
validated_responses.append(query_response)
304-
except ValidationError as e:
305-
LOG.error(f"Validation error for line: {line}\nError: {e}")
296+
297+
fileDateTime: datetime = convert_string_to_datetime_object(
298+
f"{parts[DsmcOutput.DATE_COLUMN_INDEX]} {parts[DsmcOutput.TIME_COLUMN_INDEX]}"
299+
)
300+
301+
query_response = DsmcSequencingFile(
302+
dateTime=fileDateTime,
303+
path=Path(parts[DsmcOutput.PATH_COLUMN_INDEX]),
304+
)
305+
validated_responses.append(query_response)
306306

307307
return validated_responses
308308

309309
@classmethod
310310
def get_latest_archived_sequencing_run_path(cls, dsmc_output: list[str]) -> Path | None:
311311
"""Get the path of the archived sequencing run from a PDC query."""
312-
validated_sequencing_paths = cls.parse_dsmc_output_sequencing_path(dsmc_output)
312+
validated_sequencing_paths: list[DsmcSequencingFile] = (
313+
cls.parse_dsmc_output_sequencing_path(dsmc_output)
314+
)
313315

314-
archived_run = get_latest_dsmc_archived_sequencing_run(validated_sequencing_paths)
316+
archived_run: DsmcSequencingFile = get_latest_dsmc_archived_sequencing_run(
317+
validated_sequencing_paths
318+
)
315319

316320
if archived_run:
317321
LOG.info(f"Sequencing run found: {archived_run}")
318-
return archived_run
322+
return archived_run.path
319323

320324
@staticmethod
321325
def parse_dsmc_output_key_path(dsmc_output: list[str]) -> list[DsmcEncryptionKey]:
322326
"""Parses the DSMC command output to extract validated encryption keys."""
323327
validated_responses = []
324328
for line in dsmc_output:
325-
if contains_dsmc_key(line):
326-
parts = line.split()
327-
try:
328-
query_response = DsmcEncryptionKey(
329-
date=f"{parts[DsmcOutput.DATE_COLUMN_INDEX]} {parts[DsmcOutput.TIME_COLUMN_INDEX]}",
330-
key_path=parts[DsmcOutput.PATH_COLUMN_INDEX],
331-
)
332-
validated_responses.append(query_response)
333-
except ValidationError as e:
334-
LOG.error(f"Validation error for line: {line}\nError: {e}")
329+
if is_dsmc_encryption_key(line):
330+
parts: list[str] = line.split()
331+
332+
fileDateTime: datetime = convert_string_to_datetime_object(
333+
f"{parts[DsmcOutput.DATE_COLUMN_INDEX]} {parts[DsmcOutput.TIME_COLUMN_INDEX]}"
334+
)
335+
336+
query_response = DsmcEncryptionKey(
337+
dateTime=fileDateTime,
338+
path=Path(parts[DsmcOutput.PATH_COLUMN_INDEX]),
339+
)
340+
341+
validated_responses.append(query_response)
335342

336343
return validated_responses
337344

338345
@classmethod
339346
def get_archived_encryption_key_path(cls, dsmc_output: list[str]) -> Path | None:
340347
"""Get the encryption key for the archived sequencing run from a PDC query."""
341-
validated_encryption_keys = cls.parse_dsmc_output_key_path(dsmc_output)
348+
validated_encryption_keys: list[DsmcEncryptionKey] = cls.parse_dsmc_output_key_path(
349+
dsmc_output
350+
)
342351

343-
archived_encryption_key = get_latest_dsmc_encryption_key(validated_encryption_keys)
352+
archived_encryption_key: DsmcEncryptionKey = get_latest_dsmc_encryption_key(
353+
validated_encryption_keys
354+
)
344355

345356
if archived_encryption_key:
346357
LOG.info(f"Encryption key found: {archived_encryption_key}")
347-
return archived_encryption_key
358+
return archived_encryption_key.path
348359

349360
def validate_is_run_backup_possible(
350361
self,

cg/services/illumina/backup/exc.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from cg.exc import CgError
2+
3+
4+
class DsmcMissingSequenceFileError(CgError):
5+
"""Exception raised when a Dsmc sequence file is not found."""
6+
7+
8+
class DsmcMissingEncryptionKeyError(CgError):
9+
"""Exception raised when a Dsmc encryption key is not found."""
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from datetime import datetime
2+
from pathlib import Path
3+
from typing import Annotated
4+
5+
from pydantic import BaseModel, BeforeValidator
6+
7+
from cg.services.illumina.backup.validators import (
8+
is_valid_dsmc_encryption_key_path,
9+
is_valid_dsmc_sequencing_file_path,
10+
)
11+
12+
13+
class DsmcEncryptionKey(BaseModel):
14+
"""Model representing the response from a PDC query."""
15+
16+
dateTime: datetime
17+
path: Annotated[Path, BeforeValidator(is_valid_dsmc_encryption_key_path)]
18+
19+
20+
class DsmcSequencingFile(BaseModel):
21+
"""Model representing the response from a PDC query."""
22+
23+
dateTime: datetime
24+
path: Annotated[Path, BeforeValidator(is_valid_dsmc_sequencing_file_path)]
Lines changed: 40 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
"""Helper functions."""
22

3-
from enum import IntEnum
4-
from pathlib import Path
3+
from datetime import datetime
4+
from operator import attrgetter
55

66
from cg.constants import FileExtensions
7-
from cg.exc import ValidationError
8-
from cg.services.illumina.file_parsing.models import (
9-
DsmcEncryptionKey,
10-
DsmcSequencingFile,
7+
from cg.services.illumina.backup.exc import (
8+
DsmcMissingEncryptionKeyError,
9+
DsmcMissingSequenceFileError,
1110
)
12-
from cg.constants import FileExtensions
11+
from cg.services.illumina.backup.models import DsmcEncryptionKey, DsmcSequencingFile
1312

1413

1514
class DsmcOutput:
@@ -18,41 +17,54 @@ class DsmcOutput:
1817
PATH_COLUMN_INDEX = 4
1918

2019

21-
def contains_dsmc_key(line: str) -> bool:
22-
if (
20+
def is_dsmc_encryption_key(line: str) -> bool:
21+
return (
2322
FileExtensions.KEY in line
2423
and FileExtensions.GPG in line
2524
and FileExtensions.GZIP not in line
26-
):
27-
return True
28-
return False
25+
)
2926

3027

31-
def contains_dsmc_sequencing_path(line: str) -> bool:
32-
if FileExtensions.TAR in line and FileExtensions.GZIP in line and FileExtensions.GPG in line:
33-
return True
34-
return False
28+
def is_dsmc_sequencing_path(line: str) -> bool:
29+
return FileExtensions.TAR in line and FileExtensions.GZIP in line and FileExtensions.GPG in line
3530

3631

37-
def get_latest_dsmc_archived_sequencing_run(dsmc_files: list[DsmcSequencingFile]) -> Path:
32+
def get_latest_dsmc_archived_sequencing_run(
33+
dsmc_files: list[DsmcSequencingFile],
34+
) -> DsmcSequencingFile:
3835
"""Return the latest file path based on the date attribute."""
36+
3937
if not dsmc_files:
40-
return None # Return None if the list is empty
38+
raise DsmcMissingSequenceFileError("No archived sequencing in DSMC output.")
4139

42-
# Get the file with the latest date
43-
latest_file = max(dsmc_files, key=lambda file: file.date)
40+
latest_file = max(dsmc_files, key=attrgetter("dateTime"))
4441

45-
# Return the sequencing_path as a Path object
46-
return Path(latest_file.sequencing_path)
42+
return latest_file
4743

4844

49-
def get_latest_dsmc_encryption_key(dsmc_files: list[DsmcEncryptionKey]) -> Path:
45+
def get_latest_dsmc_encryption_key(dsmc_files: list[DsmcEncryptionKey]) -> DsmcEncryptionKey:
5046
"""Return the latest file path based on the date attribute."""
47+
5148
if not dsmc_files:
52-
return None # Return None if the list is empty
49+
raise DsmcMissingEncryptionKeyError("No Encryption Key in DSMC output.")
50+
51+
latest_file = max(dsmc_files, key=attrgetter("dateTime"))
52+
53+
return latest_file
54+
55+
56+
def convert_string_to_datetime_object(strDateTime: str) -> datetime:
57+
date_formats = [
58+
"%Y-%m-%d %H:%M:%S",
59+
"%Y/%m/%d %H:%M:%S",
60+
"%d/%m/%Y %H:%M:%S",
61+
"%m/%d/%Y %H:%M:%S",
62+
]
5363

54-
# Get the file with the latest date
55-
latest_file = max(dsmc_files, key=lambda file: file.date)
64+
for fmt in date_formats:
65+
try:
66+
return datetime.strptime(strDateTime, fmt)
67+
except ValueError:
68+
continue
5669

57-
# Return the sequencing_path as a Path object
58-
return Path(latest_file.key_path)
70+
raise ValueError(f"Could not convert '{strDateTime}' to a datetime object.")
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from pathlib import Path
2+
3+
from cg.constants import FileExtensions
4+
from cg.exc import MissingFilesError
5+
6+
7+
def is_valid_dsmc_encryption_key_path(value: Path) -> Path:
8+
if not value.name.endswith(f"{FileExtensions.KEY}{FileExtensions.GPG}"):
9+
raise MissingFilesError("Missing a valid encryption key.")
10+
return value
11+
12+
13+
def is_valid_dsmc_sequencing_file_path(value: Path) -> Path:
14+
if not value.name.endswith(f"{FileExtensions.TAR}{FileExtensions.GZIP}{FileExtensions.GPG}"):
15+
raise MissingFilesError("Missing a valid sequence file.")
16+
return value
Lines changed: 1 addition & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,6 @@
1-
from datetime import datetime
2-
from pathlib import Path
1+
from pydantic import BaseModel, Field
32

4-
from pydantic import BaseModel, Field, field_validator
5-
6-
from cg.constants import FileExtensions, SequencingRunDataAvailability
7-
from cg.constants.devices import DeviceType
83
from cg.constants.metrics import DemuxMetricsColumnNames, QualityMetricsColumnNames
9-
from cg.constants.sequencing import Sequencers
104

115

126
class SequencingQualityMetrics(BaseModel):
@@ -29,41 +23,3 @@ class DemuxMetrics(BaseModel):
2923
lane: int = Field(..., alias=DemuxMetricsColumnNames.LANE)
3024
sample_internal_id: str = Field(..., alias=DemuxMetricsColumnNames.SAMPLE_INTERNAL_ID)
3125
read_pair_count: int = Field(..., alias=DemuxMetricsColumnNames.READ_PAIR_COUNT)
32-
33-
34-
class DsmcEncryptionKey(BaseModel):
35-
"""Model representing the response from a PDC query."""
36-
37-
date: str
38-
key_path: str
39-
40-
@field_validator("date")
41-
def parse_date(cls, value: str) -> datetime:
42-
return datetime.strptime(value, "%m/%d/%Y %H:%M:%S")
43-
44-
@field_validator("key_path")
45-
def validate_sequencing_path(cls, value: str) -> str:
46-
if not value.endswith(f"{FileExtensions.KEY}{FileExtensions.GPG}"):
47-
raise ValueError(f'"{value}" - is not the path to the Encryption key')
48-
if not Path(value):
49-
raise ValueError(f'"{value}" - is not a valid file path.')
50-
return value
51-
52-
53-
class DsmcSequencingFile(BaseModel):
54-
"""Model representing the response from a PDC query."""
55-
56-
date: str
57-
sequencing_path: str
58-
59-
@field_validator("date")
60-
def parse_date(cls, value: str) -> datetime:
61-
return datetime.strptime(value, "%m/%d/%Y %H:%M:%S")
62-
63-
@field_validator("sequencing_path")
64-
def validate_sequencing_path(cls, value: str) -> str:
65-
if not value.endswith(f"{FileExtensions.TAR}{FileExtensions.GZIP}{FileExtensions.GPG}"):
66-
raise ValueError(f'"{value}" - is not the path to the archived sequencing file')
67-
if not Path(value):
68-
raise ValueError(f'"{value}" - is not a valid file path.')
69-
return value

tests/services/illumina/backup/test_backup_services.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import fnmatch
44
import logging
5+
from datetime import datetime
56
from pathlib import Path
67
from typing import Callable
78

@@ -18,6 +19,7 @@
1819
from cg.models.cg_config import CGConfig, PDCArchivingDirectory
1920
from cg.services.illumina.backup.backup_service import IlluminaBackupService
2021
from cg.services.illumina.backup.encrypt_service import IlluminaRunEncryptionService
22+
from cg.services.illumina.backup.utils import convert_string_to_datetime_object
2123
from cg.services.pdc_service.pdc_service import PdcService
2224
from cg.store.models import IlluminaSequencingRun
2325
from cg.store.store import Store
@@ -137,6 +139,40 @@ def test_maximum_processing_queue_full(store_with_illumina_sequencing_data: Stor
137139
assert backup_api.has_processing_queue_capacity() is False
138140

139141

142+
def test_convert_string_to_datetime_object_valid():
143+
# GIVEN a list of valid datetime strings with different formats
144+
valid_datetime_strings = [
145+
("2024-10-15 12:45:30", "%Y-%m-%d %H:%M:%S"),
146+
("2024/10/15 12:45:30", "%Y/%m/%d %H:%M:%S"),
147+
("15/10/2024 12:45:30", "%d/%m/%Y %H:%M:%S"),
148+
("10/15/2024 12:45:30", "%m/%d/%Y %H:%M:%S"),
149+
]
150+
151+
# WHEN the function is called with valid strings
152+
for date_str, expected_format in valid_datetime_strings:
153+
# THEN it should return the correct datetime object
154+
expected_datetime = datetime.strptime(date_str, expected_format)
155+
assert convert_string_to_datetime_object(date_str) == expected_datetime
156+
157+
158+
def test_convert_string_to_datetime_object_invalid():
159+
# GIVEN a list of invalid datetime strings
160+
invalid_datetime_strings = [
161+
"2024-15-10 12:45:30", # Invalid day format
162+
"15-10-2024 12:45", # Missing seconds
163+
"Invalid string", # Completely invalid
164+
]
165+
166+
# WHEN the function is called with invalid strings
167+
for date_str in invalid_datetime_strings:
168+
# THEN it should raise a ValueError
169+
try:
170+
convert_string_to_datetime_object(date_str)
171+
assert False, f"Expected ValueError for {date_str} but it didn't raise"
172+
except ValueError as e:
173+
assert str(e) == f"Could not convert '{date_str}' to a datetime object."
174+
175+
140176
def test_maximum_processing_queue_not_full(store_with_illumina_sequencing_data: Store):
141177
# GIVEN a store with a requested sequencing run
142178
sequencing_runs: list[IlluminaSequencingRun] = store_with_illumina_sequencing_data._get_query(

0 commit comments

Comments
 (0)