Skip to content

Commit 138cee2

Browse files
authored
(Archiving) Clean retrieved springs (#2815) (minor)
### Added - CLI command for deleting retrieved Spring files.
1 parent 9bfcfca commit 138cee2

File tree

7 files changed

+267
-13
lines changed

7 files changed

+267
-13
lines changed

cg/apps/housekeeper/hk.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
""" Module to decouple cg code from Housekeeper code """
2-
import datetime as dt
32
import logging
43
import os
4+
from datetime import datetime
55
from pathlib import Path
66

77
from housekeeper.include import checksum as hk_checksum
88
from housekeeper.include import include_version
99
from housekeeper.store import Store, models
10-
from housekeeper.store.database import create_all_tables, drop_all_tables, initialize_database
10+
from housekeeper.store.database import (
11+
create_all_tables,
12+
drop_all_tables,
13+
initialize_database,
14+
)
1115
from housekeeper.store.models import Archive, Bundle, File, Version
1216
from sqlalchemy.orm import Query
1317

@@ -33,7 +37,7 @@ def __getattr__(self, name):
3337
LOG.warning(f"Called undefined {name} on {self.__class__.__name__}, please wrap")
3438
return getattr(self._store, name)
3539

36-
def new_bundle(self, name: str, created_at: dt.datetime = None) -> Bundle:
40+
def new_bundle(self, name: str, created_at: datetime = None) -> Bundle:
3741
"""Create a new file bundle."""
3842
return self._store.new_bundle(name, created_at)
3943

@@ -237,11 +241,11 @@ def include_file(self, file_obj: File, version_obj: Version) -> File:
237241
file_obj.path = str(new_path).replace(f"{global_root_dir}/", "", 1)
238242
return file_obj
239243

240-
def new_version(self, created_at: dt.datetime, expires_at: dt.datetime = None) -> Version:
244+
def new_version(self, created_at: datetime, expires_at: datetime = None) -> Version:
241245
"""Create a new bundle version."""
242246
return self._store.new_version(created_at, expires_at)
243247

244-
def version(self, bundle: str, date: dt.datetime) -> Version:
248+
def version(self, bundle: str, date: datetime) -> Version:
245249
"""Fetch a version."""
246250
LOG.debug(f"Return version: {date}, from {bundle}")
247251
return self._store.get_version_by_date_and_bundle_name(
@@ -286,7 +290,7 @@ def get_create_version(self, bundle_name: str) -> Version:
286290
bundle_result: tuple[Bundle, Version] = self.add_bundle(
287291
bundle_data={
288292
"name": bundle_name,
289-
"created_at": dt.datetime.now(),
293+
"created_at": datetime.now(),
290294
"expires_at": None,
291295
"files": [],
292296
}
@@ -316,7 +320,7 @@ def get_tag_names_from_file(file: File) -> list[str]:
316320
def include(self, version_obj: Version):
317321
"""Call the include version function to import related assets."""
318322
include_version(self.get_root_dir(), version_obj)
319-
version_obj.included_at = dt.datetime.now()
323+
version_obj.included_at = datetime.now()
320324

321325
def add_commit(self, obj):
322326
"""Wrap method in Housekeeper Store."""
@@ -376,7 +380,7 @@ def include_files_to_latest_version(self, bundle_name: str) -> None:
376380
LOG.warning(
377381
f"File is already included in Housekeeper for bundle: {bundle_name}, version: {bundle_version}"
378382
)
379-
bundle_version.included_at = dt.datetime.now()
383+
bundle_version.included_at = datetime.now()
380384
self.commit()
381385

382386
def get_file_from_latest_version(self, bundle_name: str, tags: set[str]) -> File | None:
@@ -649,3 +653,14 @@ def update_archive_retrieved_at(
649653
for archive in archives_to_update:
650654
archive.retrieval_task_id = new_retrieval_job_id
651655
self._store.session.commit()
656+
657+
def get_spring_files_retrieved_before(self, date: datetime):
658+
return self._store.get_files_retrieved_before(date, tag_names=[SequencingFileTag.SPRING])
659+
660+
def reset_retrieved_archive_data(self, files_to_reset: list[File]):
661+
"""Resets 'retrieval_task_id' and 'retrieved_at' for all files' corresponding archive entries"""
662+
for file in files_to_reset:
663+
LOG.debug(f"Resetting retrieval data for file {file.path}")
664+
file.archive.retrieval_task_id = None
665+
file.archive.retrieved_at = None
666+
self.commit()

cg/cli/clean.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from cg.exc import CleanFlowCellFailedError, FlowCellError
2828
from cg.meta.clean.api import CleanAPI
2929
from cg.meta.clean.clean_flow_cells import CleanFlowCellAPI
30+
from cg.meta.clean.clean_retrieved_spring_files import CleanRetrievedSpringFilesAPI
3031
from cg.models.cg_config import CGConfig
3132
from cg.store import Store
3233
from cg.store.models import Analysis
@@ -277,6 +278,24 @@ def clean_flow_cells(context: CGConfig, dry_run: bool):
277278
continue
278279

279280

281+
@clean.command("retrieved-spring-files")
282+
@click.option(
283+
"--age-limit",
284+
type=int,
285+
default=7,
286+
help="Clean all Spring files which were retrieved more than given amount of days ago.",
287+
show_default=True,
288+
)
289+
@DRY_RUN
290+
@click.pass_obj
291+
def clean_retrieved_spring_files(context: CGConfig, age_limit: int, dry_run: bool):
292+
"""Clean Spring files which were retrieved more than given amount of days ago."""
293+
clean_retrieved_spring_files_api = CleanRetrievedSpringFilesAPI(
294+
housekeeper_api=context.housekeeper_api, dry_run=dry_run
295+
)
296+
clean_retrieved_spring_files_api.clean_retrieved_spring_files(age_limit)
297+
298+
280299
def _get_confirm_question(bundle, file_obj) -> str:
281300
"""Return confirmation question."""
282301
return (
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import logging
2+
from datetime import datetime, timedelta
3+
from pathlib import Path
4+
5+
from housekeeper.store.models import File
6+
7+
from cg.apps.housekeeper.hk import HousekeeperAPI
8+
9+
LOG = logging.getLogger(__name__)
10+
11+
12+
class CleanRetrievedSpringFilesAPI:
13+
"""API for cleaning archived Spring files which have been retrieved."""
14+
15+
def __init__(self, housekeeper_api: HousekeeperAPI, dry_run: bool):
16+
self.housekeeper_api: HousekeeperAPI = housekeeper_api
17+
self.dry_run = dry_run
18+
19+
def _get_files_to_remove(self, age_limit: int) -> list[File]:
20+
"""Returns all Spring files which were retrieved more than given amount of days ago."""
21+
return self.housekeeper_api.get_spring_files_retrieved_before(
22+
date=datetime.now() - timedelta(days=age_limit)
23+
)
24+
25+
def _unlink_files(self, files_to_unlink: list[File]) -> None:
26+
for file in files_to_unlink:
27+
file_path: str = file.full_path
28+
if self.dry_run:
29+
LOG.info(f"Dry run - would have unlinked {file_path}")
30+
LOG.info(f"Unlinking {file_path}")
31+
Path(file_path).unlink(missing_ok=True)
32+
33+
def clean_retrieved_spring_files(self, age_limit: int):
34+
"""Removes Spring files retrieved more than given amount of days ago from Hasta,
35+
and resets retrieval data in Housekeeper."""
36+
37+
LOG.info("Starting cleaning of retrieved Spring files.")
38+
files_to_remove: list[File] = self._get_files_to_remove(age_limit)
39+
if not files_to_remove:
40+
LOG.info("No retrieved Spring files to remove - exiting.")
41+
return
42+
self._unlink_files(files_to_remove)
43+
if not self.dry_run:
44+
self.housekeeper_api.reset_retrieved_archive_data(files_to_remove)
45+
else:
46+
LOG.info("Would have reset the files' retrieval data")

poetry.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ urllib3 = "*"
6969

7070
# Apps
7171
genologics = "*"
72-
housekeeper = "*"
72+
housekeeper = ">=4.11.1"
7373

7474

7575
[tool.poetry.dev-dependencies]

tests/meta/clean/conftest.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from cg.constants import SequencingFileTag
99
from cg.constants.subject import Sex
1010
from cg.meta.clean.clean_flow_cells import CleanFlowCellAPI
11+
from cg.meta.clean.clean_retrieved_spring_files import CleanRetrievedSpringFilesAPI
1112
from cg.models.flow_cell.flow_cell import FlowCellDirectoryData
1213
from cg.store import Store
1314
from cg.store.models import Flowcell, Sample
@@ -234,3 +235,119 @@ def hk_sample_bundle_for_flow_cell_not_to_clean(
234235
"expires": timestamp_yesterday,
235236
"files": [],
236237
}
238+
239+
240+
@pytest.fixture
241+
def clean_retrieved_spring_files_api(
242+
real_housekeeper_api: HousekeeperAPI, tmp_path
243+
) -> CleanRetrievedSpringFilesAPI:
244+
"""Returns a CleanRetrievedSpringFilesAPI."""
245+
real_housekeeper_api.root_dir = tmp_path
246+
return CleanRetrievedSpringFilesAPI(housekeeper_api=real_housekeeper_api, dry_run=False)
247+
248+
249+
@pytest.fixture
250+
def path_to_old_retrieved_spring_file() -> str:
251+
return Path("path", "to", "old_retrieved_spring_file").as_posix()
252+
253+
254+
@pytest.fixture
255+
def path_to_newly_retrieved_spring_file() -> str:
256+
return Path("path", "to", "newly_retrieved_spring_file").as_posix()
257+
258+
259+
@pytest.fixture
260+
def path_to_archived_but_not_retrieved_spring_file() -> str:
261+
return Path("path", "to", "archived_spring_file").as_posix()
262+
263+
264+
@pytest.fixture
265+
def path_to_fastq_file() -> str:
266+
return Path("path", "to", "fastq_file").as_posix()
267+
268+
269+
@pytest.fixture
270+
def paths_for_populated_clean_retrieved_spring_files_api(
271+
path_to_old_retrieved_spring_file: str,
272+
path_to_newly_retrieved_spring_file: str,
273+
path_to_archived_but_not_retrieved_spring_file: str,
274+
path_to_fastq_file: str,
275+
) -> list[str]:
276+
return [
277+
path_to_old_retrieved_spring_file,
278+
path_to_newly_retrieved_spring_file,
279+
path_to_archived_but_not_retrieved_spring_file,
280+
path_to_fastq_file,
281+
]
282+
283+
284+
@pytest.fixture
285+
def retrieved_test_bundle_name() -> str:
286+
return "retrieved_test_bundle"
287+
288+
289+
@pytest.fixture
290+
def path_to_old_spring_file_in_housekeeper(
291+
retrieved_test_bundle_name: str, path_to_old_retrieved_spring_file
292+
) -> str:
293+
return Path(
294+
retrieved_test_bundle_name,
295+
str(datetime.today().date()),
296+
Path(path_to_old_retrieved_spring_file).name,
297+
).as_posix()
298+
299+
300+
@pytest.fixture
301+
def populated_clean_retrieved_spring_files_api(
302+
clean_retrieved_spring_files_api: CleanRetrievedSpringFilesAPI,
303+
paths_for_populated_clean_retrieved_spring_files_api: list[str],
304+
retrieved_test_bundle_name: str,
305+
archival_job_id_miria: int,
306+
retrieval_job_id_miria: int,
307+
timestamp: datetime,
308+
timestamp_yesterday: datetime,
309+
old_timestamp: datetime,
310+
tmp_path,
311+
) -> CleanRetrievedSpringFilesAPI:
312+
"""
313+
Returns a populated CleanRetrievedSpringFilesAPI, containing a bundle with one version and the following files:
314+
- an archived Spring file which has not been retrieved
315+
- an archived Spring file which was retrieved 1 day ago
316+
- an archived Spring file which was retrieved in the year 1900
317+
- a Fastq file
318+
"""
319+
clean_retrieved_spring_files_api.housekeeper_api.add_bundle_and_version_if_non_existent(
320+
bundle_name=retrieved_test_bundle_name
321+
)
322+
clean_retrieved_spring_files_api.housekeeper_api.commit()
323+
for path in paths_for_populated_clean_retrieved_spring_files_api:
324+
tags: list[str] = (
325+
[SequencingFileTag.SPRING]
326+
if SequencingFileTag.SPRING in path
327+
else [SequencingFileTag.FASTQ]
328+
)
329+
Path(tmp_path, path).parent.mkdir(parents=True, exist_ok=True)
330+
file_to_add = Path(tmp_path, path)
331+
file_to_add.touch()
332+
clean_retrieved_spring_files_api.housekeeper_api.add_and_include_file_to_latest_version(
333+
file=file_to_add, bundle_name=retrieved_test_bundle_name, tags=tags
334+
)
335+
for file in clean_retrieved_spring_files_api.housekeeper_api.get_files(
336+
bundle=retrieved_test_bundle_name
337+
):
338+
Path(file.full_path).parent.mkdir(parents=True, exist_ok=True)
339+
Path(file.full_path).touch()
340+
if "spring" in file.path:
341+
clean_retrieved_spring_files_api.housekeeper_api.add_archives(
342+
files=[file], archive_task_id=archival_job_id_miria
343+
)
344+
file.archive.archived_at = timestamp
345+
if "retrieved" in file.path:
346+
file.archive.retrieval_task_id = retrieval_job_id_miria
347+
file.archive.retrieved_at = (
348+
old_timestamp if "old" in file.path else timestamp_yesterday
349+
)
350+
clean_retrieved_spring_files_api.housekeeper_api.add_commit(file.archive)
351+
352+
clean_retrieved_spring_files_api.housekeeper_api.commit()
353+
return clean_retrieved_spring_files_api
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import logging
2+
from pathlib import Path
3+
4+
from housekeeper.store.models import File
5+
6+
from cg.meta.clean.clean_retrieved_spring_files import CleanRetrievedSpringFilesAPI
7+
8+
9+
def test_get_files_to_remove(
10+
populated_clean_retrieved_spring_files_api: CleanRetrievedSpringFilesAPI,
11+
path_to_old_retrieved_spring_file: str,
12+
path_to_old_spring_file_in_housekeeper: str,
13+
):
14+
"""Tests that only old retrieved files are cleaned with clean_retrieved_spring_files. With the provided populated
15+
api, this should not return a newly retrieved spring file, a fastq file nor an archived spring file which
16+
has not been retrieved."""
17+
18+
# GIVEN a CleanRetrievedSpringFilesAPI with a populated Housekeeper database
19+
20+
# WHEN getting files to remove when cleaning retrieved spring files
21+
files_to_remove: list[File] = populated_clean_retrieved_spring_files_api._get_files_to_remove(
22+
age_limit=7
23+
)
24+
25+
# THEN only the file with an old enough 'retrieved_at' should be returned
26+
assert [file.path for file in files_to_remove] == [path_to_old_spring_file_in_housekeeper]
27+
28+
29+
def test_clean_retrieved_spring_files_dry_run(
30+
populated_clean_retrieved_spring_files_api: CleanRetrievedSpringFilesAPI,
31+
path_to_old_retrieved_spring_file: str,
32+
retrieved_test_bundle_name: str,
33+
caplog,
34+
path_to_old_spring_file_in_housekeeper: str,
35+
):
36+
"""Tests that only the Spring file with an old enough 'retrieved_at' would be removed when cleaning retrieved
37+
Spring files."""
38+
39+
caplog.set_level(logging.INFO)
40+
41+
# GIVEN a CleanRetrievedSpringFilesAPI with a populated Housekeeper database
42+
# GIVEN that an old retrieved file exists
43+
files = populated_clean_retrieved_spring_files_api.housekeeper_api.files(
44+
bundle=retrieved_test_bundle_name
45+
).all()
46+
for file in files:
47+
assert Path(file.full_path).exists()
48+
49+
# WHEN running 'clean_retrieved_spring_files'
50+
populated_clean_retrieved_spring_files_api.clean_retrieved_spring_files(age_limit=7)
51+
52+
# THEN only the file with an old enough 'retrieved_at' should have been removed
53+
for file in files:
54+
if file.path == path_to_old_spring_file_in_housekeeper:
55+
assert not Path(file.full_path).exists()
56+
else:
57+
assert Path(file.full_path).exists()

0 commit comments

Comments
 (0)