diff --git a/alembic/versions/2023_10_31_fce8a2ca0fd1_add_is_compressible.py b/alembic/versions/2023_10_31_fce8a2ca0fd1_add_is_compressible.py new file mode 100644 index 0000000000..2b200783d3 --- /dev/null +++ b/alembic/versions/2023_10_31_fce8a2ca0fd1_add_is_compressible.py @@ -0,0 +1,26 @@ +"""Add is_compressible + +Revision ID: fce8a2ca0fd1 +Revises: 9073c61bc72b +Create Date: 2023-10-31 10:18:11.450637 + +""" +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = "fce8a2ca0fd1" +down_revision = "9073c61bc72b" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column( + "family", + sa.Column("is_compressible", sa.Boolean(), nullable=False, server_default=sa.text("1")), + ) + + +def downgrade(): + op.drop_column("family", "is_compressible") diff --git a/cg/cli/compress/fastq.py b/cg/cli/compress/fastq.py index 63030af785..9e676f2811 100644 --- a/cg/cli/compress/fastq.py +++ b/cg/cli/compress/fastq.py @@ -10,7 +10,6 @@ compress_sample_fastqs_in_cases, correct_spring_paths, get_cases_to_process, - is_case_ignored, update_compress_api, ) from cg.constants.constants import DRY_RUN @@ -91,8 +90,6 @@ def clean_fastq(context: CGConfig, case_id: Optional[str], days_back: int, dry_r cleaned_inds = 0 for case in cases: - if is_case_ignored(case_id=case.internal_id): - continue samples: Iterable[str] = store.get_sample_ids_by_case_id(case_id=case.internal_id) for sample_id in samples: was_cleaned: bool = compress_api.clean_fastq(sample_id=sample_id) diff --git a/cg/cli/compress/helpers.py b/cg/cli/compress/helpers.py index b335ed5e59..50cd32a256 100644 --- a/cg/cli/compress/helpers.py +++ b/cg/cli/compress/helpers.py @@ -10,7 +10,6 @@ from cg.apps.housekeeper.hk import HousekeeperAPI from cg.constants.compression import ( - CASES_TO_IGNORE, CRUNCHY_MIN_GB_PER_PROCESS, MAX_READS_PER_GB, ) @@ -31,11 +30,12 @@ def get_cases_to_process( """Return cases to process.""" cases: list[Family] = [] if case_id: - case: Family = store.get_case_by_internal_id(internal_id=case_id) + case: Family = store.get_case_by_internal_id(case_id) if not case: LOG.warning(f"Could not find case {case_id}") return - cases.append(case) + if case.is_compressible: + cases.append(case) else: date_threshold: dt.datetime = get_date_days_ago(days_ago=days_back) cases: list[Family] = store.get_cases_to_compress(date_threshold=date_threshold) @@ -53,14 +53,6 @@ def get_fastq_individuals(store: Store, case_id: str = None) -> Iterator[str]: yield link_obj.sample.internal_id -def is_case_ignored(case_id: str) -> bool: - """Check if case should be skipped.""" - if case_id in CASES_TO_IGNORE: - LOG.debug(f"Skipping case: {case_id}") - return True - return False - - def set_memory_according_to_reads( sample_id: str, sample_reads: Optional[int] = None, sample_process_mem: Optional[int] = None ) -> Optional[int]: @@ -150,8 +142,6 @@ def compress_sample_fastqs_in_cases( case_converted = True if case_conversion_count >= number_of_conversions: break - if is_case_ignored(case_id=case.internal_id): - continue LOG.info(f"Searching for FASTQ files in case {case.internal_id}") if not case.links: diff --git a/cg/constants/compression.py b/cg/constants/compression.py index b02a05bced..a2ae9fab45 100644 --- a/cg/constants/compression.py +++ b/cg/constants/compression.py @@ -13,168 +13,3 @@ # Number of days until FASTQs counts as old FASTQ_DELTA = 21 FASTQ_DATETIME_DELTA = datetime.timedelta(days=FASTQ_DELTA) - -PROBLEMATIC_CASES = [ - "causalmite", - "deepcub", - "expertalien", - "fluenteagle", - "grandkoi", - "lovingmayfly", - "loyalegret", - "modernbee", - "proudcollie", - "richalien", - "suremako", - "wisestork", -] - -# List of cases used for validation that we should skip -BALSAMIC_VALIDATION_CASES = [ - "setamoeba", # BALSAMIC validation case tumor-only panel - "sweetelf", # BALSAMIC positive control tumor-only panel - "poeticghoul", # BALSAMIC positive control tumor-only panel - "equalbug", # UMI seracare validation case (AF 0.5%) tumor-normal panel - "stableraven", # UMI seracare validation case (AF 1%) tumor-normal panel - "uphippo", # UMI seracare validation case (AF 0.5%) tumor-only panel - "cleanfowl", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation - "proudsquid", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation - "modestjaguar", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation - "dearmarmot", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation - "holykid", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation - "civilsole", # BALSAMIC validation case tumor-only wgs - "fleetjay", # BALSAMIC validation case tumor-normal wgs - "grandmarmot", # BALSAMIC validation case tumor-normal wgs - "unitedbeagle", # BALSAMIC validation case tumor-normal panel - "rightthrush", # BALSAMIC validation case tumor-only wes - "properpigeon", # BALSAMIC validation case tumor-normal wes - "eagerox", # BALSAMIC validation case from cust087, tumor-only panel - "casualweasel", # BALSAMIC validation case from cust087, tumor-only panel - "acetuna", # BALSAMIC validation case from cust087, tumor-only panel - "suitedsnake", # BALSAMIC validation case from cust087, tumor-only panel - "savinghorse", # BALSAMIC validation case from cust087, tumor-only panel - "rightpup", # BALSAMIC validation case from cust087, tumor-only panel - "sureroughy", # BALSAMIC validation case from cust087, tumor-only panel - "notedshark", # BALSAMIC validation case from cust127, tumor-normal WGS (SV inversion positive control) - "wholewhale", # BALSAMIC validation case from cust143, tumor-normal WGS (SV translocation positive control) - "largeturtle", # BALSAMIC validation case from cust143, tumor-normal WGS (SV translocation positive control) - "lightkodiak", # BALSAMIC validation case from cust143, tumor-normal WGS (SV inversion positive control) - "wholecivet", # BALSAMIC validation case from cust110, tumor-normal WGS (SV inversion positive control) - "upwardstork", # BALSAMIC validation case from cust110, tumor-normal WGS (SV deletion positive control) - "suitedgrub", # BALSAMIC validation case from cust110, tumor-normal WGS (SV deletion positive control) -] - -FLUFFY_VALIDATION_CASES = [ - "simplesalmon", # Chromosome 13, 18, 21 Suspected -] - -MIP_VALIDATION_CASES = [ - "brightcaiman", # DNA rare disease positive control - "casualgannet", # DNA rare disease positive control - "civilkoala", # RNA rare disease positive control - "cleanshrimp", # DNA rare disease positive control - "drivenmolly", # RNA rare disease positive control - "easybeetle", # DNA rare disease positive control - "epicasp", # DNA rare disease positive control - "expertmole", # RNA rare disease positive control - "finequagga", # RNA rare disease positive control - "firstfawn", # DNA rare disease positive control - "gladthrush", # DNA rare disease positive control - "helpedfilly", # DNA rare disease positive control - "hotskink", # DNA rare disease positive control - "inferret", # DNA rare disease positive control - "intentcorgi", # DNA rare disease positive control - "intentmayfly", # DNA rare disease positive control - "justhusky", # DNA rare disease positive control - "kindcaiman", # DNA rare disease positive control - "lightprawn", # DNA rare disease positive control - "livingox", # DNA rare disease positive control - "newaphid", # RNA rare disease positive control - "nextjackal", # DNA rare disease positive control - "modernmule", # DNA rare disease positive control - "moralcattle", # RNA rare disease positive control - "onemite", # DNA rare disease positive control - "opencow", # DNA rare disease positive control - "proudcougar", # DNA rare disease positive control - "rightmacaw", # DNA rare disease positive control - "safeguinea", # DNA rare disease positive control - "sharpparrot", # RNA rare disease positive control - "sharppigeon", # DNA rare disease positive control - "sharpwhale", # DNA rare disease positive control - "stillant", # DNA rare disease positive control - "smoothboa", # RNA rare disease positive control - "strongbison", # DNA rare disease positive control - "tenderoriole", # DNA rare disease positive control - "topsrhino", # DNA rare disease positive control - "usablemarten", # DNA rare disease positive control - "vitalmouse", # DNA rare disease positive control -] - -# List of cases used for validation that we should skip -RNAFUSION_VALIDATION_CASES = [ - "ableheron", - "acecoyote", - "ampleray", - "bossmink", - "cuddlyhen", - "daringowl", - "expertboar", - "finerracer", - "growndoe", - "guidedfeline", - "handyturtle", - "hardygrouse", - "holyrodent", - "honestswine", - "inlab", - "justburro", - "movedmule", - "nearbyjoey", - "oncrab", - "politeglider", - "rareosprey", - "rightmoray", - "stablemoray", - "starnewt", - "tendergoose", - "truemole", - "truepigeon", - "valuedfowl", - "vocallocust", - "wantedsawfly", -] - -TAXPROFILER_VALIDATION_CASES: list[str] = [ - "richurchin", -] - -OTHER_VALIDATION_CASES = [ - "bigdrum", - "busycolt", - "daringpony", - "frankhusky", - "gamedeer", - "hotviper", - "keencalf", - "keenviper", - "luckyhog", - "maturejay", # sars-cov-2 case - "meetpossum", - "mintbaboon", - "mintyeti", - "proeagle", - "propercoral", - "pumpedcat", - "strongman", - "truecoyote", -] - -CASES_TO_IGNORE = ( - PROBLEMATIC_CASES - + OTHER_VALIDATION_CASES - + BALSAMIC_VALIDATION_CASES - + FLUFFY_VALIDATION_CASES - + MIP_VALIDATION_CASES - + RNAFUSION_VALIDATION_CASES - + TAXPROFILER_VALIDATION_CASES -) diff --git a/cg/store/api/status.py b/cg/store/api/status.py index 9972abb1ad..0f6856cdcc 100644 --- a/cg/store/api/status.py +++ b/cg/store/api/status.py @@ -229,6 +229,7 @@ def get_cases_to_compress(self, date_threshold: datetime) -> list[Family]: case_filter_functions: list[CaseFilter] = [ CaseFilter.FILTER_HAS_INACTIVE_ANALYSIS, CaseFilter.FILTER_OLD_BY_CREATION_DATE, + CaseFilter.FILTER_IS_COMPRESSIBLE, ] return apply_case_filter( filter_functions=case_filter_functions, diff --git a/cg/store/filters/status_case_filters.py b/cg/store/filters/status_case_filters.py index 0aa1f72f0b..5b30fd7034 100644 --- a/cg/store/filters/status_case_filters.py +++ b/cg/store/filters/status_case_filters.py @@ -190,6 +190,11 @@ def filter_running_cases(cases: Query, **kwargs) -> Query: return cases.filter(Family.action == CaseActions.RUNNING) +def filter_compressible_cases(cases: Query, **kwargs) -> Query: + """Filter cases which are running.""" + return cases.filter(Family.is_compressible) + + def order_cases_by_created_at(cases: Query, **kwargs) -> Query: """Order cases by created at.""" return cases.order_by(Family.created_at.desc()) @@ -256,6 +261,7 @@ class CaseFilter(Enum): FILTER_HAS_INACTIVE_ANALYSIS: Callable = filter_inactive_analysis_cases FILTER_HAS_SEQUENCE: Callable = filter_cases_has_sequence FILTER_IS_RUNNING: Callable = filter_running_cases + FILTER_IS_COMPRESSIBLE: Callable = filter_compressible_cases FILTER_NEW_BY_ORDER_DATE: Callable = filter_newer_cases_by_order_date FILTER_NOT_ANALYSED: Callable = filter_cases_not_analysed FILTER_OLD_BY_CREATION_DATE: Callable = filter_older_cases_by_creation_date diff --git a/cg/store/models.py b/cg/store/models.py index 2ccfea831e..fba7931657 100644 --- a/cg/store/models.py +++ b/cg/store/models.py @@ -390,6 +390,7 @@ class Family(Model, PriorityMixin): data_delivery = Column(types.Enum(*list(DataDelivery))) id = Column(types.Integer, primary_key=True) internal_id = Column(types.String(32), unique=True, nullable=False) + is_compressible = Column(types.Boolean, nullable=False, default=True) name = Column(types.String(128), nullable=False) ordered_at = Column(types.DateTime, default=dt.datetime.now) _panels = Column(types.Text) diff --git a/tests/cli/compress/test_cli_compress_fastq.py b/tests/cli/compress/test_cli_compress_fastq.py index 7c45eef877..e7fbd067fa 100644 --- a/tests/cli/compress/test_cli_compress_fastq.py +++ b/tests/cli/compress/test_cli_compress_fastq.py @@ -71,6 +71,26 @@ def test_get_cases_to_process_when_no_case( assert f"Could not find case {case_id_does_not_exist}" in caplog.text +def test_incompressible_cases_are_not_processable( + helpers: StoreHelpers, + populated_compress_context: CGConfig, +): + """Test that cases that are marked as incompressible are not processable.""" + + # GIVEN a store with a case that is marked as incompressible + status_db: Store = populated_compress_context.status_db + + incompressible_case: Family = helpers.add_case(store=status_db, internal_id="incompressible") + incompressible_case.created_at = dt.datetime.now() - dt.timedelta(days=1000) + incompressible_case.is_compressible = False + + # WHEN retrieving the processable cases + processable_cases: list[Family] = get_cases_to_process(days_back=1, store=status_db) + + # THEN assert that the incompressible case is not processable + assert incompressible_case not in processable_cases + + def test_compress_fastq_cli_no_family(compress_context: CGConfig, cli_runner: CliRunner, caplog): """Test to run the compress command with a database without samples,""" caplog.set_level(logging.DEBUG)