Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace hardcoded validation cases #2645

Merged
merged 9 commits into from
Oct 31, 2023
26 changes: 26 additions & 0 deletions alembic/versions/2023_10_31_fce8a2ca0fd1_add_is_compressible.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Add is_compressible

Revision ID: fce8a2ca0fd1
Revises: 9073c61bc72b
Create Date: 2023-10-31 10:18:11.450637

"""
from alembic import op
import sqlalchemy as sa

# revision identifiers, used by Alembic.
revision = "fce8a2ca0fd1"
down_revision = "9073c61bc72b"
branch_labels = None
depends_on = None


def upgrade():
op.add_column(
"family",
sa.Column("is_compressible", sa.Boolean(), nullable=False, server_default=sa.text("1")),
)


def downgrade():
op.drop_column("family", "is_compressible")
3 changes: 0 additions & 3 deletions cg/cli/compress/fastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
compress_sample_fastqs_in_cases,
correct_spring_paths,
get_cases_to_process,
is_case_ignored,
update_compress_api,
)
from cg.constants.constants import DRY_RUN
Expand Down Expand Up @@ -91,8 +90,6 @@ def clean_fastq(context: CGConfig, case_id: Optional[str], days_back: int, dry_r

cleaned_inds = 0
for case in cases:
if is_case_ignored(case_id=case.internal_id):
continue
samples: Iterable[str] = store.get_sample_ids_by_case_id(case_id=case.internal_id)
for sample_id in samples:
was_cleaned: bool = compress_api.clean_fastq(sample_id=sample_id)
Expand Down
16 changes: 3 additions & 13 deletions cg/cli/compress/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from cg.apps.housekeeper.hk import HousekeeperAPI
from cg.constants.compression import (
CASES_TO_IGNORE,
CRUNCHY_MIN_GB_PER_PROCESS,
MAX_READS_PER_GB,
)
Expand All @@ -31,11 +30,12 @@ def get_cases_to_process(
"""Return cases to process."""
cases: list[Family] = []
if case_id:
case: Family = store.get_case_by_internal_id(internal_id=case_id)
case: Family = store.get_case_by_internal_id(case_id)
if not case:
LOG.warning(f"Could not find case {case_id}")
return
cases.append(case)
if case.is_compressible:
cases.append(case)
else:
date_threshold: dt.datetime = get_date_days_ago(days_ago=days_back)
cases: list[Family] = store.get_cases_to_compress(date_threshold=date_threshold)
Expand All @@ -53,14 +53,6 @@ def get_fastq_individuals(store: Store, case_id: str = None) -> Iterator[str]:
yield link_obj.sample.internal_id


def is_case_ignored(case_id: str) -> bool:
"""Check if case should be skipped."""
if case_id in CASES_TO_IGNORE:
LOG.debug(f"Skipping case: {case_id}")
return True
return False


def set_memory_according_to_reads(
sample_id: str, sample_reads: Optional[int] = None, sample_process_mem: Optional[int] = None
) -> Optional[int]:
Expand Down Expand Up @@ -150,8 +142,6 @@ def compress_sample_fastqs_in_cases(
case_converted = True
if case_conversion_count >= number_of_conversions:
break
if is_case_ignored(case_id=case.internal_id):
continue

LOG.info(f"Searching for FASTQ files in case {case.internal_id}")
if not case.links:
Expand Down
165 changes: 0 additions & 165 deletions cg/constants/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,168 +13,3 @@
# Number of days until FASTQs counts as old
FASTQ_DELTA = 21
FASTQ_DATETIME_DELTA = datetime.timedelta(days=FASTQ_DELTA)

PROBLEMATIC_CASES = [
"causalmite",
"deepcub",
"expertalien",
"fluenteagle",
"grandkoi",
"lovingmayfly",
"loyalegret",
"modernbee",
"proudcollie",
"richalien",
"suremako",
"wisestork",
]

# List of cases used for validation that we should skip
BALSAMIC_VALIDATION_CASES = [
"setamoeba", # BALSAMIC validation case tumor-only panel
"sweetelf", # BALSAMIC positive control tumor-only panel
"poeticghoul", # BALSAMIC positive control tumor-only panel
"equalbug", # UMI seracare validation case (AF 0.5%) tumor-normal panel
"stableraven", # UMI seracare validation case (AF 1%) tumor-normal panel
"uphippo", # UMI seracare validation case (AF 0.5%) tumor-only panel
"cleanfowl", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation
"proudsquid", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation
"modestjaguar", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation
"dearmarmot", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation
"holykid", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation
"civilsole", # BALSAMIC validation case tumor-only wgs
"fleetjay", # BALSAMIC validation case tumor-normal wgs
"grandmarmot", # BALSAMIC validation case tumor-normal wgs
"unitedbeagle", # BALSAMIC validation case tumor-normal panel
"rightthrush", # BALSAMIC validation case tumor-only wes
"properpigeon", # BALSAMIC validation case tumor-normal wes
"eagerox", # BALSAMIC validation case from cust087, tumor-only panel
"casualweasel", # BALSAMIC validation case from cust087, tumor-only panel
"acetuna", # BALSAMIC validation case from cust087, tumor-only panel
"suitedsnake", # BALSAMIC validation case from cust087, tumor-only panel
"savinghorse", # BALSAMIC validation case from cust087, tumor-only panel
"rightpup", # BALSAMIC validation case from cust087, tumor-only panel
"sureroughy", # BALSAMIC validation case from cust087, tumor-only panel
"notedshark", # BALSAMIC validation case from cust127, tumor-normal WGS (SV inversion positive control)
"wholewhale", # BALSAMIC validation case from cust143, tumor-normal WGS (SV translocation positive control)
"largeturtle", # BALSAMIC validation case from cust143, tumor-normal WGS (SV translocation positive control)
"lightkodiak", # BALSAMIC validation case from cust143, tumor-normal WGS (SV inversion positive control)
"wholecivet", # BALSAMIC validation case from cust110, tumor-normal WGS (SV inversion positive control)
"upwardstork", # BALSAMIC validation case from cust110, tumor-normal WGS (SV deletion positive control)
"suitedgrub", # BALSAMIC validation case from cust110, tumor-normal WGS (SV deletion positive control)
]

FLUFFY_VALIDATION_CASES = [
"simplesalmon", # Chromosome 13, 18, 21 Suspected
]

MIP_VALIDATION_CASES = [
"brightcaiman", # DNA rare disease positive control
"casualgannet", # DNA rare disease positive control
"civilkoala", # RNA rare disease positive control
"cleanshrimp", # DNA rare disease positive control
"drivenmolly", # RNA rare disease positive control
"easybeetle", # DNA rare disease positive control
"epicasp", # DNA rare disease positive control
"expertmole", # RNA rare disease positive control
"finequagga", # RNA rare disease positive control
"firstfawn", # DNA rare disease positive control
"gladthrush", # DNA rare disease positive control
"helpedfilly", # DNA rare disease positive control
"hotskink", # DNA rare disease positive control
"inferret", # DNA rare disease positive control
"intentcorgi", # DNA rare disease positive control
"intentmayfly", # DNA rare disease positive control
"justhusky", # DNA rare disease positive control
"kindcaiman", # DNA rare disease positive control
"lightprawn", # DNA rare disease positive control
"livingox", # DNA rare disease positive control
"newaphid", # RNA rare disease positive control
"nextjackal", # DNA rare disease positive control
"modernmule", # DNA rare disease positive control
"moralcattle", # RNA rare disease positive control
"onemite", # DNA rare disease positive control
"opencow", # DNA rare disease positive control
"proudcougar", # DNA rare disease positive control
"rightmacaw", # DNA rare disease positive control
"safeguinea", # DNA rare disease positive control
"sharpparrot", # RNA rare disease positive control
"sharppigeon", # DNA rare disease positive control
"sharpwhale", # DNA rare disease positive control
"stillant", # DNA rare disease positive control
"smoothboa", # RNA rare disease positive control
"strongbison", # DNA rare disease positive control
"tenderoriole", # DNA rare disease positive control
"topsrhino", # DNA rare disease positive control
"usablemarten", # DNA rare disease positive control
"vitalmouse", # DNA rare disease positive control
]

# List of cases used for validation that we should skip
RNAFUSION_VALIDATION_CASES = [
"ableheron",
"acecoyote",
"ampleray",
"bossmink",
"cuddlyhen",
"daringowl",
"expertboar",
"finerracer",
"growndoe",
"guidedfeline",
"handyturtle",
"hardygrouse",
"holyrodent",
"honestswine",
"inlab",
"justburro",
"movedmule",
"nearbyjoey",
"oncrab",
"politeglider",
"rareosprey",
"rightmoray",
"stablemoray",
"starnewt",
"tendergoose",
"truemole",
"truepigeon",
"valuedfowl",
"vocallocust",
"wantedsawfly",
]

TAXPROFILER_VALIDATION_CASES: list[str] = [
"richurchin",
]

OTHER_VALIDATION_CASES = [
"bigdrum",
"busycolt",
"daringpony",
"frankhusky",
"gamedeer",
"hotviper",
"keencalf",
"keenviper",
"luckyhog",
"maturejay", # sars-cov-2 case
"meetpossum",
"mintbaboon",
"mintyeti",
"proeagle",
"propercoral",
"pumpedcat",
"strongman",
"truecoyote",
]

CASES_TO_IGNORE = (
PROBLEMATIC_CASES
+ OTHER_VALIDATION_CASES
+ BALSAMIC_VALIDATION_CASES
+ FLUFFY_VALIDATION_CASES
+ MIP_VALIDATION_CASES
+ RNAFUSION_VALIDATION_CASES
+ TAXPROFILER_VALIDATION_CASES
)
1 change: 1 addition & 0 deletions cg/store/api/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ def get_cases_to_compress(self, date_threshold: datetime) -> list[Family]:
case_filter_functions: list[CaseFilter] = [
CaseFilter.FILTER_HAS_INACTIVE_ANALYSIS,
CaseFilter.FILTER_OLD_BY_CREATION_DATE,
CaseFilter.FILTER_IS_COMPRESSIBLE,
]
return apply_case_filter(
filter_functions=case_filter_functions,
Expand Down
6 changes: 6 additions & 0 deletions cg/store/filters/status_case_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,11 @@ def filter_running_cases(cases: Query, **kwargs) -> Query:
return cases.filter(Family.action == CaseActions.RUNNING)


def filter_compressible_cases(cases: Query, **kwargs) -> Query:
"""Filter cases which are running."""
return cases.filter(Family.is_compressible)


def order_cases_by_created_at(cases: Query, **kwargs) -> Query:
"""Order cases by created at."""
return cases.order_by(Family.created_at.desc())
Expand Down Expand Up @@ -256,6 +261,7 @@ class CaseFilter(Enum):
FILTER_HAS_INACTIVE_ANALYSIS: Callable = filter_inactive_analysis_cases
FILTER_HAS_SEQUENCE: Callable = filter_cases_has_sequence
FILTER_IS_RUNNING: Callable = filter_running_cases
FILTER_IS_COMPRESSIBLE: Callable = filter_compressible_cases
FILTER_NEW_BY_ORDER_DATE: Callable = filter_newer_cases_by_order_date
FILTER_NOT_ANALYSED: Callable = filter_cases_not_analysed
FILTER_OLD_BY_CREATION_DATE: Callable = filter_older_cases_by_creation_date
Expand Down
1 change: 1 addition & 0 deletions cg/store/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@ class Family(Model, PriorityMixin):
data_delivery = Column(types.Enum(*list(DataDelivery)))
id = Column(types.Integer, primary_key=True)
internal_id = Column(types.String(32), unique=True, nullable=False)
is_compressible = Column(types.Boolean, nullable=False, default=True)
name = Column(types.String(128), nullable=False)
ordered_at = Column(types.DateTime, default=dt.datetime.now)
_panels = Column(types.Text)
Expand Down
20 changes: 20 additions & 0 deletions tests/cli/compress/test_cli_compress_fastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,26 @@ def test_get_cases_to_process_when_no_case(
assert f"Could not find case {case_id_does_not_exist}" in caplog.text


def test_incompressible_cases_are_not_processable(
helpers: StoreHelpers,
populated_compress_context: CGConfig,
):
"""Test that cases that are marked as incompressible are not processable."""

# GIVEN a store with a case that is marked as incompressible
status_db: Store = populated_compress_context.status_db

incompressible_case: Family = helpers.add_case(store=status_db, internal_id="incompressible")
incompressible_case.created_at = dt.datetime.now() - dt.timedelta(days=1000)
incompressible_case.is_compressible = False

# WHEN retrieving the processable cases
processable_cases: list[Family] = get_cases_to_process(days_back=1, store=status_db)

# THEN assert that the incompressible case is not processable
assert incompressible_case not in processable_cases


def test_compress_fastq_cli_no_family(compress_context: CGConfig, cli_runner: CliRunner, caplog):
"""Test to run the compress command with a database without samples,"""
caplog.set_level(logging.DEBUG)
Expand Down