Skip to content

Commit

Permalink
Replace hardcoded validation cases (#2645)(patch)
Browse files Browse the repository at this point in the history
Replace hardcoded validation case ids with a field on the Family model in status db
  • Loading branch information
seallard authored Oct 31, 2023
1 parent 83ecd4f commit e9b0108
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 181 deletions.
26 changes: 26 additions & 0 deletions alembic/versions/2023_10_31_fce8a2ca0fd1_add_is_compressible.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Add is_compressible
Revision ID: fce8a2ca0fd1
Revises: 9073c61bc72b
Create Date: 2023-10-31 10:18:11.450637
"""
from alembic import op
import sqlalchemy as sa

# revision identifiers, used by Alembic.
revision = "fce8a2ca0fd1"
down_revision = "9073c61bc72b"
branch_labels = None
depends_on = None


def upgrade():
op.add_column(
"family",
sa.Column("is_compressible", sa.Boolean(), nullable=False, server_default=sa.text("1")),
)


def downgrade():
op.drop_column("family", "is_compressible")
3 changes: 0 additions & 3 deletions cg/cli/compress/fastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
compress_sample_fastqs_in_cases,
correct_spring_paths,
get_cases_to_process,
is_case_ignored,
update_compress_api,
)
from cg.constants.constants import DRY_RUN
Expand Down Expand Up @@ -91,8 +90,6 @@ def clean_fastq(context: CGConfig, case_id: Optional[str], days_back: int, dry_r

cleaned_inds = 0
for case in cases:
if is_case_ignored(case_id=case.internal_id):
continue
samples: Iterable[str] = store.get_sample_ids_by_case_id(case_id=case.internal_id)
for sample_id in samples:
was_cleaned: bool = compress_api.clean_fastq(sample_id=sample_id)
Expand Down
16 changes: 3 additions & 13 deletions cg/cli/compress/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from cg.apps.housekeeper.hk import HousekeeperAPI
from cg.constants.compression import (
CASES_TO_IGNORE,
CRUNCHY_MIN_GB_PER_PROCESS,
MAX_READS_PER_GB,
)
Expand All @@ -31,11 +30,12 @@ def get_cases_to_process(
"""Return cases to process."""
cases: list[Family] = []
if case_id:
case: Family = store.get_case_by_internal_id(internal_id=case_id)
case: Family = store.get_case_by_internal_id(case_id)
if not case:
LOG.warning(f"Could not find case {case_id}")
return
cases.append(case)
if case.is_compressible:
cases.append(case)
else:
date_threshold: dt.datetime = get_date_days_ago(days_ago=days_back)
cases: list[Family] = store.get_cases_to_compress(date_threshold=date_threshold)
Expand All @@ -53,14 +53,6 @@ def get_fastq_individuals(store: Store, case_id: str = None) -> Iterator[str]:
yield link_obj.sample.internal_id


def is_case_ignored(case_id: str) -> bool:
"""Check if case should be skipped."""
if case_id in CASES_TO_IGNORE:
LOG.debug(f"Skipping case: {case_id}")
return True
return False


def set_memory_according_to_reads(
sample_id: str, sample_reads: Optional[int] = None, sample_process_mem: Optional[int] = None
) -> Optional[int]:
Expand Down Expand Up @@ -150,8 +142,6 @@ def compress_sample_fastqs_in_cases(
case_converted = True
if case_conversion_count >= number_of_conversions:
break
if is_case_ignored(case_id=case.internal_id):
continue

LOG.info(f"Searching for FASTQ files in case {case.internal_id}")
if not case.links:
Expand Down
165 changes: 0 additions & 165 deletions cg/constants/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,168 +13,3 @@
# Number of days until FASTQs counts as old
FASTQ_DELTA = 21
FASTQ_DATETIME_DELTA = datetime.timedelta(days=FASTQ_DELTA)

PROBLEMATIC_CASES = [
"causalmite",
"deepcub",
"expertalien",
"fluenteagle",
"grandkoi",
"lovingmayfly",
"loyalegret",
"modernbee",
"proudcollie",
"richalien",
"suremako",
"wisestork",
]

# List of cases used for validation that we should skip
BALSAMIC_VALIDATION_CASES = [
"setamoeba", # BALSAMIC validation case tumor-only panel
"sweetelf", # BALSAMIC positive control tumor-only panel
"poeticghoul", # BALSAMIC positive control tumor-only panel
"equalbug", # UMI seracare validation case (AF 0.5%) tumor-normal panel
"stableraven", # UMI seracare validation case (AF 1%) tumor-normal panel
"uphippo", # UMI seracare validation case (AF 0.5%) tumor-only panel
"cleanfowl", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation
"proudsquid", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation
"modestjaguar", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation
"dearmarmot", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation
"holykid", # BALSAMIC validation case, HD829 reference for FLT3 Ascertation
"civilsole", # BALSAMIC validation case tumor-only wgs
"fleetjay", # BALSAMIC validation case tumor-normal wgs
"grandmarmot", # BALSAMIC validation case tumor-normal wgs
"unitedbeagle", # BALSAMIC validation case tumor-normal panel
"rightthrush", # BALSAMIC validation case tumor-only wes
"properpigeon", # BALSAMIC validation case tumor-normal wes
"eagerox", # BALSAMIC validation case from cust087, tumor-only panel
"casualweasel", # BALSAMIC validation case from cust087, tumor-only panel
"acetuna", # BALSAMIC validation case from cust087, tumor-only panel
"suitedsnake", # BALSAMIC validation case from cust087, tumor-only panel
"savinghorse", # BALSAMIC validation case from cust087, tumor-only panel
"rightpup", # BALSAMIC validation case from cust087, tumor-only panel
"sureroughy", # BALSAMIC validation case from cust087, tumor-only panel
"notedshark", # BALSAMIC validation case from cust127, tumor-normal WGS (SV inversion positive control)
"wholewhale", # BALSAMIC validation case from cust143, tumor-normal WGS (SV translocation positive control)
"largeturtle", # BALSAMIC validation case from cust143, tumor-normal WGS (SV translocation positive control)
"lightkodiak", # BALSAMIC validation case from cust143, tumor-normal WGS (SV inversion positive control)
"wholecivet", # BALSAMIC validation case from cust110, tumor-normal WGS (SV inversion positive control)
"upwardstork", # BALSAMIC validation case from cust110, tumor-normal WGS (SV deletion positive control)
"suitedgrub", # BALSAMIC validation case from cust110, tumor-normal WGS (SV deletion positive control)
]

FLUFFY_VALIDATION_CASES = [
"simplesalmon", # Chromosome 13, 18, 21 Suspected
]

MIP_VALIDATION_CASES = [
"brightcaiman", # DNA rare disease positive control
"casualgannet", # DNA rare disease positive control
"civilkoala", # RNA rare disease positive control
"cleanshrimp", # DNA rare disease positive control
"drivenmolly", # RNA rare disease positive control
"easybeetle", # DNA rare disease positive control
"epicasp", # DNA rare disease positive control
"expertmole", # RNA rare disease positive control
"finequagga", # RNA rare disease positive control
"firstfawn", # DNA rare disease positive control
"gladthrush", # DNA rare disease positive control
"helpedfilly", # DNA rare disease positive control
"hotskink", # DNA rare disease positive control
"inferret", # DNA rare disease positive control
"intentcorgi", # DNA rare disease positive control
"intentmayfly", # DNA rare disease positive control
"justhusky", # DNA rare disease positive control
"kindcaiman", # DNA rare disease positive control
"lightprawn", # DNA rare disease positive control
"livingox", # DNA rare disease positive control
"newaphid", # RNA rare disease positive control
"nextjackal", # DNA rare disease positive control
"modernmule", # DNA rare disease positive control
"moralcattle", # RNA rare disease positive control
"onemite", # DNA rare disease positive control
"opencow", # DNA rare disease positive control
"proudcougar", # DNA rare disease positive control
"rightmacaw", # DNA rare disease positive control
"safeguinea", # DNA rare disease positive control
"sharpparrot", # RNA rare disease positive control
"sharppigeon", # DNA rare disease positive control
"sharpwhale", # DNA rare disease positive control
"stillant", # DNA rare disease positive control
"smoothboa", # RNA rare disease positive control
"strongbison", # DNA rare disease positive control
"tenderoriole", # DNA rare disease positive control
"topsrhino", # DNA rare disease positive control
"usablemarten", # DNA rare disease positive control
"vitalmouse", # DNA rare disease positive control
]

# List of cases used for validation that we should skip
RNAFUSION_VALIDATION_CASES = [
"ableheron",
"acecoyote",
"ampleray",
"bossmink",
"cuddlyhen",
"daringowl",
"expertboar",
"finerracer",
"growndoe",
"guidedfeline",
"handyturtle",
"hardygrouse",
"holyrodent",
"honestswine",
"inlab",
"justburro",
"movedmule",
"nearbyjoey",
"oncrab",
"politeglider",
"rareosprey",
"rightmoray",
"stablemoray",
"starnewt",
"tendergoose",
"truemole",
"truepigeon",
"valuedfowl",
"vocallocust",
"wantedsawfly",
]

TAXPROFILER_VALIDATION_CASES: list[str] = [
"richurchin",
]

OTHER_VALIDATION_CASES = [
"bigdrum",
"busycolt",
"daringpony",
"frankhusky",
"gamedeer",
"hotviper",
"keencalf",
"keenviper",
"luckyhog",
"maturejay", # sars-cov-2 case
"meetpossum",
"mintbaboon",
"mintyeti",
"proeagle",
"propercoral",
"pumpedcat",
"strongman",
"truecoyote",
]

CASES_TO_IGNORE = (
PROBLEMATIC_CASES
+ OTHER_VALIDATION_CASES
+ BALSAMIC_VALIDATION_CASES
+ FLUFFY_VALIDATION_CASES
+ MIP_VALIDATION_CASES
+ RNAFUSION_VALIDATION_CASES
+ TAXPROFILER_VALIDATION_CASES
)
1 change: 1 addition & 0 deletions cg/store/api/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ def get_cases_to_compress(self, date_threshold: datetime) -> list[Family]:
case_filter_functions: list[CaseFilter] = [
CaseFilter.FILTER_HAS_INACTIVE_ANALYSIS,
CaseFilter.FILTER_OLD_BY_CREATION_DATE,
CaseFilter.FILTER_IS_COMPRESSIBLE,
]
return apply_case_filter(
filter_functions=case_filter_functions,
Expand Down
6 changes: 6 additions & 0 deletions cg/store/filters/status_case_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,11 @@ def filter_running_cases(cases: Query, **kwargs) -> Query:
return cases.filter(Family.action == CaseActions.RUNNING)


def filter_compressible_cases(cases: Query, **kwargs) -> Query:
"""Filter cases which are running."""
return cases.filter(Family.is_compressible)


def order_cases_by_created_at(cases: Query, **kwargs) -> Query:
"""Order cases by created at."""
return cases.order_by(Family.created_at.desc())
Expand Down Expand Up @@ -256,6 +261,7 @@ class CaseFilter(Enum):
FILTER_HAS_INACTIVE_ANALYSIS: Callable = filter_inactive_analysis_cases
FILTER_HAS_SEQUENCE: Callable = filter_cases_has_sequence
FILTER_IS_RUNNING: Callable = filter_running_cases
FILTER_IS_COMPRESSIBLE: Callable = filter_compressible_cases
FILTER_NEW_BY_ORDER_DATE: Callable = filter_newer_cases_by_order_date
FILTER_NOT_ANALYSED: Callable = filter_cases_not_analysed
FILTER_OLD_BY_CREATION_DATE: Callable = filter_older_cases_by_creation_date
Expand Down
1 change: 1 addition & 0 deletions cg/store/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@ class Family(Model, PriorityMixin):
data_delivery = Column(types.Enum(*list(DataDelivery)))
id = Column(types.Integer, primary_key=True)
internal_id = Column(types.String(32), unique=True, nullable=False)
is_compressible = Column(types.Boolean, nullable=False, default=True)
name = Column(types.String(128), nullable=False)
ordered_at = Column(types.DateTime, default=dt.datetime.now)
_panels = Column(types.Text)
Expand Down
20 changes: 20 additions & 0 deletions tests/cli/compress/test_cli_compress_fastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,26 @@ def test_get_cases_to_process_when_no_case(
assert f"Could not find case {case_id_does_not_exist}" in caplog.text


def test_incompressible_cases_are_not_processable(
helpers: StoreHelpers,
populated_compress_context: CGConfig,
):
"""Test that cases that are marked as incompressible are not processable."""

# GIVEN a store with a case that is marked as incompressible
status_db: Store = populated_compress_context.status_db

incompressible_case: Family = helpers.add_case(store=status_db, internal_id="incompressible")
incompressible_case.created_at = dt.datetime.now() - dt.timedelta(days=1000)
incompressible_case.is_compressible = False

# WHEN retrieving the processable cases
processable_cases: list[Family] = get_cases_to_process(days_back=1, store=status_db)

# THEN assert that the incompressible case is not processable
assert incompressible_case not in processable_cases


def test_compress_fastq_cli_no_family(compress_context: CGConfig, cli_runner: CliRunner, caplog):
"""Test to run the compress command with a database without samples,"""
caplog.set_level(logging.DEBUG)
Expand Down

0 comments on commit e9b0108

Please sign in to comment.