Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add automatic quality control of analyses for microbial samples #2754

Merged
merged 69 commits into from
Jan 2, 2024
Merged
Show file tree
Hide file tree
Changes from 60 commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
3abe921
Add microsalt qc checks
seallard Dec 7, 2023
c3f669e
Extract qc logic to separate module
seallard Dec 8, 2023
7da4955
Merge branch 'master' into add-microsalt-qc
seallard Dec 8, 2023
0d79d1e
Fix imports
seallard Dec 11, 2023
139c888
Merge branch 'master' into add-microsalt-qc
seallard Dec 11, 2023
1a4d661
Formatting
seallard Dec 11, 2023
da79782
Remove failing test
seallard Dec 11, 2023
2f34f6d
Add sample total reads check
seallard Dec 11, 2023
79feb5c
Fix tests
seallard Dec 11, 2023
09ddae9
Quality control for total reads given sample id
seallard Dec 11, 2023
cca1d73
Validate negative control total reads
seallard Dec 11, 2023
09830e6
Add models for parsing quality metrics
seallard Dec 11, 2023
0c3aa9b
Validate mapped rate
seallard Dec 11, 2023
418837d
Add validation of duplication rate
seallard Dec 12, 2023
4cabeca
Validate median insert size
seallard Dec 12, 2023
72b3f99
Validate average coverage
seallard Dec 12, 2023
71634c0
Validate 10x coverage
seallard Dec 12, 2023
0227515
Formatting
seallard Dec 12, 2023
48e247b
Validate that negative control sample for cases passes control
seallard Dec 12, 2023
ec60b96
Fix quality result model
seallard Dec 12, 2023
ea264ca
Validate that all urgent samples pass quality control
seallard Dec 12, 2023
5da77a4
Validate that most non urgent samples pass qc
seallard Dec 12, 2023
d33a84c
Handle div by zero
seallard Dec 12, 2023
738fa8b
Remove deprecated code
seallard Dec 12, 2023
dbd596f
Fix parsing of metrics
seallard Dec 12, 2023
c118eef
Pass metrics file path as single parameter to qc
seallard Dec 12, 2023
296b12f
Handle possible null values from metrics
seallard Dec 12, 2023
603c3a5
Restructure methods
seallard Dec 12, 2023
b1750d0
Restructure microsalt module
seallard Dec 13, 2023
151b157
Generate quality report
seallard Dec 13, 2023
e8c5a4b
Fix naming
seallard Dec 13, 2023
45869be
Cleaning
seallard Dec 13, 2023
d6372b9
Test report generator
seallard Dec 13, 2023
22078c8
Add tests for utils
seallard Dec 13, 2023
475d0fb
Add tests for utils
seallard Dec 13, 2023
84fde0b
Add tests for utils
seallard Dec 13, 2023
ff10f87
Add tests for utils
seallard Dec 13, 2023
f8ff37e
Add tests for quality controller
seallard Dec 13, 2023
5464dc5
Remove old tests
seallard Dec 13, 2023
1a1e320
Add integration tests
seallard Dec 14, 2023
4a3eb85
Cleanup
seallard Dec 14, 2023
3e4a44b
Add integration test
seallard Dec 14, 2023
39894b4
Add logging
seallard Dec 14, 2023
899af79
Add new lines
seallard Dec 14, 2023
2289485
Fix comments
seallard Dec 14, 2023
85966cb
Extract method
seallard Dec 14, 2023
fd5c4bf
Extract method
seallard Dec 15, 2023
b08ec67
Fix comments
seallard Dec 18, 2023
20f94a6
Improve report and logging
seallard Dec 18, 2023
24ad13c
Merge branch 'master' into add-microsalt-qc
seallard Dec 18, 2023
408e5b6
Formatting
seallard Dec 18, 2023
f52c801
Add logging
seallard Dec 18, 2023
f485804
Improve messages
seallard Dec 18, 2023
a6df7b8
Improve logging
seallard Dec 18, 2023
f946950
Simplify run dir retrieval
seallard Dec 18, 2023
5ad401a
Fix test
seallard Dec 19, 2023
d62abd9
Add log messages
seallard Dec 19, 2023
6f0a0ab
Add logging
seallard Dec 19, 2023
f152d2d
Improve logging
seallard Dec 19, 2023
bc1b6cb
Fix exc
seallard Dec 19, 2023
fde8536
Fix comments
seallard Dec 22, 2023
9900e26
Handle missing metrics file
seallard Dec 22, 2023
517fe95
Handle missing negative sample
seallard Dec 22, 2023
8ab1cbe
Use application guaranteed reads
seallard Dec 22, 2023
0538866
Fix tests
seallard Dec 22, 2023
053283f
Fix name for microsalt qc command
seallard Dec 22, 2023
4f840d1
Add summary to report
seallard Jan 2, 2024
d463810
Add summary to trailblazer comment
seallard Jan 2, 2024
6a4fbda
Merge branch 'master' into add-microsalt-qc
seallard Jan 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 4 additions & 10 deletions cg/cli/workflow/microsalt/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from cg.meta.workflow.analysis import AnalysisAPI
from cg.meta.workflow.microsalt import MicrosaltAnalysisAPI
from cg.models.cg_config import CGConfig
from cg.store.models import Sample
from cg.store.models import Case, Sample

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -224,15 +224,9 @@ def start_available(context: click.Context, dry_run: bool = False):
def qc_microsalt(context: click.Context, unique_id: str) -> None:
"""Perform QC on a microsalt case."""
analysis_api: MicrosaltAnalysisAPI = context.obj.meta_apis["analysis_api"]
metrics_file_path: Path = analysis_api.get_metrics_file_path(unique_id)
try:
analysis_api.microsalt_qc(
case_id=unique_id,
run_dir_path=analysis_api.get_latest_case_path(case_id=unique_id),
lims_project=analysis_api.get_project(
analysis_api.status_db.get_case_by_internal_id(internal_id=unique_id)
.samples[0]
.internal_id
),
)
LOG.info(f"Performing QC on case {unique_id}")
analysis_api.quality_checker.quality_control(metrics_file_path)
except IndexError:
LOG.error(f"No existing analysis directories found for case {unique_id}.")
7 changes: 6 additions & 1 deletion cg/constants/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,10 +211,15 @@ class APIMethods(StrEnum):


class MicrosaltQC:
QC_PERCENT_THRESHOLD_MWX: float = 0.1
AVERAGE_COVERAGE_THRESHOLD: int = 10
QC_PERCENT_THRESHOLD_MWX: float = 0.9
seallard marked this conversation as resolved.
Show resolved Hide resolved
COVERAGE_10X_THRESHOLD: float = 0.75
DUPLICATION_RATE_THRESHOLD: float = 0.8
INSERT_SIZE_THRESHOLD: int = 100
MAPPED_RATE_THRESHOLD: float = 0.3
NEGATIVE_CONTROL_READS_THRESHOLD: float = 0.2
TARGET_READS: int = 6000000
TARGET_READS_FAIL_THRESHOLD: float = 0.7


class MicrosaltAppTags(StrEnum):
Expand Down
12 changes: 12 additions & 0 deletions cg/exc.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,18 @@ class LimsDataError(CgError):
"""


class MicrosaltError(CgError):
"""
Error related to Microsalt analysis.
"""


class MissingAnalysisDir(CgError):
"""
Error related to missing analysis.
"""


class OrderError(CgError):
"""
Exception related to orders.
Expand Down
1 change: 1 addition & 0 deletions cg/meta/workflow/microsalt/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from cg.meta.workflow.microsalt.microsalt import MicrosaltAnalysisAPI
4 changes: 4 additions & 0 deletions cg/meta/workflow/microsalt/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from cg.constants.constants import FileExtensions


QUALITY_REPORT_FILE_NAME: str = f"QC_done{FileExtensions.JSON}"
seallard marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 2 additions & 0 deletions cg/meta/workflow/microsalt/metrics_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from cg.meta.workflow.microsalt.metrics_parser.metrics_parser import MetricsParser
from cg.meta.workflow.microsalt.metrics_parser.models import QualityMetrics, SampleMetrics
12 changes: 12 additions & 0 deletions cg/meta/workflow/microsalt/metrics_parser/metrics_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from pathlib import Path

from cg.io.json import read_json
from cg.meta.workflow.microsalt.metrics_parser.models import QualityMetrics


class MetricsParser:
@staticmethod
def parse(file_path: Path) -> QualityMetrics:
data = read_json(file_path)
formatted_data = {"samples": data}
return QualityMetrics.model_validate(formatted_data)
seallard marked this conversation as resolved.
Show resolved Hide resolved
27 changes: 27 additions & 0 deletions cg/meta/workflow/microsalt/metrics_parser/models.py
seallard marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import Annotated
from pydantic import BaseModel, BeforeValidator


def empty_str_to_none(v: str) -> str | None:
return v or None


class PicardMarkduplicate(BaseModel):
insert_size: Annotated[int | None, BeforeValidator(empty_str_to_none)]
duplication_rate: Annotated[float | None, BeforeValidator(empty_str_to_none)]


class MicrosaltSamtoolsStats(BaseModel):
total_reads: Annotated[int | None, BeforeValidator(empty_str_to_none)]
mapped_rate: Annotated[float | None, BeforeValidator(empty_str_to_none)]
average_coverage: Annotated[float | None, BeforeValidator(empty_str_to_none)]
coverage_10x: Annotated[float | None, BeforeValidator(empty_str_to_none)]


class SampleMetrics(BaseModel):
picard_markduplicate: PicardMarkduplicate
microsalt_samtools_stats: MicrosaltSamtoolsStats


class QualityMetrics(BaseModel):
samples: dict[str, SampleMetrics]
Original file line number Diff line number Diff line change
@@ -1,10 +1,3 @@
""" API to manage Microsalt Analyses
Organism - Fallback based on reference, ‘Other species’ and ‘Comment’. Default to “Unset”.
Priority = Default to empty string. Weird response. Typically “standard” or “research”.
Reference = Defaults to “None”
Method: Outputted as “1273:23”. Defaults to “Not in LIMS”
Date: Returns latest == most recent date. Outputted as DT object “YYYY MM DD”. Defaults to
datetime.min"""
import glob
import logging
import os
Expand All @@ -17,13 +10,13 @@
import click

from cg.constants import EXIT_FAIL, EXIT_SUCCESS, Pipeline, Priority
from cg.constants.constants import MicrosaltAppTags, MicrosaltQC
from cg.exc import CgDataError
from cg.io.json import read_json, write_json
from cg.constants.constants import FileExtensions
from cg.constants.tb import AnalysisStatus
from cg.exc import CgDataError, MissingAnalysisDir
from cg.meta.workflow.analysis import AnalysisAPI
from cg.meta.workflow.fastq import MicrosaltFastqHandler
from cg.meta.workflow.microsalt.quality_controller import QualityController
from cg.models.cg_config import CGConfig
from cg.models.orders.sample_base import ControlEnum
from cg.store.models import Case, Sample
from cg.utils import Process

Expand All @@ -37,6 +30,7 @@ def __init__(self, config: CGConfig, pipeline: Pipeline = Pipeline.MICROSALT):
super().__init__(pipeline, config)
self.root_dir = config.microsalt.root
self.queries_path = config.microsalt.queries_path
self.quality_checker = QualityController(config.status_db)

@property
def use_read_count_threshold(self) -> bool:
Expand All @@ -60,33 +54,6 @@ def process(self) -> Process:
)
return self._process

def get_case_path(self, case_id: str) -> list[Path]:
"""Returns all paths associated with the case or single sample analysis."""
case_obj: Case = self.status_db.get_case_by_internal_id(internal_id=case_id)
lims_project: str = self.get_project(case_obj.links[0].sample.internal_id)
lims_project_dir_path: Path = Path(self.root_dir, "results", lims_project)

case_directories: list[Path] = [
Path(path) for path in glob.glob(f"{lims_project_dir_path}*", recursive=True)
]

return sorted(case_directories, key=os.path.getctime, reverse=True)

def get_latest_case_path(self, case_id: str) -> Path | None:
"""Return latest run dir for a microbial case, if no path found it returns None."""
lims_project: str = self.get_project(
self.status_db.get_case_by_internal_id(internal_id=case_id).links[0].sample.internal_id
)

return next(
(
path
for path in self.get_case_path(case_id=case_id)
if lims_project + "_" in str(path)
),
None,
)

def clean_run_dir(self, case_id: str, yes: bool, case_path: list[Path] | Path) -> int:
"""Remove workflow run directories for a MicroSALT case."""

Expand Down Expand Up @@ -167,10 +134,7 @@ def get_samples(self, case_id: str, sample_id: str | None = None) -> list[Sample
def get_lims_comment(self, sample_id: str) -> str:
"""Returns the comment associated with a sample stored in lims"""
comment: str = self.lims_api.get_sample_comment(sample_id) or ""
if re.match(r"\w{4}\d{2,3}", comment):
return comment

return ""
return comment if re.match(r"\w{4}\d{2,3}", comment) else ""

def get_organism(self, sample_obj: Sample) -> str:
"""Organism
Expand Down Expand Up @@ -282,94 +246,66 @@ def get_case_id_from_case(self, unique_id: str) -> tuple[str, None]:
case_id = case_obj.internal_id
return case_id, None

def microsalt_qc(self, case_id: str, run_dir_path: Path, lims_project: str) -> bool:
"""Check if given microSALT case passes QC check."""
failed_samples: dict = {}
case_qc: dict = read_json(file_path=Path(run_dir_path, f"{lims_project}.json"))

for sample_id in case_qc:
sample: Sample = self.status_db.get_sample_by_internal_id(internal_id=sample_id)
sample_check: dict | None = self.qc_sample_check(
sample=sample,
sample_qc=case_qc[sample_id],
)
if sample_check is not None:
failed_samples[sample_id] = sample_check

return self.qc_case_check(
case_id=case_id,
failed_samples=failed_samples,
number_of_samples=len(case_qc),
run_dir_path=run_dir_path,
)

def qc_case_check(
self, case_id: str, failed_samples: dict, number_of_samples: int, run_dir_path: Path
) -> bool:
"""Perform the final QC check for a microbial case based on failed samples."""
qc_pass: bool = True

for sample_id in failed_samples:
sample: Sample = self.status_db.get_sample_by_internal_id(internal_id=sample_id)
if sample.control == ControlEnum.negative:
qc_pass = False
if sample.application_version.application.tag == MicrosaltAppTags.MWRNXTR003:
qc_pass = False

# Check if more than 10% of MWX samples failed
if len(failed_samples) / number_of_samples > MicrosaltQC.QC_PERCENT_THRESHOLD_MWX:
qc_pass = False

if not qc_pass:
LOG.warning(
f"Case {case_id} failed QC, see {run_dir_path}/QC_done.json for more information."
)
else:
LOG.info(f"Case {case_id} passed QC.")
def get_cases_to_store(self) -> list[Case]:
cases_qc_ready: list[Case] = self.get_completed_cases()
cases_to_store: list[Case] = []
LOG.info(f"Found {len(cases_qc_ready)} cases to perform QC on!")

for case in cases_qc_ready:
case_run_dir: Path | None = self.get_case_path(case.internal_id)
LOG.info(f"Checking QC for case {case.internal_id} in {case_run_dir}")
if self.quality_checker.is_qc_required(case_run_dir):
LOG.info(f"QC required for case {case.internal_id}")
metrics_file_path = self.get_metrics_file_path(case.internal_id)
if self.quality_checker.quality_control(metrics_file_path):
self.trailblazer_api.add_comment(case_id=case.internal_id, comment="QC passed")
seallard marked this conversation as resolved.
Show resolved Hide resolved
cases_to_store.append(case)
else:
self.trailblazer_api.set_analysis_status(
case_id=case.internal_id, status=AnalysisStatus.FAILED
)
self.trailblazer_api.add_comment(case_id=case.internal_id, comment="QC failed")
else:
cases_to_store.append(case)

return cases_to_store

def get_completed_cases(self) -> list[Case]:
"""Return cases that are completed in trailblazer."""
return [
case
for case in self.status_db.get_running_cases_in_pipeline(self.pipeline)
if self.trailblazer_api.is_latest_analysis_completed(case.internal_id)
]

self.create_qc_done_file(
run_dir_path=run_dir_path,
failed_samples=failed_samples,
)
return qc_pass

def create_qc_done_file(self, run_dir_path: Path, failed_samples: dict) -> None:
"""Creates a QC_done when a QC check is performed."""
write_json(file_path=run_dir_path.joinpath("QC_done.json"), content=failed_samples)

def qc_sample_check(self, sample: Sample, sample_qc: dict) -> dict | None:
"""Perform a QC on a sample."""
if sample.control == ControlEnum.negative:
reads_pass: bool = self.check_external_negative_control_sample(sample)
if not reads_pass:
LOG.warning(f"Negative control sample {sample.internal_id} failed QC.")
return {"Passed QC Reads": reads_pass}
else:
reads_pass: bool = sample.sequencing_qc
coverage_10x_pass: bool = self.check_coverage_10x(
sample_name=sample.internal_id, sample_qc=sample_qc
)
if not reads_pass or not coverage_10x_pass:
LOG.warning(f"Sample {sample.internal_id} failed QC.")
return {"Passed QC Reads": reads_pass, "Passed Coverage 10X": coverage_10x_pass}

def check_coverage_10x(self, sample_name: str, sample_qc: dict) -> bool:
"""Check if a sample passed the coverage_10x criteria."""
try:
return (
sample_qc["microsalt_samtools_stats"]["coverage_10x"]
>= MicrosaltQC.COVERAGE_10X_THRESHOLD
)
except TypeError as e:
LOG.error(
f"There is no 10X coverage value for sample {sample_name}, setting qc to fail for this sample"
)
LOG.error(f"See error: {e}")
return False

def check_external_negative_control_sample(self, sample: Sample) -> bool:
"""Check if external negative control passed read check"""
return sample.reads < (
sample.application_version.application.target_reads
* MicrosaltQC.NEGATIVE_CONTROL_READS_THRESHOLD
)
def get_metrics_file_path(self, case_id: str) -> Path:
"""Return path to metrics file for a case."""
project_id: str = self.get_project_id(case_id)
case_run_dir: Path = self.get_case_path(case_id)
return Path(case_run_dir, f"{project_id}{FileExtensions.JSON}")

def extract_project_id(self, sample_id: str) -> str:
return sample_id.rsplit("A", maxsplit=1)[0]

def get_project_id(self, case_id: str) -> str:
case: Case = self.status_db.get_case_by_internal_id(case_id)
sample_id: str = case.links[0].sample.internal_id
return self.extract_project_id(sample_id)

def get_results_dir(self) -> Path:
return Path(self.root_dir, "results")

def get_analyses_result_dirs(self, case_id: str) -> list[str]:
project_id: str = self.get_project_id(case_id)
results_dir: Path = self.get_results_dir()
matches: list[str] = [d for d in os.listdir(results_dir) if d.startswith(project_id)]
if not matches:
LOG.error(f"No result directory found for {case_id} with project id {project_id}")
raise MissingAnalysisDir
return matches

def get_case_path(self, case_id: str) -> Path:
results_dir: Path = self.get_results_dir()
matching_cases: list[str] = self.get_analyses_result_dirs(case_id)
case_dir: str = max(matching_cases, default=None)
return Path(results_dir, case_dir)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from cg.meta.workflow.microsalt.quality_controller.quality_controller import QualityController
23 changes: 23 additions & 0 deletions cg/meta/workflow/microsalt/quality_controller/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from pydantic import BaseModel

from cg.constants.constants import MicrosaltAppTags


class SampleQualityResult(BaseModel):
sample_id: str
passes_qc: bool
is_control: bool
application_tag: MicrosaltAppTags
passes_reads_qc: bool
passes_mapping_qc: bool = True
passes_duplication_qc: bool = True
passes_inserts_qc: bool = True
passes_coverage_qc: bool = True
passes_10x_coverage_qc: bool = True


class CaseQualityResult(BaseModel):
passes_qc: bool
control_passes_qc: bool
urgent_passes_qc: bool
non_urgent_passes_qc: bool
Loading
Loading