Skip to content

Commit

Permalink
Add upload for raredisease to genotype (#3784)
Browse files Browse the repository at this point in the history
### Added

- Upload of raredisease cases to genotype

### Changed

- Add _mip suffix to mip related genotype functions
  • Loading branch information
rannick authored Oct 17, 2024
1 parent b295134 commit edb13ca
Show file tree
Hide file tree
Showing 9 changed files with 354 additions and 89 deletions.
5 changes: 2 additions & 3 deletions cg/cli/upload/genotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,14 @@ def upload_genotypes(context: CGConfig, re_upload: bool, family_id: str | None):
status_db: Store = context.status_db
housekeeper_api: HousekeeperAPI = context.housekeeper_api
genotype_api: GenotypeAPI = context.genotype_api

click.echo(click.style("----------------- GENOTYPES -------------------"))

if not family_id:
suggest_cases_to_upload(status_db=status_db)
raise click.Abort
case_obj: Case = status_db.get_case_by_internal_id(internal_id=family_id)
case: Case = status_db.get_case_by_internal_id(internal_id=family_id)
upload_genotypes_api = UploadGenotypesAPI(hk_api=housekeeper_api, gt_api=genotype_api)
results: dict = upload_genotypes_api.data(case_obj.analyses[0])
results: dict = upload_genotypes_api.get_genotype_data(case.analyses[0])

if not results:
LOG.warning("Could not find any results to upload")
Expand Down
2 changes: 2 additions & 0 deletions cg/constants/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ class HastaSlurmPartitions(StrEnum):

class FileExtensions(StrEnum):
BAM: str = ".bam"
BCF: str = ".bcf"
BED: str = ".bed"
COMPLETE: str = ".complete"
CONFIG: str = ".config"
Expand Down Expand Up @@ -224,6 +225,7 @@ class FileExtensions(StrEnum):
TSV: str = ".tsv"
TXT: str = ".txt"
VCF: str = ".vcf"
VCF_GZ: str = ".vcf.gz"
XLSX: str = ".xlsx"
XML: str = ".xml"
YAML: str = ".yaml"
Expand Down
8 changes: 8 additions & 0 deletions cg/constants/housekeeper_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,19 @@ class BalsamicAnalysisTag:
QC_METRICS: list[str] = ["qc-metrics", "deliverable"]


class HkAnalysisMetricsTag:
QC_METRICS: set[str] = {"qc-metrics", "deliverable"}


class GensAnalysisTag:
COVERAGE: list[str] = ["gens", "coverage", "bed"]
FRACSNP: list[str] = ["gens", "fracsnp", "bed"]


class GenotypeAnalysisTag:
GENOTYPE: str = "genotype"


class BalsamicProtectedTags:
"""Balsamic workflow protected tags by type."""

Expand Down
4 changes: 3 additions & 1 deletion cg/constants/nf_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@ class NfTowerStatus(StrEnum):
UNKNOWN: str = "UNKNOWN"


RAREDISEASE_PREDICTED_SEX_METRIC = "predicted_sex_sex_check"

RAREDISEASE_METRIC_CONDITIONS: dict[str, dict[str, Any]] = {
"percent_duplicates": {"norm": "lt", "threshold": 20},
"PCT_PF_UQ_READS_ALIGNED": {"norm": "gt", "threshold": 0.95},
"MEDIAN_TARGET_COVERAGE": {"norm": "gt", "threshold": 25},
"PCT_TARGET_BASES_10X": {"norm": "gt", "threshold": 0.95},
"PCT_EXC_ADAPTER": {"norm": "lt", "threshold": 0.0005},
"predicted_sex_sex_check": {"norm": "eq", "threshold": None},
RAREDISEASE_PREDICTED_SEX_METRIC: {"norm": "eq", "threshold": None},
"gender": {"norm": "eq", "threshold": None},
}

Expand Down
191 changes: 126 additions & 65 deletions cg/meta/upload/gt.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
import logging
from pathlib import Path

from housekeeper.store.models import File, Version
from housekeeper.store.models import File

from cg.apps.gt import GenotypeAPI
from cg.apps.housekeeper.hk import HousekeeperAPI
from cg.constants.constants import FileFormat, PrepCategory, Workflow
from cg.constants.housekeeper_tags import HkMipAnalysisTag
from cg.constants.constants import FileExtensions, FileFormat, PrepCategory, Workflow
from cg.constants.housekeeper_tags import GenotypeAnalysisTag, HkAnalysisMetricsTag
from cg.constants.nf_analysis import RAREDISEASE_PREDICTED_SEX_METRIC
from cg.constants.subject import Sex
from cg.io.controller import ReadFile
from cg.models.deliverables.metric_deliverables import MetricsBase
from cg.models.mip.mip_metrics_deliverables import MIPMetricsDeliverables
from cg.store.models import Analysis, Case, Sample

LOG = logging.getLogger(__name__)


class UploadGenotypesAPI(object):
"""Genotype upload API."""

def __init__(
self,
hk_api: HousekeeperAPI,
Expand All @@ -25,8 +29,8 @@ def __init__(
self.hk = hk_api
self.gt = gt_api

def data(self, analysis: Analysis) -> dict:
"""Fetch data about an analysis to load genotypes.
def get_genotype_data(self, analysis: Analysis) -> dict[dict[str, dict[str, str]]]:
"""Return data about an analysis to load genotypes.
Returns: dict on form
Expand All @@ -43,93 +47,150 @@ def data(self, analysis: Analysis) -> dict:
"""
case_id = analysis.case.internal_id
LOG.info(f"Fetching upload genotype data for {case_id}")
hk_version = self.hk.last_version(case_id)
hk_bcf = self.get_bcf_file(hk_version)
data = {"bcf": hk_bcf.full_path}
hk_bcf: File = self._get_genotype_file(case_id=case_id)
genotype_load_config: dict = {"bcf": hk_bcf.full_path}
if analysis.workflow in [Workflow.BALSAMIC, Workflow.BALSAMIC_UMI]:
data["samples_sex"] = self._get_samples_sex_balsamic(case_obj=analysis.case)
genotype_load_config["samples_sex"] = self._get_samples_sex_balsamic(case=analysis.case)
elif analysis.workflow == Workflow.MIP_DNA:
data["samples_sex"] = self._get_samples_sex_mip(
case_obj=analysis.case, hk_version=hk_version
genotype_load_config["samples_sex"] = self._get_samples_sex_mip_dna(case=analysis.case)
elif analysis.workflow == Workflow.RAREDISEASE:
genotype_load_config["samples_sex"] = self._get_samples_sex_raredisease(
case=analysis.case
)
else:
raise ValueError(f"Workflow {analysis.workflow} does not support Genotype upload")
return data

def _get_samples_sex_mip(self, case_obj: Case, hk_version: Version) -> dict:
qc_metrics_file = self.get_qcmetrics_file(hk_version)
analysis_sexes = self.analysis_sex(qc_metrics_file)
samples_sex = {}
for link_obj in case_obj.links:
sample_id = link_obj.sample.internal_id
return genotype_load_config

def upload(self, genotype_load_config: dict, replace: bool = False):
"""Upload a genotype config for a case."""
self.gt.upload(
str(genotype_load_config["bcf"]), genotype_load_config["samples_sex"], force=replace
)

@staticmethod
def _is_suitable_for_genotype_upload(case: Case) -> bool:
"""Returns True if there are any non-tumor WGS samples in the case."""
samples: list[Sample] = case.samples
return any(
(not sample.is_tumour and PrepCategory.WHOLE_GENOME_SEQUENCING == sample.prep_category)
for sample in samples
)

def _get_genotype_file(self, case_id: str) -> File:
"Returns latest genotype file in housekeeper for given case, raises FileNotFoundError if not found."
LOG.debug("Get Genotype files from Housekeeper")
tags: set[str] = {GenotypeAnalysisTag.GENOTYPE}
hk_genotype_files: list[File] = self.hk.get_files_from_latest_version(
bundle_name=case_id, tags=tags
)
hk_genotype: File = self._get_single_genotype_file(hk_genotype_files)
if not hk_genotype:
raise FileNotFoundError(f"Genotype file not found for {case_id}")
LOG.debug(f"Found genotype file {hk_genotype.full_path}")
return hk_genotype

def _get_samples_sex_balsamic(self, case: Case) -> dict[str, dict[str, str]]:
"""Return sex information from StatusDB and for analysis."""
samples_sex: dict[str, dict[str, str]] = {}
for sample in case.samples:
if sample.is_tumour:
continue
sample_id: str = sample.internal_id
samples_sex[sample_id] = {
"pedigree": link_obj.sample.sex,
"analysis": analysis_sexes[sample_id],
"pedigree": sample.sex,
"analysis": Sex.UNKNOWN,
}
return samples_sex

def _get_samples_sex_balsamic(self, case_obj: Case) -> dict:
samples_sex = {}
for link_obj in case_obj.links:
if link_obj.sample.is_tumour:
continue
sample_id = link_obj.sample.internal_id
def _get_samples_sex_mip_dna(self, case: Case) -> dict[str, dict[str, str]]:
"""Return sex information from StatusDB and from analysis prediction"""
qc_metrics_file: Path = self._get_qcmetrics_file(case_id=case.internal_id)
analysis_sex: dict = self._get_analysis_sex_mip_dna(qc_metrics_file)
samples_sex: dict[str, dict[str, str]] = {}
for sample in case.samples:
sample_id: str = sample.internal_id
samples_sex[sample_id] = {
"pedigree": link_obj.sample.sex,
"analysis": Sex.UNKNOWN,
"pedigree": sample.sex,
"analysis": analysis_sex[sample_id],
}
return samples_sex

def analysis_sex(self, qc_metrics_file: Path) -> dict:
"""Fetch analysis sex for each sample of an analysis."""
qc_metrics: MIPMetricsDeliverables = self.get_parsed_qc_metrics_data(qc_metrics_file)
@staticmethod
def _get_analysis_sex_mip_dna(qc_metrics_file: Path) -> dict:
"""Return analysis sex for each sample of an analysis."""
qc_metrics: MIPMetricsDeliverables = (
UploadGenotypesAPI._get_parsed_qc_metrics_deliverables_mip_dna(qc_metrics_file)
)
return {
sample_id_metric.sample_id: sample_id_metric.predicted_sex
for sample_id_metric in qc_metrics.sample_id_metrics
}

def get_bcf_file(self, hk_version_obj: Version) -> File:
"""Fetch a bcf file and return the file object"""
genotype_files: list = self._get_genotype_files(version_id=hk_version_obj.id)
for genotype_file in genotype_files:
if self._is_variant_file(genotype_file=genotype_file):
LOG.debug(f"Found bcf file {genotype_file.full_path}")
return genotype_file
raise FileNotFoundError(f"No vcf or bcf file found for bundle {hk_version_obj.bundle_id}")

def get_qcmetrics_file(self, hk_version_obj: Version) -> Path:
"""Fetch a qc_metrics file and return the path"""
hk_qcmetrics = self.hk.files(
version=hk_version_obj.id, tags=HkMipAnalysisTag.QC_METRICS
).first()
LOG.debug(f"Found qc metrics file {hk_qcmetrics.full_path}")
def _get_qcmetrics_file(self, case_id: str) -> Path:
"""Return a QC metrics file path.
Raises: FileNotFoundError if nothing is found in the Housekeeper bundle."""
tags: set[str] = HkAnalysisMetricsTag.QC_METRICS
hk_qcmetrics: File = self.hk.get_file_from_latest_version(bundle_name=case_id, tags=tags)
if not hk_qcmetrics:
raise FileNotFoundError("QC metrics file not found for the given Housekeeper version.")
LOG.debug(f"Found QC metrics file {hk_qcmetrics.full_path}")
return Path(hk_qcmetrics.full_path)

@staticmethod
def get_parsed_qc_metrics_data(qc_metrics: Path) -> MIPMetricsDeliverables:
"""Parse the information from a qc metrics file"""
def _get_parsed_qc_metrics_deliverables_mip_dna(
qc_metrics_file: Path,
) -> MIPMetricsDeliverables:
"""Parse and return a QC metrics file."""
qcmetrics_raw: dict = ReadFile.get_content_from_file(
file_format=FileFormat.YAML, file_path=qc_metrics
file_format=FileFormat.YAML, file_path=qc_metrics_file
)
return MIPMetricsDeliverables(**qcmetrics_raw)

def upload(self, data: dict, replace: bool = False):
"""Upload data about genotypes for a family of samples."""
self.gt.upload(str(data["bcf"]), data["samples_sex"], force=replace)
def _get_samples_sex_raredisease(self, case: Case) -> dict[str, dict[str, str]]:
"""Return sex information from StatusDB and from analysis prediction."""
qc_metrics_file: Path = self._get_qcmetrics_file(case_id=case.internal_id)
samples_sex: dict[str, dict[str, str]] = {}
for sample in case.samples:
sample_id: str = sample.internal_id
samples_sex[sample_id] = {
"pedigree": sample.sex,
"analysis": self._get_analysis_sex_raredisease(
qc_metrics_file=qc_metrics_file, sample_id=sample_id
),
}
return samples_sex

@staticmethod
def _is_variant_file(genotype_file: File):
return genotype_file.full_path.endswith("vcf.gz") or genotype_file.full_path.endswith("bcf")
def _get_analysis_sex_raredisease(self, qc_metrics_file: Path, sample_id: str) -> str:
"""Return analysis sex for each sample of an analysis."""
qc_metrics: list[MetricsBase] = self._get_parsed_qc_metrics_deliverables_raredisease(
qc_metrics_file
)
for metric in qc_metrics:
if metric.name == RAREDISEASE_PREDICTED_SEX_METRIC and metric.id == sample_id:
return str(metric.value)

def _get_genotype_files(self, version_id: int) -> list:
return self.hk.files(version=version_id, tags=["genotype"]).all()
@staticmethod
def _get_parsed_qc_metrics_deliverables_raredisease(qc_metrics: Path) -> list[MetricsBase]:
"""Parse and return a QC metrics file."""
qcmetrics_raw: dict = ReadFile.get_content_from_file(
file_format=FileFormat.YAML, file_path=qc_metrics
)
return [MetricsBase(**metric) for metric in qcmetrics_raw["metrics"]]

@staticmethod
def is_suitable_for_genotype_upload(case_obj: Case) -> bool:
"""Check if a cancer case is contains WGS and normal sample."""
def _is_variant_file(genotype_file: File):
return genotype_file.full_path.endswith(
FileExtensions.VCF_GZ
) or genotype_file.full_path.endswith(FileExtensions.BCF)

samples: list[Sample] = case_obj.samples
return any(
(not sample.is_tumour and PrepCategory.WHOLE_GENOME_SEQUENCING == sample.prep_category)
for sample in samples
)
def _get_single_genotype_file(self, hk_genotype_files: list[File]) -> File:
"""
Returns the single .bcf or .vcf.gz file expected to be found in the provided list. Raises an error if the amount of such files is different from 1.
"""
filtered_files = [file for file in hk_genotype_files if self._is_variant_file(file)]
if len(filtered_files) != 1:
raise ValueError(
f"Error: Expecte one genotype file, but found {len(filtered_files)} "
f"({', '.join(map(str, filtered_files))})."
)
return filtered_files[0]
6 changes: 6 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -830,6 +830,12 @@ def case_qc_metrics_deliverables(apps_dir: Path) -> Path:
return Path(apps_dir, "mip", "case_metrics_deliverables.yaml")


@pytest.fixture
def case_qc_metrics_deliverables_raredisease(apps_dir: Path) -> Path:
"""Return the path to a qc metrics deliverables file with case data."""
return Path(apps_dir, "raredisease", "case_metrics_deliverables.yaml")


@pytest.fixture
def mip_analysis_dir(analysis_dir: Path) -> Path:
"""Return the path to the directory with mip analysis files."""
Expand Down
Loading

0 comments on commit edb13ca

Please sign in to comment.