From aa0095916a730ce1684bd0cf383ef664b5436d83 Mon Sep 17 00:00:00 2001 From: Daniel Suveges Date: Fri, 24 Jan 2025 10:29:37 +0000 Subject: [PATCH] feat(qtls): flagging trans QTL credible sets (#973) * test: adding tests for trans qtl flagging * fix: failing tests due to changing schema * chore: updating code for TargetIndex * chore: addressing review comments --- src/gentropy/assets/schemas/study_locus.json | 6 ++ src/gentropy/config.py | 2 + src/gentropy/dataset/study_locus.py | 89 +++++++++++++++++- src/gentropy/study_locus_validation.py | 12 ++- tests/gentropy/dataset/test_study_locus.py | 94 +++++++++++++++++++ .../gentropy/step/test_colocalisation_step.py | 4 + 6 files changed, 204 insertions(+), 3 deletions(-) diff --git a/src/gentropy/assets/schemas/study_locus.json b/src/gentropy/assets/schemas/study_locus.json index 5c7bf1178..b9780f634 100644 --- a/src/gentropy/assets/schemas/study_locus.json +++ b/src/gentropy/assets/schemas/study_locus.json @@ -247,6 +247,12 @@ "name": "confidence", "nullable": true, "type": "string" + }, + { + "metadata": {}, + "name": "isTransQtl", + "nullable": true, + "type": "boolean" } ], "type": "struct" diff --git a/src/gentropy/config.py b/src/gentropy/config.py index 5cc4689e1..7d924f3fe 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -658,9 +658,11 @@ class StudyLocusValidationStepConfig(StepConfig): study_index_path: str = MISSING study_locus_path: list[str] = MISSING + target_index_path: str = MISSING valid_study_locus_path: str = MISSING invalid_study_locus_path: str = MISSING invalid_qc_reasons: list[str] = MISSING + trans_qtl_threshold: int = MISSING _target_: str = "gentropy.study_locus_validation.StudyLocusValidationStep" diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index 5abb30c8e..30b91541b 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -8,7 +8,7 @@ import numpy as np import pyspark.sql.functions as f -from pyspark.sql.types import ArrayType, FloatType, StringType +from pyspark.sql.types import ArrayType, FloatType, LongType, StringType from gentropy.common.genomic_region import GenomicRegion, KnownGenomicRegions from gentropy.common.schemas import parse_spark_schema @@ -34,6 +34,7 @@ from gentropy.dataset.ld_index import LDIndex from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.summary_statistics import SummaryStatistics + from gentropy.dataset.target_index import TargetIndex from gentropy.method.l2g.feature_factory import L2GFeatureInputLoader @@ -681,6 +682,92 @@ def get_QC_mappings(cls: type[StudyLocus]) -> dict[str, str]: """ return {member.name: member.value for member in StudyLocusQualityCheck} + def flag_trans_qtls( + self: StudyLocus, + study_index: StudyIndex, + target_index: TargetIndex, + trans_threshold: int = 5_000_000, + ) -> StudyLocus: + """Flagging transQTL credible sets based on genomic location of the measured gene. + + Process: + 1. Enrich study-locus dataset with geneId based on study metadata. (only QTL studies are considered) + 2. Enrich with transcription start site and chromosome of the studied gegne. + 3. Flagging any tagging variant of QTL credible sets, if chromosome is different from the gene or distance is above the threshold. + 4. Propagate flags to credible sets where any tags are considered as trans. + 5. Return study locus object with annotation stored in 'isTransQtl` boolean column, where gwas credible sets will be `null` + + Args: + study_index (StudyIndex): study index to extract identifier of the measured gene + target_index (TargetIndex): target index bringing TSS and chromosome of the measured gene + trans_threshold (int): Distance above which the QTL is considered trans. Default: 5_000_000bp + + Returns: + StudyLocus: new column added indicating if the QTL credibles sets are trans. + """ + # As the `geneId` column in the study index is optional, we have to test for that: + if "geneId" not in study_index.df.columns: + return self + + # Process study index: + processed_studies = ( + study_index.df + # Dropping gwas studies. This ensures that only QTLs will have "isTrans" annotation: + .filter(f.col("studyType") != "gwas").select( + "studyId", "geneId", "projectId" + ) + ) + + # Process study locus: + processed_credible_set = ( + self.df + # Exploding locus to test all tag variants: + .withColumn("locus", f.explode("locus")).select( + "studyLocusId", + "studyId", + f.split("locus.variantId", "_")[0].alias("chromosome"), + f.split("locus.variantId", "_")[1].cast(LongType()).alias("position"), + ) + ) + + # Process target index: + processed_targets = target_index.df.select( + f.col("id").alias("geneId"), + f.col("tss"), + f.col("genomicLocation.chromosome").alias("geneChromosome"), + ) + + # Pool datasets: + joined_data = ( + processed_credible_set + # Join processed studies: + .join(processed_studies, on="studyId", how="inner") + # Join processed targets: + .join(processed_targets, on="geneId", how="left") + # Assign True/False for QTL studies: + .withColumn( + "isTagTrans", + # The QTL signal is considered trans if the locus is on a different chromosome than the measured gene. + # OR the distance from the gene's transcription start site is > threshold. + f.when( + (f.col("chromosome") != f.col("geneChromosome")) + | (f.abs(f.col("tss") - f.col("position")) > trans_threshold), + f.lit(True), + ).otherwise(f.lit(False)), + ) + .groupby("studyLocusId") + .agg( + # If all tagging variants of the locus is in trans position, the QTL is considered trans: + f.when( + f.array_contains(f.collect_set("isTagTrans"), f.lit(False)), False + ) + .otherwise(f.lit(True)) + .alias("isTransQtl") + ) + ) + # Adding new column, where the value is null for gwas loci: + return StudyLocus(self.df.join(joined_data, on="studyLocusId", how="left")) + def filter_credible_set( self: StudyLocus, credible_interval: CredibleInterval, diff --git a/src/gentropy/study_locus_validation.py b/src/gentropy/study_locus_validation.py index ce2201f80..f5dd37889 100644 --- a/src/gentropy/study_locus_validation.py +++ b/src/gentropy/study_locus_validation.py @@ -5,6 +5,7 @@ from gentropy.common.session import Session from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.study_locus import CredibleInterval, StudyLocus +from gentropy.dataset.target_index import TargetIndex class StudyLocusValidationStep: @@ -17,24 +18,29 @@ class StudyLocusValidationStep: def __init__( self, session: Session, - study_index_path: str, study_locus_path: list[str], + study_index_path: str, + target_index_path: str, valid_study_locus_path: str, invalid_study_locus_path: str, + trans_qtl_threshold: int, invalid_qc_reasons: list[str] = [], ) -> None: """Initialize step. Args: session (Session): Session object. - study_index_path (str): Path to study index file. study_locus_path (list[str]): Path to study locus dataset. + study_index_path (str): Path to study index file. + target_index_path (str): path to the target index. valid_study_locus_path (str): Path to write the valid records. invalid_study_locus_path (str): Path to write the output file. + trans_qtl_threshold (int): genomic distance above which a QTL is considered trans. invalid_qc_reasons (list[str]): List of invalid quality check reason names from `StudyLocusQualityCheck` (e.g. ['SUBSIGNIFICANT_FLAG']). """ # Reading datasets: study_index = StudyIndex.from_parquet(session, study_index_path) + target_index = TargetIndex.from_parquet(session, target_index_path) # Running validation then writing output: study_locus_with_qc = ( @@ -54,6 +60,8 @@ def __init__( ) # Annotate credible set confidence: .assign_confidence() + # Flagging trans qtls: + .flag_trans_qtls(study_index, target_index, trans_qtl_threshold) ).persist() # we will need this for 2 types of outputs # Valid study locus partitioned to simplify the finding of overlaps diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py index 19b833124..f8e59d97e 100644 --- a/tests/gentropy/dataset/test_study_locus.py +++ b/tests/gentropy/dataset/test_study_locus.py @@ -12,6 +12,7 @@ ArrayType, BooleanType, DoubleType, + IntegerType, StringType, StructField, StructType, @@ -29,6 +30,7 @@ ) from gentropy.dataset.study_locus_overlap import StudyLocusOverlap from gentropy.dataset.summary_statistics import SummaryStatistics +from gentropy.dataset.target_index import TargetIndex from gentropy.dataset.variant_index import VariantIndex from gentropy.method.l2g.feature_factory import L2GFeatureInputLoader @@ -1189,3 +1191,95 @@ def test_duplication_flag_correctness( assert self.validated.df.filter(f.size("qualityControls") == 0).count() == 2 assert self.validated.df.filter(f.size("qualityControls") > 0).count() == 2 + + +class TestTransQtlFlagging: + """Test flagging trans qtl credible sets.""" + + THRESHOLD = 30 + STUDY_LOCUS_DATA = [ + # QTL in cis position -> flag: False + ("sl1", "c1_50", "s1"), + # QTL in trans position (by distance) -> flag: True + ("sl2", "c1_100", "s1"), + # QTL in trans position (by chromosome) -> flag: True + ("sl3", "c2_50", "s1"), + # Not qtl -> flag: Null + ("sl4", "c1_50", "s2"), + ] + + STUDY_LOCUS_COLUMNS = ["studyLocusId", "variantId", "studyId"] + + STUDY_DATA = [ + ("s1", "p1", "qtl", "g1"), + ("s2", "p2", "gwas", None), + ] + + STUDY_COLUMNS = ["studyId", "projectId", "studyType", "geneId"] + + GENE_DATA = [("g1", -1, 10, 30, "c1", 30)] + GENE_COLUMNS = ["id", "strand", "start", "end", "chromosome", "tss"] + + @pytest.fixture(autouse=True) + def _setup(self: TestTransQtlFlagging, spark: SparkSession) -> None: + """Setup study locus for testing.""" + self.study_locus = StudyLocus( + _df=( + spark.createDataFrame( + self.STUDY_LOCUS_DATA, self.STUDY_LOCUS_COLUMNS + ).withColumn("locus", f.array(f.struct("variantId"))) + ) + ) + self.study_index = StudyIndex( + _df=spark.createDataFrame(self.STUDY_DATA, self.STUDY_COLUMNS) + ) + self.target_index = TargetIndex( + _df=( + spark.createDataFrame(self.GENE_DATA, self.GENE_COLUMNS).select( + f.struct( + f.col("strand").cast(IntegerType()).alias("strand"), + "start", + "end", + "chromosome", + ).alias("genomicLocation"), + f.col("id"), + f.col("tss"), + ) + ) + ) + + self.qtl_flagged = self.study_locus.flag_trans_qtls( + self.study_index, self.target_index, self.THRESHOLD + ) + + def test_return_type(self: TestTransQtlFlagging) -> None: + """Test duplication flagging return type.""" + assert isinstance(self.qtl_flagged, StudyLocus) + + def test_number_of_rows(self: TestTransQtlFlagging) -> None: + """Test duplication flagging no data loss.""" + assert self.qtl_flagged.df.count() == self.study_locus.df.count() + + def test_column_added(self: TestTransQtlFlagging) -> None: + """Test duplication flagging no data loss.""" + assert "isTransQtl" in self.qtl_flagged.df.columns + + def test_correctness_no_gwas_flagged(self: TestTransQtlFlagging) -> None: + """Make sure the flag is null for gwas credible sets.""" + gwas_studies = self.study_index.df.filter(f.col("studyId") == "s2") + + assert ( + self.qtl_flagged.df.join(gwas_studies, on="studyId", how="inner") + .filter(f.col("isTransQtl").isNotNull()) + .count() + ) == 0 + + def test_correctness_all_qlts_are_flagged(self: TestTransQtlFlagging) -> None: + """Make sure all qtls have non-null flags.""" + assert self.qtl_flagged.df.filter(f.col("isTransQtl").isNotNull()).count() == 3 + + def test_correctness_found_trans(self: TestTransQtlFlagging) -> None: + """Make sure trans qtls are flagged.""" + assert ( + self.qtl_flagged.df.filter(f.col("isTransQtl")).count() == 2 + ), "Expected number of rows differ from observed." diff --git a/tests/gentropy/step/test_colocalisation_step.py b/tests/gentropy/step/test_colocalisation_step.py index 4b06adb69..95ceaf976 100644 --- a/tests/gentropy/step/test_colocalisation_step.py +++ b/tests/gentropy/step/test_colocalisation_step.py @@ -61,6 +61,7 @@ def _setup(self, session: Session, tmp_path: Path) -> None: ) ], "SuSiE fine-mapped credible set with out-of-sample LD", + None, ), ( "-1245591334543437941", @@ -104,6 +105,7 @@ def _setup(self, session: Session, tmp_path: Path) -> None: ) ], "SuSiE fine-mapped credible set with out-of-sample LD", + None, ), ( "-0.20241232721094407", @@ -147,6 +149,7 @@ def _setup(self, session: Session, tmp_path: Path) -> None: ) ], "SuSiE fine-mapped credible set with out-of-sample LD", + None, ), ( "-2271857845883525223", @@ -190,6 +193,7 @@ def _setup(self, session: Session, tmp_path: Path) -> None: ) ], "SuSiE fine-mapped credible set with out-of-sample LD", + None, ), ] self.credible_set_path = str(tmp_path / "credible_set_datasets")