From aa0095916a730ce1684bd0cf383ef664b5436d83 Mon Sep 17 00:00:00 2001
From: Daniel Suveges <daniel.suveges@protonmail.com>
Date: Fri, 24 Jan 2025 10:29:37 +0000
Subject: [PATCH] feat(qtls): flagging trans QTL credible sets (#973)

* test: adding tests for trans qtl flagging

* fix: failing tests due to changing schema

* chore: updating code for TargetIndex

* chore: addressing review comments
---
 src/gentropy/assets/schemas/study_locus.json  |  6 ++
 src/gentropy/config.py                        |  2 +
 src/gentropy/dataset/study_locus.py           | 89 +++++++++++++++++-
 src/gentropy/study_locus_validation.py        | 12 ++-
 tests/gentropy/dataset/test_study_locus.py    | 94 +++++++++++++++++++
 .../gentropy/step/test_colocalisation_step.py |  4 +
 6 files changed, 204 insertions(+), 3 deletions(-)

diff --git a/src/gentropy/assets/schemas/study_locus.json b/src/gentropy/assets/schemas/study_locus.json
index 5c7bf1178..b9780f634 100644
--- a/src/gentropy/assets/schemas/study_locus.json
+++ b/src/gentropy/assets/schemas/study_locus.json
@@ -247,6 +247,12 @@
       "name": "confidence",
       "nullable": true,
       "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "isTransQtl",
+      "nullable": true,
+      "type": "boolean"
     }
   ],
   "type": "struct"
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
index 5cc4689e1..7d924f3fe 100644
--- a/src/gentropy/config.py
+++ b/src/gentropy/config.py
@@ -658,9 +658,11 @@ class StudyLocusValidationStepConfig(StepConfig):
 
     study_index_path: str = MISSING
     study_locus_path: list[str] = MISSING
+    target_index_path: str = MISSING
     valid_study_locus_path: str = MISSING
     invalid_study_locus_path: str = MISSING
     invalid_qc_reasons: list[str] = MISSING
+    trans_qtl_threshold: int = MISSING
     _target_: str = "gentropy.study_locus_validation.StudyLocusValidationStep"
 
 
diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
index 5abb30c8e..30b91541b 100644
--- a/src/gentropy/dataset/study_locus.py
+++ b/src/gentropy/dataset/study_locus.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 import pyspark.sql.functions as f
-from pyspark.sql.types import ArrayType, FloatType, StringType
+from pyspark.sql.types import ArrayType, FloatType, LongType, StringType
 
 from gentropy.common.genomic_region import GenomicRegion, KnownGenomicRegions
 from gentropy.common.schemas import parse_spark_schema
@@ -34,6 +34,7 @@
     from gentropy.dataset.ld_index import LDIndex
     from gentropy.dataset.study_index import StudyIndex
     from gentropy.dataset.summary_statistics import SummaryStatistics
+    from gentropy.dataset.target_index import TargetIndex
     from gentropy.method.l2g.feature_factory import L2GFeatureInputLoader
 
 
@@ -681,6 +682,92 @@ def get_QC_mappings(cls: type[StudyLocus]) -> dict[str, str]:
         """
         return {member.name: member.value for member in StudyLocusQualityCheck}
 
+    def flag_trans_qtls(
+        self: StudyLocus,
+        study_index: StudyIndex,
+        target_index: TargetIndex,
+        trans_threshold: int = 5_000_000,
+    ) -> StudyLocus:
+        """Flagging transQTL credible sets based on genomic location of the measured gene.
+
+        Process:
+        1. Enrich study-locus dataset with geneId based on study metadata. (only QTL studies are considered)
+        2. Enrich with transcription start site and chromosome of the studied gegne.
+        3. Flagging any tagging variant of QTL credible sets, if chromosome is different from the gene or distance is above the threshold.
+        4. Propagate flags to credible sets where any tags are considered as trans.
+        5. Return study locus object with annotation stored in 'isTransQtl` boolean column, where gwas credible sets will be `null`
+
+        Args:
+            study_index (StudyIndex): study index to extract identifier of the measured gene
+            target_index (TargetIndex): target index bringing TSS and chromosome of the measured gene
+            trans_threshold (int): Distance above which the QTL is considered trans. Default: 5_000_000bp
+
+        Returns:
+            StudyLocus: new column added indicating if the QTL credibles sets are trans.
+        """
+        # As the `geneId` column in the study index is optional, we have to test for that:
+        if "geneId" not in study_index.df.columns:
+            return self
+
+        # Process study index:
+        processed_studies = (
+            study_index.df
+            # Dropping gwas studies. This ensures that only QTLs will have "isTrans" annotation:
+            .filter(f.col("studyType") != "gwas").select(
+                "studyId", "geneId", "projectId"
+            )
+        )
+
+        # Process study locus:
+        processed_credible_set = (
+            self.df
+            # Exploding locus to test all tag variants:
+            .withColumn("locus", f.explode("locus")).select(
+                "studyLocusId",
+                "studyId",
+                f.split("locus.variantId", "_")[0].alias("chromosome"),
+                f.split("locus.variantId", "_")[1].cast(LongType()).alias("position"),
+            )
+        )
+
+        # Process target index:
+        processed_targets = target_index.df.select(
+            f.col("id").alias("geneId"),
+            f.col("tss"),
+            f.col("genomicLocation.chromosome").alias("geneChromosome"),
+        )
+
+        # Pool datasets:
+        joined_data = (
+            processed_credible_set
+            # Join processed studies:
+            .join(processed_studies, on="studyId", how="inner")
+            # Join processed targets:
+            .join(processed_targets, on="geneId", how="left")
+            # Assign True/False for QTL studies:
+            .withColumn(
+                "isTagTrans",
+                # The QTL signal is considered trans if the locus is on a different chromosome than the measured gene.
+                # OR the distance from the gene's transcription start site is > threshold.
+                f.when(
+                    (f.col("chromosome") != f.col("geneChromosome"))
+                    | (f.abs(f.col("tss") - f.col("position")) > trans_threshold),
+                    f.lit(True),
+                ).otherwise(f.lit(False)),
+            )
+            .groupby("studyLocusId")
+            .agg(
+                # If all tagging variants of the locus is in trans position, the QTL is considered trans:
+                f.when(
+                    f.array_contains(f.collect_set("isTagTrans"), f.lit(False)), False
+                )
+                .otherwise(f.lit(True))
+                .alias("isTransQtl")
+            )
+        )
+        # Adding new column, where the value is null for gwas loci:
+        return StudyLocus(self.df.join(joined_data, on="studyLocusId", how="left"))
+
     def filter_credible_set(
         self: StudyLocus,
         credible_interval: CredibleInterval,
diff --git a/src/gentropy/study_locus_validation.py b/src/gentropy/study_locus_validation.py
index ce2201f80..f5dd37889 100644
--- a/src/gentropy/study_locus_validation.py
+++ b/src/gentropy/study_locus_validation.py
@@ -5,6 +5,7 @@
 from gentropy.common.session import Session
 from gentropy.dataset.study_index import StudyIndex
 from gentropy.dataset.study_locus import CredibleInterval, StudyLocus
+from gentropy.dataset.target_index import TargetIndex
 
 
 class StudyLocusValidationStep:
@@ -17,24 +18,29 @@ class StudyLocusValidationStep:
     def __init__(
         self,
         session: Session,
-        study_index_path: str,
         study_locus_path: list[str],
+        study_index_path: str,
+        target_index_path: str,
         valid_study_locus_path: str,
         invalid_study_locus_path: str,
+        trans_qtl_threshold: int,
         invalid_qc_reasons: list[str] = [],
     ) -> None:
         """Initialize step.
 
         Args:
             session (Session): Session object.
-            study_index_path (str): Path to study index file.
             study_locus_path (list[str]): Path to study locus dataset.
+            study_index_path (str): Path to study index file.
+            target_index_path (str): path to the target index.
             valid_study_locus_path (str): Path to write the valid records.
             invalid_study_locus_path (str): Path to write the output file.
+            trans_qtl_threshold (int): genomic distance above which a QTL is considered trans.
             invalid_qc_reasons (list[str]): List of invalid quality check reason names from `StudyLocusQualityCheck` (e.g. ['SUBSIGNIFICANT_FLAG']).
         """
         # Reading datasets:
         study_index = StudyIndex.from_parquet(session, study_index_path)
+        target_index = TargetIndex.from_parquet(session, target_index_path)
 
         # Running validation then writing output:
         study_locus_with_qc = (
@@ -54,6 +60,8 @@ def __init__(
             )
             # Annotate credible set confidence:
             .assign_confidence()
+            # Flagging trans qtls:
+            .flag_trans_qtls(study_index, target_index, trans_qtl_threshold)
         ).persist()  # we will need this for 2 types of outputs
 
         # Valid study locus partitioned to simplify the finding of overlaps
diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py
index 19b833124..f8e59d97e 100644
--- a/tests/gentropy/dataset/test_study_locus.py
+++ b/tests/gentropy/dataset/test_study_locus.py
@@ -12,6 +12,7 @@
     ArrayType,
     BooleanType,
     DoubleType,
+    IntegerType,
     StringType,
     StructField,
     StructType,
@@ -29,6 +30,7 @@
 )
 from gentropy.dataset.study_locus_overlap import StudyLocusOverlap
 from gentropy.dataset.summary_statistics import SummaryStatistics
+from gentropy.dataset.target_index import TargetIndex
 from gentropy.dataset.variant_index import VariantIndex
 from gentropy.method.l2g.feature_factory import L2GFeatureInputLoader
 
@@ -1189,3 +1191,95 @@ def test_duplication_flag_correctness(
         assert self.validated.df.filter(f.size("qualityControls") == 0).count() == 2
 
         assert self.validated.df.filter(f.size("qualityControls") > 0).count() == 2
+
+
+class TestTransQtlFlagging:
+    """Test flagging trans qtl credible sets."""
+
+    THRESHOLD = 30
+    STUDY_LOCUS_DATA = [
+        # QTL in cis position -> flag: False
+        ("sl1", "c1_50", "s1"),
+        # QTL in trans position (by distance) -> flag: True
+        ("sl2", "c1_100", "s1"),
+        # QTL in trans position (by chromosome) -> flag: True
+        ("sl3", "c2_50", "s1"),
+        # Not qtl -> flag: Null
+        ("sl4", "c1_50", "s2"),
+    ]
+
+    STUDY_LOCUS_COLUMNS = ["studyLocusId", "variantId", "studyId"]
+
+    STUDY_DATA = [
+        ("s1", "p1", "qtl", "g1"),
+        ("s2", "p2", "gwas", None),
+    ]
+
+    STUDY_COLUMNS = ["studyId", "projectId", "studyType", "geneId"]
+
+    GENE_DATA = [("g1", -1, 10, 30, "c1", 30)]
+    GENE_COLUMNS = ["id", "strand", "start", "end", "chromosome", "tss"]
+
+    @pytest.fixture(autouse=True)
+    def _setup(self: TestTransQtlFlagging, spark: SparkSession) -> None:
+        """Setup study locus for testing."""
+        self.study_locus = StudyLocus(
+            _df=(
+                spark.createDataFrame(
+                    self.STUDY_LOCUS_DATA, self.STUDY_LOCUS_COLUMNS
+                ).withColumn("locus", f.array(f.struct("variantId")))
+            )
+        )
+        self.study_index = StudyIndex(
+            _df=spark.createDataFrame(self.STUDY_DATA, self.STUDY_COLUMNS)
+        )
+        self.target_index = TargetIndex(
+            _df=(
+                spark.createDataFrame(self.GENE_DATA, self.GENE_COLUMNS).select(
+                    f.struct(
+                        f.col("strand").cast(IntegerType()).alias("strand"),
+                        "start",
+                        "end",
+                        "chromosome",
+                    ).alias("genomicLocation"),
+                    f.col("id"),
+                    f.col("tss"),
+                )
+            )
+        )
+
+        self.qtl_flagged = self.study_locus.flag_trans_qtls(
+            self.study_index, self.target_index, self.THRESHOLD
+        )
+
+    def test_return_type(self: TestTransQtlFlagging) -> None:
+        """Test duplication flagging return type."""
+        assert isinstance(self.qtl_flagged, StudyLocus)
+
+    def test_number_of_rows(self: TestTransQtlFlagging) -> None:
+        """Test duplication flagging no data loss."""
+        assert self.qtl_flagged.df.count() == self.study_locus.df.count()
+
+    def test_column_added(self: TestTransQtlFlagging) -> None:
+        """Test duplication flagging no data loss."""
+        assert "isTransQtl" in self.qtl_flagged.df.columns
+
+    def test_correctness_no_gwas_flagged(self: TestTransQtlFlagging) -> None:
+        """Make sure the flag is null for gwas credible sets."""
+        gwas_studies = self.study_index.df.filter(f.col("studyId") == "s2")
+
+        assert (
+            self.qtl_flagged.df.join(gwas_studies, on="studyId", how="inner")
+            .filter(f.col("isTransQtl").isNotNull())
+            .count()
+        ) == 0
+
+    def test_correctness_all_qlts_are_flagged(self: TestTransQtlFlagging) -> None:
+        """Make sure all qtls have non-null flags."""
+        assert self.qtl_flagged.df.filter(f.col("isTransQtl").isNotNull()).count() == 3
+
+    def test_correctness_found_trans(self: TestTransQtlFlagging) -> None:
+        """Make sure trans qtls are flagged."""
+        assert (
+            self.qtl_flagged.df.filter(f.col("isTransQtl")).count() == 2
+        ), "Expected number of rows differ from observed."
diff --git a/tests/gentropy/step/test_colocalisation_step.py b/tests/gentropy/step/test_colocalisation_step.py
index 4b06adb69..95ceaf976 100644
--- a/tests/gentropy/step/test_colocalisation_step.py
+++ b/tests/gentropy/step/test_colocalisation_step.py
@@ -61,6 +61,7 @@ def _setup(self, session: Session, tmp_path: Path) -> None:
                     )
                 ],
                 "SuSiE fine-mapped credible set with out-of-sample LD",
+                None,
             ),
             (
                 "-1245591334543437941",
@@ -104,6 +105,7 @@ def _setup(self, session: Session, tmp_path: Path) -> None:
                     )
                 ],
                 "SuSiE fine-mapped credible set with out-of-sample LD",
+                None,
             ),
             (
                 "-0.20241232721094407",
@@ -147,6 +149,7 @@ def _setup(self, session: Session, tmp_path: Path) -> None:
                     )
                 ],
                 "SuSiE fine-mapped credible set with out-of-sample LD",
+                None,
             ),
             (
                 "-2271857845883525223",
@@ -190,6 +193,7 @@ def _setup(self, session: Session, tmp_path: Path) -> None:
                     )
                 ],
                 "SuSiE fine-mapped credible set with out-of-sample LD",
+                None,
             ),
         ]
         self.credible_set_path = str(tmp_path / "credible_set_datasets")