Merge branch 'dev' into vh-integrate-lof-data

opentargets · Feb 7, 2025 · 597cf05 · 597cf05
2 parents 8383ba5 + 30a6046
commit 597cf05
Show file tree

Hide file tree

Showing 27 changed files with 865 additions and 148 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -45,7 +45,7 @@ repos:
       - id: python-check-blanket-noqa
 
   - repo: https://github.com/hadialqattan/pycln
-    rev: v2.4.0
+    rev: v2.5.0
     hooks:
       - id: pycln
         args: [--all]

diff --git a/Makefile b/Makefile
@@ -3,6 +3,7 @@ PROJECT_ID ?= open-targets-genetics-dev
 REGION ?= europe-west1
 APP_NAME ?= $$(cat pyproject.toml | grep -m 1 "name" | cut -d" " -f3 | sed  's/"//g')
 PACKAGE_VERSION ?= $(shell grep -m 1 'version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/')
+USER_SAFE ?= $(shell echo $(USER) | tr '[:upper:]' '[:lower:]')
 # NOTE: git rev-parse will always return the HEAD if it sits in the tag,
 # this way we can distinguish the tag vs branch name
 ifeq ($(shell git rev-parse --abbrev-ref HEAD),HEAD)
@@ -57,7 +58,7 @@ create-dev-cluster: sync-cluster-init-script sync-gentropy-cli-script ## Spin up
 	@./utils/clean_status.sh || (echo "ERROR: Commit and push or stash local changes, to have up to date cluster"; exit 1)
 	@echo "Creating Dataproc Dev Cluster"
 	gcloud config set project ${PROJECT_ID}
-	gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_PACKAGE_VERSION}-$(USER)" \
+	gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_PACKAGE_VERSION}-$(USER_SAFE)" \
 		--image-version 2.2 \
 		--region ${REGION} \
 		--master-machine-type n1-standard-2 \
@@ -70,6 +71,7 @@ create-dev-cluster: sync-cluster-init-script sync-gentropy-cli-script ## Spin up
 		--autoscaling-policy="projects/${PROJECT_ID}/regions/${REGION}/autoscalingPolicies/otg-etl" \
 		--optional-components=JUPYTER \
 		--enable-component-gateway \
+		--labels team=open-targets,subteam=gentropy,created_by=${USER_SAFE},environment=development, \
 		--max-idle=60m
 
 update-dev-cluster: build ## Reinstalls the package on the dev-cluster

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,10 +10,10 @@ requires-python = ">=3.10, <3.13"
 dependencies = [
   "pyspark (>=3.5.0, <3.6)",
   "hail (>=0.2.133, <0.3.0)",
-  "scipy (>=1.11.4, <1.12.0)",
+  "scipy (>=1.11.4, <1.16.0)",
   "hydra-core (>=1.3.2, <1.4.0)",
   "pyliftover (>=0.4.1, <0.5.0)",
-  "numpy (>=1.26.4, <1.27.0)",
+  "numpy (>=1.26.4, <2.3.0)",
   "wandb (>=0.19.4, <0.20.0)",
   "omegaconf (>=2.3.0, <2.4.0)",
   "typing-extensions (>=4.12.2, <4.13.0)",
@@ -23,7 +23,7 @@ dependencies = [
   "shap (>=0.46, <0.47)",
   "matplotlib (>=3.10.0, <3.11.0)",
   "google-cloud-secret-manager (>=2.12.6, <2.13.0)",
-  "google-cloud-storage (>=2.14.0, <2.15.0)",
+  "google-cloud-storage (>=2.14.0, <3.1.0)",
 ]
 classifiers = [
   "Programming Language :: Python :: 3.10",

diff --git a/src/gentropy/assets/schemas/amino_acid_variants.json b/src/gentropy/assets/schemas/amino_acid_variants.json
@@ -0,0 +1,67 @@
+{
+  "fields": [
+    {
+      "metadata": {},
+      "name": "uniprotAccession",
+      "nullable": true,
+      "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "aminoAcidChange",
+      "nullable": true,
+      "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "inSilicoPredictors",
+      "nullable": true,
+      "type": {
+        "containsNull": true,
+        "elementType": {
+          "fields": [
+            {
+              "metadata": {},
+              "name": "method",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "assessment",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "score",
+              "nullable": true,
+              "type": "float"
+            },
+            {
+              "metadata": {},
+              "name": "assessmentFlag",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "targetId",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "normalisedScore",
+              "nullable": true,
+              "type": "double"
+            }
+          ],
+          "type": "struct"
+        },
+        "type": "array"
+      }
+    }
+  ],
+  "type": "struct"
+}
diff --git a/src/gentropy/common/session.py b/src/gentropy/common/session.py
@@ -42,7 +42,8 @@ def __init__(  # noqa: D107
         )
 
         self.spark = (
-            SparkSession.builder.config(conf=merged_conf)
+            SparkSession.Builder()
+            .config(conf=merged_conf)
             .master(spark_uri)
             .appName(app_name)
             .getOrCreate()

diff --git a/src/gentropy/common/spark_helpers.py b/src/gentropy/common/spark_helpers.py
@@ -886,3 +886,29 @@ def calculate_harmonic_sum(input_array: Column) -> Column:
         / f.pow(x["pos"], 2)
         / f.lit(sum(1 / ((i + 1) ** 2) for i in range(1000))),
     )
+
+
+def clean_strings_from_symbols(source: Column) -> Column:
+    """To make strings URL-safe and consitent by lower-casing and replace special characters with underscores.
+
+    Args:
+        source (Column): Source string
+
+    Returns:
+        Column: Cleaned string
+
+    Examples:
+        >>> d = [("AbCd-12.2",),("AaBb..123?",),("cDd!@#$%^&*()",),]
+        >>> df = spark.createDataFrame(d).toDF("source")
+        >>> df.withColumn("cleaned", clean_strings_from_symbols(f.col("source"))).show(truncate=False)
+        +-------------+---------+
+        |source       |cleaned  |
+        +-------------+---------+
+        |AbCd-12.2    |abcd-12_2|
+        |AaBb..123?   |aabb_123_|
+        |cDd!@#$%^&*()|cdd_     |
+        +-------------+---------+
+        <BLANKLINE>
+    """
+    characters_to_replace = r"[^a-z0-9-_]+"
+    return f.regexp_replace(f.lower(source), characters_to_replace, "_")
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -103,6 +103,17 @@ class GWASCatalogSumstatsPreprocessConfig(StepConfig):
     )
 
 
+@dataclass
+class FoldXVariantAnnotationConfig(StepConfig):
+    """Step to ingest FoldX amino acid variation data."""
+
+    foldx_dataset_path: str = MISSING
+    plddt_threshold: float = 0.7
+    annotation_path: str = MISSING
+
+    _target_: str = "gentropy.foldx_ingestion.FoldXIngestionStep"
+
+
 @dataclass
 class EqtlCatalogueConfig(StepConfig):
     """eQTL Catalogue step configuration."""
@@ -517,6 +528,7 @@ class _ConsequenceToPathogenicityScoreMap(TypedDict):
         {"id": "SO_0001620", "label": "mature_miRNA_variant", "score": 0.0},
         {"id": "SO_0001060", "label": "intergenic_variant", "score": 0.0},
     ]
+    amino_acid_change_annotations: list[str] = MISSING
 
     _target_: str = "gentropy.variant_index.VariantIndexStep"
 
@@ -773,3 +785,4 @@ def register_config() -> None:
     )
     cs.store(group="step", name="finngen_ukb_meta_ingestion", node=FinngenUkbMetaConfig)
     cs.store(group="step", name="credible_set_qc", node=CredibleSetQCStepConfig)
+    cs.store(group="step", name="foldx_integration", node=FoldXVariantAnnotationConfig)
diff --git a/src/gentropy/dataset/amino_acid_variants.py b/src/gentropy/dataset/amino_acid_variants.py
@@ -0,0 +1,26 @@
+"""Dataset representing consequence of amino-acid changes in protein."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from gentropy.common.schemas import parse_spark_schema
+from gentropy.dataset.dataset import Dataset
+
+if TYPE_CHECKING:
+    from pyspark.sql.types import StructType
+
+
+@dataclass
+class AminoAcidVariants(Dataset):
+    """Dataset representing consequence of amino-acid changes in protein."""
+
+    @classmethod
+    def get_schema(cls: type[AminoAcidVariants]) -> StructType:
+        """Provides the schema for the AminoAcidVariants dataset.
+
+        Returns:
+            StructType: Schema for the AminoAcidVariants dataset
+        """
+        return parse_spark_schema("amino_acid_variants.json")
diff --git a/src/gentropy/dataset/l2g_prediction.py b/src/gentropy/dataset/l2g_prediction.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING
 
 import pyspark.sql.functions as f
@@ -29,6 +29,8 @@ class L2GPrediction(Dataset):
     confidence of the prediction that a gene is causal to an association.
     """
 
+    model: LocusToGeneModel | None = field(default=None, repr=False)
+
     @classmethod
     def get_schema(cls: type[L2GPrediction]) -> StructType:
         """Provides the schema for the L2GPrediction dataset.
@@ -44,7 +46,6 @@ def from_credible_set(
         session: Session,
         credible_set: StudyLocus,
         feature_matrix: L2GFeatureMatrix,
-        features_list: list[str],
         model_path: str | None,
         hf_token: str | None = None,
         download_from_hub: bool = True,
@@ -55,7 +56,6 @@ def from_credible_set(
             session (Session): Session object that contains the Spark session
             credible_set (StudyLocus): Dataset containing credible sets from GWAS only
             feature_matrix (L2GFeatureMatrix): Dataset containing all credible sets and their annotations
-            features_list (list[str]): List of features to use for the model
             model_path (str | None): Path to the model file. It can be either in the filesystem or the name on the Hugging Face Hub (in the form of username/repo_name).
             hf_token (str | None): Hugging Face token to download the model from the Hub. Only required if the model is private.
             download_from_hub (bool): Whether to download the model from the Hugging Face Hub. Defaults to True.
@@ -82,9 +82,8 @@ def from_credible_set(
                 )
             )
             .fill_na()
-            .select_features(features_list)
+            .select_features(l2g_model.features_list)
         )
-
         return l2g_model.predict(fm, session)
 
     def to_disease_target_evidence(
@@ -129,17 +128,22 @@ def to_disease_target_evidence(
         )
 
     def add_locus_to_gene_features(
-        self: L2GPrediction, feature_matrix: L2GFeatureMatrix, features_list: list[str]
+        self: L2GPrediction,
+        feature_matrix: L2GFeatureMatrix,
     ) -> L2GPrediction:
         """Add features used to extract the L2G predictions.
 
         Args:
             feature_matrix (L2GFeatureMatrix): Feature matrix dataset
-            features_list (list[str]): List of features used in the model
 
         Returns:
             L2GPrediction: L2G predictions with additional features
+
+        Raises:
+            ValueError: If model is not set, feature list won't be available
         """
+        if self.model is None:
+            raise ValueError("Model not set, feature annotation cannot be created.")
         # Testing if `locusToGeneFeatures` column already exists:
         if "locusToGeneFeatures" in self.df.columns:
             self.df = self.df.drop("locusToGeneFeatures")
@@ -150,7 +154,10 @@ def add_locus_to_gene_features(
                 "locusToGeneFeatures",
                 f.create_map(
                     *sum(
-                        ((f.lit(feature), f.col(feature)) for feature in features_list),
+                        (
+                            (f.lit(feature), f.col(feature))
+                            for feature in self.model.features_list
+                        ),
                         (),
                     )
                 ),
@@ -159,11 +166,12 @@ def add_locus_to_gene_features(
                 "locusToGeneFeatures",
                 f.expr("map_filter(locusToGeneFeatures, (k, v) -> v != 0)"),
             )
-            .drop(*features_list)
+            .drop(*self.model.features_list)
         )
         return L2GPrediction(
             _df=self.df.join(
                 aggregated_features, on=["studyLocusId", "geneId"], how="left"
             ),
             _schema=self.get_schema(),
+            model=self.model,
         )
diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py
@@ -15,7 +15,9 @@
 
 from gentropy.assets import data
 from gentropy.common.schemas import parse_spark_schema
-from gentropy.common.spark_helpers import convert_from_wide_to_long
+from gentropy.common.spark_helpers import (
+    convert_from_wide_to_long,
+)
 from gentropy.dataset.dataset import Dataset
 
 if TYPE_CHECKING:

diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
@@ -433,7 +433,8 @@ def _qc_subsignificant_associations(
     def qc_abnormal_pips(
         self: StudyLocus,
         sum_pips_lower_threshold: float = 0.99,
-        sum_pips_upper_threshold: float = 1.0001,  # Set slightly above 1 to account for floating point errors
+        # Set slightly above 1 to account for floating point errors
+        sum_pips_upper_threshold: float = 1.0001,
     ) -> StudyLocus:
         """Filter study-locus by sum of posterior inclusion probabilities to ensure that the sum of PIPs is within a given range.
 
@@ -691,6 +692,7 @@ def flag_trans_qtls(
         """Flagging transQTL credible sets based on genomic location of the measured gene.
 
         Process:
+        0. Make sure that the `isTransQtl` column does not exist (remove if exists)
         1. Enrich study-locus dataset with geneId based on study metadata. (only QTL studies are considered)
         2. Enrich with transcription start site and chromosome of the studied gegne.
         3. Flagging any tagging variant of QTL credible sets, if chromosome is different from the gene or distance is above the threshold.
@@ -709,6 +711,12 @@ def flag_trans_qtls(
         if "geneId" not in study_index.df.columns:
             return self
 
+        # We have to remove the column `isTransQtl` to ensure the column is not duplicated
+        # The duplication can happen when one reads the StudyLocus from parquet with
+        # predefined schema that already contains the `isTransQtl` column.
+        if "isTransQtl" in self.df.columns:
+            self.df = self.df.drop("isTransQtl")
+
         # Process study index:
         processed_studies = (
             study_index.df