Merge branches 'xg1_l2g_intervals' and 'dev' of https://github.com/op…

…entargets/gentropy into xg1_l2g_intervals
opentargets · Feb 6, 2025 · 8cc1f2c · 8cc1f2c
2 parents 2fce3d6 + 8622b5e
commit 8cc1f2c
Show file tree

Hide file tree

Showing 44 changed files with 735 additions and 290 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ ci:
   autofix_commit_msg: "chore: pre-commit auto fixes [...]"
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.9.3
+    rev: v0.7.4
     hooks:
       - id: ruff
         args:
@@ -57,14 +57,14 @@ repos:
         exclude: "CHANGELOG.md"
 
   - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
-    rev: v9.20.0
+    rev: v9.18.0
     hooks:
       - id: commitlint
         additional_dependencies: ["@commitlint/[email protected]"]
         stages: [commit-msg]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: "v1.14.1"
+    rev: "v1.13.0"
     hooks:
       - id: mypy
         args:
@@ -97,7 +97,7 @@ repos:
       - id: beautysh
 
   - repo: https://github.com/jsh9/pydoclint
-    rev: 0.6.0
+    rev: 0.5.9
     hooks:
       - id: pydoclint
   - repo: https://github.com/astral-sh/uv-pre-commit

diff --git a/Makefile b/Makefile
@@ -3,6 +3,7 @@ PROJECT_ID ?= open-targets-genetics-dev
 REGION ?= europe-west1
 APP_NAME ?= $$(cat pyproject.toml | grep -m 1 "name" | cut -d" " -f3 | sed  's/"//g')
 PACKAGE_VERSION ?= $(shell grep -m 1 'version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/')
+USER_SAFE ?= $(shell echo $(USER) | tr '[:upper:]' '[:lower:]')
 # NOTE: git rev-parse will always return the HEAD if it sits in the tag,
 # this way we can distinguish the tag vs branch name
 ifeq ($(shell git rev-parse --abbrev-ref HEAD),HEAD)
@@ -57,7 +58,7 @@ create-dev-cluster: sync-cluster-init-script sync-gentropy-cli-script ## Spin up
 	@./utils/clean_status.sh || (echo "ERROR: Commit and push or stash local changes, to have up to date cluster"; exit 1)
 	@echo "Creating Dataproc Dev Cluster"
 	gcloud config set project ${PROJECT_ID}
-	gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_PACKAGE_VERSION}-$(USER)" \
+	gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_PACKAGE_VERSION}-$(USER_SAFE)" \
 		--image-version 2.2 \
 		--region ${REGION} \
 		--master-machine-type n1-standard-2 \
@@ -70,6 +71,7 @@ create-dev-cluster: sync-cluster-init-script sync-gentropy-cli-script ## Spin up
 		--autoscaling-policy="projects/${PROJECT_ID}/regions/${REGION}/autoscalingPolicies/otg-etl" \
 		--optional-components=JUPYTER \
 		--enable-component-gateway \
+		--labels team=open-targets,subteam=gentropy,created_by=${USER_SAFE},environment=development, \
 		--max-idle=60m
 
 update-dev-cluster: build ## Reinstalls the package on the dev-cluster

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,10 +10,10 @@ requires-python = ">=3.10, <3.13"
 dependencies = [
   "pyspark (>=3.5.0, <3.6)",
   "hail (>=0.2.133, <0.3.0)",
-  "scipy (>=1.11.4, <1.12.0)",
+  "scipy (>=1.11.4, <1.16.0)",
   "hydra-core (>=1.3.2, <1.4.0)",
   "pyliftover (>=0.4.1, <0.5.0)",
-  "numpy (>=1.26.4, <1.27.0)",
+  "numpy (>=1.26.4, <2.3.0)",
   "wandb (>=0.19.4, <0.20.0)",
   "omegaconf (>=2.3.0, <2.4.0)",
   "typing-extensions (>=4.12.2, <4.13.0)",
@@ -23,7 +23,7 @@ dependencies = [
   "shap (>=0.46, <0.47)",
   "matplotlib (>=3.10.0, <3.11.0)",
   "google-cloud-secret-manager (>=2.12.6, <2.13.0)",
-  "google-cloud-storage (>=2.14.0, <2.15.0)",
+  "google-cloud-storage (>=2.14.0, <3.1.0)",
 ]
 classifiers = [
   "Programming Language :: Python :: 3.10",

diff --git a/src/gentropy/assets/schemas/amino_acid_variants.json b/src/gentropy/assets/schemas/amino_acid_variants.json
@@ -0,0 +1,67 @@
+{
+  "fields": [
+    {
+      "metadata": {},
+      "name": "uniprotAccession",
+      "nullable": true,
+      "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "aminoAcidChange",
+      "nullable": true,
+      "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "inSilicoPredictors",
+      "nullable": true,
+      "type": {
+        "containsNull": true,
+        "elementType": {
+          "fields": [
+            {
+              "metadata": {},
+              "name": "method",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "assessment",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "score",
+              "nullable": true,
+              "type": "float"
+            },
+            {
+              "metadata": {},
+              "name": "assessmentFlag",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "targetId",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "normalisedScore",
+              "nullable": true,
+              "type": "double"
+            }
+          ],
+          "type": "struct"
+        },
+        "type": "array"
+      }
+    }
+  ],
+  "type": "struct"
+}
diff --git a/src/gentropy/common/session.py b/src/gentropy/common/session.py
@@ -42,7 +42,8 @@ def __init__(  # noqa: D107
         )
 
         self.spark = (
-            SparkSession.builder.config(conf=merged_conf)
+            SparkSession.Builder()
+            .config(conf=merged_conf)
             .master(spark_uri)
             .appName(app_name)
             .getOrCreate()

diff --git a/src/gentropy/common/spark_helpers.py b/src/gentropy/common/spark_helpers.py
@@ -886,3 +886,29 @@ def calculate_harmonic_sum(input_array: Column) -> Column:
         / f.pow(x["pos"], 2)
         / f.lit(sum(1 / ((i + 1) ** 2) for i in range(1000))),
     )
+
+
+def clean_strings_from_symbols(source: Column) -> Column:
+    """To make strings URL-safe and consitent by lower-casing and replace special characters with underscores.
+
+    Args:
+        source (Column): Source string
+
+    Returns:
+        Column: Cleaned string
+
+    Examples:
+        >>> d = [("AbCd-12.2",),("AaBb..123?",),("cDd!@#$%^&*()",),]
+        >>> df = spark.createDataFrame(d).toDF("source")
+        >>> df.withColumn("cleaned", clean_strings_from_symbols(f.col("source"))).show(truncate=False)
+        +-------------+---------+
+        |source       |cleaned  |
+        +-------------+---------+
+        |AbCd-12.2    |abcd-12_2|
+        |AaBb..123?   |aabb_123_|
+        |cDd!@#$%^&*()|cdd_     |
+        +-------------+---------+
+        <BLANKLINE>
+    """
+    characters_to_replace = r"[^a-z0-9-_]+"
+    return f.regexp_replace(f.lower(source), characters_to_replace, "_")
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -103,6 +103,17 @@ class GWASCatalogSumstatsPreprocessConfig(StepConfig):
     )
 
 
+@dataclass
+class FoldXVariantAnnotationConfig(StepConfig):
+    """Step to ingest FoldX amino acid variation data."""
+
+    foldx_dataset_path: str = MISSING
+    plddt_threshold: float = 0.7
+    annotation_path: str = MISSING
+
+    _target_: str = "gentropy.foldx_ingestion.FoldXIngestionStep"
+
+
 @dataclass
 class EqtlCatalogueConfig(StepConfig):
     """eQTL Catalogue step configuration."""
@@ -532,6 +543,7 @@ class _ConsequenceToPathogenicityScoreMap(TypedDict):
         {"id": "SO_0001620", "label": "mature_miRNA_variant", "score": 0.0},
         {"id": "SO_0001060", "label": "intergenic_variant", "score": 0.0},
     ]
+    amino_acid_change_annotations: list[str] = MISSING
 
     _target_: str = "gentropy.variant_index.VariantIndexStep"
 
@@ -787,3 +799,4 @@ def register_config() -> None:
     )
     cs.store(group="step", name="finngen_ukb_meta_ingestion", node=FinngenUkbMetaConfig)
     cs.store(group="step", name="credible_set_qc", node=CredibleSetQCStepConfig)
+    cs.store(group="step", name="foldx_integration", node=FoldXVariantAnnotationConfig)
diff --git a/src/gentropy/dataset/amino_acid_variants.py b/src/gentropy/dataset/amino_acid_variants.py
@@ -0,0 +1,26 @@
+"""Dataset representing consequence of amino-acid changes in protein."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from gentropy.common.schemas import parse_spark_schema
+from gentropy.dataset.dataset import Dataset
+
+if TYPE_CHECKING:
+    from pyspark.sql.types import StructType
+
+
+@dataclass
+class AminoAcidVariants(Dataset):
+    """Dataset representing consequence of amino-acid changes in protein."""
+
+    @classmethod
+    def get_schema(cls: type[AminoAcidVariants]) -> StructType:
+        """Provides the schema for the AminoAcidVariants dataset.
+
+        Returns:
+            StructType: Schema for the AminoAcidVariants dataset
+        """
+        return parse_spark_schema("amino_acid_variants.json")
diff --git a/src/gentropy/dataset/l2g_prediction.py b/src/gentropy/dataset/l2g_prediction.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING
 
 import pyspark.sql.functions as f
@@ -29,6 +29,8 @@ class L2GPrediction(Dataset):
     confidence of the prediction that a gene is causal to an association.
     """
 
+    model: LocusToGeneModel | None = field(default=None, repr=False)
+
     @classmethod
     def get_schema(cls: type[L2GPrediction]) -> StructType:
         """Provides the schema for the L2GPrediction dataset.
@@ -44,7 +46,6 @@ def from_credible_set(
         session: Session,
         credible_set: StudyLocus,
         feature_matrix: L2GFeatureMatrix,
-        features_list: list[str],
         model_path: str | None,
         hf_token: str | None = None,
         download_from_hub: bool = True,
@@ -55,7 +56,6 @@ def from_credible_set(
             session (Session): Session object that contains the Spark session
             credible_set (StudyLocus): Dataset containing credible sets from GWAS only
             feature_matrix (L2GFeatureMatrix): Dataset containing all credible sets and their annotations
-            features_list (list[str]): List of features to use for the model
             model_path (str | None): Path to the model file. It can be either in the filesystem or the name on the Hugging Face Hub (in the form of username/repo_name).
             hf_token (str | None): Hugging Face token to download the model from the Hub. Only required if the model is private.
             download_from_hub (bool): Whether to download the model from the Hugging Face Hub. Defaults to True.
@@ -82,9 +82,8 @@ def from_credible_set(
                 )
             )
             .fill_na()
-            .select_features(features_list)
+            .select_features(l2g_model.features_list)
         )
-
         return l2g_model.predict(fm, session)
 
     def to_disease_target_evidence(
@@ -129,17 +128,22 @@ def to_disease_target_evidence(
         )
 
     def add_locus_to_gene_features(
-        self: L2GPrediction, feature_matrix: L2GFeatureMatrix, features_list: list[str]
+        self: L2GPrediction,
+        feature_matrix: L2GFeatureMatrix,
     ) -> L2GPrediction:
         """Add features used to extract the L2G predictions.
 
         Args:
             feature_matrix (L2GFeatureMatrix): Feature matrix dataset
-            features_list (list[str]): List of features used in the model
 
         Returns:
             L2GPrediction: L2G predictions with additional features
+
+        Raises:
+            ValueError: If model is not set, feature list won't be available
         """
+        if self.model is None:
+            raise ValueError("Model not set, feature annotation cannot be created.")
         # Testing if `locusToGeneFeatures` column already exists:
         if "locusToGeneFeatures" in self.df.columns:
             self.df = self.df.drop("locusToGeneFeatures")
@@ -150,7 +154,10 @@ def add_locus_to_gene_features(
                 "locusToGeneFeatures",
                 f.create_map(
                     *sum(
-                        ((f.lit(feature), f.col(feature)) for feature in features_list),
+                        (
+                            (f.lit(feature), f.col(feature))
+                            for feature in self.model.features_list
+                        ),
                         (),
                     )
                 ),
@@ -159,11 +166,12 @@ def add_locus_to_gene_features(
                 "locusToGeneFeatures",
                 f.expr("map_filter(locusToGeneFeatures, (k, v) -> v != 0)"),
             )
-            .drop(*features_list)
+            .drop(*self.model.features_list)
         )
         return L2GPrediction(
             _df=self.df.join(
                 aggregated_features, on=["studyLocusId", "geneId"], how="left"
             ),
             _schema=self.get_schema(),
+            model=self.model,
         )
diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py
@@ -15,7 +15,9 @@
 
 from gentropy.assets import data
 from gentropy.common.schemas import parse_spark_schema
-from gentropy.common.spark_helpers import convert_from_wide_to_long
+from gentropy.common.spark_helpers import (
+    convert_from_wide_to_long,
+)
 from gentropy.dataset.dataset import Dataset
 
 if TYPE_CHECKING: