Skip to content

Commit

Permalink
Merge branches 'xg1_l2g_intervals' and 'dev' of https://github.com/op…
Browse files Browse the repository at this point in the history
…entargets/gentropy into xg1_l2g_intervals
  • Loading branch information
xyg123 committed Feb 6, 2025
2 parents 2fce3d6 + 8622b5e commit 8cc1f2c
Show file tree
Hide file tree
Showing 44 changed files with 735 additions and 290 deletions.
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ ci:
autofix_commit_msg: "chore: pre-commit auto fixes [...]"
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.3
rev: v0.7.4
hooks:
- id: ruff
args:
Expand Down Expand Up @@ -57,14 +57,14 @@ repos:
exclude: "CHANGELOG.md"

- repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
rev: v9.20.0
rev: v9.18.0
hooks:
- id: commitlint
additional_dependencies: ["@commitlint/[email protected]"]
stages: [commit-msg]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: "v1.14.1"
rev: "v1.13.0"
hooks:
- id: mypy
args:
Expand Down Expand Up @@ -97,7 +97,7 @@ repos:
- id: beautysh

- repo: https://github.com/jsh9/pydoclint
rev: 0.6.0
rev: 0.5.9
hooks:
- id: pydoclint
- repo: https://github.com/astral-sh/uv-pre-commit
Expand Down
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ PROJECT_ID ?= open-targets-genetics-dev
REGION ?= europe-west1
APP_NAME ?= $$(cat pyproject.toml | grep -m 1 "name" | cut -d" " -f3 | sed 's/"//g')
PACKAGE_VERSION ?= $(shell grep -m 1 'version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/')
USER_SAFE ?= $(shell echo $(USER) | tr '[:upper:]' '[:lower:]')
# NOTE: git rev-parse will always return the HEAD if it sits in the tag,
# this way we can distinguish the tag vs branch name
ifeq ($(shell git rev-parse --abbrev-ref HEAD),HEAD)
Expand Down Expand Up @@ -57,7 +58,7 @@ create-dev-cluster: sync-cluster-init-script sync-gentropy-cli-script ## Spin up
@./utils/clean_status.sh || (echo "ERROR: Commit and push or stash local changes, to have up to date cluster"; exit 1)
@echo "Creating Dataproc Dev Cluster"
gcloud config set project ${PROJECT_ID}
gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_PACKAGE_VERSION}-$(USER)" \
gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_PACKAGE_VERSION}-$(USER_SAFE)" \
--image-version 2.2 \
--region ${REGION} \
--master-machine-type n1-standard-2 \
Expand All @@ -70,6 +71,7 @@ create-dev-cluster: sync-cluster-init-script sync-gentropy-cli-script ## Spin up
--autoscaling-policy="projects/${PROJECT_ID}/regions/${REGION}/autoscalingPolicies/otg-etl" \
--optional-components=JUPYTER \
--enable-component-gateway \
--labels team=open-targets,subteam=gentropy,created_by=${USER_SAFE},environment=development, \
--max-idle=60m

update-dev-cluster: build ## Reinstalls the package on the dev-cluster
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ requires-python = ">=3.10, <3.13"
dependencies = [
"pyspark (>=3.5.0, <3.6)",
"hail (>=0.2.133, <0.3.0)",
"scipy (>=1.11.4, <1.12.0)",
"scipy (>=1.11.4, <1.16.0)",
"hydra-core (>=1.3.2, <1.4.0)",
"pyliftover (>=0.4.1, <0.5.0)",
"numpy (>=1.26.4, <1.27.0)",
"numpy (>=1.26.4, <2.3.0)",
"wandb (>=0.19.4, <0.20.0)",
"omegaconf (>=2.3.0, <2.4.0)",
"typing-extensions (>=4.12.2, <4.13.0)",
Expand All @@ -23,7 +23,7 @@ dependencies = [
"shap (>=0.46, <0.47)",
"matplotlib (>=3.10.0, <3.11.0)",
"google-cloud-secret-manager (>=2.12.6, <2.13.0)",
"google-cloud-storage (>=2.14.0, <2.15.0)",
"google-cloud-storage (>=2.14.0, <3.1.0)",
]
classifiers = [
"Programming Language :: Python :: 3.10",
Expand Down
67 changes: 67 additions & 0 deletions src/gentropy/assets/schemas/amino_acid_variants.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{
"fields": [
{
"metadata": {},
"name": "uniprotAccession",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "aminoAcidChange",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "inSilicoPredictors",
"nullable": true,
"type": {
"containsNull": true,
"elementType": {
"fields": [
{
"metadata": {},
"name": "method",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "assessment",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "score",
"nullable": true,
"type": "float"
},
{
"metadata": {},
"name": "assessmentFlag",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "targetId",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "normalisedScore",
"nullable": true,
"type": "double"
}
],
"type": "struct"
},
"type": "array"
}
}
],
"type": "struct"
}
3 changes: 2 additions & 1 deletion src/gentropy/common/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def __init__( # noqa: D107
)

self.spark = (
SparkSession.builder.config(conf=merged_conf)
SparkSession.Builder()
.config(conf=merged_conf)
.master(spark_uri)
.appName(app_name)
.getOrCreate()
Expand Down
26 changes: 26 additions & 0 deletions src/gentropy/common/spark_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,3 +886,29 @@ def calculate_harmonic_sum(input_array: Column) -> Column:
/ f.pow(x["pos"], 2)
/ f.lit(sum(1 / ((i + 1) ** 2) for i in range(1000))),
)


def clean_strings_from_symbols(source: Column) -> Column:
"""To make strings URL-safe and consitent by lower-casing and replace special characters with underscores.
Args:
source (Column): Source string
Returns:
Column: Cleaned string
Examples:
>>> d = [("AbCd-12.2",),("AaBb..123?",),("cDd!@#$%^&*()",),]
>>> df = spark.createDataFrame(d).toDF("source")
>>> df.withColumn("cleaned", clean_strings_from_symbols(f.col("source"))).show(truncate=False)
+-------------+---------+
|source |cleaned |
+-------------+---------+
|AbCd-12.2 |abcd-12_2|
|AaBb..123? |aabb_123_|
|cDd!@#$%^&*()|cdd_ |
+-------------+---------+
<BLANKLINE>
"""
characters_to_replace = r"[^a-z0-9-_]+"
return f.regexp_replace(f.lower(source), characters_to_replace, "_")
13 changes: 13 additions & 0 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,17 @@ class GWASCatalogSumstatsPreprocessConfig(StepConfig):
)


@dataclass
class FoldXVariantAnnotationConfig(StepConfig):
"""Step to ingest FoldX amino acid variation data."""

foldx_dataset_path: str = MISSING
plddt_threshold: float = 0.7
annotation_path: str = MISSING

_target_: str = "gentropy.foldx_ingestion.FoldXIngestionStep"


@dataclass
class EqtlCatalogueConfig(StepConfig):
"""eQTL Catalogue step configuration."""
Expand Down Expand Up @@ -532,6 +543,7 @@ class _ConsequenceToPathogenicityScoreMap(TypedDict):
{"id": "SO_0001620", "label": "mature_miRNA_variant", "score": 0.0},
{"id": "SO_0001060", "label": "intergenic_variant", "score": 0.0},
]
amino_acid_change_annotations: list[str] = MISSING

_target_: str = "gentropy.variant_index.VariantIndexStep"

Expand Down Expand Up @@ -787,3 +799,4 @@ def register_config() -> None:
)
cs.store(group="step", name="finngen_ukb_meta_ingestion", node=FinngenUkbMetaConfig)
cs.store(group="step", name="credible_set_qc", node=CredibleSetQCStepConfig)
cs.store(group="step", name="foldx_integration", node=FoldXVariantAnnotationConfig)
26 changes: 26 additions & 0 deletions src/gentropy/dataset/amino_acid_variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Dataset representing consequence of amino-acid changes in protein."""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

from gentropy.common.schemas import parse_spark_schema
from gentropy.dataset.dataset import Dataset

if TYPE_CHECKING:
from pyspark.sql.types import StructType


@dataclass
class AminoAcidVariants(Dataset):
"""Dataset representing consequence of amino-acid changes in protein."""

@classmethod
def get_schema(cls: type[AminoAcidVariants]) -> StructType:
"""Provides the schema for the AminoAcidVariants dataset.
Returns:
StructType: Schema for the AminoAcidVariants dataset
"""
return parse_spark_schema("amino_acid_variants.json")
26 changes: 17 additions & 9 deletions src/gentropy/dataset/l2g_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from __future__ import annotations

from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import TYPE_CHECKING

import pyspark.sql.functions as f
Expand All @@ -29,6 +29,8 @@ class L2GPrediction(Dataset):
confidence of the prediction that a gene is causal to an association.
"""

model: LocusToGeneModel | None = field(default=None, repr=False)

@classmethod
def get_schema(cls: type[L2GPrediction]) -> StructType:
"""Provides the schema for the L2GPrediction dataset.
Expand All @@ -44,7 +46,6 @@ def from_credible_set(
session: Session,
credible_set: StudyLocus,
feature_matrix: L2GFeatureMatrix,
features_list: list[str],
model_path: str | None,
hf_token: str | None = None,
download_from_hub: bool = True,
Expand All @@ -55,7 +56,6 @@ def from_credible_set(
session (Session): Session object that contains the Spark session
credible_set (StudyLocus): Dataset containing credible sets from GWAS only
feature_matrix (L2GFeatureMatrix): Dataset containing all credible sets and their annotations
features_list (list[str]): List of features to use for the model
model_path (str | None): Path to the model file. It can be either in the filesystem or the name on the Hugging Face Hub (in the form of username/repo_name).
hf_token (str | None): Hugging Face token to download the model from the Hub. Only required if the model is private.
download_from_hub (bool): Whether to download the model from the Hugging Face Hub. Defaults to True.
Expand All @@ -82,9 +82,8 @@ def from_credible_set(
)
)
.fill_na()
.select_features(features_list)
.select_features(l2g_model.features_list)
)

return l2g_model.predict(fm, session)

def to_disease_target_evidence(
Expand Down Expand Up @@ -129,17 +128,22 @@ def to_disease_target_evidence(
)

def add_locus_to_gene_features(
self: L2GPrediction, feature_matrix: L2GFeatureMatrix, features_list: list[str]
self: L2GPrediction,
feature_matrix: L2GFeatureMatrix,
) -> L2GPrediction:
"""Add features used to extract the L2G predictions.
Args:
feature_matrix (L2GFeatureMatrix): Feature matrix dataset
features_list (list[str]): List of features used in the model
Returns:
L2GPrediction: L2G predictions with additional features
Raises:
ValueError: If model is not set, feature list won't be available
"""
if self.model is None:
raise ValueError("Model not set, feature annotation cannot be created.")
# Testing if `locusToGeneFeatures` column already exists:
if "locusToGeneFeatures" in self.df.columns:
self.df = self.df.drop("locusToGeneFeatures")
Expand All @@ -150,7 +154,10 @@ def add_locus_to_gene_features(
"locusToGeneFeatures",
f.create_map(
*sum(
((f.lit(feature), f.col(feature)) for feature in features_list),
(
(f.lit(feature), f.col(feature))
for feature in self.model.features_list
),
(),
)
),
Expand All @@ -159,11 +166,12 @@ def add_locus_to_gene_features(
"locusToGeneFeatures",
f.expr("map_filter(locusToGeneFeatures, (k, v) -> v != 0)"),
)
.drop(*features_list)
.drop(*self.model.features_list)
)
return L2GPrediction(
_df=self.df.join(
aggregated_features, on=["studyLocusId", "geneId"], how="left"
),
_schema=self.get_schema(),
model=self.model,
)
4 changes: 3 additions & 1 deletion src/gentropy/dataset/study_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@

from gentropy.assets import data
from gentropy.common.schemas import parse_spark_schema
from gentropy.common.spark_helpers import convert_from_wide_to_long
from gentropy.common.spark_helpers import (
convert_from_wide_to_long,
)
from gentropy.dataset.dataset import Dataset

if TYPE_CHECKING:
Expand Down
Loading

0 comments on commit 8cc1f2c

Please sign in to comment.