Skip to content

Commit

Permalink
Merge branch 'dev' into vh-integrate-lof-data
Browse files Browse the repository at this point in the history
  • Loading branch information
vivienho authored Feb 7, 2025
2 parents 8383ba5 + 30a6046 commit 597cf05
Show file tree
Hide file tree
Showing 27 changed files with 865 additions and 148 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ repos:
- id: python-check-blanket-noqa

- repo: https://github.com/hadialqattan/pycln
rev: v2.4.0
rev: v2.5.0
hooks:
- id: pycln
args: [--all]
Expand Down
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ PROJECT_ID ?= open-targets-genetics-dev
REGION ?= europe-west1
APP_NAME ?= $$(cat pyproject.toml | grep -m 1 "name" | cut -d" " -f3 | sed 's/"//g')
PACKAGE_VERSION ?= $(shell grep -m 1 'version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/')
USER_SAFE ?= $(shell echo $(USER) | tr '[:upper:]' '[:lower:]')
# NOTE: git rev-parse will always return the HEAD if it sits in the tag,
# this way we can distinguish the tag vs branch name
ifeq ($(shell git rev-parse --abbrev-ref HEAD),HEAD)
Expand Down Expand Up @@ -57,7 +58,7 @@ create-dev-cluster: sync-cluster-init-script sync-gentropy-cli-script ## Spin up
@./utils/clean_status.sh || (echo "ERROR: Commit and push or stash local changes, to have up to date cluster"; exit 1)
@echo "Creating Dataproc Dev Cluster"
gcloud config set project ${PROJECT_ID}
gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_PACKAGE_VERSION}-$(USER)" \
gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_PACKAGE_VERSION}-$(USER_SAFE)" \
--image-version 2.2 \
--region ${REGION} \
--master-machine-type n1-standard-2 \
Expand All @@ -70,6 +71,7 @@ create-dev-cluster: sync-cluster-init-script sync-gentropy-cli-script ## Spin up
--autoscaling-policy="projects/${PROJECT_ID}/regions/${REGION}/autoscalingPolicies/otg-etl" \
--optional-components=JUPYTER \
--enable-component-gateway \
--labels team=open-targets,subteam=gentropy,created_by=${USER_SAFE},environment=development, \
--max-idle=60m

update-dev-cluster: build ## Reinstalls the package on the dev-cluster
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ requires-python = ">=3.10, <3.13"
dependencies = [
"pyspark (>=3.5.0, <3.6)",
"hail (>=0.2.133, <0.3.0)",
"scipy (>=1.11.4, <1.12.0)",
"scipy (>=1.11.4, <1.16.0)",
"hydra-core (>=1.3.2, <1.4.0)",
"pyliftover (>=0.4.1, <0.5.0)",
"numpy (>=1.26.4, <1.27.0)",
"numpy (>=1.26.4, <2.3.0)",
"wandb (>=0.19.4, <0.20.0)",
"omegaconf (>=2.3.0, <2.4.0)",
"typing-extensions (>=4.12.2, <4.13.0)",
Expand All @@ -23,7 +23,7 @@ dependencies = [
"shap (>=0.46, <0.47)",
"matplotlib (>=3.10.0, <3.11.0)",
"google-cloud-secret-manager (>=2.12.6, <2.13.0)",
"google-cloud-storage (>=2.14.0, <2.15.0)",
"google-cloud-storage (>=2.14.0, <3.1.0)",
]
classifiers = [
"Programming Language :: Python :: 3.10",
Expand Down
67 changes: 67 additions & 0 deletions src/gentropy/assets/schemas/amino_acid_variants.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{
"fields": [
{
"metadata": {},
"name": "uniprotAccession",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "aminoAcidChange",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "inSilicoPredictors",
"nullable": true,
"type": {
"containsNull": true,
"elementType": {
"fields": [
{
"metadata": {},
"name": "method",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "assessment",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "score",
"nullable": true,
"type": "float"
},
{
"metadata": {},
"name": "assessmentFlag",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "targetId",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "normalisedScore",
"nullable": true,
"type": "double"
}
],
"type": "struct"
},
"type": "array"
}
}
],
"type": "struct"
}
3 changes: 2 additions & 1 deletion src/gentropy/common/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def __init__( # noqa: D107
)

self.spark = (
SparkSession.builder.config(conf=merged_conf)
SparkSession.Builder()
.config(conf=merged_conf)
.master(spark_uri)
.appName(app_name)
.getOrCreate()
Expand Down
26 changes: 26 additions & 0 deletions src/gentropy/common/spark_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,3 +886,29 @@ def calculate_harmonic_sum(input_array: Column) -> Column:
/ f.pow(x["pos"], 2)
/ f.lit(sum(1 / ((i + 1) ** 2) for i in range(1000))),
)


def clean_strings_from_symbols(source: Column) -> Column:
"""To make strings URL-safe and consitent by lower-casing and replace special characters with underscores.
Args:
source (Column): Source string
Returns:
Column: Cleaned string
Examples:
>>> d = [("AbCd-12.2",),("AaBb..123?",),("cDd!@#$%^&*()",),]
>>> df = spark.createDataFrame(d).toDF("source")
>>> df.withColumn("cleaned", clean_strings_from_symbols(f.col("source"))).show(truncate=False)
+-------------+---------+
|source |cleaned |
+-------------+---------+
|AbCd-12.2 |abcd-12_2|
|AaBb..123? |aabb_123_|
|cDd!@#$%^&*()|cdd_ |
+-------------+---------+
<BLANKLINE>
"""
characters_to_replace = r"[^a-z0-9-_]+"
return f.regexp_replace(f.lower(source), characters_to_replace, "_")
13 changes: 13 additions & 0 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,17 @@ class GWASCatalogSumstatsPreprocessConfig(StepConfig):
)


@dataclass
class FoldXVariantAnnotationConfig(StepConfig):
"""Step to ingest FoldX amino acid variation data."""

foldx_dataset_path: str = MISSING
plddt_threshold: float = 0.7
annotation_path: str = MISSING

_target_: str = "gentropy.foldx_ingestion.FoldXIngestionStep"


@dataclass
class EqtlCatalogueConfig(StepConfig):
"""eQTL Catalogue step configuration."""
Expand Down Expand Up @@ -517,6 +528,7 @@ class _ConsequenceToPathogenicityScoreMap(TypedDict):
{"id": "SO_0001620", "label": "mature_miRNA_variant", "score": 0.0},
{"id": "SO_0001060", "label": "intergenic_variant", "score": 0.0},
]
amino_acid_change_annotations: list[str] = MISSING

_target_: str = "gentropy.variant_index.VariantIndexStep"

Expand Down Expand Up @@ -773,3 +785,4 @@ def register_config() -> None:
)
cs.store(group="step", name="finngen_ukb_meta_ingestion", node=FinngenUkbMetaConfig)
cs.store(group="step", name="credible_set_qc", node=CredibleSetQCStepConfig)
cs.store(group="step", name="foldx_integration", node=FoldXVariantAnnotationConfig)
26 changes: 26 additions & 0 deletions src/gentropy/dataset/amino_acid_variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Dataset representing consequence of amino-acid changes in protein."""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

from gentropy.common.schemas import parse_spark_schema
from gentropy.dataset.dataset import Dataset

if TYPE_CHECKING:
from pyspark.sql.types import StructType


@dataclass
class AminoAcidVariants(Dataset):
"""Dataset representing consequence of amino-acid changes in protein."""

@classmethod
def get_schema(cls: type[AminoAcidVariants]) -> StructType:
"""Provides the schema for the AminoAcidVariants dataset.
Returns:
StructType: Schema for the AminoAcidVariants dataset
"""
return parse_spark_schema("amino_acid_variants.json")
26 changes: 17 additions & 9 deletions src/gentropy/dataset/l2g_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from __future__ import annotations

from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import TYPE_CHECKING

import pyspark.sql.functions as f
Expand All @@ -29,6 +29,8 @@ class L2GPrediction(Dataset):
confidence of the prediction that a gene is causal to an association.
"""

model: LocusToGeneModel | None = field(default=None, repr=False)

@classmethod
def get_schema(cls: type[L2GPrediction]) -> StructType:
"""Provides the schema for the L2GPrediction dataset.
Expand All @@ -44,7 +46,6 @@ def from_credible_set(
session: Session,
credible_set: StudyLocus,
feature_matrix: L2GFeatureMatrix,
features_list: list[str],
model_path: str | None,
hf_token: str | None = None,
download_from_hub: bool = True,
Expand All @@ -55,7 +56,6 @@ def from_credible_set(
session (Session): Session object that contains the Spark session
credible_set (StudyLocus): Dataset containing credible sets from GWAS only
feature_matrix (L2GFeatureMatrix): Dataset containing all credible sets and their annotations
features_list (list[str]): List of features to use for the model
model_path (str | None): Path to the model file. It can be either in the filesystem or the name on the Hugging Face Hub (in the form of username/repo_name).
hf_token (str | None): Hugging Face token to download the model from the Hub. Only required if the model is private.
download_from_hub (bool): Whether to download the model from the Hugging Face Hub. Defaults to True.
Expand All @@ -82,9 +82,8 @@ def from_credible_set(
)
)
.fill_na()
.select_features(features_list)
.select_features(l2g_model.features_list)
)

return l2g_model.predict(fm, session)

def to_disease_target_evidence(
Expand Down Expand Up @@ -129,17 +128,22 @@ def to_disease_target_evidence(
)

def add_locus_to_gene_features(
self: L2GPrediction, feature_matrix: L2GFeatureMatrix, features_list: list[str]
self: L2GPrediction,
feature_matrix: L2GFeatureMatrix,
) -> L2GPrediction:
"""Add features used to extract the L2G predictions.
Args:
feature_matrix (L2GFeatureMatrix): Feature matrix dataset
features_list (list[str]): List of features used in the model
Returns:
L2GPrediction: L2G predictions with additional features
Raises:
ValueError: If model is not set, feature list won't be available
"""
if self.model is None:
raise ValueError("Model not set, feature annotation cannot be created.")
# Testing if `locusToGeneFeatures` column already exists:
if "locusToGeneFeatures" in self.df.columns:
self.df = self.df.drop("locusToGeneFeatures")
Expand All @@ -150,7 +154,10 @@ def add_locus_to_gene_features(
"locusToGeneFeatures",
f.create_map(
*sum(
((f.lit(feature), f.col(feature)) for feature in features_list),
(
(f.lit(feature), f.col(feature))
for feature in self.model.features_list
),
(),
)
),
Expand All @@ -159,11 +166,12 @@ def add_locus_to_gene_features(
"locusToGeneFeatures",
f.expr("map_filter(locusToGeneFeatures, (k, v) -> v != 0)"),
)
.drop(*features_list)
.drop(*self.model.features_list)
)
return L2GPrediction(
_df=self.df.join(
aggregated_features, on=["studyLocusId", "geneId"], how="left"
),
_schema=self.get_schema(),
model=self.model,
)
4 changes: 3 additions & 1 deletion src/gentropy/dataset/study_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@

from gentropy.assets import data
from gentropy.common.schemas import parse_spark_schema
from gentropy.common.spark_helpers import convert_from_wide_to_long
from gentropy.common.spark_helpers import (
convert_from_wide_to_long,
)
from gentropy.dataset.dataset import Dataset

if TYPE_CHECKING:
Expand Down
10 changes: 9 additions & 1 deletion src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,8 @@ def _qc_subsignificant_associations(
def qc_abnormal_pips(
self: StudyLocus,
sum_pips_lower_threshold: float = 0.99,
sum_pips_upper_threshold: float = 1.0001, # Set slightly above 1 to account for floating point errors
# Set slightly above 1 to account for floating point errors
sum_pips_upper_threshold: float = 1.0001,
) -> StudyLocus:
"""Filter study-locus by sum of posterior inclusion probabilities to ensure that the sum of PIPs is within a given range.
Expand Down Expand Up @@ -691,6 +692,7 @@ def flag_trans_qtls(
"""Flagging transQTL credible sets based on genomic location of the measured gene.
Process:
0. Make sure that the `isTransQtl` column does not exist (remove if exists)
1. Enrich study-locus dataset with geneId based on study metadata. (only QTL studies are considered)
2. Enrich with transcription start site and chromosome of the studied gegne.
3. Flagging any tagging variant of QTL credible sets, if chromosome is different from the gene or distance is above the threshold.
Expand All @@ -709,6 +711,12 @@ def flag_trans_qtls(
if "geneId" not in study_index.df.columns:
return self

# We have to remove the column `isTransQtl` to ensure the column is not duplicated
# The duplication can happen when one reads the StudyLocus from parquet with
# predefined schema that already contains the `isTransQtl` column.
if "isTransQtl" in self.df.columns:
self.df = self.df.drop("isTransQtl")

# Process study index:
processed_studies = (
study_index.df
Expand Down
Loading

0 comments on commit 597cf05

Please sign in to comment.