Skip to content

Commit

Permalink
chore: review study locus and study index configs (#326)
Browse files Browse the repository at this point in the history
* chore: make studylocus and study indices configs clearer

* chore: temporarily turn off removal of redundancies due to perf

* refactor: read studyindex and studylocus recursively
  • Loading branch information
ireneisdoomed authored Dec 12, 2023
1 parent 923684c commit 7dfce61
Show file tree
Hide file tree
Showing 12 changed files with 39 additions and 26 deletions.
6 changes: 3 additions & 3 deletions config/datasets/gcp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,16 @@ eqtl_catalogue_paths_imported: ${datasets.inputs}/preprocess/eqtl_catalogue/tabi
gene_index: ${datasets.outputs}/gene_index
variant_annotation: ${datasets.outputs}/variant_annotation
variant_index: ${datasets.outputs}/variant_index
study_index: ${datasets.outputs}/study_index
study_locus: ${datasets.outputs}/study_locus
credible_set: ${datasets.outputs}/credible_set
study_index: ${datasets.outputs}/study_index
summary_statistics: ${datasets.outputs}/summary_statistics
study_locus_overlap: ${datasets.outputs}/study_locus_overlap
colocalisation: ${datasets.outputs}/colocalisation
v2g: ${datasets.outputs}/v2g
ld_index: ${datasets.outputs}/ld_index
catalog_study_index: ${datasets.study_index}/catalog_curated
catalog_study_locus: ${datasets.credible_set}/catalog_curated
catalog_study_index: ${datasets.study_index}/catalog
catalog_study_locus: ${datasets.study_locus}/catalog_study_locus
finngen_study_index: ${datasets.study_index}/finngen
finngen_summary_stats: ${datasets.summary_statistics}/finngen
from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats
Expand Down
2 changes: 1 addition & 1 deletion config/step/locus_to_gene.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ predictions_path: ${datasets.l2g_predictions}
credible_set_path: ${datasets.credible_set}
variant_gene_path: ${datasets.v2g}
colocalisation_path: ${datasets.colocalisation}
study_index_path: ${datasets.catalog_study_index}
study_index_path: ${datasets.study_index}
study_locus_overlap_path: ${datasets.study_locus_overlap}
gold_standard_curation_path: ${datasets.l2g_gold_standard_curation}
gene_interactions_path: ${datasets.gene_interactions}
Expand Down
4 changes: 2 additions & 2 deletions config/step/pics.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
_target_: otg.pics.PICSStep
study_locus_ld_annotated_in: ${datasets.from_sumstats_study_locus}
picsed_study_locus_out: ${datasets.from_sumstats_pics}
study_locus_ld_annotated_in: ???
picsed_study_locus_out: ???
4 changes: 2 additions & 2 deletions config/step/study_locus_overlap.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
_target_: otg.overlaps.OverlapsIndexStep
study_locus_path: ${datasets.outputs}/catalog_study_locus
study_index_path: ${datasets.outputs}/catalog_study_index
study_locus_path: ${datasets.outputs}/credible_set
study_index_path: ${datasets.outputs}/study_index
overlaps_index_out: ${datasets.outputs}/study_locus_overlap
2 changes: 1 addition & 1 deletion config/step/variant_index.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
_target_: otg.variant_index.VariantIndexStep
variant_annotation_path: ${datasets.variant_annotation}
study_locus_path: ${datasets.study_locus}
credible_set_path: ${datasets.study_locus}
variant_index_path: ${datasets.variant_index}
3 changes: 0 additions & 3 deletions src/airflow/dags/configs/dag.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,12 @@
prerequisites:
- "variant_index"
- "gene_index"
- id: "ukbiobank"
- id: "study_locus_overlap"
prerequisites:
- "gwas_catalog"
- "ukbiobank"
- id: "locus_to_gene"
prerequisites:
- "gwas_catalog"
- "ukbiobank"
- "variant_index"
- "v2g"
- "study_locus_overlap"
8 changes: 7 additions & 1 deletion src/airflow/dags/dag_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@

CLUSTER_NAME = "otg-preprocess"

ALL_STEPS = ["eqtl_catalogue", "ld_index", "variant_annotation"]
ALL_STEPS = [
"finngen",
"eqtl_catalogue",
"ld_index",
"variant_annotation",
"ukbiobank",
]


with DAG(
Expand Down
2 changes: 1 addition & 1 deletion src/otg/dataset/l2g_gold_standard.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def from_otg_curation(

return (
OpenTargetsL2GGoldStandard.as_l2g_gold_standard(gold_standard_curation, v2g)
.filter_unique_associations(study_locus_overlap)
# .filter_unique_associations(study_locus_overlap)
.remove_false_negatives(interactions_df)
)

Expand Down
16 changes: 11 additions & 5 deletions src/otg/l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from dataclasses import dataclass, field
from typing import Any

import pyspark.sql.functions as f
import sklearn
from omegaconf import MISSING
from xgboost.spark import SparkXGBClassifier
Expand Down Expand Up @@ -114,7 +115,9 @@ def __post_init__(self: LocusToGeneStep) -> None:
credible_set = StudyLocus.from_parquet(
self.session, self.credible_set_path, recursiveFileLookup=True
)
studies = StudyIndex.from_parquet(self.session, self.study_index_path)
studies = StudyIndex.from_parquet(
self.session, self.study_index_path, recursiveFileLookup=True
)
v2g = V2G.from_parquet(self.session, self.variant_gene_path)
# coloc = Colocalisation.from_parquet(self.session, self.colocalisation_path) # TODO: run step

Expand Down Expand Up @@ -142,8 +145,12 @@ def __post_init__(self: LocusToGeneStep) -> None:

# Join and fill null values with 0
data = L2GFeatureMatrix(
_df=gold_standards.df.drop("sources").join(
fm.df, on=["studyLocusId", "geneId"], how="inner"
_df=fm.df.join(
f.broadcast(
gold_standards.df.drop("variantId", "studyId", "sources")
),
on=["studyLocusId", "geneId"],
how="inner",
),
_schema=L2GFeatureMatrix.get_schema(),
).fill_na()
Expand All @@ -168,7 +175,7 @@ def __post_init__(self: LocusToGeneStep) -> None:
)
else:
# Train model
model = LocusToGeneTrainer.train(
LocusToGeneTrainer.train(
data=data,
l2g_model=l2g_model,
features_list=list(self.features_list),
Expand All @@ -177,7 +184,6 @@ def __post_init__(self: LocusToGeneStep) -> None:
wandb_run_name=self.wandb_run_name,
**self.hyperparameters,
)
model.save(self.model_path)
self.session.logger.info(
f"Finished L2G step. L2G model saved to {self.model_path}"
)
Expand Down
2 changes: 1 addition & 1 deletion src/otg/method/l2g/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def evaluate(
hyperparameters: dict[str, Any],
wandb_run_name: str | None,
) -> None:
"""Perform evaluation of the model by applying it to a test set and tracking the results with W&B.
"""Perform evaluation of the model predictions for the test set and track the results with W&B.
Args:
results (DataFrame): Dataframe containing the predictions
Expand Down
8 changes: 6 additions & 2 deletions src/otg/overlaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,12 @@ class OverlapsIndexStep:
def __post_init__(self: OverlapsIndexStep) -> None:
"""Run step."""
# Extract
study_locus = StudyLocus.from_parquet(self.session, self.study_locus_path)
study_index = StudyIndex.from_parquet(self.session, self.study_index_path)
study_locus = StudyLocus.from_parquet(
self.session, self.study_locus_path, recursiveFileLookup=True
)
study_index = StudyIndex.from_parquet(
self.session, self.study_index_path, recursiveFileLookup=True
)
# Transform
overlaps_index = StudyLocusOverlap.from_associations(study_locus, study_index)
# Load
Expand Down
8 changes: 4 additions & 4 deletions src/otg/variant_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,19 @@ class VariantIndexStep:

session: Session = MISSING
variant_annotation_path: str = MISSING
study_locus_path: str = MISSING
credible_set_path: str = MISSING
variant_index_path: str = MISSING

def __post_init__(self: VariantIndexStep) -> None:
"""Run step."""
# Extract
va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)
study_locus = StudyLocus.from_parquet(
self.session, self.study_locus_path, recursiveFileLookup=True
credible_set = StudyLocus.from_parquet(
self.session, self.credible_set_path, recursiveFileLookup=True
)

# Transform
vi = VariantIndex.from_variant_annotation(va, study_locus)
vi = VariantIndex.from_variant_annotation(va, credible_set)

# Load
self.session.logger.info(f"Writing variant index to: {self.variant_index_path}")
Expand Down

0 comments on commit 7dfce61

Please sign in to comment.