Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: new Loss-of-Function variant data from OTAR2075 #991

Open
wants to merge 10 commits into
base: dev
Choose a base branch
from
2 changes: 1 addition & 1 deletion src/gentropy/assets/schemas/amino_acid_variants.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
},
{
"metadata": {},
"name": "inSilicoPredictors",
"name": "variantEffect",
"nullable": true,
"type": {
"containsNull": true,
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/assets/schemas/variant_index.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
},
{
"metadata": {},
"name": "inSilicoPredictors",
"name": "variantEffect",
"nullable": true,
"type": {
"containsNull": true,
Expand Down
13 changes: 12 additions & 1 deletion src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,16 @@ class PanUKBBConfig(StepConfig):
_target_: str = "gentropy.pan_ukb_ingestion.PanUKBBVariantIndexStep"


@dataclass
class LOFIngestionConfig(StepConfig):
"""Step configuration for the ingestion of Loss-of-Function variant data generated by OTAR2075."""

lof_curation_dataset_path: str = MISSING
lof_curation_variant_annotations_path: str = MISSING

_target_: str = "gentropy.lof_curation_ingestion.LOFIngestionStep"


@dataclass
class VariantIndexConfig(StepConfig):
"""Variant index step configuration."""
Expand All @@ -454,7 +464,7 @@ class _ConsequenceToPathogenicityScoreMap(TypedDict):
)
vep_output_json_path: str = MISSING
variant_index_path: str = MISSING
gnomad_variant_annotations_path: str | None = None
variant_annotations_path: list[str] | None = None
hash_threshold: int = 300
consequence_to_pathogenicity_score: ClassVar[
list[_ConsequenceToPathogenicityScoreMap]
Expand Down Expand Up @@ -739,6 +749,7 @@ def register_config() -> None:
cs.store(group="step", name="pics", node=PICSConfig)
cs.store(group="step", name="gnomad_variants", node=GnomadVariantConfig)
cs.store(group="step", name="ukb_ppp_eur_sumstat_preprocess", node=UkbPppEurConfig)
cs.store(group="step", name="lof_curation_ingestion", node=LOFIngestionConfig)
cs.store(group="step", name="variant_index", node=VariantIndexConfig)
cs.store(group="step", name="variant_to_vcf", node=ConvertToVcfStepConfig)
cs.store(
Expand Down
97 changes: 66 additions & 31 deletions src/gentropy/dataset/variant_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def add_annotation(
"""Import annotation from an other variant index dataset.

At this point the annotation can be extended with extra cross-references,
in-silico predictions and allele frequencies.
variant effects, allele frequencies, and variant descriptions.

Args:
annotation_source (VariantIndex): Annotation to add to the dataset
Expand Down Expand Up @@ -168,7 +168,12 @@ def add_annotation(
f.col(column), f.col(f"{prefix}{column}"), fields_order
).alias(column)
)
# Non-array columns are coalesced:
# variantDescription columns are concatenated:
elif column == "variantDescription":
select_expressions.append(
f.concat_ws(" ", f.col(column), f.col(f"{prefix}{column}")).alias(column)
)
# All other non-array columns are coalesced:
else:
select_expressions.append(
f.coalesce(f.col(column), f.col(f"{prefix}{column}")).alias(
Expand Down Expand Up @@ -279,15 +284,15 @@ def get_distance_to_gene(
def annotate_with_amino_acid_consequences(
self: VariantIndex, annotation: AminoAcidVariants
) -> VariantIndex:
"""Enriching in silico predictors with amino-acid derived predicted consequences.
"""Enriching variant effect assessments with amino-acid derived predicted consequences.

Args:
annotation (AminoAcidVariants): amio-acid level variant consequences.

Returns:
VariantIndex: where amino-acid causing variants are enriched with extra annotation
"""
w = Window.partitionBy("variantId").orderBy(f.size("inSilicoPredictors").desc())
w = Window.partitionBy("variantId").orderBy(f.size("variantEffect").desc())

return VariantIndex(
_df=self.df
Expand All @@ -308,17 +313,17 @@ def annotate_with_amino_acid_consequences(
)
# Joining with amino-acid predictions:
.join(
annotation.df.withColumnRenamed("inSilicoPredictors", "annotations"),
annotation.df.withColumnRenamed("variantEffect", "annotations"),
on=["uniprotAccession", "aminoAcidChange"],
how="left",
)
# Merge predictors:
.withColumn(
"inSilicoPredictors",
"variantEffect",
f.when(
f.col("annotations").isNotNull(),
f.array_union("inSilicoPredictors", "annotations"),
).otherwise(f.col("inSilicoPredictors")),
f.array_union("variantEffect", "annotations"),
).otherwise(f.col("variantEffect")),
)
# Dropping unused columns:
.drop("uniprotAccession", "aminoAcidChange", "annotations")
Expand Down Expand Up @@ -356,33 +361,33 @@ def get_loftee(self: VariantIndex) -> DataFrame:
)


class InSilicoPredictorNormaliser:
"""Class to normalise in silico predictor assessments.
class VariantEffectNormaliser:
"""Class to normalise variant effect assessments.

Essentially based on the raw scores, it normalises the scores to a range between -1 and 1, and appends the normalised
value to the in silico predictor struct.
value to the variant effect struct.

The higher negative values indicate increasingly confident prediction to be a benign variant,
while the higher positive values indicate increasingly deleterious predicted effect.

The point of these operations to make the scores comparable across different in silico predictors.
The point of these operations to make the scores comparable across different variant effect assessments.
"""

@classmethod
def normalise_in_silico_predictors(
cls: type[InSilicoPredictorNormaliser],
in_silico_predictors: Column,
def normalise_variant_effect(
cls: type[VariantEffectNormaliser],
variant_effect: Column,
) -> Column:
"""Normalise in silico predictors. Appends a normalised score to the in silico predictor struct.
"""Normalise variant effect assessments. Appends a normalised score to the variant effect struct.

Args:
in_silico_predictors (Column): Column containing in silico predictors (list of structs).
variant_effect (Column): Column containing variant effect assessments (list of structs).

Returns:
Column: Normalised in silico predictors.
Column: Normalised variant effect assessments.
"""
return f.transform(
in_silico_predictors,
variant_effect,
lambda predictor: f.struct(
# Extracing all existing columns:
predictor.method.alias("method"),
Expand All @@ -399,20 +404,20 @@ def normalise_in_silico_predictors(

@classmethod
def resolve_predictor_methods(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
score: Column,
method: Column,
assessment: Column,
) -> Column:
"""It takes a score, a method, and an assessment, and returns a normalized score for the in silico predictor.
"""It takes a score, a method, and an assessment, and returns a normalized score for the variant effect.

Args:
score (Column): The raw score from the in silico predictor.
score (Column): The raw score from the variant effect.
method (Column): The method used to generate the score.
assessment (Column): The assessment of the score.

Returns:
Column: Normalised score for the in silico predictor.
Column: Normalised score for the variant effect.
"""
return (
f.when(method == "LOFTEE", cls._normalise_loftee(assessment))
Expand All @@ -421,6 +426,7 @@ def resolve_predictor_methods(
.when(method == "AlphaMissense", cls._normalise_alpha_missense(score))
.when(method == "CADD", cls._normalise_cadd(score))
.when(method == "Pangolin", cls._normalise_pangolin(score))
.when(method == "LossOfFunctionCuration", cls._normalise_lof(assessment))
# The following predictors are not normalised:
.when(method == "SpliceAI", score)
.when(method == "VEP", score)
Expand Down Expand Up @@ -454,7 +460,7 @@ def _rescaleColumnValue(

@classmethod
def _normalise_foldx(
cls: type[InSilicoPredictorNormaliser], score: Column
cls: type[VariantEffectNormaliser], score: Column
) -> Column:
"""Normalise FoldX ddG energies.

Expand All @@ -477,7 +483,7 @@ def _normalise_foldx(

@classmethod
def _normalise_cadd(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
score: Column,
) -> Column:
"""Normalise CADD scores.
Expand All @@ -503,7 +509,7 @@ def _normalise_cadd(

@classmethod
def _normalise_gerp(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
score: Column,
) -> Column:
"""Normalise GERP scores.
Expand Down Expand Up @@ -533,9 +539,38 @@ def _normalise_gerp(
.when(score < -3, f.lit(-1.0))
)

@classmethod
def _normalise_lof(
cls: type[VariantEffectNormaliser],
assessment: Column,
) -> Column:
"""Normalise loss-of-function verdicts.

There are five ordinal verdicts.
The normalised score is determined by the verdict:
- lof: 1
- likely_lof: 0.5
- uncertain: 0
- likely_not_lof: -0.5
- not_lof: -1

Args:
assessment (Column): Loss-of-function assessment.

Returns:
Column: Normalised loss-of-function score.
"""
return (
f.when(assessment == "lof", f.lit(1))
.when(assessment == "likely_lof", f.lit(0.5))
.when(assessment == "uncertain", f.lit(0))
.when(assessment == "likely_not_lof", f.lit(-0.5))
.when(assessment == "not_lof", f.lit(-1))
)

@classmethod
def _normalise_loftee(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
assessment: Column,
) -> Column:
"""Normalise LOFTEE scores.
Expand All @@ -557,7 +592,7 @@ def _normalise_loftee(

@classmethod
def _normalise_sift(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
score: Column,
assessment: Column,
) -> Column:
Expand Down Expand Up @@ -601,7 +636,7 @@ def _normalise_sift(

@classmethod
def _normalise_polyphen(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
assessment: Column,
score: Column,
) -> Column:
Expand Down Expand Up @@ -632,7 +667,7 @@ def _normalise_polyphen(

@classmethod
def _normalise_alpha_missense(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
score: Column,
) -> Column:
"""Normalise AlphaMissense scores.
Expand All @@ -656,7 +691,7 @@ def _normalise_alpha_missense(

@classmethod
def _normalise_pangolin(
cls: type[InSilicoPredictorNormaliser],
cls: type[VariantEffectNormaliser],
score: Column,
) -> Column:
"""Normalise Pangolin scores.
Expand Down
Loading
Loading