opentargets · vivienho · Feb 7, 2025 · Feb 7, 2025 · Feb 7, 2025 · Feb 7, 2025
diff --git a/src/gentropy/assets/schemas/amino_acid_variants.json b/src/gentropy/assets/schemas/amino_acid_variants.json
@@ -14,7 +14,7 @@
     },
     {
       "metadata": {},
-      "name": "inSilicoPredictors",
+      "name": "variantEffect",
       "nullable": true,
       "type": {
         "containsNull": true,

diff --git a/src/gentropy/assets/schemas/variant_index.json b/src/gentropy/assets/schemas/variant_index.json
@@ -32,7 +32,7 @@
     },
     {
       "metadata": {},
-      "name": "inSilicoPredictors",
+      "name": "variantEffect",
       "nullable": true,
       "type": {
         "containsNull": true,

diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -436,6 +436,16 @@ class PanUKBBConfig(StepConfig):
     _target_: str = "gentropy.pan_ukb_ingestion.PanUKBBVariantIndexStep"
 
 
+@dataclass
+class LOFIngestionConfig(StepConfig):
+    """Step configuration for the ingestion of Loss-of-Function variant data generated by OTAR2075."""
+
+    lof_curation_dataset_path: str = MISSING
+    lof_curation_variant_annotations_path: str = MISSING
+
+    _target_: str = "gentropy.lof_curation_ingestion.LOFIngestionStep"
+
+
 @dataclass
 class VariantIndexConfig(StepConfig):
     """Variant index step configuration."""
@@ -454,7 +464,7 @@ class _ConsequenceToPathogenicityScoreMap(TypedDict):
     )
     vep_output_json_path: str = MISSING
     variant_index_path: str = MISSING
-    gnomad_variant_annotations_path: str | None = None
+    variant_annotations_path: list[str] | None = None
     hash_threshold: int = 300
     consequence_to_pathogenicity_score: ClassVar[
         list[_ConsequenceToPathogenicityScoreMap]
@@ -739,6 +749,7 @@ def register_config() -> None:
     cs.store(group="step", name="pics", node=PICSConfig)
     cs.store(group="step", name="gnomad_variants", node=GnomadVariantConfig)
     cs.store(group="step", name="ukb_ppp_eur_sumstat_preprocess", node=UkbPppEurConfig)
+    cs.store(group="step", name="lof_curation_ingestion", node=LOFIngestionConfig)
     cs.store(group="step", name="variant_index", node=VariantIndexConfig)
     cs.store(group="step", name="variant_to_vcf", node=ConvertToVcfStepConfig)
     cs.store(

diff --git a/src/gentropy/dataset/variant_index.py b/src/gentropy/dataset/variant_index.py
@@ -130,7 +130,7 @@ def add_annotation(
         """Import annotation from an other variant index dataset.
 
         At this point the annotation can be extended with extra cross-references,
-        in-silico predictions and allele frequencies.
+        variant effects, allele frequencies, and variant descriptions.
 
         Args:
             annotation_source (VariantIndex): Annotation to add to the dataset
@@ -168,7 +168,12 @@ def add_annotation(
                             f.col(column), f.col(f"{prefix}{column}"), fields_order
                         ).alias(column)
                     )
-                # Non-array columns are coalesced:
+                # variantDescription columns are concatenated:
+                elif column == "variantDescription":
+                    select_expressions.append(
+                        f.concat_ws(" ", f.col(column), f.col(f"{prefix}{column}")).alias(column)
+                    )
+                # All other non-array columns are coalesced:
                 else:
                     select_expressions.append(
                         f.coalesce(f.col(column), f.col(f"{prefix}{column}")).alias(
@@ -279,15 +284,15 @@ def get_distance_to_gene(
     def annotate_with_amino_acid_consequences(
         self: VariantIndex, annotation: AminoAcidVariants
     ) -> VariantIndex:
-        """Enriching in silico predictors with amino-acid derived predicted consequences.
+        """Enriching variant effect assessments with amino-acid derived predicted consequences.
 
         Args:
             annotation (AminoAcidVariants): amio-acid level variant consequences.
 
         Returns:
             VariantIndex: where amino-acid causing variants are enriched with extra annotation
         """
-        w = Window.partitionBy("variantId").orderBy(f.size("inSilicoPredictors").desc())
+        w = Window.partitionBy("variantId").orderBy(f.size("variantEffect").desc())
 
         return VariantIndex(
             _df=self.df
@@ -308,17 +313,17 @@ def annotate_with_amino_acid_consequences(
             )
             # Joining with amino-acid predictions:
             .join(
-                annotation.df.withColumnRenamed("inSilicoPredictors", "annotations"),
+                annotation.df.withColumnRenamed("variantEffect", "annotations"),
                 on=["uniprotAccession", "aminoAcidChange"],
                 how="left",
             )
             # Merge predictors:
             .withColumn(
-                "inSilicoPredictors",
+                "variantEffect",
                 f.when(
                     f.col("annotations").isNotNull(),
-                    f.array_union("inSilicoPredictors", "annotations"),
-                ).otherwise(f.col("inSilicoPredictors")),
+                    f.array_union("variantEffect", "annotations"),
+                ).otherwise(f.col("variantEffect")),
             )
             # Dropping unused columns:
             .drop("uniprotAccession", "aminoAcidChange", "annotations")
@@ -356,33 +361,33 @@ def get_loftee(self: VariantIndex) -> DataFrame:
         )
 
 
-class InSilicoPredictorNormaliser:
-    """Class to normalise in silico predictor assessments.
+class VariantEffectNormaliser:
+    """Class to normalise variant effect assessments.
 
     Essentially based on the raw scores, it normalises the scores to a range between -1 and 1, and appends the normalised
-    value to the in silico predictor struct.
+    value to the variant effect struct.
 
     The higher negative values indicate increasingly confident prediction to be a benign variant,
     while the higher positive values indicate increasingly deleterious predicted effect.
 
-    The point of these operations to make the scores comparable across different in silico predictors.
+    The point of these operations to make the scores comparable across different variant effect assessments.
     """
 
     @classmethod
-    def normalise_in_silico_predictors(
-        cls: type[InSilicoPredictorNormaliser],
-        in_silico_predictors: Column,
+    def normalise_variant_effect(
+        cls: type[VariantEffectNormaliser],
+        variant_effect: Column,
     ) -> Column:
-        """Normalise in silico predictors. Appends a normalised score to the in silico predictor struct.
+        """Normalise variant effect assessments. Appends a normalised score to the variant effect struct.
 
         Args:
-            in_silico_predictors (Column): Column containing in silico predictors (list of structs).
+            variant_effect (Column): Column containing variant effect assessments (list of structs).
 
         Returns:
-            Column: Normalised in silico predictors.
+            Column: Normalised variant effect assessments.
         """
         return f.transform(
-            in_silico_predictors,
+            variant_effect,
             lambda predictor: f.struct(
                 # Extracing all existing columns:
                 predictor.method.alias("method"),
@@ -399,20 +404,20 @@ def normalise_in_silico_predictors(
 
     @classmethod
     def resolve_predictor_methods(
-        cls: type[InSilicoPredictorNormaliser],
+        cls: type[VariantEffectNormaliser],
         score: Column,
         method: Column,
         assessment: Column,
     ) -> Column:
-        """It takes a score, a method, and an assessment, and returns a normalized score for the in silico predictor.
+        """It takes a score, a method, and an assessment, and returns a normalized score for the variant effect.
 
         Args:
-            score (Column): The raw score from the in silico predictor.
+            score (Column): The raw score from the variant effect.
             method (Column): The method used to generate the score.
             assessment (Column): The assessment of the score.
 
         Returns:
-            Column: Normalised score for the in silico predictor.
+            Column: Normalised score for the variant effect.
         """
         return (
             f.when(method == "LOFTEE", cls._normalise_loftee(assessment))
@@ -421,6 +426,7 @@ def resolve_predictor_methods(
             .when(method == "AlphaMissense", cls._normalise_alpha_missense(score))
             .when(method == "CADD", cls._normalise_cadd(score))
             .when(method == "Pangolin", cls._normalise_pangolin(score))
+            .when(method == "LossOfFunctionCuration", cls._normalise_lof(assessment))
             # The following predictors are not normalised:
             .when(method == "SpliceAI", score)
             .when(method == "VEP", score)
@@ -454,7 +460,7 @@ def _rescaleColumnValue(
 
     @classmethod
     def _normalise_foldx(
-        cls: type[InSilicoPredictorNormaliser], score: Column
+        cls: type[VariantEffectNormaliser], score: Column
     ) -> Column:
         """Normalise FoldX ddG energies.
 
@@ -477,7 +483,7 @@ def _normalise_foldx(
 
     @classmethod
     def _normalise_cadd(
-        cls: type[InSilicoPredictorNormaliser],
+        cls: type[VariantEffectNormaliser],
         score: Column,
     ) -> Column:
         """Normalise CADD scores.
@@ -503,7 +509,7 @@ def _normalise_cadd(
 
     @classmethod
     def _normalise_gerp(
-        cls: type[InSilicoPredictorNormaliser],
+        cls: type[VariantEffectNormaliser],
         score: Column,
     ) -> Column:
         """Normalise GERP scores.
@@ -533,9 +539,38 @@ def _normalise_gerp(
             .when(score < -3, f.lit(-1.0))
         )
 
+    @classmethod
+    def _normalise_lof(
+        cls: type[VariantEffectNormaliser],
+        assessment: Column,
+    ) -> Column:
+        """Normalise loss-of-function verdicts.
+
+        There are five ordinal verdicts.
+        The normalised score is determined by the verdict:
+         - lof: 1
+         - likely_lof: 0.5
+         - uncertain: 0
+         - likely_not_lof: -0.5
+         - not_lof: -1
+
+        Args:
+            assessment (Column): Loss-of-function assessment.
+
+        Returns:
+            Column: Normalised loss-of-function score.
+        """
+        return (
+            f.when(assessment == "lof", f.lit(1))
+            .when(assessment == "likely_lof", f.lit(0.5))
+            .when(assessment == "uncertain", f.lit(0))
+            .when(assessment == "likely_not_lof", f.lit(-0.5))
+            .when(assessment == "not_lof", f.lit(-1))
+        )
+
     @classmethod
     def _normalise_loftee(
-        cls: type[InSilicoPredictorNormaliser],
+        cls: type[VariantEffectNormaliser],
         assessment: Column,
     ) -> Column:
         """Normalise LOFTEE scores.
@@ -557,7 +592,7 @@ def _normalise_loftee(
 
     @classmethod
     def _normalise_sift(
-        cls: type[InSilicoPredictorNormaliser],
+        cls: type[VariantEffectNormaliser],
         score: Column,
         assessment: Column,
     ) -> Column:
@@ -601,7 +636,7 @@ def _normalise_sift(
 
     @classmethod
     def _normalise_polyphen(
-        cls: type[InSilicoPredictorNormaliser],
+        cls: type[VariantEffectNormaliser],
         assessment: Column,
         score: Column,
     ) -> Column:
@@ -632,7 +667,7 @@ def _normalise_polyphen(
 
     @classmethod
     def _normalise_alpha_missense(
-        cls: type[InSilicoPredictorNormaliser],
+        cls: type[VariantEffectNormaliser],
         score: Column,
     ) -> Column:
         """Normalise AlphaMissense scores.
@@ -656,7 +691,7 @@ def _normalise_alpha_missense(
 
     @classmethod
     def _normalise_pangolin(
-        cls: type[InSilicoPredictorNormaliser],
+        cls: type[VariantEffectNormaliser],
         score: Column,
     ) -> Column:
         """Normalise Pangolin scores.