feat(L2GFeatureMatrix): track missingness rate for each feature

opentargets · Dec 13, 2023 · e69c47e · e69c47e
1 parent be480cd
commit e69c47e
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 3 deletions.
diff --git a/src/otg/dataset/l2g_feature_matrix.py b/src/otg/dataset/l2g_feature_matrix.py
@@ -93,6 +93,26 @@ def get_schema(cls: type[L2GFeatureMatrix]) -> StructType:
         """
         return parse_spark_schema("l2g_feature_matrix.json")
 
+    def calculate_feature_missingness_rate(
+        self: L2GFeatureMatrix,
+    ) -> dict[str, float]:
+        """Calculate the proportion of missing values in each feature.
+
+        Returns:
+            dict[str, float]: Dictionary of feature names and their missingness rate.
+
+        Raises:
+            ValueError: If no features are found.
+        """
+        total_count = self._df.count()
+        if not self.features_list:
+            raise ValueError("No features found")
+
+        return {
+            feature: (self._df.filter(self._df[feature].isNull()).count() / total_count)
+            for feature in self.features_list
+        }
+
     def fill_na(
         self: L2GFeatureMatrix, value: float = 0.0, subset: list[str] | None = None
     ) -> L2GFeatureMatrix:

diff --git a/src/otg/method/l2g/model.py b/src/otg/method/l2g/model.py
@@ -125,14 +125,19 @@ def log_to_wandb(
             wandb_evaluator.evaluate(results)
         ## Track feature importance
         wandb_run.log({"importances": self.get_feature_importance()})
-        ## Track training set metadata
+        ## Track training set
+        training_table = wandb.Table(dataframe=training_data.df.toPandas())
+        wandb_run.log({"trainingSet": training_table})
+        # Count number of positive and negative labels
         gs_counts_dict = {
             "goldStandard" + row["goldStandardSet"].capitalize(): row["count"]
             for row in training_data.df.groupBy("goldStandardSet").count().collect()
         }
         wandb_run.log(gs_counts_dict)
-        training_table = wandb.Table(dataframe=training_data.df.toPandas())
-        wandb_run.log({"trainingSet": training_table})
+        # Missingness rates
+        wandb_run.log(
+            "missingnessRates", training_data.calculate_feature_missingness_rate()
+        )
 
     @classmethod
     def load_from_disk(