Usability tweaks & new featurizer preset (#215)

* Add ability to disable o-o-b remapping * Better debugging of target names * Add fast featurizer preset * Allow structure IDs to be passed via featurized df * Linting
ppdebreuck · May 7, 2024 · b7d1939 · b7d1939
1 parent f0e2e99
commit b7d1939
Show file tree

Hide file tree

Showing 4 changed files with 270 additions and 26 deletions.
diff --git a/modnet/featurizers/presets/matminer_2024_fast.py b/modnet/featurizers/presets/matminer_2024_fast.py
@@ -0,0 +1,219 @@
+"""This submodule contains the `Matminer2024FastFeaturizer` class. """
+
+import numpy as np
+import modnet.featurizers
+import contextlib
+
+
+class Matminer2024FastFeaturizer(modnet.featurizers.MODFeaturizer):
+    """A set of efficient featurizers for features implemented in matminer
+    at time of creation (matminer v0.9.2 from 2024).
+
+    Removes featurizers that are known to be slow (i.e., orders of magnitude
+    more intensive to compute than the rest of the featurizers).
+
+    """
+
+    def __init__(
+        self,
+        fast_oxid: bool = True,
+        continuous_only: bool = True,
+    ):
+        """Creates the featurizer and imports all featurizer functions.
+
+        Parameters:
+            fast_oxid: Whether to use the accelerated oxidation state parameters within
+                pymatgen when constructing features that constrain oxidation states such
+                that all sites with the same species in a structure will have the same
+                oxidation state (recommended if featurizing any structure
+                with large unit cells).
+            continuous_only: Whether to keep only the features that are continuous
+                with respect to the composition (only for composition featurizers).
+                Discontinuous features may lead to discontinuities in the model predictions.
+
+        """
+
+        super().__init__()
+        self.drop_allnan = False
+        self.fast_oxid = fast_oxid
+        self.continuous_only = continuous_only
+        self.load_featurizers()
+
+    def load_featurizers(self):
+        with contextlib.redirect_stdout(None):
+            from matminer.featurizers.composition import (
+                BandCenter,
+                ElementFraction,
+                ElementProperty,
+                Stoichiometry,
+                TMetalFraction,
+                ValenceOrbital,
+            )
+            from matminer.featurizers.structure import (
+                DensityFeatures,
+                EwaldEnergy,
+                GlobalSymmetryFeatures,
+                StructuralComplexity,
+            )
+            from matminer.utils.data import (
+                DemlData,
+                PymatgenData,
+            )
+
+            pymatgen_features = [
+                "block",
+                "mendeleev_no",
+                "electrical_resistivity",
+                "velocity_of_sound",
+                "thermal_conductivity",
+                "bulk_modulus",
+                "coefficient_of_linear_thermal_expansion",
+            ]
+
+            deml_features = [
+                "atom_radius",
+                "molar_vol",
+                "heat_fusion",
+                "boiling_point",
+                "heat_cap",
+                "first_ioniz",
+                "electric_pol",
+                "GGAU_Etot",
+                "mus_fere",
+                "FERE correction",
+            ]
+
+            magpie_featurizer = ElementProperty.from_preset("magpie")
+            magpie_featurizer.stats = ["mean", "avg_dev"]
+
+            pymatgen_featurizer = ElementProperty(
+                data_source=PymatgenData(),
+                stats=["mean", "avg_dev"],
+                features=pymatgen_features,
+            )
+
+            deml_featurizer = ElementProperty(
+                data_source=DemlData(),
+                stats=["mean", "avg_dev"],
+                features=deml_features,
+            )
+
+            self.composition_featurizers = (
+                BandCenter(),
+                ElementFraction(),
+                magpie_featurizer,
+                pymatgen_featurizer,
+                deml_featurizer,
+                Stoichiometry(p_list=[2, 3, 5, 7, 10]),
+                TMetalFraction(),
+                ValenceOrbital(props=["frac"]),
+            )
+
+            self.oxid_composition_featurizers = []
+
+            self.structure_featurizers = (
+                DensityFeatures(),
+                EwaldEnergy(),
+                GlobalSymmetryFeatures(),
+                StructuralComplexity(),
+            )
+
+            self.site_featurizers = []
+
+    def featurize_composition(self, df):
+        """Applies the preset composition featurizers to the input dataframe,
+        renames some fields and cleans the output dataframe.
+
+        """
+        from pymatgen.core.periodic_table import Element
+
+        df = super().featurize_composition(df)
+
+        if self.composition_featurizers and not self.continuous_only:
+            _orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}
+            df["AtomicOrbitals|HOMO_character"] = df[
+                "AtomicOrbitals|HOMO_character"
+            ].map(_orbitals)
+            df["AtomicOrbitals|LUMO_character"] = df[
+                "AtomicOrbitals|LUMO_character"
+            ].map(_orbitals)
+
+            df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply(
+                lambda x: -1 if not isinstance(x, str) else Element(x).Z
+            )
+            df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply(
+                lambda x: -1 if not isinstance(x, str) else Element(x).Z
+            )
+
+        if self.continuous_only:
+            # These are additional features that have shown discontinuities in my tests.
+            # Hopefully, I got them all...
+            df.drop(
+                columns=[
+                    "ElementProperty|DemlData mean electric_pol",
+                    "ElementProperty|DemlData mean FERE correction",
+                    "ElementProperty|DemlData mean GGAU_Etot",
+                    "ElementProperty|DemlData mean heat_fusion",
+                    "ElementProperty|DemlData mean mus_fere",
+                ],
+                inplace=True,
+                errors="ignore",
+            )
+
+            if self.oxid_composition_featurizers:
+                df.drop(columns=["IonProperty|max ionic char"], inplace=True)
+
+        return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan)
+
+    def featurize_structure(self, df):
+        """Applies the preset structural featurizers to the input dataframe,
+        renames some fields and cleans the output dataframe.
+
+        """
+
+        if self.structure_featurizers:
+            df = super().featurize_structure(df)
+
+        _crystal_system = {
+            "cubic": 1,
+            "tetragonal": 2,
+            "orthorombic": 3,
+            "hexagonal": 4,
+            "trigonal": 5,
+            "monoclinic": 6,
+            "triclinic": 7,
+        }
+
+        def _int_map(x):
+            if x == np.nan:
+                return 0
+            elif x:
+                return 1
+            else:
+                return 0
+
+        df["GlobalSymmetryFeatures|crystal_system"] = df[
+            "GlobalSymmetryFeatures|crystal_system"
+        ].map(_crystal_system)
+        df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[
+            "GlobalSymmetryFeatures|is_centrosymmetric"
+        ].map(_int_map)
+
+        return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan)
+
+    def featurize_site(self, df):
+        """Applies the preset site featurizers to the input dataframe,
+        renames some fields and cleans the output dataframe.
+
+        """
+
+        # rename some features for backwards compatibility with pretrained models
+        aliases = {
+            "GeneralizedRadialDistributionFunction": "GeneralizedRDF",
+            "AGNIFingerprints": "AGNIFingerPrint",
+            "BondOrientationalParameter": "BondOrientationParameter",
+        }
+        df = super().featurize_site(df, aliases=aliases)
+        df = df.loc[:, (df != 0).any(axis=0)]
+
+        return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan)
diff --git a/modnet/models/ensemble.py b/modnet/models/ensemble.py
@@ -144,7 +144,11 @@ def fit(
             pool.join()
 
     def predict(
-        self, test_data: MODData, return_unc=False, return_prob=False
+        self,
+        test_data: MODData,
+        return_unc: bool = False,
+        return_prob: bool = False,
+        remap_out_of_bounds: bool = True,
     ) -> pd.DataFrame:
         """Predict the target values for the passed MODData.
 
@@ -154,6 +158,7 @@ def predict(
             return_prob: For a classification task only: whether to return the probability of each
                 class OR only return the most probable class.
             return_unc: whether to return a second dataframe containing the uncertainties
+            remap_out_of_bounds: whether to remap out-of-bounds values to the nearest bound.
 
         Returns:
             A `pandas.DataFrame` containing the predicted values of the targets.
@@ -163,7 +168,11 @@ class OR only return the most probable class.
 
         all_predictions = []
         for i in range(self.n_models):
-            p = self.models[i].predict(test_data, return_prob=return_prob)
+            p = self.models[i].predict(
+                test_data,
+                return_prob=return_prob,
+                remap_out_of_bounds=remap_out_of_bounds,
+            )
             all_predictions.append(p.values)
 
         p_mean = np.array(all_predictions).mean(axis=0)

diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py
@@ -693,14 +693,20 @@ def fit_preset(
 
         return models, val_losses, best_learning_curve, learning_curves, best_preset
 
-    def predict(self, test_data: MODData, return_prob=False) -> pd.DataFrame:
+    def predict(
+        self,
+        test_data: MODData,
+        return_prob: bool = False,
+        remap_out_of_bounds: bool = True,
+    ) -> pd.DataFrame:
         """Predict the target values for the passed MODData.
 
         Parameters:
             test_data: A featurized and feature-selected `MODData`
                 object containing the descriptors used in training.
             return_prob: For a classification tasks only: whether to return the probability of each
                 class OR only return the most probable class.
+            remap_out_of_bounds: Whether to remap out-of-bounds predictions to the training data distribution.
 
         Returns:
             A `pandas.DataFrame` containing the predicted values of the targets.
@@ -724,20 +730,22 @@ class OR only return the most probable class.
             p = [p]
 
         # post-process based on training data
-        if max(self.num_classes.values()) <= 2:  # regression
-            for i, vals in enumerate(p):
-                yrange = self.max_y[i] - self.min_y[i]
-                upper_bound = self.max_y[i] + 0.25 * yrange
-                lower_bound = self.min_y[i] - 0.25 * yrange
-                for j in range(len(self.targets_groups[i])):
-                    out_of_range_idxs = np.where(
-                        (vals[:, j] < lower_bound[j]) | (vals[:, j] > upper_bound[j])
-                    )
-                    vals[out_of_range_idxs, j] = (
-                        np.random.uniform(0, 1, size=len(out_of_range_idxs[0]))
-                        * (yrange[j])
-                        + self.min_y[i][j]
-                    )
+        if remap_out_of_bounds:
+            if max(self.num_classes.values()) <= 2:  # regression
+                for i, vals in enumerate(p):
+                    yrange = self.max_y[i] - self.min_y[i]
+                    upper_bound = self.max_y[i] + 0.25 * yrange
+                    lower_bound = self.min_y[i] - 0.25 * yrange
+                    for j in range(len(self.targets_groups[i])):
+                        out_of_range_idxs = np.where(
+                            (vals[:, j] < lower_bound[j])
+                            | (vals[:, j] > upper_bound[j])
+                        )
+                        vals[out_of_range_idxs, j] = (
+                            np.random.uniform(0, 1, size=len(out_of_range_idxs[0]))
+                            * (yrange[j])
+                            + self.min_y[i][j]
+                        )
 
         p_dic = {}
 

diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py
@@ -664,8 +664,12 @@ def __init__(
             LOG.info(f"Loaded {self.featurizer.__class__.__name__} featurizer.")
 
         if target_names is not None:
+            if isinstance(target_names, str):
+                target_names = [target_names]
             if np.shape(targets)[-1] != len(target_names):
-                raise ValueError("Target names must be supplied for every target.")
+                raise ValueError(
+                    f"Target names must be supplied for every target: {np.shape(targets)} vs {target_names=}"
+                )
         elif targets is not None:
             if len(np.shape(targets)) == 1:
                 target_names = ["prop0"]
@@ -681,16 +685,20 @@ def __init__(
                     "List of IDs (`structure_ids`) provided must be unique."
                 )
 
-            if len(structure_ids) != len(materials):
-                raise ValueError(
-                    "List of IDs (`structure_ids`) must have same length as list of structure."
-                )
+            if materials is not None:
+                if len(structure_ids) != len(materials):
+                    raise ValueError(
+                        "List of IDs (`structure_ids`) must have same length as list of structure."
+                    )
 
         else:
-            num_entries = (
-                len(materials) if materials is not None else len(df_featurized)
-            )
-            structure_ids = [f"id{i}" for i in range(num_entries)]
+            if df_featurized is not None:
+                structure_ids = df_featurized.index
+            else:
+                num_entries = (
+                    len(materials) if materials is not None else len(df_featurized)
+                )
+                structure_ids = [f"id{i}" for i in range(num_entries)]
 
         if targets is not None:
             # set up dataframe for targets with columns (id, property_1, ..., property_n)