Skip to content

Commit

Permalink
Usability tweaks & new featurizer preset (#215)
Browse files Browse the repository at this point in the history
* Add ability to disable o-o-b remapping

* Better debugging of target names

* Add fast featurizer preset

* Allow structure IDs to be passed via featurized df

* Linting
  • Loading branch information
ml-evs authored May 7, 2024
1 parent f0e2e99 commit b7d1939
Show file tree
Hide file tree
Showing 4 changed files with 270 additions and 26 deletions.
219 changes: 219 additions & 0 deletions modnet/featurizers/presets/matminer_2024_fast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
"""This submodule contains the `Matminer2024FastFeaturizer` class. """

import numpy as np
import modnet.featurizers
import contextlib


class Matminer2024FastFeaturizer(modnet.featurizers.MODFeaturizer):
"""A set of efficient featurizers for features implemented in matminer
at time of creation (matminer v0.9.2 from 2024).
Removes featurizers that are known to be slow (i.e., orders of magnitude
more intensive to compute than the rest of the featurizers).
"""

def __init__(
self,
fast_oxid: bool = True,
continuous_only: bool = True,
):
"""Creates the featurizer and imports all featurizer functions.
Parameters:
fast_oxid: Whether to use the accelerated oxidation state parameters within
pymatgen when constructing features that constrain oxidation states such
that all sites with the same species in a structure will have the same
oxidation state (recommended if featurizing any structure
with large unit cells).
continuous_only: Whether to keep only the features that are continuous
with respect to the composition (only for composition featurizers).
Discontinuous features may lead to discontinuities in the model predictions.
"""

super().__init__()
self.drop_allnan = False
self.fast_oxid = fast_oxid
self.continuous_only = continuous_only
self.load_featurizers()

def load_featurizers(self):
with contextlib.redirect_stdout(None):
from matminer.featurizers.composition import (
BandCenter,
ElementFraction,
ElementProperty,
Stoichiometry,
TMetalFraction,
ValenceOrbital,
)
from matminer.featurizers.structure import (
DensityFeatures,
EwaldEnergy,
GlobalSymmetryFeatures,
StructuralComplexity,
)
from matminer.utils.data import (
DemlData,
PymatgenData,
)

pymatgen_features = [
"block",
"mendeleev_no",
"electrical_resistivity",
"velocity_of_sound",
"thermal_conductivity",
"bulk_modulus",
"coefficient_of_linear_thermal_expansion",
]

deml_features = [
"atom_radius",
"molar_vol",
"heat_fusion",
"boiling_point",
"heat_cap",
"first_ioniz",
"electric_pol",
"GGAU_Etot",
"mus_fere",
"FERE correction",
]

magpie_featurizer = ElementProperty.from_preset("magpie")
magpie_featurizer.stats = ["mean", "avg_dev"]

pymatgen_featurizer = ElementProperty(
data_source=PymatgenData(),
stats=["mean", "avg_dev"],
features=pymatgen_features,
)

deml_featurizer = ElementProperty(
data_source=DemlData(),
stats=["mean", "avg_dev"],
features=deml_features,
)

self.composition_featurizers = (
BandCenter(),
ElementFraction(),
magpie_featurizer,
pymatgen_featurizer,
deml_featurizer,
Stoichiometry(p_list=[2, 3, 5, 7, 10]),
TMetalFraction(),
ValenceOrbital(props=["frac"]),
)

self.oxid_composition_featurizers = []

self.structure_featurizers = (
DensityFeatures(),
EwaldEnergy(),
GlobalSymmetryFeatures(),
StructuralComplexity(),
)

self.site_featurizers = []

def featurize_composition(self, df):
"""Applies the preset composition featurizers to the input dataframe,
renames some fields and cleans the output dataframe.
"""
from pymatgen.core.periodic_table import Element

df = super().featurize_composition(df)

if self.composition_featurizers and not self.continuous_only:
_orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}
df["AtomicOrbitals|HOMO_character"] = df[
"AtomicOrbitals|HOMO_character"
].map(_orbitals)
df["AtomicOrbitals|LUMO_character"] = df[
"AtomicOrbitals|LUMO_character"
].map(_orbitals)

df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply(
lambda x: -1 if not isinstance(x, str) else Element(x).Z
)
df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply(
lambda x: -1 if not isinstance(x, str) else Element(x).Z
)

if self.continuous_only:
# These are additional features that have shown discontinuities in my tests.
# Hopefully, I got them all...
df.drop(
columns=[
"ElementProperty|DemlData mean electric_pol",
"ElementProperty|DemlData mean FERE correction",
"ElementProperty|DemlData mean GGAU_Etot",
"ElementProperty|DemlData mean heat_fusion",
"ElementProperty|DemlData mean mus_fere",
],
inplace=True,
errors="ignore",
)

if self.oxid_composition_featurizers:
df.drop(columns=["IonProperty|max ionic char"], inplace=True)

return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan)

def featurize_structure(self, df):
"""Applies the preset structural featurizers to the input dataframe,
renames some fields and cleans the output dataframe.
"""

if self.structure_featurizers:
df = super().featurize_structure(df)

_crystal_system = {
"cubic": 1,
"tetragonal": 2,
"orthorombic": 3,
"hexagonal": 4,
"trigonal": 5,
"monoclinic": 6,
"triclinic": 7,
}

def _int_map(x):
if x == np.nan:
return 0
elif x:
return 1
else:
return 0

df["GlobalSymmetryFeatures|crystal_system"] = df[
"GlobalSymmetryFeatures|crystal_system"
].map(_crystal_system)
df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[
"GlobalSymmetryFeatures|is_centrosymmetric"
].map(_int_map)

return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan)

def featurize_site(self, df):
"""Applies the preset site featurizers to the input dataframe,
renames some fields and cleans the output dataframe.
"""

# rename some features for backwards compatibility with pretrained models
aliases = {
"GeneralizedRadialDistributionFunction": "GeneralizedRDF",
"AGNIFingerprints": "AGNIFingerPrint",
"BondOrientationalParameter": "BondOrientationParameter",
}
df = super().featurize_site(df, aliases=aliases)
df = df.loc[:, (df != 0).any(axis=0)]

return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan)
13 changes: 11 additions & 2 deletions modnet/models/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,11 @@ def fit(
pool.join()

def predict(
self, test_data: MODData, return_unc=False, return_prob=False
self,
test_data: MODData,
return_unc: bool = False,
return_prob: bool = False,
remap_out_of_bounds: bool = True,
) -> pd.DataFrame:
"""Predict the target values for the passed MODData.
Expand All @@ -154,6 +158,7 @@ def predict(
return_prob: For a classification task only: whether to return the probability of each
class OR only return the most probable class.
return_unc: whether to return a second dataframe containing the uncertainties
remap_out_of_bounds: whether to remap out-of-bounds values to the nearest bound.
Returns:
A `pandas.DataFrame` containing the predicted values of the targets.
Expand All @@ -163,7 +168,11 @@ class OR only return the most probable class.

all_predictions = []
for i in range(self.n_models):
p = self.models[i].predict(test_data, return_prob=return_prob)
p = self.models[i].predict(
test_data,
return_prob=return_prob,
remap_out_of_bounds=remap_out_of_bounds,
)
all_predictions.append(p.values)

p_mean = np.array(all_predictions).mean(axis=0)
Expand Down
38 changes: 23 additions & 15 deletions modnet/models/vanilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,14 +693,20 @@ def fit_preset(

return models, val_losses, best_learning_curve, learning_curves, best_preset

def predict(self, test_data: MODData, return_prob=False) -> pd.DataFrame:
def predict(
self,
test_data: MODData,
return_prob: bool = False,
remap_out_of_bounds: bool = True,
) -> pd.DataFrame:
"""Predict the target values for the passed MODData.
Parameters:
test_data: A featurized and feature-selected `MODData`
object containing the descriptors used in training.
return_prob: For a classification tasks only: whether to return the probability of each
class OR only return the most probable class.
remap_out_of_bounds: Whether to remap out-of-bounds predictions to the training data distribution.
Returns:
A `pandas.DataFrame` containing the predicted values of the targets.
Expand All @@ -724,20 +730,22 @@ class OR only return the most probable class.
p = [p]

# post-process based on training data
if max(self.num_classes.values()) <= 2: # regression
for i, vals in enumerate(p):
yrange = self.max_y[i] - self.min_y[i]
upper_bound = self.max_y[i] + 0.25 * yrange
lower_bound = self.min_y[i] - 0.25 * yrange
for j in range(len(self.targets_groups[i])):
out_of_range_idxs = np.where(
(vals[:, j] < lower_bound[j]) | (vals[:, j] > upper_bound[j])
)
vals[out_of_range_idxs, j] = (
np.random.uniform(0, 1, size=len(out_of_range_idxs[0]))
* (yrange[j])
+ self.min_y[i][j]
)
if remap_out_of_bounds:
if max(self.num_classes.values()) <= 2: # regression
for i, vals in enumerate(p):
yrange = self.max_y[i] - self.min_y[i]
upper_bound = self.max_y[i] + 0.25 * yrange
lower_bound = self.min_y[i] - 0.25 * yrange
for j in range(len(self.targets_groups[i])):
out_of_range_idxs = np.where(
(vals[:, j] < lower_bound[j])
| (vals[:, j] > upper_bound[j])
)
vals[out_of_range_idxs, j] = (
np.random.uniform(0, 1, size=len(out_of_range_idxs[0]))
* (yrange[j])
+ self.min_y[i][j]
)

p_dic = {}

Expand Down
26 changes: 17 additions & 9 deletions modnet/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,8 +664,12 @@ def __init__(
LOG.info(f"Loaded {self.featurizer.__class__.__name__} featurizer.")

if target_names is not None:
if isinstance(target_names, str):
target_names = [target_names]
if np.shape(targets)[-1] != len(target_names):
raise ValueError("Target names must be supplied for every target.")
raise ValueError(
f"Target names must be supplied for every target: {np.shape(targets)} vs {target_names=}"
)
elif targets is not None:
if len(np.shape(targets)) == 1:
target_names = ["prop0"]
Expand All @@ -681,16 +685,20 @@ def __init__(
"List of IDs (`structure_ids`) provided must be unique."
)

if len(structure_ids) != len(materials):
raise ValueError(
"List of IDs (`structure_ids`) must have same length as list of structure."
)
if materials is not None:
if len(structure_ids) != len(materials):
raise ValueError(
"List of IDs (`structure_ids`) must have same length as list of structure."
)

else:
num_entries = (
len(materials) if materials is not None else len(df_featurized)
)
structure_ids = [f"id{i}" for i in range(num_entries)]
if df_featurized is not None:
structure_ids = df_featurized.index
else:
num_entries = (
len(materials) if materials is not None else len(df_featurized)
)
structure_ids = [f"id{i}" for i in range(num_entries)]

if targets is not None:
# set up dataframe for targets with columns (id, property_1, ..., property_n)
Expand Down

0 comments on commit b7d1939

Please sign in to comment.