Merge pull request #70 from datamol-io/misc

Misc
datamol-io · Jul 25, 2023 · 8bf7f42 · 8bf7f42
2 parents 8cf5c9a + 322503d
commit 8bf7f42
Show file tree

Hide file tree

Showing 9 changed files with 278 additions and 45 deletions.
diff --git a/docs/api/molfeat.trans.pretrained.base.md b/docs/api/molfeat.trans.pretrained.base.md
@@ -0,0 +1,3 @@
+## Pretrained Model
+
+::: molfeat.trans.pretrained.base
diff --git a/docs/assets/css/custom-molfeat.css b/docs/assets/css/custom-molfeat.css
@@ -1,20 +1,20 @@
 :root {
-  --datamol-primary: #217EBB;
-  --datamol-secondary: #343a40;
+  --molfeat-primary: #217EBB;
+  --molfeat-secondary: #5f6d7a;
 
   /* Primary color shades */
-  --md-primary-fg-color: var(--datamol-primary);
-  --md-primary-fg-color--light: var(--datamol-primary);
-  --md-primary-fg-color--dark: var(--datamol-primary);
-  --md-primary-bg-color: var(--datamol-secondary);
-  --md-primary-bg-color--light: var(--datamol-secondary);
-  --md-text-link-color: var(--datamol-secondary);
+  --md-primary-fg-color: var(--molfeat-primary);
+  --md-primary-fg-color--light: var(--molfeat-primary);
+  --md-primary-fg-color--dark: var(--molfeat-primary);
+  --md-primary-bg-color: var(--molfeat-secondary);
+  --md-primary-bg-color--light: var(--molfeat-secondary);
+  --md-text-link-color: var(--molfeat-secondary);
 
   /* Accent color shades */
-  --md-accent-fg-color: var(--datamol-secondary);
-  --md-accent-fg-color--transparent: var(--datamol-secondary);
-  --md-accent-bg-color: var(--datamol-secondary);
-  --md-accent-bg-color--light: var(--datamol-secondary);
+  --md-accent-fg-color: var(--molfeat-secondary);
+  --md-accent-fg-color--transparent: var(--molfeat-secondary);
+  --md-accent-bg-color: var(--molfeat-secondary);
+  --md-accent-bg-color--light: var(--molfeat-secondary);
 }
 
 :root>* {
@@ -23,11 +23,11 @@
   --md-code-fg-color: hsla(200, 18%, 26%, 1);
 
   /* Footer */
-  --md-footer-bg-color: var(--datamol-primary);
+  --md-footer-bg-color: var(--molfeat-primary);
   /* --md-footer-bg-color--dark: hsla(0, 0%, 0%, 0.32); */
-  --md-footer-fg-color: var(--datamol-secondary);
-  --md-footer-fg-color--light: var(--datamol-secondary);
-  --md-footer-fg-color--lighter: var(--datamol-secondary);
+  --md-footer-fg-color: var(--molfeat-secondary);
+  --md-footer-fg-color--light: var(--molfeat-secondary);
+  --md-footer-fg-color--lighter: var(--molfeat-secondary);
 
 }
 
@@ -40,7 +40,7 @@
 }
 
 .md-tabs {
-  background-image: linear-gradient(to right, #F4F6F9, #C3CFE2);
+  background-image: linear-gradient(to right, #F4F6F9, #CCE3f8);
 }
 
 .md-header__topic {
@@ -63,7 +63,16 @@
 }
 
 .md-search__form {
-  background-color: rgba(255, 255, 255, 0.2);
+  background-color: rgba(255, 255, 255, 0.4);
+}
+
+.md-search-result__article:hover {
+  background-color: #CCE3f8;
+}
+
+.md-search-result__more:hover,
+.md-search-result__more:focus {
+  background-color: #CCE3f8 !important;
 }
 
 .md-search__input {

diff --git a/docs/usage.md b/docs/usage.md
@@ -46,3 +46,73 @@ model_card = store.search(name="ChemBERTa-77M-MLM")[0]
 model_card.usage()
 ```
 
+
+## FAQ
+#### What is a molecular featurizer ?
+A molecular featurizer is a function or model that provides numerical representations for molecular structures. These numerical features serve as inputs for machine learning models, enabling them to predict molecular properties and activities, design novel molecules, perform molecular analyses, or conduct searches for similar molecules.
+
+#### Why so many molecular featurizers in `molfeat`?
+
+The reason for providing a diverse range of molecular featurizers in `molfeat` is to address the inherent uncertainty in determining which molecular representation performs best for a given task. Different featurization methods exist, such as using physico-chemical descriptors, molecular structure fingerprints, deep learning embeddings, and more. The effectiveness of these representations varies depending on the specific application. Therefore, the availability of multiple featurizers in `molfeat` ensures that users can access the most suitable featurizer for their unique needs.
+
+
+#### What is the difference between a calculator and a featurizer in `molfeat`?
+
+In `molfeat`,
+
+- a `calculator` operates on individual molecules and specifies the process of transforming an input molecule into a numerical representation.
+- a `featurizer`  works with batches of molecules, leveraging the efficiency of deep learning models on batch processing. Some  `featurizers` uses a `calculator` internally to feature each molecule individually and then stitch their outputs together. Additionally, `featurizers` offer convenient tools, such as parallelism and caching, to optimize the computation of molecular representations efficiently.
+
+`molfeat` has been designed with utmost flexibility, recognizing that the actions users wish to perform with molecular data can be vast and diverse, and there often isn't a single "right" way to approach them.
+
+
+#### What functions should I be familiar with when using the featurizer classes ?
+
+When using a `featurizer` in `molfeat`, you should be familiar with the following functions:
+
+- `preprocess()`: This method performs preprocessing of your input molecules to ensure compatibility with the expected featurizer class you are using. It's essential to note that the preprocessing steps **are not automatically applied to your inputs** to maintain independence from the molecular transformation. The preprocess function takes your molecule inputs, along with optional labels, and can be redefined when creating a custom featurizer.
+
+-  `transform()`: This method operates on a batch of molecules and returns a list of representations, where the actual featurization occurs. In cases where featurization fails, the position can be denoted as `None`, especially when you choose to `ignore_errors`.
+- `_transform()`: This method operates on a single input molecule, performing the actual featurization.
+- `__call__()`: This method uses `transform()` under the hood and provides convenient arguments, such as enforcing the datatype defined during the initialization of your model, to the outputs. If you specify `ignore_errors`, a vector of indexes where featurization did not fail will also be returned.
+
+In addition to the methods described above, PretrainedMolTransformer introduces the following functions:
+
+- `_embed()`: For pre-trained models that benefit from batched featurization, this method is internally called during transform instead of an internal calculator.
+- `_convert()`: This method is called by the transformer to convert the molecule input into the expected format of the underlying ML model. For example, for a pre-trained language model expecting SELFIES strings, we will perform the conversion to SELFIES strings here.
+
+
+
+#### I am getting an error and I am not sure what to do ?
+
+When encountering an error during the featurization process, you have a couple of options to handle it:
+
+- Ignore Errors: You can choose to set the `ignore_errors` parameter to `True` when using the featurizer. This allows the featurizer to continue processing even if it encounters errors on some molecules in your dataset. The featurizer will still attempt to calculate representations for all molecules, and any molecules that failed featurization will have their position in the output list marked as `None`.
+
+- Increase Verbosity: If you're unsure about the specific errors occurring during featurization, you can set the verbosity of the featurizer to True. This will enable the featurizer to log all errors encountered during the process, providing more detailed information about the cause of the issue, since because of the above features, some silent errors are often caught but not propagated.
+
+For example, the following will ensure that all errors are logged.
+
+```python
+from molfeat.trans.concat import FeatConcat
+from molfeat.trans.fp import FPVecTransformer
+import numpy as np
+featurizer = MoleculeTransformer(..., dtype=np.float32, verbose=True)
+featurizer(["CSc1nc2cc3c(cc2[nH]1)N(Cc1ccc(S(=O)(=O)c2ccccc2)cc1)CCC3"], enforce_dtype=True)
+```
+
+
+#### What are the base featurizers class in molfeat and how to use them ?
+
+
+| Class                                                                                                                                                                                | Module                                    | Why?                                                                                                                                                                                                                                                                                                                                     |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [BaseFeaturizer](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.base.html#molfeat.trans.base.BaseFeaturizer)                                                               | `molfeat.trans.base`                      | Lowest level featurizer class. All featurizers (even if not molecular) inherit from this class. It's recommended to use `MoleculeTransformer` as the root class instead.                                                                                                                                                                 |
+| [MoleculeTransformer](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.base.html#molfeat.trans.base.MoleculeTransformer)                                                     | `molfeat.trans.base`                      | <ul><li> Base class for all molecule featurizers. This is where you start if you want to implement a new featurizer.</li><li> You can provide either an existing `calculator` or your own (a **python callable**) directly to define a new `featurizer`.</li></ul>                                                                       |
+| [PrecomputedMolTransformer](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.base.html#molfeat.trans.base.PrecomputedMolTransformer)                                         | `molfeat.trans.base`                      | Class for dealing with precomputed features. You can leverage this class to compute features, save them in a file, and reload them after for other tasks efficiently. [See this tutorial!](https://molfeat-docs.datamol.io/stable/tutorials/datacache.html#using-a-cache-with-a-precomputed-transformer)                                 |
+| [FeatConcat](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.concat.html#molfeat.trans.concat.FeatConcat)                                                                   | `molfeat.trans.concat`                    | Convenient class for concatenating multiple vector-featurizers automatically. If you want to combine multiple 'fingerprints' and descriptors, this is the class you use. [See example!](https://molfeat-docs.datamol.io/stable/tutorials/types_of_featurizers.html#concatenate-featurizers)                                              |
+| [PretrainedMolTransformer](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.pretrained.base.html)                                                                            | `molfeat.trans.pretrained.base`           | Base class for all `pretrained featurizers`. A `pretrained featurizer` is a `featurizer` that is derived from a pretrained machine learning model. Implement a subclass of this to define your new pretrained featurizer. [See example!](https://molfeat-docs.datamol.io/stable/tutorials/add_your_own.html#define-your-own-transformer) |
+| [PretrainedDGLTransformer](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.pretrained.dgl_pretrained.html#molfeat.trans.pretrained.dgl_pretrained.PretrainedDGLTransformer) | `molfeat.trans.pretrained.dgl_pretrained` | Base class for all `dgl pretrained featurizers`. You can initialize a new dgl/dgllife pretrained model as a `molfeat featurizer` easily using this class. You only need to add the dgl model object to a store. [See this example!](https://github.com/datamol-io/molfeat/blob/main/nb/etl/dgl-etl.ipynb)                                |
+| [PretrainedHFTransformer](https://molfeat-docs.datamol.io/stable/api/molfeat.trans.pretrained.hf_transformers.html#molfeat.trans.pretrained.hf_transformers.PretrainedHFTransformer) | `molfeat.trans.pretrained.hf_transformer` | Base class for all `huggingface pretrained featurizers`. You can initialize a new 🤗 Transformers pretrained model as a `molfeat featurizer` easily using this class. [See this example!](https://github.com/datamol-io/molfeat/blob/main/nb/etl/molt5-etl.ipynb)                                                                         |
+
+
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -46,6 +46,7 @@ nav:
           - molfeat.trans.struct: api/molfeat.trans.struct.md
           - molfeat.trans.concat: api/molfeat.trans.concat.md
           - molfeat.trans.pretrained:
+              - Base Pretrained Models: api/molfeat.trans.pretrained.base.md
               - HuggingFace: api/molfeat.trans.pretrained.hf_transformers.md
               - Graphormer: api/molfeat.trans.pretrained.graphormer.md
               - DGL: api/molfeat.trans.pretrained.dgl_pretrained.md

diff --git a/molfeat/trans/base.py b/molfeat/trans/base.py
@@ -32,6 +32,7 @@
 from molfeat.utils.cache import CacheList
 from molfeat.utils.commons import fn_to_hex
 from molfeat.utils.commons import hex_to_fn
+from molfeat.utils.commons import is_callable
 from molfeat.utils.parsing import get_input_args
 from molfeat.utils.parsing import import_from_string
 from molfeat.utils.state import map_dtype
@@ -198,6 +199,10 @@ def __init__(
         self._fitted = False
 
         self._save_input_args()
+        if self.featurizer and not (
+            isinstance(self.featurizer, str) or is_callable(self.featurizer)
+        ):
+            raise AttributeError(f"Featurizer {self.featurizer} must be a callable or a string")
 
     def _save_input_args(self):
         """Save the input arguments of a transformer to the attribute
@@ -319,7 +324,9 @@ def _to_mol(x):
         if not ignore_errors:
             for ind, feat in enumerate(features):
                 if feat is None:
-                    raise ValueError(f"Cannot transform molecule at index {ind}")
+                    raise ValueError(
+                        f"Cannot transform molecule at index {ind}. Please check logs (set verbose to True) to see errors!"
+                    )
 
         return features
 

diff --git a/molfeat/trans/pretrained/hf_transformers.py b/molfeat/trans/pretrained/hf_transformers.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import List, Optional
+from typing import List
 from typing import Union
 from typing import Optional
 

diff --git a/molfeat/utils/cache.py b/molfeat/utils/cache.py
@@ -260,7 +260,7 @@ def fetch(
         try:
             cacher = copy.deepcopy(self)
             n_jobs = self.n_jobs
-        except:
+        except:  # noqa
             # cannot parallelize process, ensure n_jobs is 0
             cacher = self
             n_jobs = 0
@@ -357,7 +357,7 @@ def clear(self, delete: bool = False):
                 for path in glob.glob(str(self.cache_file) + "*"):
                     try:
                         os.unlink(path)
-                    except:
+                    except:  # noqa
                         pass
         else:
             self._initialize_cache()

diff --git a/molfeat/utils/commons.py b/molfeat/utils/commons.py
@@ -6,6 +6,7 @@
 from typing import List
 from typing import Union
 
+import types
 import os
 import inspect
 import hashlib
@@ -25,6 +26,16 @@
 from molfeat.utils import datatype
 
 
+FUNCTYPES = (types.FunctionType, types.MethodType, functools.partial)
+
+
+def is_callable(func):
+    r"""
+    Check if func is a function or a callable
+    """
+    return func and (isinstance(func, FUNCTYPES) or callable(func))
+
+
 def sha256sum(filepath: Union[str, os.PathLike]):
     """Return the sha256 sum hash of a file or a directory