Merge pull request #32 from nlesc-nano/filter

felipeZ · web-flow · commit 7741238d30d1 · 2020-08-21T14:57:45.000+02:00
make pipeline more robust
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,9 @@
 # Change Log
 
-# 0.3.0 [date]
+# 0.3.0 [21/08/2020]
 
 ## New
-* Introduce Pipeline to filter ligands (#26)
+* Introduce Pipeline to filter ligands (#13, #26)
 * Use [SCScore](https://pubs.acs.org/doi/10.1021/acs.jcim.7b00622)
 * Use [Horovod](https://github.com/horovod/horovod) to distribute the training 
 * Add [mypy](http://mypy-lang.org/) test
diff --git a/notebooks/Filter_visualization.ipynb b/notebooks/Filter_visualization.ipynb
diff --git a/notebooks/candidates.csv b/notebooks/candidates.csv
@@ -0,0 +1,192 @@
+,smiles
+2,O=C/C(Cl)=C(/Cl)C(=O)O
+3,O=C(O)C/C=C/CC(=O)O
+4,O=C(O)C1CC(Br)(Br)C1
+6,O=C(O)C1CCOC1=O
+7,C#CCCC#CC(=O)O
+8,N#CNCC(=O)O
+9,C=CC(=C)C(=O)O
+25,C=CNCC(=O)O
+26,O=C(O)C1CS1
+28,O=C(O)CCCOCCOC1CCCCO1
+29,C=CCOCCOCC(=O)O
+31,O=C(O)c1nccnc1O
+34,N=C(S)C(=O)O
+37,O=C(O)CCCCCCCCCCCCCCBr
+38,O=C(O)COCCCl
+39,O=C(O)c1nc(O)ccc1F
+40,O=C(O)CC(=O)CCl
+42,O=C(O)c1nc(Br)ccc1F
+44,O=C(O)C1CSCN1
+45,O=C(O)C(O)(O)C(=O)O
+47,O=C(O)c1ncccc1F
+48,O=C(O)C=Cc1ccccc1F
+49,O=C(O)C=Cc1ccc(F)cc1
+51,O=C(O)C=Cc1ccc2c(c1)OCO2
+52,O=C(O)C(Cl)(Cl)Cl
+56,O=C(O)C1CNCCN1
+58,O=C(O)C(F)(F)C(F)F
+71,O=C(O)c1cnc(Cl)nc1
+77,O=C(O)/C(Cl)=C/Cl
+83,O=C(O)C(F)(F)I
+87,O=C(O)COCCBr
+88,O=C(O)C1CS(=O)(=O)C1
+89,O=C(O)CC(=O)C(F)(F)F
+122,N#CCCSCC(=O)O
+123,O=C(O)C=CI
+124,O=C(O)C(F)(Cl)Cl
+125,O=C(O)C(=O)CS
+126,O=C(O)C=Cc1ccc(Cl)cc1
+130,O=C(O)C1CO1
+131,O=C(O)C1C=CCC1
+132,O=C1CCC(F)(C(=O)O)CC1
+133,O=C(O)CCCCCCCCCCCCCBr
+137,O=C(O)/C(Cl)=C\c1ccccc1
+143,O=C(O)C(Br)C(=O)O
+146,O=C(O)COc1ccc(Br)cc1
+147,O=C(O)COc1ccc(Cl)cc1
+150,O=C(O)CI
+151,O=C(O)C#CC(=O)O
+153,C=CC(=O)O
+155,O=C(O)C=CC(=O)O
+156,O=C(O)CBr
+157,C=CCCC(=O)O
+159,O=C(O)C(F)(F)F
+160,O=C(O)CCBr
+163,O=Cc1ccc(C(=O)O)cc1
+164,O=C(O)c1ccc(F)cc1
+165,O=C(O)c1ccc(O)nc1
+166,O=C(O)C=Cc1cccs1
+168,O=C(O)CCCl
+169,C=CCC(=O)O
+176,O=C(O)C(Br)CBr
+177,O=C(O)C(F)F
+181,O=C(O)CCCCCBr
+182,O=C(O)c1ncccc1O
+183,O=C(O)c1cc(F)ccc1O
+185,O=C(O)c1ccc(Cl)nc1
+187,O=C(O)CCCCBr
+188,O=C(O)COc1ccccc1
+189,O=C(O)c1ccc(CBr)cc1
+191,O=C(O)C1CCCN1
+193,O=C(O)c1ccoc1
+194,O=C(O)C(F)(F)C(F)(F)C(F)(F)F
+196,O=C(O)C(=O)CBr
+198,O=C(O)CCCBr
+199,O=C(O)CCCCCCCCCCBr
+201,O=C(O)CCCCCCCCCCCBr
+202,O=C(O)CC(=O)C(=O)O
+205,C=C(CBr)C(=O)O
+209,O=C(O)CCCCCCCBr
+221,C#CCCC(=O)O
+224,O=C(O)CCCCl
+225,O=C(O)c1ccc(O)cc1
+226,N#CCC(=O)O
+228,O=C(O)C(F)(F)Cl
+230,O=C(O)c1ccc(Cl)cc1F
+231,O=C(O)c1ccsc1
+232,O=C(O)c1cc(F)ccc1F
+238,O=CC(=O)O
+240,O=C(O)c1ccc(F)cc1F
+242,O=C(O)c1ccc(CCl)cc1
+243,O=C(O)CCCCCl
+244,O=C(O)C(Cl)Cl
+246,O=C(O)CS
+247,O=C(O)CC(=O)O
+248,O=C(O)C=Cc1ccc(O)cc1
+249,O=C(O)C1CCC1
+253,O=C(O)C1CCCC1
+254,O=C(O)C1CC1
+261,O=C(O)C(Cl)CCl
+265,O=C(O)c1ccc(O)cc1O
+267,C=C(Br)C(=O)O
+270,O=C(O)C(F)(F)C(F)(F)C(=O)O
+271,O=C(O)C1CNC1
+272,O=C(O)c1ccc(Br)cc1F
+273,O=C(O)c1ccccn1
+275,O=C(O)COCCOCC(=O)O
+276,O=C(O)c1ccccc1F
+278,O=C(O)c1c[nH]cn1
+284,C#CCCCCC(=O)O
+285,O=C(O)CCCCCCCCCCCCCCCBr
+289,O=C(O)C1CC=CC1
+292,O=C(O)CSCCSCC(=O)O
+298,O=C(O)c1ccco1
+302,O=C(O)C(F)(F)S(=O)(=O)F
+304,O=C(O)c1cc(Cl)ccc1F
+306,O=C(O)C1CCC(C(=O)O)CC1
+307,O=C(O)CCCCCCCCCBr
+310,O=C(O)c1cc(I)ccc1F
+317,O=C(O)c1cc(Br)ccc1F
+318,O=C(O)CCS
+324,O=C(O)C1=CCSC1
+325,O=C(O)C1(O)CCOCC1
+327,O=C(O)CCOC1CCCCO1
+328,O=C(O)c1cc(F)c(F)cc1O
+330,O=C(O)C1CCC=CO1
+333,N#CC=CC(=O)O
+337,O=C(O)c1nc(Cl)ccc1F
+341,O=C(O)C(=O)n1ccnc1
+342,O=C(O)COCC(=O)n1ccnc1
+350,O=C(O)c1cc(Br)c[nH]1
+355,C=C=CCC(=O)O
+363,O=C(O)C=Cc1ccc(Br)cc1
+366,O=C(O)c1cc(Br)c(F)cc1F
+368,O=C(O)CSc1cc[n+](CC(=O)c2ccccc2)cc1
+371,O=C(O)C=CCBr
+373,C=C1CC(C(=O)O)C1
+376,O=C(O)c1cc[n+](Cc2ccc(F)cc2)cc1
+377,O=C(O)c1cc(Br)cnc1F
+379,O=C(O)CONC(=O)OCc1ccccc1
+390,O=C(O)C(S)C(=O)O
+391,C=CC(=O)C(=O)O
+392,O=C(O)C(=O)C(Cl)(Cl)Cl
+396,O=C(O)C=Cc1c(F)cc(F)cc1F
+397,O=C(O)C1CSC1
+398,O=C(O)C1SCCCS1
+403,O=C(O)CCCCCCCCCCCCCCCCBr
+408,O=C(O)CCCS
+412,O=C(O)C1CCC=CCC1
+416,O=C(O)C1CC(C(=O)O)S1
+417,O=C(O)CCCCCCCCC[P+](c1ccccc1)(c1ccccc1)c1ccccc1
+418,O=C(O)CC1(O)CC1
+419,O=C(O)CS(=O)(=O)C(F)(F)F
+430,C=CCOCC(=O)O
+432,C=CCCOCC(=O)O
+434,O=C(O)COC(F)(F)F
+439,O=C(O)C=C(Cl)Cl
+443,C#CC=CC(=O)O
+444,O=C(O)C1C=CC=CC=C1
+454,O=C(O)c1cc(F)c(Br)cc1F
+459,C=C(Cl)C(=O)O
+461,O=C(O)CSCSCC(=O)O
+470,O=C(O)C(F)(Br)Br
+471,O=C(O)C(F)(F)Br
+477,O=C(O)c1cc(O)ccc1F
+480,O=C(O)CCCCCCCCCCCCCCCCCCCCBr
+481,O=C(O)CCCCCCBr
+483,O=C(O)CCCCCCCCCCCCCCCCCCBr
+486,O=C(O)c1ccc(Br)cc1O
+488,C#CCC(=O)O
+489,O=C(O)CF
+491,O=C(O)c1ccc(Br)nc1
+494,O=C(O)C(=O)c1ccco1
+497,O=C(O)C(=O)c1ccccc1
+498,O=C(O)C#Cc1ccccc1
+499,O=C(O)C1CCNCC1
+500,C#CC(=O)O
+501,O=C(O)c1cnccn1
+502,O=C(O)c1ccc[nH]1
+504,O=C(O)c1cccs1
+510,O=C(O)CCI
+518,O=C/C(Cl)=C(\Cl)C(=O)O
+525,O=C(O)C(=O)c1cc(F)ccc1F
+529,O=C(O)C(=O)c1ccc(Br)cc1
+546,O=C(O)C1(O)CSC1
+548,C=CC(Cl)C(=O)O
+552,O=C(O)c1cc(F)ccn1
+559,O=C(O)C1CCOCC1
+562,O=C(O)c1cc(Cl)ccn1
+565,O=C(O)c1cc(Br)ccn1
+573,N#CCSCC(=O)O
+574,O=C(O)CSc1cccs1
diff --git a/setup.py b/setup.py
@@ -52,7 +52,7 @@
         'nano-CAT@git+https://github.com/nlesc-nano/nano-CAT@master',
         'data-CAT@git+https://github.com/nlesc-nano/data-CAT@master',
         'horovod', 'mendeleev', 'more_itertools', 'numpy', 'pandas',
-        'pyyaml>=5.1.1', 'seaborn', 'schema', 'sqlalchemy',
+        'pyyaml>=5.1.1', 'retry', 'seaborn', 'schema', 'sqlalchemy',
         'torch-geometric', 'typing-extensions'],
 
     extras_require={
diff --git a/swan/cosmo/cat_interface.py b/swan/cosmo/cat_interface.py
@@ -1,4 +1,15 @@
-"""Interface with CAT/PLAMS Packages."""
+"""Interface with CAT/PLAMS Packages.
+
+Index
+-----
+.. currentmodule:: swan.cosmo.cat_interface
+
+API
+---
+
+.. autofunction:: call_mopac
+.. autofunction:: call_cat_in_parallel
+"""
 import logging
 import os
 import shutil
@@ -16,6 +27,7 @@
 import yaml
 from more_itertools import chunked
 from scm.plams import CRSJob, Settings
+from retry import retry
 
 import CAT
 from CAT.base import prep
@@ -25,6 +37,9 @@
 from ..utils import Options
 from .functions import run_command
 
+__all__ = ["call_cat_in_parallel", "call_mopac"]
+
+
 T = TypeVar('T')
 
 # Starting logger
@@ -35,6 +50,7 @@
 logger.addHandler(handler)
 
 
+@retry(FileExistsError, tries=3, delay=1)
 def call_cat(smiles: pd.Series, opts: Mapping[str, T], chunk_name: str = "0") -> Path:
     """Call cat with a given `config` and returns a dataframe with the results.
 
@@ -58,7 +74,7 @@ def call_cat(smiles: pd.Series, opts: Mapping[str, T], chunk_name: str = "0") ->
     """
     # create workdir for cat
     path_workdir_cat = Path(opts["workdir"]) / "cat_workdir" / chunk_name
-    path_workdir_cat.mkdir(parents=True)
+    path_workdir_cat.mkdir(parents=True, exist_ok=True)
 
     path_smiles = (path_workdir_cat / "smiles.txt").absolute().as_posix()
 
@@ -129,20 +145,41 @@ def compute_bulkiness(smiles: pd.Series, opts: Mapping[str, T], indices: pd.Inde
     chunk_name = str(indices[0])
     try:
         values = compute_bulkiness_using_cat(chunk, opts, chunk_name)
-    except RuntimeError:
-        values = np.repeat(np.nan, len(smiles))
+    except (RuntimeError):
+        logger.error(f"There was an error processing:\n{chunk.values}")
+        values = np.repeat(np.nan, len(indices))
 
     return values
 
 
 def call_cat_in_parallel(smiles: pd.Series, opts: Options) -> np.ndarray:
-    """Compute a ligand/quantum dot property using CAT."""
+    """Compute a ligand/quantum dot property using CAT.
+
+    It creates several instances of CAT using multiprocessing.
+
+    Parameters
+    ----------
+    smiles
+        Pandas.Series with the smiles to compute
+    opts
+        Options to call CAT
+
+    Returns
+    -------
+        Numpy array with the computed properties
+    """
     worker = partial(compute_bulkiness, smiles, opts.to_dict())
 
     with Pool() as p:
         results = p.map(worker, chunked(smiles.index, 10))
 
-    return np.concatenate(results)
+    results = np.concatenate(results)
+
+    if len(smiles.index) != results.size:
+        msg = "WWW There is an incongruence in the bulkiness computed by CAT!"
+        raise RuntimeError(msg)
+
+    return results
 
 
 def call_mopac(smile: str, solvents=["Toluene.coskf"]) -> Tuple[float, float]:
@@ -161,7 +198,7 @@ def call_mopac(smile: str, solvents=["Toluene.coskf"]) -> Tuple[float, float]:
             return np.nan, np.nan
         return call_cat_mopac(Path(tmp), smile, solvents)
     except ValueError:
-        print(f"Error reading smile: {smile}")
+        logger.error(f"Error reading smile: {smile}")
         return np.nan, np.nan
 
     finally:
diff --git a/swan/filter/screen.py b/swan/filter/screen.py
@@ -9,6 +9,9 @@
 
 API
 ---
+.. autofunction:: split_filter_in_batches
+.. autofunction:: apply_filters
+
 {autodata}
 
 """
@@ -112,8 +115,8 @@ def split_filter_in_batches(opts: Options) -> None:
         try:
             apply_filters(batch, opts, output_file)
         except:
-            error = next(iter(sys.exc_info()))
-            logger.error(error)
+            error, msg, _ = sys.exc_info()
+            logger.error(f"Error processing batch: {k}\n{error} {msg}")
 
 
 def apply_filters(molecules: pd.DataFrame, opts: Options, output_file: Path) -> None:
@@ -195,6 +198,7 @@ def filter_by_bulkiness(molecules: pd.DataFrame, opts: Options) -> pd.DataFrame:
         raise RuntimeError("A core molecular geometry is needed to compute bulkiness")
 
     molecules["bulkiness"] = call_cat_in_parallel(molecules.smiles, opts)
+    logger.debug("CAT has been called!")
 
     return apply_predicate(molecules, "bulkiness", opts)
 
diff --git a/tests/test_files/input_test_filter.yml b/tests/test_files/input_test_filter.yml
@@ -12,9 +12,7 @@ filters:
     - "C(=O)O"
   scscore:
     lower_than:
-      2.5
+      2.0
   bulkiness:
     lower_than:
-      20
-
-    
+      20