Skip to content

Commit 7741238

Browse files
authored
Merge pull request #32 from nlesc-nano/filter
make pipeline more robust
2 parents e80a0b6 + 192d37b commit 7741238

File tree

7 files changed

+951
-39
lines changed

7 files changed

+951
-39
lines changed

CHANGELOG.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# Change Log
22

3-
# 0.3.0 [date]
3+
# 0.3.0 [21/08/2020]
44

55
## New
6-
* Introduce Pipeline to filter ligands (#26)
6+
* Introduce Pipeline to filter ligands (#13, #26)
77
* Use [SCScore](https://pubs.acs.org/doi/10.1021/acs.jcim.7b00622)
88
* Use [Horovod](https://github.com/horovod/horovod) to distribute the training
99
* Add [mypy](http://mypy-lang.org/) test

notebooks/Filter_visualization.ipynb

Lines changed: 704 additions & 23 deletions
Large diffs are not rendered by default.

notebooks/candidates.csv

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
,smiles
2+
2,O=C/C(Cl)=C(/Cl)C(=O)O
3+
3,O=C(O)C/C=C/CC(=O)O
4+
4,O=C(O)C1CC(Br)(Br)C1
5+
6,O=C(O)C1CCOC1=O
6+
7,C#CCCC#CC(=O)O
7+
8,N#CNCC(=O)O
8+
9,C=CC(=C)C(=O)O
9+
25,C=CNCC(=O)O
10+
26,O=C(O)C1CS1
11+
28,O=C(O)CCCOCCOC1CCCCO1
12+
29,C=CCOCCOCC(=O)O
13+
31,O=C(O)c1nccnc1O
14+
34,N=C(S)C(=O)O
15+
37,O=C(O)CCCCCCCCCCCCCCBr
16+
38,O=C(O)COCCCl
17+
39,O=C(O)c1nc(O)ccc1F
18+
40,O=C(O)CC(=O)CCl
19+
42,O=C(O)c1nc(Br)ccc1F
20+
44,O=C(O)C1CSCN1
21+
45,O=C(O)C(O)(O)C(=O)O
22+
47,O=C(O)c1ncccc1F
23+
48,O=C(O)C=Cc1ccccc1F
24+
49,O=C(O)C=Cc1ccc(F)cc1
25+
51,O=C(O)C=Cc1ccc2c(c1)OCO2
26+
52,O=C(O)C(Cl)(Cl)Cl
27+
56,O=C(O)C1CNCCN1
28+
58,O=C(O)C(F)(F)C(F)F
29+
71,O=C(O)c1cnc(Cl)nc1
30+
77,O=C(O)/C(Cl)=C/Cl
31+
83,O=C(O)C(F)(F)I
32+
87,O=C(O)COCCBr
33+
88,O=C(O)C1CS(=O)(=O)C1
34+
89,O=C(O)CC(=O)C(F)(F)F
35+
122,N#CCCSCC(=O)O
36+
123,O=C(O)C=CI
37+
124,O=C(O)C(F)(Cl)Cl
38+
125,O=C(O)C(=O)CS
39+
126,O=C(O)C=Cc1ccc(Cl)cc1
40+
130,O=C(O)C1CO1
41+
131,O=C(O)C1C=CCC1
42+
132,O=C1CCC(F)(C(=O)O)CC1
43+
133,O=C(O)CCCCCCCCCCCCCBr
44+
137,O=C(O)/C(Cl)=C\c1ccccc1
45+
143,O=C(O)C(Br)C(=O)O
46+
146,O=C(O)COc1ccc(Br)cc1
47+
147,O=C(O)COc1ccc(Cl)cc1
48+
150,O=C(O)CI
49+
151,O=C(O)C#CC(=O)O
50+
153,C=CC(=O)O
51+
155,O=C(O)C=CC(=O)O
52+
156,O=C(O)CBr
53+
157,C=CCCC(=O)O
54+
159,O=C(O)C(F)(F)F
55+
160,O=C(O)CCBr
56+
163,O=Cc1ccc(C(=O)O)cc1
57+
164,O=C(O)c1ccc(F)cc1
58+
165,O=C(O)c1ccc(O)nc1
59+
166,O=C(O)C=Cc1cccs1
60+
168,O=C(O)CCCl
61+
169,C=CCC(=O)O
62+
176,O=C(O)C(Br)CBr
63+
177,O=C(O)C(F)F
64+
181,O=C(O)CCCCCBr
65+
182,O=C(O)c1ncccc1O
66+
183,O=C(O)c1cc(F)ccc1O
67+
185,O=C(O)c1ccc(Cl)nc1
68+
187,O=C(O)CCCCBr
69+
188,O=C(O)COc1ccccc1
70+
189,O=C(O)c1ccc(CBr)cc1
71+
191,O=C(O)C1CCCN1
72+
193,O=C(O)c1ccoc1
73+
194,O=C(O)C(F)(F)C(F)(F)C(F)(F)F
74+
196,O=C(O)C(=O)CBr
75+
198,O=C(O)CCCBr
76+
199,O=C(O)CCCCCCCCCCBr
77+
201,O=C(O)CCCCCCCCCCCBr
78+
202,O=C(O)CC(=O)C(=O)O
79+
205,C=C(CBr)C(=O)O
80+
209,O=C(O)CCCCCCCBr
81+
221,C#CCCC(=O)O
82+
224,O=C(O)CCCCl
83+
225,O=C(O)c1ccc(O)cc1
84+
226,N#CCC(=O)O
85+
228,O=C(O)C(F)(F)Cl
86+
230,O=C(O)c1ccc(Cl)cc1F
87+
231,O=C(O)c1ccsc1
88+
232,O=C(O)c1cc(F)ccc1F
89+
238,O=CC(=O)O
90+
240,O=C(O)c1ccc(F)cc1F
91+
242,O=C(O)c1ccc(CCl)cc1
92+
243,O=C(O)CCCCCl
93+
244,O=C(O)C(Cl)Cl
94+
246,O=C(O)CS
95+
247,O=C(O)CC(=O)O
96+
248,O=C(O)C=Cc1ccc(O)cc1
97+
249,O=C(O)C1CCC1
98+
253,O=C(O)C1CCCC1
99+
254,O=C(O)C1CC1
100+
261,O=C(O)C(Cl)CCl
101+
265,O=C(O)c1ccc(O)cc1O
102+
267,C=C(Br)C(=O)O
103+
270,O=C(O)C(F)(F)C(F)(F)C(=O)O
104+
271,O=C(O)C1CNC1
105+
272,O=C(O)c1ccc(Br)cc1F
106+
273,O=C(O)c1ccccn1
107+
275,O=C(O)COCCOCC(=O)O
108+
276,O=C(O)c1ccccc1F
109+
278,O=C(O)c1c[nH]cn1
110+
284,C#CCCCCC(=O)O
111+
285,O=C(O)CCCCCCCCCCCCCCCBr
112+
289,O=C(O)C1CC=CC1
113+
292,O=C(O)CSCCSCC(=O)O
114+
298,O=C(O)c1ccco1
115+
302,O=C(O)C(F)(F)S(=O)(=O)F
116+
304,O=C(O)c1cc(Cl)ccc1F
117+
306,O=C(O)C1CCC(C(=O)O)CC1
118+
307,O=C(O)CCCCCCCCCBr
119+
310,O=C(O)c1cc(I)ccc1F
120+
317,O=C(O)c1cc(Br)ccc1F
121+
318,O=C(O)CCS
122+
324,O=C(O)C1=CCSC1
123+
325,O=C(O)C1(O)CCOCC1
124+
327,O=C(O)CCOC1CCCCO1
125+
328,O=C(O)c1cc(F)c(F)cc1O
126+
330,O=C(O)C1CCC=CO1
127+
333,N#CC=CC(=O)O
128+
337,O=C(O)c1nc(Cl)ccc1F
129+
341,O=C(O)C(=O)n1ccnc1
130+
342,O=C(O)COCC(=O)n1ccnc1
131+
350,O=C(O)c1cc(Br)c[nH]1
132+
355,C=C=CCC(=O)O
133+
363,O=C(O)C=Cc1ccc(Br)cc1
134+
366,O=C(O)c1cc(Br)c(F)cc1F
135+
368,O=C(O)CSc1cc[n+](CC(=O)c2ccccc2)cc1
136+
371,O=C(O)C=CCBr
137+
373,C=C1CC(C(=O)O)C1
138+
376,O=C(O)c1cc[n+](Cc2ccc(F)cc2)cc1
139+
377,O=C(O)c1cc(Br)cnc1F
140+
379,O=C(O)CONC(=O)OCc1ccccc1
141+
390,O=C(O)C(S)C(=O)O
142+
391,C=CC(=O)C(=O)O
143+
392,O=C(O)C(=O)C(Cl)(Cl)Cl
144+
396,O=C(O)C=Cc1c(F)cc(F)cc1F
145+
397,O=C(O)C1CSC1
146+
398,O=C(O)C1SCCCS1
147+
403,O=C(O)CCCCCCCCCCCCCCCCBr
148+
408,O=C(O)CCCS
149+
412,O=C(O)C1CCC=CCC1
150+
416,O=C(O)C1CC(C(=O)O)S1
151+
417,O=C(O)CCCCCCCCC[P+](c1ccccc1)(c1ccccc1)c1ccccc1
152+
418,O=C(O)CC1(O)CC1
153+
419,O=C(O)CS(=O)(=O)C(F)(F)F
154+
430,C=CCOCC(=O)O
155+
432,C=CCCOCC(=O)O
156+
434,O=C(O)COC(F)(F)F
157+
439,O=C(O)C=C(Cl)Cl
158+
443,C#CC=CC(=O)O
159+
444,O=C(O)C1C=CC=CC=C1
160+
454,O=C(O)c1cc(F)c(Br)cc1F
161+
459,C=C(Cl)C(=O)O
162+
461,O=C(O)CSCSCC(=O)O
163+
470,O=C(O)C(F)(Br)Br
164+
471,O=C(O)C(F)(F)Br
165+
477,O=C(O)c1cc(O)ccc1F
166+
480,O=C(O)CCCCCCCCCCCCCCCCCCCCBr
167+
481,O=C(O)CCCCCCBr
168+
483,O=C(O)CCCCCCCCCCCCCCCCCCBr
169+
486,O=C(O)c1ccc(Br)cc1O
170+
488,C#CCC(=O)O
171+
489,O=C(O)CF
172+
491,O=C(O)c1ccc(Br)nc1
173+
494,O=C(O)C(=O)c1ccco1
174+
497,O=C(O)C(=O)c1ccccc1
175+
498,O=C(O)C#Cc1ccccc1
176+
499,O=C(O)C1CCNCC1
177+
500,C#CC(=O)O
178+
501,O=C(O)c1cnccn1
179+
502,O=C(O)c1ccc[nH]1
180+
504,O=C(O)c1cccs1
181+
510,O=C(O)CCI
182+
518,O=C/C(Cl)=C(\Cl)C(=O)O
183+
525,O=C(O)C(=O)c1cc(F)ccc1F
184+
529,O=C(O)C(=O)c1ccc(Br)cc1
185+
546,O=C(O)C1(O)CSC1
186+
548,C=CC(Cl)C(=O)O
187+
552,O=C(O)c1cc(F)ccn1
188+
559,O=C(O)C1CCOCC1
189+
562,O=C(O)c1cc(Cl)ccn1
190+
565,O=C(O)c1cc(Br)ccn1
191+
573,N#CCSCC(=O)O
192+
574,O=C(O)CSc1cccs1

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
'nano-CAT@git+https://github.com/nlesc-nano/nano-CAT@master',
5353
'data-CAT@git+https://github.com/nlesc-nano/data-CAT@master',
5454
'horovod', 'mendeleev', 'more_itertools', 'numpy', 'pandas',
55-
'pyyaml>=5.1.1', 'seaborn', 'schema', 'sqlalchemy',
55+
'pyyaml>=5.1.1', 'retry', 'seaborn', 'schema', 'sqlalchemy',
5656
'torch-geometric', 'typing-extensions'],
5757

5858
extras_require={

swan/cosmo/cat_interface.py

Lines changed: 44 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,15 @@
1-
"""Interface with CAT/PLAMS Packages."""
1+
"""Interface with CAT/PLAMS Packages.
2+
3+
Index
4+
-----
5+
.. currentmodule:: swan.cosmo.cat_interface
6+
7+
API
8+
---
9+
10+
.. autofunction:: call_mopac
11+
.. autofunction:: call_cat_in_parallel
12+
"""
213
import logging
314
import os
415
import shutil
@@ -16,6 +27,7 @@
1627
import yaml
1728
from more_itertools import chunked
1829
from scm.plams import CRSJob, Settings
30+
from retry import retry
1931

2032
import CAT
2133
from CAT.base import prep
@@ -25,6 +37,9 @@
2537
from ..utils import Options
2638
from .functions import run_command
2739

40+
__all__ = ["call_cat_in_parallel", "call_mopac"]
41+
42+
2843
T = TypeVar('T')
2944

3045
# Starting logger
@@ -35,6 +50,7 @@
3550
logger.addHandler(handler)
3651

3752

53+
@retry(FileExistsError, tries=3, delay=1)
3854
def call_cat(smiles: pd.Series, opts: Mapping[str, T], chunk_name: str = "0") -> Path:
3955
"""Call cat with a given `config` and returns a dataframe with the results.
4056
@@ -58,7 +74,7 @@ def call_cat(smiles: pd.Series, opts: Mapping[str, T], chunk_name: str = "0") ->
5874
"""
5975
# create workdir for cat
6076
path_workdir_cat = Path(opts["workdir"]) / "cat_workdir" / chunk_name
61-
path_workdir_cat.mkdir(parents=True)
77+
path_workdir_cat.mkdir(parents=True, exist_ok=True)
6278

6379
path_smiles = (path_workdir_cat / "smiles.txt").absolute().as_posix()
6480

@@ -129,20 +145,41 @@ def compute_bulkiness(smiles: pd.Series, opts: Mapping[str, T], indices: pd.Inde
129145
chunk_name = str(indices[0])
130146
try:
131147
values = compute_bulkiness_using_cat(chunk, opts, chunk_name)
132-
except RuntimeError:
133-
values = np.repeat(np.nan, len(smiles))
148+
except (RuntimeError):
149+
logger.error(f"There was an error processing:\n{chunk.values}")
150+
values = np.repeat(np.nan, len(indices))
134151

135152
return values
136153

137154

138155
def call_cat_in_parallel(smiles: pd.Series, opts: Options) -> np.ndarray:
139-
"""Compute a ligand/quantum dot property using CAT."""
156+
"""Compute a ligand/quantum dot property using CAT.
157+
158+
It creates several instances of CAT using multiprocessing.
159+
160+
Parameters
161+
----------
162+
smiles
163+
Pandas.Series with the smiles to compute
164+
opts
165+
Options to call CAT
166+
167+
Returns
168+
-------
169+
Numpy array with the computed properties
170+
"""
140171
worker = partial(compute_bulkiness, smiles, opts.to_dict())
141172

142173
with Pool() as p:
143174
results = p.map(worker, chunked(smiles.index, 10))
144175

145-
return np.concatenate(results)
176+
results = np.concatenate(results)
177+
178+
if len(smiles.index) != results.size:
179+
msg = "WWW There is an incongruence in the bulkiness computed by CAT!"
180+
raise RuntimeError(msg)
181+
182+
return results
146183

147184

148185
def call_mopac(smile: str, solvents=["Toluene.coskf"]) -> Tuple[float, float]:
@@ -161,7 +198,7 @@ def call_mopac(smile: str, solvents=["Toluene.coskf"]) -> Tuple[float, float]:
161198
return np.nan, np.nan
162199
return call_cat_mopac(Path(tmp), smile, solvents)
163200
except ValueError:
164-
print(f"Error reading smile: {smile}")
201+
logger.error(f"Error reading smile: {smile}")
165202
return np.nan, np.nan
166203

167204
finally:

swan/filter/screen.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
1010
API
1111
---
12+
.. autofunction:: split_filter_in_batches
13+
.. autofunction:: apply_filters
14+
1215
{autodata}
1316
1417
"""
@@ -112,8 +115,8 @@ def split_filter_in_batches(opts: Options) -> None:
112115
try:
113116
apply_filters(batch, opts, output_file)
114117
except:
115-
error = next(iter(sys.exc_info()))
116-
logger.error(error)
118+
error, msg, _ = sys.exc_info()
119+
logger.error(f"Error processing batch: {k}\n{error} {msg}")
117120

118121

119122
def apply_filters(molecules: pd.DataFrame, opts: Options, output_file: Path) -> None:
@@ -195,6 +198,7 @@ def filter_by_bulkiness(molecules: pd.DataFrame, opts: Options) -> pd.DataFrame:
195198
raise RuntimeError("A core molecular geometry is needed to compute bulkiness")
196199

197200
molecules["bulkiness"] = call_cat_in_parallel(molecules.smiles, opts)
201+
logger.debug("CAT has been called!")
198202

199203
return apply_predicate(molecules, "bulkiness", opts)
200204

tests/test_files/input_test_filter.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,7 @@ filters:
1212
- "C(=O)O"
1313
scscore:
1414
lower_than:
15-
2.5
15+
2.0
1616
bulkiness:
1717
lower_than:
18-
20
19-
20-
18+
20

0 commit comments

Comments
 (0)