Skip to content

Commit cd7296a

Browse files
committed
new_notebook
1 parent afffdee commit cd7296a

File tree

2 files changed

+112
-68
lines changed

2 files changed

+112
-68
lines changed

baybe_hack.ipynb

Lines changed: 112 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,32 @@
99
},
1010
{
1111
"cell_type": "code",
12-
"execution_count": 10,
12+
"execution_count": 1,
1313
"metadata": {},
1414
"outputs": [
1515
{
1616
"name": "stdout",
1717
"output_type": "stream",
1818
"text": [
19-
" Time_h pH Inhib_Concentrat_M Efficiency\n",
20-
"count 611.000000 611.000000 611.000000 611.000000\n",
21-
"mean 135.801964 6.342062 0.006808 26.736841\n",
22-
"std 201.683867 2.529080 0.014059 288.788317\n",
23-
"min 0.500000 0.000000 0.000010 -4834.000000\n",
24-
"25% 24.000000 4.000000 0.000500 30.000000\n",
25-
"50% 24.000000 7.000000 0.001000 58.000000\n",
26-
"75% 144.000000 7.000000 0.003000 87.950000\n",
27-
"max 672.000000 10.000000 0.100000 100.000000\n"
19+
" Time_h pH Inhib_Concentrat_M Salt_Concentrat_M \\\n",
20+
"count 611.000000 611.000000 611.000000 611.000000 \n",
21+
"mean 135.801964 6.342062 0.006808 0.145450 \n",
22+
"std 201.683867 2.529080 0.014059 0.200575 \n",
23+
"min 0.500000 0.000000 0.000010 0.000000 \n",
24+
"25% 24.000000 4.000000 0.000500 0.010000 \n",
25+
"50% 24.000000 7.000000 0.001000 0.100000 \n",
26+
"75% 144.000000 7.000000 0.003000 0.100000 \n",
27+
"max 672.000000 10.000000 0.100000 0.600000 \n",
28+
"\n",
29+
" Efficiency \n",
30+
"count 611.000000 \n",
31+
"mean 26.736841 \n",
32+
"std 288.788317 \n",
33+
"min -4834.000000 \n",
34+
"25% 30.000000 \n",
35+
"50% 58.000000 \n",
36+
"75% 87.950000 \n",
37+
"max 100.000000 \n"
2838
]
2939
}
3040
],
@@ -38,7 +48,7 @@
3848
},
3949
{
4050
"cell_type": "code",
41-
"execution_count": 11,
51+
"execution_count": 2,
4252
"metadata": {},
4353
"outputs": [
4454
{
@@ -52,12 +62,12 @@
5262
"3 Cc1ccc(c(c1)n1nc2c(n1)cccc2)O 24.0 10.0 0.001 \n",
5363
"4 Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O 24.0 4.0 0.001 \n",
5464
"\n",
55-
" Efficiency \n",
56-
"0 0.0 \n",
57-
"1 0.0 \n",
58-
"2 30.0 \n",
59-
"3 30.0 \n",
60-
"4 30.0 \n"
65+
" Salt_Concentrat_M Efficiency \n",
66+
"0 0.1 0.0 \n",
67+
"1 0.1 0.0 \n",
68+
"2 0.1 30.0 \n",
69+
"3 0.1 30.0 \n",
70+
"4 0.1 30.0 \n"
6171
]
6272
}
6373
],
@@ -74,9 +84,18 @@
7484
},
7585
{
7686
"cell_type": "code",
77-
"execution_count": 12,
87+
"execution_count": 3,
7888
"metadata": {},
79-
"outputs": [],
89+
"outputs": [
90+
{
91+
"name": "stderr",
92+
"output_type": "stream",
93+
"text": [
94+
"/home/vscode/.local/lib/python3.10/site-packages/baybe/telemetry.py:222: UserWarning: WARNING: BayBE Telemetry endpoint https://public.telemetry.baybe.p.uptimize.merckgroup.com:4317 cannot be reached. Disabling telemetry. The exception encountered was: ConnectionError, HTTPConnectionPool(host='verkehrsnachrichten.merck.de', port=80): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPConnection object at 0x7fa2386f7fd0>: Failed to resolve 'verkehrsnachrichten.merck.de' ([Errno -2] Name or service not known)\"))\n",
95+
" warnings.warn(\n"
96+
]
97+
}
98+
],
8099
"source": [
81100
"from baybe.targets import NumericalTarget\n",
82101
"from baybe.objective import Objective\n",
@@ -97,7 +116,7 @@
97116
},
98117
{
99118
"cell_type": "code",
100-
"execution_count": 13,
119+
"execution_count": 4,
101120
"metadata": {},
102121
"outputs": [],
103122
"source": [
@@ -141,20 +160,9 @@
141160
},
142161
{
143162
"cell_type": "code",
144-
"execution_count": 4,
163+
"execution_count": null,
145164
"metadata": {},
146-
"outputs": [
147-
{
148-
"data": {
149-
"text/plain": [
150-
"SubstanceParameter(name='Solvent', data={'Water': 'O', '1-Octanol': 'CCCCCCCCO', 'Toluene': 'CC1=CC=CC=C1'}, decorrelate=0.7, encoding=<SubstanceEncoding.MORDRED: 'MORDRED'>)"
151-
]
152-
},
153-
"execution_count": 4,
154-
"metadata": {},
155-
"output_type": "execute_result"
156-
}
157-
],
165+
"outputs": [],
158166
"source": [
159167
"from baybe.parameters import SubstanceParameter\n",
160168
"\n",
@@ -178,22 +186,17 @@
178186
"These calculations will typically result in 500 to 1500 numbers per molecule. **To avoid detrimental effects on the surrogate model fit, we reduce the number of descriptors via decorrelation before using them.** For instance, the decorrelate option in the example above specifies that only descriptors with a correlation lower than 0.7 to any other descriptor will be kept. This usually reduces the number of descriptors to 10-50, depending on the specific items in data."
179187
]
180188
},
181-
{
182-
"cell_type": "markdown",
183-
"metadata": {},
184-
"source": [
185-
"The encoding concept introduced above is generalized by the CustomParameter. Here, the user is expected to provide their own descriptors for the encoding.\n",
186-
"\n",
187-
"Take, for instance, a parameter that corresponds to the choice of a polymer. Polymers are not well represented by the small molecule descriptors utilized in the SubstanceParameter. Still, one could provide experimental measurements or common metrics used to classify polymers:"
188-
]
189-
},
190189
{
191190
"cell_type": "code",
192191
"execution_count": null,
193192
"metadata": {},
194193
"outputs": [],
195194
"source": [
196195
"\"\"\"\n",
196+
"The encoding concept introduced above is generalized by the CustomParameter. Here, the user is expected to provide their own descriptors for the encoding.\n",
197+
"\n",
198+
"Take, for instance, a parameter that corresponds to the choice of a polymer. Polymers are not well represented by the small molecule descriptors utilized in the SubstanceParameter. \n",
199+
"Still, one could provide experimental measurements or common metrics used to classify polymers:\n",
197200
"from baybe.parameters import CustomDiscreteParameter\n",
198201
"\n",
199202
"# Create or import new dataframe containing custom descriptors\n",
@@ -216,7 +219,7 @@
216219
},
217220
{
218221
"cell_type": "code",
219-
"execution_count": 14,
222+
"execution_count": 5,
220223
"metadata": {},
221224
"outputs": [],
222225
"source": [
@@ -250,7 +253,7 @@
250253
},
251254
{
252255
"cell_type": "code",
253-
"execution_count": 15,
256+
"execution_count": 6,
254257
"metadata": {},
255258
"outputs": [],
256259
"source": [
@@ -282,12 +285,7 @@
282285
" sampling_percentage=0.3, # should be relatively low\n",
283286
" allow_repeated_recommendations=False,\n",
284287
" allow_recommending_already_measured=False,\n",
285-
" )\n",
286-
"\n",
287-
"hybrid_recommender = SequentialGreedyRecommender(\n",
288-
" allow_repeated_recommendations=False,\n",
289-
" allow_recommending_already_measured=False\n",
290-
")"
288+
" )"
291289
]
292290
},
293291
{
@@ -299,7 +297,7 @@
299297
},
300298
{
301299
"cell_type": "code",
302-
"execution_count": 16,
300+
"execution_count": 9,
303301
"metadata": {},
304302
"outputs": [
305303
{
@@ -316,12 +314,10 @@
316314
"from baybe import Campaign\n",
317315
"\n",
318316
"strategy = TwoPhaseStrategy(\n",
319-
" initial_recommender = RandomRecommender(), # Initial recommender, if no training data is available\n",
320-
" # Other initial recommenders don't seem to work for my hybrid search space/set of parameters\n",
321-
" # Doesn't matter since I already have training data\n",
317+
" initial_recommender = RandomRecommender(), # Initial recommender\n",
318+
" # Doesn't matter since I already have training data, BUT CAN BE USED FOR BENCHMARKING\n",
322319
" recommender = seq_greedy_recommender, # Bayesian model-based optimization\n",
323-
" # recommender = hybrid_recommender,\n",
324-
" switch_after=1 # Switch to the model-based recommender after 1 batch or iteration (so the initial training data)\n",
320+
" switch_after=1 # Switch to the model-based recommender after 1 batches = immediately\n",
325321
")\n",
326322
"\n",
327323
"campaign = Campaign(searchspace, objective, strategy)"
@@ -336,7 +332,7 @@
336332
},
337333
{
338334
"cell_type": "code",
339-
"execution_count": 21,
335+
"execution_count": 10,
340336
"metadata": {},
341337
"outputs": [
342338
{
@@ -346,21 +342,23 @@
346342
"\n",
347343
"\n",
348344
"Recommended experiments: \n",
349-
"| | Time (h) | pH | Salt Concentration (M) | Inhibitor Concentration (M) |\n",
350-
"|------:|-----------:|-----:|-------------------------:|------------------------------:|\n",
351-
"| 11808 | 1 | 4.8 | 1.5 | 0.00858356 |\n"
345+
"| | Time (h) | pH | Inhibitor Concentration (M) | Salt Concentration (M) |\n",
346+
"|--------:|-----------:|-----:|------------------------------:|-------------------------:|\n",
347+
"| 4924793 | 16 | 2.5 | 0.01 | 0.92 |\n",
348+
"| 6006943 | 19 | 8 | 0.05 | 0.58 |\n",
349+
"| 6994486 | 22 | 8.8 | 0.08 | 0.88 |\n"
352350
]
353351
}
354352
],
355353
"source": [
356-
"new_rec = campaign.recommend(batch_size=1) # TEST with different batch sizes for optimal performance\n",
354+
"new_rec = campaign.recommend(batch_size=3) # TEST with different batch sizes for optimal performance\n",
357355
"print(\"\\n\\nRecommended experiments: \")\n",
358356
"print(new_rec.to_markdown())"
359357
]
360358
},
361359
{
362360
"cell_type": "code",
363-
"execution_count": 22,
361+
"execution_count": 11,
364362
"metadata": {},
365363
"outputs": [
366364
{
@@ -370,21 +368,49 @@
370368
"\n",
371369
"\n",
372370
"Recommended experiments with measured values: \n",
373-
"| | Time (h) | pH | Salt Concentration (M) | Inhibitor Concentration (M) |\n",
374-
"|------:|-----------:|-----:|-------------------------:|------------------------------:|\n",
375-
"| 11808 | 1 | 4.8 | 1.5 | 0.00858356 |\n"
371+
"| | Time (h) | pH | Inhibitor Concentration (M) | Salt Concentration (M) | efficiency |\n",
372+
"|--------:|-----------:|-----:|------------------------------:|-------------------------:|-------------:|\n",
373+
"| 4924793 | 16 | 2.5 | 0.01 | 0.92 | 0.1 |\n",
374+
"| 6006943 | 19 | 8 | 0.05 | 0.58 | 0.2 |\n",
375+
"| 6994486 | 22 | 8.8 | 0.08 | 0.88 | 0.3 |\n"
376376
]
377377
}
378378
],
379379
"source": [
380380
"# Get and input efficiency value from Excel table, for specific SMILES component first, \n",
381381
"# then for the closest values of the rest of the parameters\n",
382382
"\n",
383-
"new_rec[\"efficiency\"] = [0.1]\n",
383+
"new_rec[\"efficiency\"] = [0.1, 0.2, 0.3]\n",
384384
"print(\"\\n\\nRecommended experiments with measured values: \")\n",
385385
"print(new_rec.to_markdown())"
386386
]
387387
},
388+
{
389+
"cell_type": "code",
390+
"execution_count": 12,
391+
"metadata": {},
392+
"outputs": [
393+
{
394+
"name": "stdout",
395+
"output_type": "stream",
396+
"text": [
397+
"\n",
398+
"\n",
399+
"Recommended experiments: \n",
400+
"| | Time (h) | pH | Inhibitor Concentration (M) | Salt Concentration (M) | efficiency |\n",
401+
"|--------:|-----------:|-----:|------------------------------:|-------------------------:|-------------:|\n",
402+
"| 4924793 | 16 | 2.5 | 0.01 | 0.92 | 0.1 |\n",
403+
"| 6006943 | 19 | 8 | 0.05 | 0.58 | 0.2 |\n",
404+
"| 6994486 | 22 | 8.8 | 0.08 | 0.88 | 0.3 |\n"
405+
]
406+
}
407+
],
408+
"source": [
409+
"new_new_rec = campaign.recommend(batch_size=3) # TEST with different batch sizes for optimal performance\n",
410+
"print(\"\\n\\nRecommended experiments: \")\n",
411+
"print(new_new_rec.to_markdown())"
412+
]
413+
},
388414
{
389415
"cell_type": "markdown",
390416
"metadata": {},
@@ -394,9 +420,27 @@
394420
},
395421
{
396422
"cell_type": "code",
397-
"execution_count": null,
423+
"execution_count": 14,
398424
"metadata": {},
399-
"outputs": [],
425+
"outputs": [
426+
{
427+
"name": "stdout",
428+
"output_type": "stream",
429+
"text": [
430+
"\n",
431+
"\n",
432+
"All experiments with measured values: \n",
433+
"| | Time (h) | pH | Inhibitor Concentration (M) | Salt Concentration (M) | efficiency |\n",
434+
"|--------:|-----------:|-----:|------------------------------:|-------------------------:|-------------:|\n",
435+
"| 4924793 | 16 | 2.5 | 0.01 | 0.92 | 0.1 |\n",
436+
"| 6006943 | 19 | 8 | 0.05 | 0.58 | 0.2 |\n",
437+
"| 6994486 | 22 | 8.8 | 0.08 | 0.88 | 0.3 |\n",
438+
"| 4924793 | 16 | 2.5 | 0.01 | 0.92 | nan |\n",
439+
"| 6006943 | 19 | 8 | 0.05 | 0.58 | nan |\n",
440+
"| 6994486 | 22 | 8.8 | 0.08 | 0.88 | nan |\n"
441+
]
442+
}
443+
],
400444
"source": [
401445
"results = pd.concat([new_rec, new_new_rec]) # etc.\n",
402446
"print(\"\\n\\nAll experiments with measured values: \")\n",
File renamed without changes.

0 commit comments

Comments
 (0)