|
9 | 9 | },
|
10 | 10 | {
|
11 | 11 | "cell_type": "code",
|
12 |
| - "execution_count": 10, |
| 12 | + "execution_count": 1, |
13 | 13 | "metadata": {},
|
14 | 14 | "outputs": [
|
15 | 15 | {
|
16 | 16 | "name": "stdout",
|
17 | 17 | "output_type": "stream",
|
18 | 18 | "text": [
|
19 |
| - " Time_h pH Inhib_Concentrat_M Efficiency\n", |
20 |
| - "count 611.000000 611.000000 611.000000 611.000000\n", |
21 |
| - "mean 135.801964 6.342062 0.006808 26.736841\n", |
22 |
| - "std 201.683867 2.529080 0.014059 288.788317\n", |
23 |
| - "min 0.500000 0.000000 0.000010 -4834.000000\n", |
24 |
| - "25% 24.000000 4.000000 0.000500 30.000000\n", |
25 |
| - "50% 24.000000 7.000000 0.001000 58.000000\n", |
26 |
| - "75% 144.000000 7.000000 0.003000 87.950000\n", |
27 |
| - "max 672.000000 10.000000 0.100000 100.000000\n" |
| 19 | + " Time_h pH Inhib_Concentrat_M Salt_Concentrat_M \\\n", |
| 20 | + "count 611.000000 611.000000 611.000000 611.000000 \n", |
| 21 | + "mean 135.801964 6.342062 0.006808 0.145450 \n", |
| 22 | + "std 201.683867 2.529080 0.014059 0.200575 \n", |
| 23 | + "min 0.500000 0.000000 0.000010 0.000000 \n", |
| 24 | + "25% 24.000000 4.000000 0.000500 0.010000 \n", |
| 25 | + "50% 24.000000 7.000000 0.001000 0.100000 \n", |
| 26 | + "75% 144.000000 7.000000 0.003000 0.100000 \n", |
| 27 | + "max 672.000000 10.000000 0.100000 0.600000 \n", |
| 28 | + "\n", |
| 29 | + " Efficiency \n", |
| 30 | + "count 611.000000 \n", |
| 31 | + "mean 26.736841 \n", |
| 32 | + "std 288.788317 \n", |
| 33 | + "min -4834.000000 \n", |
| 34 | + "25% 30.000000 \n", |
| 35 | + "50% 58.000000 \n", |
| 36 | + "75% 87.950000 \n", |
| 37 | + "max 100.000000 \n" |
28 | 38 | ]
|
29 | 39 | }
|
30 | 40 | ],
|
|
38 | 48 | },
|
39 | 49 | {
|
40 | 50 | "cell_type": "code",
|
41 |
| - "execution_count": 11, |
| 51 | + "execution_count": 2, |
42 | 52 | "metadata": {},
|
43 | 53 | "outputs": [
|
44 | 54 | {
|
|
52 | 62 | "3 Cc1ccc(c(c1)n1nc2c(n1)cccc2)O 24.0 10.0 0.001 \n",
|
53 | 63 | "4 Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O 24.0 4.0 0.001 \n",
|
54 | 64 | "\n",
|
55 |
| - " Efficiency \n", |
56 |
| - "0 0.0 \n", |
57 |
| - "1 0.0 \n", |
58 |
| - "2 30.0 \n", |
59 |
| - "3 30.0 \n", |
60 |
| - "4 30.0 \n" |
| 65 | + " Salt_Concentrat_M Efficiency \n", |
| 66 | + "0 0.1 0.0 \n", |
| 67 | + "1 0.1 0.0 \n", |
| 68 | + "2 0.1 30.0 \n", |
| 69 | + "3 0.1 30.0 \n", |
| 70 | + "4 0.1 30.0 \n" |
61 | 71 | ]
|
62 | 72 | }
|
63 | 73 | ],
|
|
74 | 84 | },
|
75 | 85 | {
|
76 | 86 | "cell_type": "code",
|
77 |
| - "execution_count": 12, |
| 87 | + "execution_count": 3, |
78 | 88 | "metadata": {},
|
79 |
| - "outputs": [], |
| 89 | + "outputs": [ |
| 90 | + { |
| 91 | + "name": "stderr", |
| 92 | + "output_type": "stream", |
| 93 | + "text": [ |
| 94 | + "/home/vscode/.local/lib/python3.10/site-packages/baybe/telemetry.py:222: UserWarning: WARNING: BayBE Telemetry endpoint https://public.telemetry.baybe.p.uptimize.merckgroup.com:4317 cannot be reached. Disabling telemetry. The exception encountered was: ConnectionError, HTTPConnectionPool(host='verkehrsnachrichten.merck.de', port=80): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPConnection object at 0x7fa2386f7fd0>: Failed to resolve 'verkehrsnachrichten.merck.de' ([Errno -2] Name or service not known)\"))\n", |
| 95 | + " warnings.warn(\n" |
| 96 | + ] |
| 97 | + } |
| 98 | + ], |
80 | 99 | "source": [
|
81 | 100 | "from baybe.targets import NumericalTarget\n",
|
82 | 101 | "from baybe.objective import Objective\n",
|
|
97 | 116 | },
|
98 | 117 | {
|
99 | 118 | "cell_type": "code",
|
100 |
| - "execution_count": 13, |
| 119 | + "execution_count": 4, |
101 | 120 | "metadata": {},
|
102 | 121 | "outputs": [],
|
103 | 122 | "source": [
|
|
141 | 160 | },
|
142 | 161 | {
|
143 | 162 | "cell_type": "code",
|
144 |
| - "execution_count": 4, |
| 163 | + "execution_count": null, |
145 | 164 | "metadata": {},
|
146 |
| - "outputs": [ |
147 |
| - { |
148 |
| - "data": { |
149 |
| - "text/plain": [ |
150 |
| - "SubstanceParameter(name='Solvent', data={'Water': 'O', '1-Octanol': 'CCCCCCCCO', 'Toluene': 'CC1=CC=CC=C1'}, decorrelate=0.7, encoding=<SubstanceEncoding.MORDRED: 'MORDRED'>)" |
151 |
| - ] |
152 |
| - }, |
153 |
| - "execution_count": 4, |
154 |
| - "metadata": {}, |
155 |
| - "output_type": "execute_result" |
156 |
| - } |
157 |
| - ], |
| 165 | + "outputs": [], |
158 | 166 | "source": [
|
159 | 167 | "from baybe.parameters import SubstanceParameter\n",
|
160 | 168 | "\n",
|
|
178 | 186 | "These calculations will typically result in 500 to 1500 numbers per molecule. **To avoid detrimental effects on the surrogate model fit, we reduce the number of descriptors via decorrelation before using them.** For instance, the decorrelate option in the example above specifies that only descriptors with a correlation lower than 0.7 to any other descriptor will be kept. This usually reduces the number of descriptors to 10-50, depending on the specific items in data."
|
179 | 187 | ]
|
180 | 188 | },
|
181 |
| - { |
182 |
| - "cell_type": "markdown", |
183 |
| - "metadata": {}, |
184 |
| - "source": [ |
185 |
| - "The encoding concept introduced above is generalized by the CustomParameter. Here, the user is expected to provide their own descriptors for the encoding.\n", |
186 |
| - "\n", |
187 |
| - "Take, for instance, a parameter that corresponds to the choice of a polymer. Polymers are not well represented by the small molecule descriptors utilized in the SubstanceParameter. Still, one could provide experimental measurements or common metrics used to classify polymers:" |
188 |
| - ] |
189 |
| - }, |
190 | 189 | {
|
191 | 190 | "cell_type": "code",
|
192 | 191 | "execution_count": null,
|
193 | 192 | "metadata": {},
|
194 | 193 | "outputs": [],
|
195 | 194 | "source": [
|
196 | 195 | "\"\"\"\n",
|
| 196 | + "The encoding concept introduced above is generalized by the CustomParameter. Here, the user is expected to provide their own descriptors for the encoding.\n", |
| 197 | + "\n", |
| 198 | + "Take, for instance, a parameter that corresponds to the choice of a polymer. Polymers are not well represented by the small molecule descriptors utilized in the SubstanceParameter. \n", |
| 199 | + "Still, one could provide experimental measurements or common metrics used to classify polymers:\n", |
197 | 200 | "from baybe.parameters import CustomDiscreteParameter\n",
|
198 | 201 | "\n",
|
199 | 202 | "# Create or import new dataframe containing custom descriptors\n",
|
|
216 | 219 | },
|
217 | 220 | {
|
218 | 221 | "cell_type": "code",
|
219 |
| - "execution_count": 14, |
| 222 | + "execution_count": 5, |
220 | 223 | "metadata": {},
|
221 | 224 | "outputs": [],
|
222 | 225 | "source": [
|
|
250 | 253 | },
|
251 | 254 | {
|
252 | 255 | "cell_type": "code",
|
253 |
| - "execution_count": 15, |
| 256 | + "execution_count": 6, |
254 | 257 | "metadata": {},
|
255 | 258 | "outputs": [],
|
256 | 259 | "source": [
|
|
282 | 285 | " sampling_percentage=0.3, # should be relatively low\n",
|
283 | 286 | " allow_repeated_recommendations=False,\n",
|
284 | 287 | " allow_recommending_already_measured=False,\n",
|
285 |
| - " )\n", |
286 |
| - "\n", |
287 |
| - "hybrid_recommender = SequentialGreedyRecommender(\n", |
288 |
| - " allow_repeated_recommendations=False,\n", |
289 |
| - " allow_recommending_already_measured=False\n", |
290 |
| - ")" |
| 288 | + " )" |
291 | 289 | ]
|
292 | 290 | },
|
293 | 291 | {
|
|
299 | 297 | },
|
300 | 298 | {
|
301 | 299 | "cell_type": "code",
|
302 |
| - "execution_count": 16, |
| 300 | + "execution_count": 9, |
303 | 301 | "metadata": {},
|
304 | 302 | "outputs": [
|
305 | 303 | {
|
|
316 | 314 | "from baybe import Campaign\n",
|
317 | 315 | "\n",
|
318 | 316 | "strategy = TwoPhaseStrategy(\n",
|
319 |
| - " initial_recommender = RandomRecommender(), # Initial recommender, if no training data is available\n", |
320 |
| - " # Other initial recommenders don't seem to work for my hybrid search space/set of parameters\n", |
321 |
| - " # Doesn't matter since I already have training data\n", |
| 317 | + " initial_recommender = RandomRecommender(), # Initial recommender\n", |
| 318 | + " # Doesn't matter since I already have training data, BUT CAN BE USED FOR BENCHMARKING\n", |
322 | 319 | " recommender = seq_greedy_recommender, # Bayesian model-based optimization\n",
|
323 |
| - " # recommender = hybrid_recommender,\n", |
324 |
| - " switch_after=1 # Switch to the model-based recommender after 1 batch or iteration (so the initial training data)\n", |
| 320 | + " switch_after=1 # Switch to the model-based recommender after 1 batches = immediately\n", |
325 | 321 | ")\n",
|
326 | 322 | "\n",
|
327 | 323 | "campaign = Campaign(searchspace, objective, strategy)"
|
|
336 | 332 | },
|
337 | 333 | {
|
338 | 334 | "cell_type": "code",
|
339 |
| - "execution_count": 21, |
| 335 | + "execution_count": 10, |
340 | 336 | "metadata": {},
|
341 | 337 | "outputs": [
|
342 | 338 | {
|
|
346 | 342 | "\n",
|
347 | 343 | "\n",
|
348 | 344 | "Recommended experiments: \n",
|
349 |
| - "| | Time (h) | pH | Salt Concentration (M) | Inhibitor Concentration (M) |\n", |
350 |
| - "|------:|-----------:|-----:|-------------------------:|------------------------------:|\n", |
351 |
| - "| 11808 | 1 | 4.8 | 1.5 | 0.00858356 |\n" |
| 345 | + "| | Time (h) | pH | Inhibitor Concentration (M) | Salt Concentration (M) |\n", |
| 346 | + "|--------:|-----------:|-----:|------------------------------:|-------------------------:|\n", |
| 347 | + "| 4924793 | 16 | 2.5 | 0.01 | 0.92 |\n", |
| 348 | + "| 6006943 | 19 | 8 | 0.05 | 0.58 |\n", |
| 349 | + "| 6994486 | 22 | 8.8 | 0.08 | 0.88 |\n" |
352 | 350 | ]
|
353 | 351 | }
|
354 | 352 | ],
|
355 | 353 | "source": [
|
356 |
| - "new_rec = campaign.recommend(batch_size=1) # TEST with different batch sizes for optimal performance\n", |
| 354 | + "new_rec = campaign.recommend(batch_size=3) # TEST with different batch sizes for optimal performance\n", |
357 | 355 | "print(\"\\n\\nRecommended experiments: \")\n",
|
358 | 356 | "print(new_rec.to_markdown())"
|
359 | 357 | ]
|
360 | 358 | },
|
361 | 359 | {
|
362 | 360 | "cell_type": "code",
|
363 |
| - "execution_count": 22, |
| 361 | + "execution_count": 11, |
364 | 362 | "metadata": {},
|
365 | 363 | "outputs": [
|
366 | 364 | {
|
|
370 | 368 | "\n",
|
371 | 369 | "\n",
|
372 | 370 | "Recommended experiments with measured values: \n",
|
373 |
| - "| | Time (h) | pH | Salt Concentration (M) | Inhibitor Concentration (M) |\n", |
374 |
| - "|------:|-----------:|-----:|-------------------------:|------------------------------:|\n", |
375 |
| - "| 11808 | 1 | 4.8 | 1.5 | 0.00858356 |\n" |
| 371 | + "| | Time (h) | pH | Inhibitor Concentration (M) | Salt Concentration (M) | efficiency |\n", |
| 372 | + "|--------:|-----------:|-----:|------------------------------:|-------------------------:|-------------:|\n", |
| 373 | + "| 4924793 | 16 | 2.5 | 0.01 | 0.92 | 0.1 |\n", |
| 374 | + "| 6006943 | 19 | 8 | 0.05 | 0.58 | 0.2 |\n", |
| 375 | + "| 6994486 | 22 | 8.8 | 0.08 | 0.88 | 0.3 |\n" |
376 | 376 | ]
|
377 | 377 | }
|
378 | 378 | ],
|
379 | 379 | "source": [
|
380 | 380 | "# Get and input efficiency value from Excel table, for specific SMILES component first, \n",
|
381 | 381 | "# then for the closest values of the rest of the parameters\n",
|
382 | 382 | "\n",
|
383 |
| - "new_rec[\"efficiency\"] = [0.1]\n", |
| 383 | + "new_rec[\"efficiency\"] = [0.1, 0.2, 0.3]\n", |
384 | 384 | "print(\"\\n\\nRecommended experiments with measured values: \")\n",
|
385 | 385 | "print(new_rec.to_markdown())"
|
386 | 386 | ]
|
387 | 387 | },
|
| 388 | + { |
| 389 | + "cell_type": "code", |
| 390 | + "execution_count": 12, |
| 391 | + "metadata": {}, |
| 392 | + "outputs": [ |
| 393 | + { |
| 394 | + "name": "stdout", |
| 395 | + "output_type": "stream", |
| 396 | + "text": [ |
| 397 | + "\n", |
| 398 | + "\n", |
| 399 | + "Recommended experiments: \n", |
| 400 | + "| | Time (h) | pH | Inhibitor Concentration (M) | Salt Concentration (M) | efficiency |\n", |
| 401 | + "|--------:|-----------:|-----:|------------------------------:|-------------------------:|-------------:|\n", |
| 402 | + "| 4924793 | 16 | 2.5 | 0.01 | 0.92 | 0.1 |\n", |
| 403 | + "| 6006943 | 19 | 8 | 0.05 | 0.58 | 0.2 |\n", |
| 404 | + "| 6994486 | 22 | 8.8 | 0.08 | 0.88 | 0.3 |\n" |
| 405 | + ] |
| 406 | + } |
| 407 | + ], |
| 408 | + "source": [ |
| 409 | + "new_new_rec = campaign.recommend(batch_size=3) # TEST with different batch sizes for optimal performance\n", |
| 410 | + "print(\"\\n\\nRecommended experiments: \")\n", |
| 411 | + "print(new_new_rec.to_markdown())" |
| 412 | + ] |
| 413 | + }, |
388 | 414 | {
|
389 | 415 | "cell_type": "markdown",
|
390 | 416 | "metadata": {},
|
|
394 | 420 | },
|
395 | 421 | {
|
396 | 422 | "cell_type": "code",
|
397 |
| - "execution_count": null, |
| 423 | + "execution_count": 14, |
398 | 424 | "metadata": {},
|
399 |
| - "outputs": [], |
| 425 | + "outputs": [ |
| 426 | + { |
| 427 | + "name": "stdout", |
| 428 | + "output_type": "stream", |
| 429 | + "text": [ |
| 430 | + "\n", |
| 431 | + "\n", |
| 432 | + "All experiments with measured values: \n", |
| 433 | + "| | Time (h) | pH | Inhibitor Concentration (M) | Salt Concentration (M) | efficiency |\n", |
| 434 | + "|--------:|-----------:|-----:|------------------------------:|-------------------------:|-------------:|\n", |
| 435 | + "| 4924793 | 16 | 2.5 | 0.01 | 0.92 | 0.1 |\n", |
| 436 | + "| 6006943 | 19 | 8 | 0.05 | 0.58 | 0.2 |\n", |
| 437 | + "| 6994486 | 22 | 8.8 | 0.08 | 0.88 | 0.3 |\n", |
| 438 | + "| 4924793 | 16 | 2.5 | 0.01 | 0.92 | nan |\n", |
| 439 | + "| 6006943 | 19 | 8 | 0.05 | 0.58 | nan |\n", |
| 440 | + "| 6994486 | 22 | 8.8 | 0.08 | 0.88 | nan |\n" |
| 441 | + ] |
| 442 | + } |
| 443 | + ], |
400 | 444 | "source": [
|
401 | 445 | "results = pd.concat([new_rec, new_new_rec]) # etc.\n",
|
402 | 446 | "print(\"\\n\\nAll experiments with measured values: \")\n",
|
|
0 commit comments