From 8cf4493727703bf729372f4d9edbf6e6caff4224 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20W=C3=BCrger?= <44372393+koerper@users.noreply.github.com> Date: Wed, 27 Mar 2024 21:31:17 +0000 Subject: [PATCH] Implement campaign simulation --- src/tim_baybe-inhibitor.ipynb | 363 ++++++++++++++-------------------- 1 file changed, 150 insertions(+), 213 deletions(-) diff --git a/src/tim_baybe-inhibitor.ipynb b/src/tim_baybe-inhibitor.ipynb index 67fc765..4563a7d 100644 --- a/src/tim_baybe-inhibitor.ipynb +++ b/src/tim_baybe-inhibitor.ipynb @@ -37,7 +37,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/vscode/.local/lib/python3.10/site-packages/baybe/telemetry.py:222: UserWarning: WARNING: BayBE Telemetry endpoint https://public.telemetry.baybe.p.uptimize.merckgroup.com:4317 cannot be reached. Disabling telemetry. The exception encountered was: ConnectionError, HTTPConnectionPool(host='verkehrsnachrichten.merck.de', port=80): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'verkehrsnachrichten.merck.de' ([Errno -2] Name or service not known)\"))\n", + "/home/vscode/.local/lib/python3.10/site-packages/baybe/telemetry.py:222: UserWarning: WARNING: BayBE Telemetry endpoint https://public.telemetry.baybe.p.uptimize.merckgroup.com:4317 cannot be reached. Disabling telemetry. The exception encountered was: ConnectionError, HTTPConnectionPool(host='verkehrsnachrichten.merck.de', port=80): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'verkehrsnachrichten.merck.de' ([Errno -2] Name or service not known)\"))\n", " warnings.warn(\n", "/home/vscode/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" @@ -197,13 +197,6 @@ "efficiency_min, efficiency_max = df_AA2024.Efficiency.min(), df_AA2024.Efficiency.max()" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -213,34 +206,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "import sys\n", - "sys.path.append('../utils')\n", - "from subsampling import random_subsample" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(50, 6)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "random_subsample(df_AA2024, 50).shape" - ] + "source": [] }, { "cell_type": "markdown", @@ -251,169 +220,10 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SMILESTime_hpHInhib_Concentrat_MSalt_Concentrat_M
0COCCOC(=O)OCSc1nc2c(s1)cccc224.04.00.00100.10
1COCCOC(=O)OCSc1nc2c(s1)cccc224.010.00.00100.10
2Cc1ccc(c(c1)n1nc2c(n1)cccc2)O24.04.00.00100.10
3Cc1ccc(c(c1)n1nc2c(n1)cccc2)O24.010.00.00100.10
4Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O24.04.00.00100.10
..................
606S=c1sc2c([nH]1)cccc224.07.00.00050.05
607C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O24.07.00.00050.05
608C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O24.07.00.00050.05
609C(=O)(C(=O)[O-])[O-]24.07.00.00050.05
610C(=O)(C(=O)[O-])[O-]24.07.00.00050.05
\n", - "

611 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " SMILES Time_h pH \\\n", - "0 COCCOC(=O)OCSc1nc2c(s1)cccc2 24.0 4.0 \n", - "1 COCCOC(=O)OCSc1nc2c(s1)cccc2 24.0 10.0 \n", - "2 Cc1ccc(c(c1)n1nc2c(n1)cccc2)O 24.0 4.0 \n", - "3 Cc1ccc(c(c1)n1nc2c(n1)cccc2)O 24.0 10.0 \n", - "4 Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O 24.0 4.0 \n", - ".. ... ... ... \n", - "606 S=c1sc2c([nH]1)cccc2 24.0 7.0 \n", - "607 C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O 24.0 7.0 \n", - "608 C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O 24.0 7.0 \n", - "609 C(=O)(C(=O)[O-])[O-] 24.0 7.0 \n", - "610 C(=O)(C(=O)[O-])[O-] 24.0 7.0 \n", - "\n", - " Inhib_Concentrat_M Salt_Concentrat_M \n", - "0 0.0010 0.10 \n", - "1 0.0010 0.10 \n", - "2 0.0010 0.10 \n", - "3 0.0010 0.10 \n", - "4 0.0010 0.10 \n", - ".. ... ... \n", - "606 0.0005 0.05 \n", - "607 0.0005 0.05 \n", - "608 0.0005 0.05 \n", - "609 0.0005 0.05 \n", - "610 0.0005 0.05 \n", - "\n", - "[611 rows x 5 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_AA2024[[\"SMILES\", \"Time_h\", \"pH\", \"Inhib_Concentrat_M\", \"Salt_Concentrat_M\"]]" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", @@ -424,7 +234,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -489,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -602,7 +412,7 @@ "[611 rows x 388 columns]), continuous=SubspaceContinuous(parameters=[], constraints_lin_eq=[], constraints_lin_ineq=[]))" ] }, - "execution_count": 9, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -613,21 +423,25 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "metadata": {}, "outputs": [ { - "ename": "IndexError", - "evalue": "boolean index did not match indexed array along dimension 0; dimension is 611 but corresponding boolean dimension is 921", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[13], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m N_DOE_ITERATIONS \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m\n\u001b[1;32m 6\u001b[0m N_MC_ITERATIONS \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m\n\u001b[0;32m----> 8\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43msimulate_experiment\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mcampaign\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43mdf_AA2024\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mBATCH_SIZE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_doe_iterations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mN_DOE_ITERATIONS\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[43m \u001b[49m\u001b[43mimpute_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mignore\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/baybe/simulation.py:473\u001b[0m, in \u001b[0;36msimulate_experiment\u001b[0;34m(campaign, lookup, batch_size, n_doe_iterations, initial_data, random_seed, impute_mode, noise_percent)\u001b[0m\n\u001b[1;32m 471\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m impute_mode \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 472\u001b[0m searchspace \u001b[38;5;241m=\u001b[39m campaign\u001b[38;5;241m.\u001b[39msearchspace\u001b[38;5;241m.\u001b[39mdiscrete\u001b[38;5;241m.\u001b[39mexp_rep\n\u001b[0;32m--> 473\u001b[0m missing_inds \u001b[38;5;241m=\u001b[39m \u001b[43msearchspace\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m 474\u001b[0m \u001b[43m \u001b[49m\u001b[43msearchspace\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmerge\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlookup\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mleft\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindicator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m_merge\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 475\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mleft_only\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 476\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 477\u001b[0m campaign\u001b[38;5;241m.\u001b[39msearchspace\u001b[38;5;241m.\u001b[39mdiscrete\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mloc[\n\u001b[1;32m 478\u001b[0m missing_inds, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdont_recommend\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 479\u001b[0m ] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 481\u001b[0m \u001b[38;5;66;03m# Run the DOE loop\u001b[39;00m\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/indexes/range.py:1030\u001b[0m, in \u001b[0;36mRangeIndex.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1023\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_scalar(key):\n\u001b[1;32m 1024\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m(\n\u001b[1;32m 1025\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124monly integers, slices (`:`), \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1026\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mellipsis (`...`), numpy.newaxis (`None`) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1027\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mand integer or boolean \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1028\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marrays are valid indices\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1029\u001b[0m )\n\u001b[0;32m-> 1030\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__getitem__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py:5416\u001b[0m, in \u001b[0;36mIndex.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 5407\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(key) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(key) \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 5408\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 5409\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUsing a boolean indexer with length 0 on an Index with \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 5410\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlength greater than 0 is deprecated and will raise in a \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 5413\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[1;32m 5414\u001b[0m )\n\u001b[0;32m-> 5416\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mgetitem\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5417\u001b[0m \u001b[38;5;66;03m# Because we ruled out integer above, we always get an arraylike here\u001b[39;00m\n\u001b[1;32m 5418\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m result\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "\u001b[0;31mIndexError\u001b[0m: boolean index did not match indexed array along dimension 0; dimension is 611 but corresponding boolean dimension is 921" + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/vscode/.local/lib/python3.10/site-packages/botorch/models/transforms/outcome.py:304: UserWarning: std(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at ../aten/src/ATen/native/ReduceOps.cpp:1760.)\n", + " stdvs = Y.std(dim=-2, keepdim=True)\n", + "/home/vscode/.local/lib/python3.10/site-packages/botorch/models/utils/assorted.py:194: UserWarning: std(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at ../aten/src/ATen/native/ReduceOps.cpp:1760.)\n", + " Ymean, Ystd = torch.mean(Y, dim=-2), torch.std(Y, dim=-2)\n", + "The lookup rows with indexes [297 300 303 306] seem to be duplicates regarding parameter values. Choosing a random one.\n", + "The lookup rows with indexes [297 300 303 306] seem to be duplicates regarding parameter values. Choosing a random one.\n", + "The lookup rows with indexes [297 300 303 306] seem to be duplicates regarding parameter values. Choosing a random one.\n", + "The lookup rows with indexes [297 300 303 306] seem to be duplicates regarding parameter values. Choosing a random one.\n", + "Input row with index 297 has multiple matches with the search space. This could indicate that something went wrong. Matching only first occurrence.\n", + "Input row with index 300 has multiple matches with the search space. This could indicate that something went wrong. Matching only first occurrence.\n", + "Input row with index 303 has multiple matches with the search space. This could indicate that something went wrong. Matching only first occurrence.\n", + "Input row with index 306 has multiple matches with the search space. This could indicate that something went wrong. Matching only first occurrence.\n" ] } ], @@ -644,8 +458,131 @@ " df_AA2024,\n", " batch_size=BATCH_SIZE,\n", " n_doe_iterations=N_DOE_ITERATIONS,\n", - " impute_mode=\"ignore\",\n", - ")" + " impute_mode=\"best\",\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IterationNum_ExperimentsEfficiency_MeasurementsEfficiency_IterBestEfficiency_CumBest
001[60.35]60.3560.35
115[40.0, 73.0, 40.0, 43.0]73.0073.00
226[78.26]78.2678.26
337[89.68]89.6889.68
448[50.0]50.0089.68
\n", + "
" + ], + "text/plain": [ + " Iteration Num_Experiments Efficiency_Measurements Efficiency_IterBest \\\n", + "0 0 1 [60.35] 60.35 \n", + "1 1 5 [40.0, 73.0, 40.0, 43.0] 73.00 \n", + "2 2 6 [78.26] 78.26 \n", + "3 3 7 [89.68] 89.68 \n", + "4 4 8 [50.0] 50.00 \n", + "\n", + " Efficiency_CumBest \n", + "0 60.35 \n", + "1 73.00 \n", + "2 78.26 \n", + "3 89.68 \n", + "4 89.68 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100.0" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_AA2024.Efficiency.max()" ] }, {