From db3e4b5bd5d30ea3aef2afc6accc79a26536328e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20W=C3=BCrger?= <44372393+koerper@users.noreply.github.com> Date: Wed, 27 Mar 2024 20:35:43 +0000 Subject: [PATCH] Add bullocks --- src/tim_baybe-inhibitor.ipynb | 600 +++++++++++++++++++++++++++++----- 1 file changed, 525 insertions(+), 75 deletions(-) diff --git a/src/tim_baybe-inhibitor.ipynb b/src/tim_baybe-inhibitor.ipynb index 3213fad..d4fd932 100644 --- a/src/tim_baybe-inhibitor.ipynb +++ b/src/tim_baybe-inhibitor.ipynb @@ -30,9 +30,20 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/vscode/.local/lib/python3.10/site-packages/baybe/telemetry.py:222: UserWarning: WARNING: BayBE Telemetry endpoint https://public.telemetry.baybe.p.uptimize.merckgroup.com:4317 cannot be reached. Disabling telemetry. The exception encountered was: ConnectionError, HTTPConnectionPool(host='verkehrsnachrichten.merck.de', port=80): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'verkehrsnachrichten.merck.de' ([Errno -2] Name or service not known)\"))\n", + " warnings.warn(\n", + "/home/vscode/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "import pandas as pd\n", "import numpy as np\n", @@ -45,7 +56,22 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "def random_subsample(df, num_samples):\n", + " np.random.seed(42)\n", + " indices = np.random.choice(df.index, num_samples, replace=False)\n", + " subsampled_df = df.loc[indices]\n", + " return subsampled_df " + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -143,7 +169,7 @@ "4 0.1 30.0 " ] }, - "execution_count": 15, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -152,6 +178,50 @@ "df_AA2024.head()" ] }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "unique_smiles = df_AA2024.SMILES.unique()\n", + "unique_times = df_AA2024.Time_h.unique()\n", + "unique_pH = df_AA2024.pH.unique()\n", + "unique_inhib_conc = df_AA2024.Inhib_Concentrat_M.unique()\n", + "unique_salt_conc = df_AA2024.Salt_Concentrat_M.unique()\n", + "\n", + "time_min, time_max = df_AA2024.Time_h.min(), df_AA2024.Time_h.max()\n", + "pH_min, pH_max = df_AA2024.pH.min(), df_AA2024.pH.max()\n", + "inhib_conc_min, inhib_conc_max = df_AA2024.Inhib_Concentrat_M.min(), df_AA2024.Inhib_Concentrat_M.max()\n", + "salt_conc_min, salt_conc_max = df_AA2024.Salt_Concentrat_M.min(), df_AA2024.Salt_Concentrat_M.max()\n", + "efficiency_min, efficiency_max = df_AA2024.Efficiency.min(), df_AA2024.Efficiency.max()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Processing" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('../utils')\n", + "from subsampling import random_subsample" + ] + }, { "cell_type": "code", "execution_count": 8, @@ -160,65 +230,7 @@ { "data": { "text/plain": [ - "array(['COCCOC(=O)OCSc1nc2c(s1)cccc2', 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O',\n", - " 'Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O', 'On1nnc2c1cccc2',\n", - " 'c1ncn[nH]1', 'Sc1n[nH]cn1', 'S[C]1NC2=C[CH]C=NC2=N1',\n", - " 'S=c1[nH]c2c([nH]1)nccn2', 'Sc1ncc[nH]1',\n", - " 'C=CC(=O)OCCOC(=O)OCCSc1ncccn1', 'CCSc1nnc(s1)N', 'CSc1nnc(s1)N',\n", - " 'Cc1ccc2c(c1)nc([nH]2)S', 'OC(=O)CS', 'Sc1nc2c([nH]1)cccc2',\n", - " 'OC(=O)c1ccccc1S', 'S=c1sc2c([nH]1)cccc2', 'OC(=O)c1cccnc1S',\n", - " 'Sc1ncccn1', 'c1ccc(nc1)c1ccccn1', 'Sc1nnc(s1)S',\n", - " 'Nc1cc(S)nc(n1)N', 'Nc1nc([nH]n1)C(=O)O', 'Nc1n[nH]cn1',\n", - " 'OC(=O)c1n[nH]c(n1)N', 'Nc1n[nH]c(n1)S', 'CS[C]1N[N]C(=N1)N',\n", - " 'C1=CC(=CC(=C1)S)C(=O)O', 'OC(=O)CCS', 'Oc1ccccc1c1nnc([nH]1)S',\n", - " 'Nn1cnnc1', 'Nc1ccnc(n1)S', 'Nn1c(NN)nnc1S', 'Nn1c(S)nnc1c1ccccc1',\n", - " 'Sc1nc(N)c2c(n1)[nH]nc2', 'Oc1ccc(cc1)C(=O)O', 'OC(=O)c1ccc(cc1)S',\n", - " 'Cn1cnnc1S', 'Sc1nc(N)c(c(n1)S)N', 'Nc1ncncc1N', 'Nc1cc(N)nc(n1)S',\n", - " 'Cc1cc(C)nc(n1)S', 'Clc1cccc(c1)c1n[nH]c(=S)[nH]1',\n", - " 'COc1cccc(c1)c1n[nH]c(=S)[nH]1', 'Clc1ccc(cc1Cl)c1n[nH]c(=S)[nH]1',\n", - " 'c1cc(ccc1c2[nH]c(nn2)S)[N+](=O)[O-]', 'S=c1[nH]nc([nH]1)c1ccco1',\n", - " 'S=c1[nH]nc([nH]1)c1cccnc1', 'S=c1[nH]nc([nH]1)c1ccncc1',\n", - " 'Nc1n[nH]c(=S)s1', 'Cc1nsc(c1)N', 'Clc1ccc2c(c1)[nH]c(n2)S',\n", - " 'CCOc1ccc2c(c1)nc([nH]2)S', 'Cn1nnnc1S', 'OC(=O)Cn1nnnc1S',\n", - " 'COc1ccc2c(c1)[nH]c(=S)[nH]2', 'Cc1n[nH]c(=S)s1',\n", - " 'ClC([C]1N[N]C=N1)(Cl)Cl', 'Clc1cc2[nH]c(=S)[nH]c2cc1Cl',\n", - " 'CSc1[nH]c2c(n1)cc(c(c2)C)C', 'Nc1ccc2c(c1)sc(=S)[nH]2',\n", - " 'OC(=O)c1ccc(=S)[nH]c1', 'Oc1cccc2c1nccc2',\n", - " 'S=c1[nH]c2c([nH]1)c(=O)n(cn2)C', 'S=c1[nH]c2c([nH]1)cncn2',\n", - " 'CC(=O)O', 'OC(=O)CCCCC(=O)O', 'OC(=O)c1ccccc1',\n", - " 'c1ccc2c(c1)[nH]nn2', 'OC(=O)c1ccc(cc1)c1ccccc1',\n", - " 'OC(=O)/C=C/c1ccccc1', 'C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O',\n", - " 'O[C@H]1C(=O)OCC1(C)C', 'OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O',\n", - " 'OC[C@H]([C@H]([C@@H]([C@@H](CO)O)O)O)O', 'CC(=O)SSC(=O)C',\n", - " 'CCCCOP(=O)(OCCCC)O', 'CCN(C(=S)S)CC', 'O/N=C(/C(=N/O)/C)\\\\C',\n", - " 'CCCCCCCCCCCCc1ccccc1S([O])([O])O', 'CCCCCCCCCCCCOS(=O)(=O)O',\n", - " 'OC(=O)CN(CC(=O)O)CCN(CC(=O)O)CC(=O)O',\n", - " 'O/N=C(\\\\C(=N/O)\\\\c1ccco1)/c1ccco1',\n", - " 'OC[C@H]([C@H]([C@@H]([C@H](C(=O)O)O)O)O)O', 'OCC(CO)O',\n", - " 'NCC(=O)O', 'OC(=O)CCCCCCCCCCCCCCC(=O)O', 'C1N2CN3CN1CN(C2)C3',\n", - " 'NO', 'COC(=O)CCCC1=CNC2=CC=CC=C21', 'OC(=O)c1ccncc1',\n", - " 'C1COCCN1CCCS(=O)(=O)O', 'OC(=O)c1cccnc1',\n", - " 'CCCCCCCC/C=C\\\\CCCCCCCC(=O)O', 'C(=O)(C(=O)[O-])[O-]',\n", - " 'OC(=O)c1ccc(cc1)N', 'Oc1ccc(cc1)S([O])([O])O', 'OC(=O)c1ccccn1',\n", - " 'OC(=O)c1ccccc1O', 'CCCCCCCCCCCCCCCCCC(=O)O', 'SC#N',\n", - " 'C1=CC(=C(C=C1SSC2=CC(=C(C=C2)[N+](=O)[O-])C(=O)O)C(=O)O)[N+](=O)[O-]',\n", - " '[O-]S(=O)[O-].[Na+].[Na+]', 'CCCCCCCCN(CC(=O)O[Na])CC(=O)O[Na]',\n", - " 'CCCCCCCCCCCCN(CC(=O)O[Na])CC(=O)O[Na]',\n", - " 'CCCCCCCCCCCCCCN(CC(=O)O[Na])CC(=O)O[Na]',\n", - " 'CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C(=O)O)C',\n", - " 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',\n", - " 'N.N.[N+](=O)(O)[O-].[N+](=O)(O)[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].O.O.O.O.[Ce+3]',\n", - " '[NH4+].[NH4+].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[Ce+4]',\n", - " '[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[Ce+3]',\n", - " '[O-]S(=O)(=O)[O-].[O-]S(=O)(=O)[O-].[O-]S(=O)(=O)[O-].[Ce+3].[Ce+3]',\n", - " '[Cl-].[Cl-].[Cl-].[Ce+3]', 'CNCC(C1=CC(=CC=C1)O)O',\n", - " 'C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Fe+2]',\n", - " 'C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Zn+2]',\n", - " 'C1=CC=C(C(=C1)C=NNC(=S)N)O', 'C1=CC(=C(C=C1O)O)C=NNC(=S)N',\n", - " 'NC(=S)NN=CC1=C(C(=C(C=C1)O)O)O',\n", - " 'CCCCN(CCCC)C1=NC(=NC(=N1)NC(CCSC)C(=O)O)NC(CCSC)C(=O)O',\n", - " 'C1=CC2=NNN=C2C=C1Cl', 'O=C([O-])C(O)C(O)C(O)C(O)CO.[Na+]',\n", - " 'COC(=O)n1nnc2ccccc12'], dtype=object)" + "(50, 6)" ] }, "execution_count": 8, @@ -227,42 +239,466 @@ } ], "source": [ - "df_AA2024.SMILES.unique()" + "random_subsample(df_AA2024, 50).shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Data Processing" + "# Data Anaylsis" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SMILESTime_hpHInhib_Concentrat_MSalt_Concentrat_M
0COCCOC(=O)OCSc1nc2c(s1)cccc224.04.00.00100.10
1COCCOC(=O)OCSc1nc2c(s1)cccc224.010.00.00100.10
2Cc1ccc(c(c1)n1nc2c(n1)cccc2)O24.04.00.00100.10
3Cc1ccc(c(c1)n1nc2c(n1)cccc2)O24.010.00.00100.10
4Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O24.04.00.00100.10
..................
606S=c1sc2c([nH]1)cccc224.07.00.00050.05
607C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O24.07.00.00050.05
608C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O24.07.00.00050.05
609C(=O)(C(=O)[O-])[O-]24.07.00.00050.05
610C(=O)(C(=O)[O-])[O-]24.07.00.00050.05
\n", + "

611 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " SMILES Time_h pH \\\n", + "0 COCCOC(=O)OCSc1nc2c(s1)cccc2 24.0 4.0 \n", + "1 COCCOC(=O)OCSc1nc2c(s1)cccc2 24.0 10.0 \n", + "2 Cc1ccc(c(c1)n1nc2c(n1)cccc2)O 24.0 4.0 \n", + "3 Cc1ccc(c(c1)n1nc2c(n1)cccc2)O 24.0 10.0 \n", + "4 Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O 24.0 4.0 \n", + ".. ... ... ... \n", + "606 S=c1sc2c([nH]1)cccc2 24.0 7.0 \n", + "607 C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O 24.0 7.0 \n", + "608 C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O 24.0 7.0 \n", + "609 C(=O)(C(=O)[O-])[O-] 24.0 7.0 \n", + "610 C(=O)(C(=O)[O-])[O-] 24.0 7.0 \n", + "\n", + " Inhib_Concentrat_M Salt_Concentrat_M \n", + "0 0.0010 0.10 \n", + "1 0.0010 0.10 \n", + "2 0.0010 0.10 \n", + "3 0.0010 0.10 \n", + "4 0.0010 0.10 \n", + ".. ... ... \n", + "606 0.0005 0.05 \n", + "607 0.0005 0.05 \n", + "608 0.0005 0.05 \n", + "609 0.0005 0.05 \n", + "610 0.0005 0.05 \n", + "\n", + "[611 rows x 5 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_AA2024[[\"SMILES\", \"Time_h\", \"pH\", \"Inhib_Concentrat_M\", \"Salt_Concentrat_M\"]]" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Data Anaylsis" + "# Bayesian Optimization" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "ename": "AttributeError", + "evalue": "type object 'SubstanceParameter' has no attribute 'name'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[36], line 36\u001b[0m\n\u001b[1;32m 23\u001b[0m parameters \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 24\u001b[0m SubstanceParameter(\n\u001b[1;32m 25\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSolvent\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;66;03m# NumericalDiscreteParameter(name=\"salt_conc\", values=unique_salt_conc),\u001b[39;00m\n\u001b[1;32m 34\u001b[0m ]\n\u001b[1;32m 35\u001b[0m \u001b[38;5;66;03m# searchspace = SubspaceDiscrete.from_product(parameters=parameters)\u001b[39;00m\n\u001b[0;32m---> 36\u001b[0m searchspace \u001b[38;5;241m=\u001b[39m \u001b[43mSubspaceDiscrete\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_dataframe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_AA2024\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mSMILES\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mTime_h\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpH\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mInhib_Concentrat_M\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mSalt_Concentrat_M\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43mparameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mSubstanceParameter\u001b[49m\u001b[43m,\u001b[49m\u001b[43mNumericalDiscreteParameter\u001b[49m\u001b[43m,\u001b[49m\u001b[43mNumericalDiscreteParameter\u001b[49m\u001b[43m,\u001b[49m\u001b[43mNumericalDiscreteParameter\u001b[49m\u001b[43m,\u001b[49m\u001b[43mNumericalDiscreteParameter\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 38\u001b[0m campaign \u001b[38;5;241m=\u001b[39m Campaign(\n\u001b[1;32m 39\u001b[0m searchspace\u001b[38;5;241m=\u001b[39msearchspace, \u001b[38;5;66;03m# Required\u001b[39;00m\n\u001b[1;32m 40\u001b[0m objective\u001b[38;5;241m=\u001b[39mobjective, \u001b[38;5;66;03m# Required\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;66;03m# recommender=recommender, # Optional\u001b[39;00m\n\u001b[1;32m 42\u001b[0m )\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/baybe/searchspace/discrete.py:261\u001b[0m, in \u001b[0;36mSubspaceDiscrete.from_dataframe\u001b[0;34m(cls, df, parameters, empty_encoding)\u001b[0m\n\u001b[1;32m 258\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m CategoricalParameter(name\u001b[38;5;241m=\u001b[39mname, values\u001b[38;5;241m=\u001b[39mvalues)\n\u001b[1;32m 260\u001b[0m \u001b[38;5;66;03m# Get the full list of both explicitly and implicitly defined parameter\u001b[39;00m\n\u001b[0;32m--> 261\u001b[0m parameters \u001b[38;5;241m=\u001b[39m \u001b[43mget_parameters_from_dataframe\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 262\u001b[0m \u001b[43m \u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdiscrete_parameter_factory\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\n\u001b[1;32m 263\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 265\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m(parameters\u001b[38;5;241m=\u001b[39mparameters, exp_rep\u001b[38;5;241m=\u001b[39mdf, empty_encoding\u001b[38;5;241m=\u001b[39mempty_encoding)\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/baybe/parameters/utils.py:49\u001b[0m, in \u001b[0;36mget_parameters_from_dataframe\u001b[0;34m(df, factory, parameters)\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m parameters \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 48\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m param \u001b[38;5;129;01min\u001b[39;00m parameters:\n\u001b[0;32m---> 49\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mparam\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m \u001b[38;5;129;01min\u001b[39;00m specified_params:\n\u001b[1;32m 50\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 51\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou provided several parameters with the name \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mparam\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 52\u001b[0m )\n\u001b[1;32m 53\u001b[0m specified_params[param\u001b[38;5;241m.\u001b[39mname] \u001b[38;5;241m=\u001b[39m param\n", + "\u001b[0;31mAttributeError\u001b[0m: type object 'SubstanceParameter' has no attribute 'name'" + ] + } + ], + "source": [ + "from baybe import Campaign\n", + "\n", + "from baybe.targets import NumericalTarget\n", + "from baybe.objective import Objective\n", + "from baybe.searchspace import SubspaceDiscrete\n", + "from baybe.parameters import NumericalDiscreteParameter\n", + "from baybe.parameters import SubstanceParameter\n", + "\n", + "\n", + "def list_to_dict(input_list):\n", + " return {item: item for item in input_list}\n", + "\n", + "smiles_dict =list_to_dict(unique_smiles)\n", + "\n", + "\n", + "target = NumericalTarget(name=\"Efficiency\", mode=\"MAX\", bounds=(efficiency_min, efficiency_max), transformation=\"LINEAR\")\n", + "objective = Objective(mode=\"SINGLE\", targets=[target])\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "parameters = [\n", + " SubstanceParameter(\n", + " name=\"Solvent\",\n", + " data=smiles_dict,\n", + " encoding=\"MORGAN_FP\", # optional\n", + " decorrelate=0.7, # optional\n", + "),\n", + " # NumericalDiscreteParameter(name=\"time\", values=unique_times),\n", + " # NumericalDiscreteParameter(name=\"pH\", values=unique_pH),\n", + " # NumericalDiscreteParameter(name=\"inhib_conc\", values=unique_inhib_conc),\n", + " # NumericalDiscreteParameter(name=\"salt_conc\", values=unique_salt_conc),\n", + "]\n", + "# searchspace = SubspaceDiscrete.from_product(parameters=parameters)\n", + "searchspace = SubspaceDiscrete.from_dataframe(df_AA2024[[\"SMILES\", \"Time_h\", \"pH\", \"Inhib_Concentrat_M\", \"Salt_Concentrat_M\"]])\n", + "\n", + "campaign = Campaign(\n", + " searchspace=searchspace, # Required\n", + " objective=objective, # Required\n", + " # recommender=recommender, # Optional\n", + ")" + ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 34, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SubspaceDiscrete(parameters=[CategoricalParameter(name='SMILES', _values=('COCCOC(=O)OCSc1nc2c(s1)cccc2', 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O', 'Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O', 'On1nnc2c1cccc2', 'c1ncn[nH]1', 'Sc1n[nH]cn1', 'S[C]1NC2=C[CH]C=NC2=N1', 'S=c1[nH]c2c([nH]1)nccn2', 'Sc1ncc[nH]1', 'C=CC(=O)OCCOC(=O)OCCSc1ncccn1', 'CCSc1nnc(s1)N', 'CSc1nnc(s1)N', 'Cc1ccc2c(c1)nc([nH]2)S', 'OC(=O)CS', 'Sc1nc2c([nH]1)cccc2', 'OC(=O)c1ccccc1S', 'S=c1sc2c([nH]1)cccc2', 'OC(=O)c1cccnc1S', 'Sc1ncccn1', 'c1ccc(nc1)c1ccccn1', 'Sc1nnc(s1)S', 'Nc1cc(S)nc(n1)N', 'Nc1nc([nH]n1)C(=O)O', 'Nc1n[nH]cn1', 'OC(=O)c1n[nH]c(n1)N', 'Nc1n[nH]c(n1)S', 'CS[C]1N[N]C(=N1)N', 'C1=CC(=CC(=C1)S)C(=O)O', 'OC(=O)CCS', 'Oc1ccccc1c1nnc([nH]1)S', 'Nn1cnnc1', 'Nc1ccnc(n1)S', 'Nn1c(NN)nnc1S', 'Nn1c(S)nnc1c1ccccc1', 'Sc1nc(N)c2c(n1)[nH]nc2', 'Oc1ccc(cc1)C(=O)O', 'OC(=O)c1ccc(cc1)S', 'Cn1cnnc1S', 'Sc1nc(N)c(c(n1)S)N', 'Nc1ncncc1N', 'Nc1cc(N)nc(n1)S', 'Cc1cc(C)nc(n1)S', 'Clc1cccc(c1)c1n[nH]c(=S)[nH]1', 'COc1cccc(c1)c1n[nH]c(=S)[nH]1', 'Clc1ccc(cc1Cl)c1n[nH]c(=S)[nH]1', 'c1cc(ccc1c2[nH]c(nn2)S)[N+](=O)[O-]', 'S=c1[nH]nc([nH]1)c1ccco1', 'S=c1[nH]nc([nH]1)c1cccnc1', 'S=c1[nH]nc([nH]1)c1ccncc1', 'Nc1n[nH]c(=S)s1', 'Cc1nsc(c1)N', 'Clc1ccc2c(c1)[nH]c(n2)S', 'CCOc1ccc2c(c1)nc([nH]2)S', 'Cn1nnnc1S', 'OC(=O)Cn1nnnc1S', 'COc1ccc2c(c1)[nH]c(=S)[nH]2', 'Cc1n[nH]c(=S)s1', 'ClC([C]1N[N]C=N1)(Cl)Cl', 'Clc1cc2[nH]c(=S)[nH]c2cc1Cl', 'CSc1[nH]c2c(n1)cc(c(c2)C)C', 'Nc1ccc2c(c1)sc(=S)[nH]2', 'OC(=O)c1ccc(=S)[nH]c1', 'Oc1cccc2c1nccc2', 'S=c1[nH]c2c([nH]1)c(=O)n(cn2)C', 'S=c1[nH]c2c([nH]1)cncn2', 'CC(=O)O', 'OC(=O)CCCCC(=O)O', 'OC(=O)c1ccccc1', 'c1ccc2c(c1)[nH]nn2', 'OC(=O)c1ccc(cc1)c1ccccc1', 'OC(=O)/C=C/c1ccccc1', 'C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O', 'O[C@H]1C(=O)OCC1(C)C', 'OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O', 'OC[C@H]([C@H]([C@@H]([C@@H](CO)O)O)O)O', 'CC(=O)SSC(=O)C', 'CCCCOP(=O)(OCCCC)O', 'CCN(C(=S)S)CC', 'O/N=C(/C(=N/O)/C)\\\\C', 'CCCCCCCCCCCCc1ccccc1S([O])([O])O', 'CCCCCCCCCCCCOS(=O)(=O)O', 'OC(=O)CN(CC(=O)O)CCN(CC(=O)O)CC(=O)O', 'O/N=C(\\\\C(=N/O)\\\\c1ccco1)/c1ccco1', 'OC[C@H]([C@H]([C@@H]([C@H](C(=O)O)O)O)O)O', 'OCC(CO)O', 'NCC(=O)O', 'OC(=O)CCCCCCCCCCCCCCC(=O)O', 'C1N2CN3CN1CN(C2)C3', 'NO', 'COC(=O)CCCC1=CNC2=CC=CC=C21', 'OC(=O)c1ccncc1', 'C1COCCN1CCCS(=O)(=O)O', 'OC(=O)c1cccnc1', 'CCCCCCCC/C=C\\\\CCCCCCCC(=O)O', 'C(=O)(C(=O)[O-])[O-]', 'OC(=O)c1ccc(cc1)N', 'Oc1ccc(cc1)S([O])([O])O', 'OC(=O)c1ccccn1', 'OC(=O)c1ccccc1O', 'CCCCCCCCCCCCCCCCCC(=O)O', 'SC#N', 'C1=CC(=C(C=C1SSC2=CC(=C(C=C2)[N+](=O)[O-])C(=O)O)C(=O)O)[N+](=O)[O-]', '[O-]S(=O)[O-].[Na+].[Na+]', 'CCCCCCCCN(CC(=O)O[Na])CC(=O)O[Na]', 'CCCCCCCCCCCCN(CC(=O)O[Na])CC(=O)O[Na]', 'CCCCCCCCCCCCCCN(CC(=O)O[Na])CC(=O)O[Na]', 'CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C(=O)O)C', 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C', 'N.N.[N+](=O)(O)[O-].[N+](=O)(O)[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].O.O.O.O.[Ce+3]', '[NH4+].[NH4+].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[Ce+4]', '[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[Ce+3]', '[O-]S(=O)(=O)[O-].[O-]S(=O)(=O)[O-].[O-]S(=O)(=O)[O-].[Ce+3].[Ce+3]', '[Cl-].[Cl-].[Cl-].[Ce+3]', 'CNCC(C1=CC(=CC=C1)O)O', 'C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Fe+2]', 'C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Zn+2]', 'C1=CC=C(C(=C1)C=NNC(=S)N)O', 'C1=CC(=C(C=C1O)O)C=NNC(=S)N', 'NC(=S)NN=CC1=C(C(=C(C=C1)O)O)O', 'CCCCN(CCCC)C1=NC(=NC(=N1)NC(CCSC)C(=O)O)NC(CCSC)C(=O)O', 'C1=CC2=NNN=C2C=C1Cl', 'O=C([O-])C(O)C(O)C(O)C(O)CO.[Na+]', 'COC(=O)n1nnc2ccccc12'), encoding=), NumericalDiscreteParameter(name='Time_h', encoding=None, _values=[0.5, 1.0, 2.0, 3.0, 6.0, 24.0, 48.0, 72.0, 96.0, 120.0, 144.0, 168.0, 192.0, 240.0, 288.0, 336.0, 360.0, 384.0, 432.0, 480.0, 528.0, 576.0, 600.0, 624.0, 672.0], tolerance=0.0), NumericalDiscreteParameter(name='pH', encoding=None, _values=[0.0, 3.3, 4.0, 4.4, 5.4, 5.5, 5.6, 7.0, 10.0], tolerance=0.0), NumericalDiscreteParameter(name='Inhib_Concentrat_M', encoding=None, _values=[1e-05, 5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0008, 0.001, 0.0012, 0.0018, 0.0024, 0.003, 0.005, 0.01, 0.011, 0.021, 0.022, 0.031, 0.033, 0.042, 0.044, 0.05, 0.1], tolerance=0.0), NumericalDiscreteParameter(name='Salt_Concentrat_M', encoding=None, _values=[0.0, 0.01, 0.05, 0.1, 0.5, 0.6], tolerance=0.0)], exp_rep= SMILES Time_h pH \\\n", + "0 COCCOC(=O)OCSc1nc2c(s1)cccc2 24.0 4.0 \n", + "1 COCCOC(=O)OCSc1nc2c(s1)cccc2 24.0 10.0 \n", + "2 Cc1ccc(c(c1)n1nc2c(n1)cccc2)O 24.0 4.0 \n", + "3 Cc1ccc(c(c1)n1nc2c(n1)cccc2)O 24.0 10.0 \n", + "4 Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O 24.0 4.0 \n", + ".. ... ... ... \n", + "606 S=c1sc2c([nH]1)cccc2 24.0 7.0 \n", + "607 C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O 24.0 7.0 \n", + "608 C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O 24.0 7.0 \n", + "609 C(=O)(C(=O)[O-])[O-] 24.0 7.0 \n", + "610 C(=O)(C(=O)[O-])[O-] 24.0 7.0 \n", + "\n", + " Inhib_Concentrat_M Salt_Concentrat_M \n", + "0 0.0010 0.10 \n", + "1 0.0010 0.10 \n", + "2 0.0010 0.10 \n", + "3 0.0010 0.10 \n", + "4 0.0010 0.10 \n", + ".. ... ... \n", + "606 0.0005 0.05 \n", + "607 0.0005 0.05 \n", + "608 0.0005 0.05 \n", + "609 0.0005 0.05 \n", + "610 0.0005 0.05 \n", + "\n", + "[611 rows x 5 columns], metadata= was_recommended was_measured dont_recommend\n", + "0 False False False\n", + "1 False False False\n", + "2 False False False\n", + "3 False False False\n", + "4 False False False\n", + ".. ... ... ...\n", + "606 False False False\n", + "607 False False False\n", + "608 False False False\n", + "609 False False False\n", + "610 False False False\n", + "\n", + "[611 rows x 3 columns], empty_encoding=False, constraints=[], comp_rep= SMILES_COCCOC(=O)OCSc1nc2c(s1)cccc2 \\\n", + "0 1 \n", + "1 1 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + ".. ... \n", + "606 0 \n", + "607 0 \n", + "608 0 \n", + "609 0 \n", + "610 0 \n", + "\n", + " SMILES_Cc1ccc(c(c1)n1nc2c(n1)cccc2)O \\\n", + "0 0 \n", + "1 0 \n", + "2 1 \n", + "3 1 \n", + "4 0 \n", + ".. ... \n", + "606 0 \n", + "607 0 \n", + "608 0 \n", + "609 0 \n", + "610 0 \n", + "\n", + " SMILES_Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O SMILES_On1nnc2c1cccc2 \\\n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 1 0 \n", + ".. ... ... \n", + "606 0 0 \n", + "607 0 0 \n", + "608 0 0 \n", + "609 0 0 \n", + "610 0 0 \n", + "\n", + " SMILES_c1ncn[nH]1 SMILES_Sc1n[nH]cn1 SMILES_S[C]1NC2=C[CH]C=NC2=N1 \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + ".. ... ... ... \n", + "606 0 0 0 \n", + "607 0 0 0 \n", + "608 0 0 0 \n", + "609 0 0 0 \n", + "610 0 0 0 \n", + "\n", + " SMILES_S=c1[nH]c2c([nH]1)nccn2 SMILES_Sc1ncc[nH]1 \\\n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 \n", + ".. ... ... \n", + "606 0 0 \n", + "607 0 0 \n", + "608 0 0 \n", + "609 0 0 \n", + "610 0 0 \n", + "\n", + " SMILES_C=CC(=O)OCCOC(=O)OCCSc1ncccn1 ... \\\n", + "0 0 ... \n", + "1 0 ... \n", + "2 0 ... \n", + "3 0 ... \n", + "4 0 ... \n", + ".. ... ... \n", + "606 0 ... \n", + "607 0 ... \n", + "608 0 ... \n", + "609 0 ... \n", + "610 0 ... \n", + "\n", + " SMILES_C1=CC(=C(C=C1O)O)C=NNC(=S)N \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + ".. ... \n", + "606 0 \n", + "607 0 \n", + "608 0 \n", + "609 0 \n", + "610 0 \n", + "\n", + " SMILES_NC(=S)NN=CC1=C(C(=C(C=C1)O)O)O \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + ".. ... \n", + "606 0 \n", + "607 0 \n", + "608 0 \n", + "609 0 \n", + "610 0 \n", + "\n", + " SMILES_CCCCN(CCCC)C1=NC(=NC(=N1)NC(CCSC)C(=O)O)NC(CCSC)C(=O)O \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + ".. ... \n", + "606 0 \n", + "607 0 \n", + "608 0 \n", + "609 0 \n", + "610 0 \n", + "\n", + " SMILES_C1=CC2=NNN=C2C=C1Cl SMILES_O=C([O-])C(O)C(O)C(O)C(O)CO.[Na+] \\\n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 \n", + ".. ... ... \n", + "606 0 0 \n", + "607 0 0 \n", + "608 0 0 \n", + "609 0 0 \n", + "610 0 0 \n", + "\n", + " SMILES_COC(=O)n1nnc2ccccc12 Time_h pH Inhib_Concentrat_M \\\n", + "0 0 24.0 4.0 0.0010 \n", + "1 0 24.0 10.0 0.0010 \n", + "2 0 24.0 4.0 0.0010 \n", + "3 0 24.0 10.0 0.0010 \n", + "4 0 24.0 4.0 0.0010 \n", + ".. ... ... ... ... \n", + "606 0 24.0 7.0 0.0005 \n", + "607 0 24.0 7.0 0.0005 \n", + "608 0 24.0 7.0 0.0005 \n", + "609 0 24.0 7.0 0.0005 \n", + "610 0 24.0 7.0 0.0005 \n", + "\n", + " Salt_Concentrat_M \n", + "0 0.10 \n", + "1 0.10 \n", + "2 0.10 \n", + "3 0.10 \n", + "4 0.10 \n", + ".. ... \n", + "606 0.05 \n", + "607 0.05 \n", + "608 0.05 \n", + "609 0.05 \n", + "610 0.05 \n", + "\n", + "[611 rows x 127 columns])" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Bayesian Optimization" + "searchspace" ] }, { @@ -270,7 +706,21 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "results = simulate_scenarios(\n", + " # Necessary\n", + " campaign=campaign,\n", + " # Technically optional but should always be set\n", + " lookup=lookup,\n", + " # Optional\n", + " batch_size=batch_size,\n", + " n_doe_iterations=n_doe_iterations,\n", + " initial_data=initial_data,\n", + " random_seed=random_seed,\n", + " impute_mode=impute_mode,\n", + " noise_percent=noise_percent,\n", + ")" + ] }, { "cell_type": "markdown",