From fa0c6cdb4364f6bce7a60a3b3f26bdad51c06d88 Mon Sep 17 00:00:00 2001
From: Michael <165048583+lordmikerahl@users.noreply.github.com>
Date: Wed, 27 Mar 2024 19:05:04 +0000
Subject: [PATCH] newstuff

---
 michalis-baybe-inhibitor.ipynb | 401 +++++++--------------------------
 1 file changed, 78 insertions(+), 323 deletions(-)

diff --git a/michalis-baybe-inhibitor.ipynb b/michalis-baybe-inhibitor.ipynb
index c7ffb88..bb07df0 100644
--- a/michalis-baybe-inhibitor.ipynb
+++ b/michalis-baybe-inhibitor.ipynb
@@ -36,11 +36,10 @@
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
-    "from baybe import Campaign\n",
     "\n",
     "df_AA2024 = pd.read_excel('data/filtered_AA2024.xlsx')\n",
-    "df_AA1000 = pd.read_excel('data/filtered_AA1000.xlsx')\n",
-    "df_Al = pd.read_excel('data/filtered_Al.xlsx')"
+    "# df_AA1000 = pd.read_excel('data/filtered_AA1000.xlsx')\n",
+    "# df_Al = pd.read_excel('data/filtered_Al.xlsx')"
    ]
   },
   {
@@ -48,22 +47,8 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
    "source": [
-    "# Data Processing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Can is the best"
+    "print(df_AA2024.describe())"
    ]
   },
   {
@@ -72,28 +57,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Can is the best"
+    "print(df_AA2024.head())"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Data Anaylsis"
+    "# Data Processing"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Bayesian Optimization"
+    "### Extract all unique SMILES values into dictionary"
    ]
   },
   {
@@ -101,41 +79,20 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
    "source": [
-    "## Search Space"
+    "unique_SMILES = df_AA2024.SMILES.unique()\n",
+    "\n",
+    "def list_to_dict(input_list):\n",
+    "    return {item: item for item in input_list}\n",
+    "\n",
+    "smiles_dict =list_to_dict(unique_SMILES)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Objective"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Recommender"
+    "# Data Anaylsis"
    ]
   },
   {
@@ -149,21 +106,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Benchmarking"
+    "# Bayesian Optimization"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Transfer Learning"
+    "## Search Space"
    ]
   },
   {
@@ -171,148 +121,6 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Imports"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "           Time_h          pH  Inhib_Concentrat_M  Salt_Concentrat_M  \\\n",
-      "count  611.000000  611.000000          611.000000         611.000000   \n",
-      "mean   135.801964    6.342062            0.006808           0.145450   \n",
-      "std    201.683867    2.529080            0.014059           0.200575   \n",
-      "min      0.500000    0.000000            0.000010           0.000000   \n",
-      "25%     24.000000    4.000000            0.000500           0.010000   \n",
-      "50%     24.000000    7.000000            0.001000           0.100000   \n",
-      "75%    144.000000    7.000000            0.003000           0.100000   \n",
-      "max    672.000000   10.000000            0.100000           0.600000   \n",
-      "\n",
-      "        Efficiency  \n",
-      "count   611.000000  \n",
-      "mean     26.736841  \n",
-      "std     288.788317  \n",
-      "min   -4834.000000  \n",
-      "25%      30.000000  \n",
-      "50%      58.000000  \n",
-      "75%      87.950000  \n",
-      "max     100.000000  \n"
-     ]
-    }
-   ],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "\n",
-    "df_AA2024 = pd.read_excel('data/filtered_AA2024.xlsx')\n",
-    "print(df_AA2024.describe())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "                                    SMILES  Time_h    pH  Inhib_Concentrat_M  \\\n",
-      "0             COCCOC(=O)OCSc1nc2c(s1)cccc2    24.0   4.0               0.001   \n",
-      "1             COCCOC(=O)OCSc1nc2c(s1)cccc2    24.0  10.0               0.001   \n",
-      "2            Cc1ccc(c(c1)n1nc2c(n1)cccc2)O    24.0   4.0               0.001   \n",
-      "3            Cc1ccc(c(c1)n1nc2c(n1)cccc2)O    24.0  10.0               0.001   \n",
-      "4  Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O    24.0   4.0               0.001   \n",
-      "\n",
-      "   Salt_Concentrat_M  Efficiency  \n",
-      "0                0.1         0.0  \n",
-      "1                0.1         0.0  \n",
-      "2                0.1        30.0  \n",
-      "3                0.1        30.0  \n",
-      "4                0.1        30.0  \n"
-     ]
-    }
-   ],
-   "source": [
-    "print(df_AA2024.head())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Set targets/objectives = efficiency for now"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/vscode/.local/lib/python3.10/site-packages/baybe/telemetry.py:222: UserWarning: WARNING: BayBE Telemetry endpoint https://public.telemetry.baybe.p.uptimize.merckgroup.com:4317 cannot be reached. Disabling telemetry. The exception encountered was: ConnectionError, HTTPConnectionPool(host='verkehrsnachrichten.merck.de', port=80): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPConnection object at 0x7fa2386f7fd0>: Failed to resolve 'verkehrsnachrichten.merck.de' ([Errno -2] Name or service not known)\"))\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
-   "source": [
-    "from baybe.targets import NumericalTarget\n",
-    "from baybe.objective import Objective\n",
-    "\n",
-    "target = NumericalTarget(\n",
-    "    name=\"Efficiency\",\n",
-    "    mode=\"MAX\",\n",
-    ")\n",
-    "objective = Objective(mode=\"SINGLE\", targets=[target])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Search Space"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "from baybe.parameters import NumericalDiscreteParameter, NumericalContinuousParameter\n",
     "from baybe.searchspace import SearchSpace\n",
@@ -341,17 +149,6 @@
     "]"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Substance parameter**\n",
-    "\n",
-    "Instead of values, this parameter accepts data in form of a dictionary. The items correspond to pairs of labels and SMILES. SMILES are string-based representations of molecular structures. Based on these, BayBE can assign each label a set of molecular descriptors as encoding.\n",
-    "\n",
-    "For instance, a parameter corresponding to a choice of solvents can be initialized with:"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -360,14 +157,11 @@
    "source": [
     "from baybe.parameters import SubstanceParameter\n",
     "\n",
+    "encoding_choice = [\"MORDRED\", \"RDKIT\", \"MORGAN_FP\"]\n",
+    "\n",
     "SubstanceParameter(\n",
-    "    name=\"SMILES\",\n",
-    "    data={\n",
-    "        # INCORPORATE TRAINING DATA FROM DATAFRAME SMILES COLUMN\n",
-    "        \"Water\": \"O\",\n",
-    "        \"1-Octanol\": \"CCCCCCCCO\",\n",
-    "        \"Toluene\": \"CC1=CC=CC=C1\",\n",
-    "    },\n",
+    "    name=\"Inhibitor\",\n",
+    "    data=smiles_dict,\n",
     "    encoding=\"MORDRED\",  # Can be also RDKIT or MORGAN_FP - WHICH IS BETTER?\n",
     "    decorrelate=0.7,  # Change threshold to avoid overfitting?\n",
     ")"
@@ -380,6 +174,13 @@
     "These calculations will typically result in 500 to 1500 numbers per molecule. **To avoid detrimental effects on the surrogate model fit, we reduce the number of descriptors via decorrelation before using them.** For instance, the decorrelate option in the example above specifies that only descriptors with a correlation lower than 0.7 to any other descriptor will be kept. This usually reduces the number of descriptors to 10-50, depending on the specific items in data."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Custom descriptors"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -413,7 +214,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -424,30 +225,35 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Recommenders"
+    "## Objective"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "The **SequentialGreedyRecommender** is a powerful recommender that leverages BoTorch optimization functions to perform sequential Greedy optimization. It can be applied for discrete, continuous and hybrid sarch spaces. It is an implementation of the BoTorch optimization functions for discrete, continuous and mixed spaces. **It is important to note that this recommender performs a brute-force search when applied in hybrid search spaces, as it optimizes the continuous part of the space while exhaustively searching choices in the discrete subspace.** You can customize this behavior to only sample a certain percentage of the discrete subspace via the sample_percentage attribute and to choose different sampling strategies via the hybrid_sampler attribute. \n",
+    "from baybe.targets import NumericalTarget\n",
+    "from baybe.objective import Objective\n",
     "\n",
-    "e.g.\n",
-    "strategy = TwoPhaseStrategy(recommender=SequentialGreedyRecommender(hybrid_sampler=\"Farthest\", sampling_percentage=0.3))"
+    "target = NumericalTarget(\n",
+    "    name=\"Efficiency\",\n",
+    "    mode=\"MAX\",\n",
+    ")\n",
+    "objective = Objective(mode=\"SINGLE\", targets=[target])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For implementing fully customized surrogate models e.g. from sklearn or PyTorch, see:\n",
-    "https://emdgroup.github.io/baybe/examples/Custom_Surrogates/Custom_Surrogates.html\n"
+    "## Recommender"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -486,23 +292,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Campaign Strategy"
+    "# Campaign"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/vscode/.local/lib/python3.10/site-packages/baybe/strategies/deprecation.py:26: DeprecationWarning: 'TwoPhaseStrategy' is deprecated and will be removed in a future version. Please use 'recommenders.TwoPhaseMetaRecommender' class instead.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from baybe.strategies import TwoPhaseStrategy\n",
     "from baybe import Campaign\n",
@@ -526,24 +323,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "Recommended experiments: \n",
-      "|         |   Time (h) |   pH |   Inhibitor Concentration (M) |   Salt Concentration (M) |\n",
-      "|--------:|-----------:|-----:|------------------------------:|-------------------------:|\n",
-      "| 4924793 |         16 |  2.5 |                          0.01 |                     0.92 |\n",
-      "| 6006943 |         19 |  8   |                          0.05 |                     0.58 |\n",
-      "| 6994486 |         22 |  8.8 |                          0.08 |                     0.88 |\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "new_rec = campaign.recommend(batch_size=3) # TEST with different batch sizes for optimal performance\n",
     "print(\"\\n\\nRecommended experiments: \")\n",
@@ -552,24 +334,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "Recommended experiments with measured values: \n",
-      "|         |   Time (h) |   pH |   Inhibitor Concentration (M) |   Salt Concentration (M) |   efficiency |\n",
-      "|--------:|-----------:|-----:|------------------------------:|-------------------------:|-------------:|\n",
-      "| 4924793 |         16 |  2.5 |                          0.01 |                     0.92 |          0.1 |\n",
-      "| 6006943 |         19 |  8   |                          0.05 |                     0.58 |          0.2 |\n",
-      "| 6994486 |         22 |  8.8 |                          0.08 |                     0.88 |          0.3 |\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# Get and input efficiency value from Excel table, for specific SMILES component first, \n",
     "# then for the closest values of the rest of the parameters\n",
@@ -588,24 +355,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "Recommended experiments: \n",
-      "|         |   Time (h) |   pH |   Inhibitor Concentration (M) |   Salt Concentration (M) |   efficiency |\n",
-      "|--------:|-----------:|-----:|------------------------------:|-------------------------:|-------------:|\n",
-      "| 4924793 |         16 |  2.5 |                          0.01 |                     0.92 |          0.1 |\n",
-      "| 6006943 |         19 |  8   |                          0.05 |                     0.58 |          0.2 |\n",
-      "| 6994486 |         22 |  8.8 |                          0.08 |                     0.88 |          0.3 |\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "new_new_rec = campaign.recommend(batch_size=3) # TEST with different batch sizes for optimal performance\n",
     "print(\"\\n\\nRecommended experiments: \")\n",
@@ -621,27 +373,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "All experiments with measured values: \n",
-      "|         |   Time (h) |   pH |   Inhibitor Concentration (M) |   Salt Concentration (M) |   efficiency |\n",
-      "|--------:|-----------:|-----:|------------------------------:|-------------------------:|-------------:|\n",
-      "| 4924793 |         16 |  2.5 |                          0.01 |                     0.92 |          0.1 |\n",
-      "| 6006943 |         19 |  8   |                          0.05 |                     0.58 |          0.2 |\n",
-      "| 6994486 |         22 |  8.8 |                          0.08 |                     0.88 |          0.3 |\n",
-      "| 4924793 |         16 |  2.5 |                          0.01 |                     0.92 |        nan   |\n",
-      "| 6006943 |         19 |  8   |                          0.05 |                     0.58 |        nan   |\n",
-      "| 6994486 |         22 |  8.8 |                          0.08 |                     0.88 |        nan   |\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "results = pd.concat([new_rec, new_new_rec]) # etc.\n",
     "print(\"\\n\\nAll experiments with measured values: \")\n",
@@ -652,7 +386,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Transfer learning + Initial Data INFO"
+    "# Benchmarking"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Transfer Learning"
    ]
   },
   {
@@ -672,6 +420,13 @@
     "\n",
     "https://emdgroup.github.io/baybe/examples/Backtesting/full_lookup.html"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "https://emdgroup.github.io/baybe/userguide/simulation.html"
+   ]
   }
  ],
  "metadata": {