AC-BO-Hackathon
diff --git a/‎baybe_hack.ipynb
Lines changed: 112 additions & 68 deletions b/‎baybe_hack.ipynb
Lines changed: 112 additions & 68 deletions
diff --git a/‎baybe-inhibitor.ipynb renamed to ‎michael-baybe-inhibitor.ipynb b/‎baybe-inhibitor.ipynb renamed to ‎michael-baybe-inhibitor.ipynb
@@ -9,22 +9,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "           Time_h          pH  Inhib_Concentrat_M   Efficiency\n",
-      "count  611.000000  611.000000          611.000000   611.000000\n",
-      "mean   135.801964    6.342062            0.006808    26.736841\n",
-      "std    201.683867    2.529080            0.014059   288.788317\n",
-      "min      0.500000    0.000000            0.000010 -4834.000000\n",
-      "25%     24.000000    4.000000            0.000500    30.000000\n",
-      "50%     24.000000    7.000000            0.001000    58.000000\n",
-      "75%    144.000000    7.000000            0.003000    87.950000\n",
-      "max    672.000000   10.000000            0.100000   100.000000\n"
+      "           Time_h          pH  Inhib_Concentrat_M  Salt_Concentrat_M  \\\n",
+      "count  611.000000  611.000000          611.000000         611.000000   \n",
+      "mean   135.801964    6.342062            0.006808           0.145450   \n",
+      "std    201.683867    2.529080            0.014059           0.200575   \n",
+      "min      0.500000    0.000000            0.000010           0.000000   \n",
+      "25%     24.000000    4.000000            0.000500           0.010000   \n",
+      "50%     24.000000    7.000000            0.001000           0.100000   \n",
+      "75%    144.000000    7.000000            0.003000           0.100000   \n",
+      "max    672.000000   10.000000            0.100000           0.600000   \n",
+      "\n",
+      "        Efficiency  \n",
+      "count   611.000000  \n",
+      "mean     26.736841  \n",
+      "std     288.788317  \n",
+      "min   -4834.000000  \n",
+      "25%      30.000000  \n",
+      "50%      58.000000  \n",
+      "75%      87.950000  \n",
+      "max     100.000000  \n"
      ]
     }
    ],
@@ -38,7 +48,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -52,12 +62,12 @@
       "3            Cc1ccc(c(c1)n1nc2c(n1)cccc2)O    24.0  10.0               0.001   \n",
       "4  Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O    24.0   4.0               0.001   \n",
       "\n",
-      "   Efficiency  \n",
-      "0         0.0  \n",
-      "1         0.0  \n",
-      "2        30.0  \n",
-      "3        30.0  \n",
-      "4        30.0  \n"
+      "   Salt_Concentrat_M  Efficiency  \n",
+      "0                0.1         0.0  \n",
+      "1                0.1         0.0  \n",
+      "2                0.1        30.0  \n",
+      "3                0.1        30.0  \n",
+      "4                0.1        30.0  \n"
      ]
     }
    ],
@@ -74,9 +84,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/vscode/.local/lib/python3.10/site-packages/baybe/telemetry.py:222: UserWarning: WARNING: BayBE Telemetry endpoint https://public.telemetry.baybe.p.uptimize.merckgroup.com:4317 cannot be reached. Disabling telemetry. The exception encountered was: ConnectionError, HTTPConnectionPool(host='verkehrsnachrichten.merck.de', port=80): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPConnection object at 0x7fa2386f7fd0>: Failed to resolve 'verkehrsnachrichten.merck.de' ([Errno -2] Name or service not known)\"))\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
    "source": [
     "from baybe.targets import NumericalTarget\n",
     "from baybe.objective import Objective\n",
@@ -97,7 +116,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -141,20 +160,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "SubstanceParameter(name='Solvent', data={'Water': 'O', '1-Octanol': 'CCCCCCCCO', 'Toluene': 'CC1=CC=CC=C1'}, decorrelate=0.7, encoding=<SubstanceEncoding.MORDRED: 'MORDRED'>)"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from baybe.parameters import SubstanceParameter\n",
     "\n",
@@ -178,22 +186,17 @@
     "These calculations will typically result in 500 to 1500 numbers per molecule. **To avoid detrimental effects on the surrogate model fit, we reduce the number of descriptors via decorrelation before using them.** For instance, the decorrelate option in the example above specifies that only descriptors with a correlation lower than 0.7 to any other descriptor will be kept. This usually reduces the number of descriptors to 10-50, depending on the specific items in data."
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The encoding concept introduced above is generalized by the CustomParameter. Here, the user is expected to provide their own descriptors for the encoding.\n",
-    "\n",
-    "Take, for instance, a parameter that corresponds to the choice of a polymer. Polymers are not well represented by the small molecule descriptors utilized in the SubstanceParameter. Still, one could provide experimental measurements or common metrics used to classify polymers:"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "\"\"\"\n",
+    "The encoding concept introduced above is generalized by the CustomParameter. Here, the user is expected to provide their own descriptors for the encoding.\n",
+    "\n",
+    "Take, for instance, a parameter that corresponds to the choice of a polymer. Polymers are not well represented by the small molecule descriptors utilized in the SubstanceParameter. \n",
+    "Still, one could provide experimental measurements or common metrics used to classify polymers:\n",
     "from baybe.parameters import CustomDiscreteParameter\n",
     "\n",
     "# Create or import new dataframe containing custom descriptors\n",
@@ -216,7 +219,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -250,7 +253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -282,12 +285,7 @@
     "        sampling_percentage=0.3, # should be relatively low\n",
     "        allow_repeated_recommendations=False,\n",
     "        allow_recommending_already_measured=False,\n",
-    "    )\n",
-    "\n",
-    "hybrid_recommender = SequentialGreedyRecommender(\n",
-    "    allow_repeated_recommendations=False,\n",
-    "    allow_recommending_already_measured=False\n",
-    ")"
+    "    )"
    ]
   },
   {
@@ -299,7 +297,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -316,12 +314,10 @@
     "from baybe import Campaign\n",
     "\n",
     "strategy = TwoPhaseStrategy(\n",
-    "    initial_recommender = RandomRecommender(),  # Initial recommender, if no training data is available\n",
-    "    # Other initial recommenders don't seem to work for my hybrid search space/set of parameters\n",
-    "    # Doesn't matter since I already have training data\n",
+    "    initial_recommender = RandomRecommender(),  # Initial recommender\n",
+    "    # Doesn't matter since I already have training data, BUT CAN BE USED FOR BENCHMARKING\n",
     "    recommender = seq_greedy_recommender,  # Bayesian model-based optimization\n",
-    "    # recommender = hybrid_recommender,\n",
-    "    switch_after=1  # Switch to the model-based recommender after 1 batch or iteration (so the initial training data)\n",
+    "    switch_after=1  # Switch to the model-based recommender after 1 batches = immediately\n",
     ")\n",
     "\n",
     "campaign = Campaign(searchspace, objective, strategy)"
@@ -336,7 +332,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -346,21 +342,23 @@
       "\n",
       "\n",
       "Recommended experiments: \n",
-      "|       |   Time (h) |   pH |   Salt Concentration (M) |   Inhibitor Concentration (M) |\n",
-      "|------:|-----------:|-----:|-------------------------:|------------------------------:|\n",
-      "| 11808 |          1 |  4.8 |                      1.5 |                    0.00858356 |\n"
+      "|         |   Time (h) |   pH |   Inhibitor Concentration (M) |   Salt Concentration (M) |\n",
+      "|--------:|-----------:|-----:|------------------------------:|-------------------------:|\n",
+      "| 4924793 |         16 |  2.5 |                          0.01 |                     0.92 |\n",
+      "| 6006943 |         19 |  8   |                          0.05 |                     0.58 |\n",
+      "| 6994486 |         22 |  8.8 |                          0.08 |                     0.88 |\n"
      ]
     }
    ],
    "source": [
-    "new_rec = campaign.recommend(batch_size=1) # TEST with different batch sizes for optimal performance\n",
+    "new_rec = campaign.recommend(batch_size=3) # TEST with different batch sizes for optimal performance\n",
     "print(\"\\n\\nRecommended experiments: \")\n",
     "print(new_rec.to_markdown())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -370,21 +368,49 @@
       "\n",
       "\n",
       "Recommended experiments with measured values: \n",
-      "|       |   Time (h) |   pH |   Salt Concentration (M) |   Inhibitor Concentration (M) |\n",
-      "|------:|-----------:|-----:|-------------------------:|------------------------------:|\n",
-      "| 11808 |          1 |  4.8 |                      1.5 |                    0.00858356 |\n"
+      "|         |   Time (h) |   pH |   Inhibitor Concentration (M) |   Salt Concentration (M) |   efficiency |\n",
+      "|--------:|-----------:|-----:|------------------------------:|-------------------------:|-------------:|\n",
+      "| 4924793 |         16 |  2.5 |                          0.01 |                     0.92 |          0.1 |\n",
+      "| 6006943 |         19 |  8   |                          0.05 |                     0.58 |          0.2 |\n",
+      "| 6994486 |         22 |  8.8 |                          0.08 |                     0.88 |          0.3 |\n"
      ]
     }
    ],
    "source": [
     "# Get and input efficiency value from Excel table, for specific SMILES component first, \n",
     "# then for the closest values of the rest of the parameters\n",
     "\n",
-    "new_rec[\"efficiency\"] = [0.1]\n",
+    "new_rec[\"efficiency\"] = [0.1, 0.2, 0.3]\n",
     "print(\"\\n\\nRecommended experiments with measured values: \")\n",
     "print(new_rec.to_markdown())"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Recommended experiments: \n",
+      "|         |   Time (h) |   pH |   Inhibitor Concentration (M) |   Salt Concentration (M) |   efficiency |\n",
+      "|--------:|-----------:|-----:|------------------------------:|-------------------------:|-------------:|\n",
+      "| 4924793 |         16 |  2.5 |                          0.01 |                     0.92 |          0.1 |\n",
+      "| 6006943 |         19 |  8   |                          0.05 |                     0.58 |          0.2 |\n",
+      "| 6994486 |         22 |  8.8 |                          0.08 |                     0.88 |          0.3 |\n"
+     ]
+    }
+   ],
+   "source": [
+    "new_new_rec = campaign.recommend(batch_size=3) # TEST with different batch sizes for optimal performance\n",
+    "print(\"\\n\\nRecommended experiments: \")\n",
+    "print(new_new_rec.to_markdown())"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -394,9 +420,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "All experiments with measured values: \n",
+      "|         |   Time (h) |   pH |   Inhibitor Concentration (M) |   Salt Concentration (M) |   efficiency |\n",
+      "|--------:|-----------:|-----:|------------------------------:|-------------------------:|-------------:|\n",
+      "| 4924793 |         16 |  2.5 |                          0.01 |                     0.92 |          0.1 |\n",
+      "| 6006943 |         19 |  8   |                          0.05 |                     0.58 |          0.2 |\n",
+      "| 6994486 |         22 |  8.8 |                          0.08 |                     0.88 |          0.3 |\n",
+      "| 4924793 |         16 |  2.5 |                          0.01 |                     0.92 |        nan   |\n",
+      "| 6006943 |         19 |  8   |                          0.05 |                     0.58 |        nan   |\n",
+      "| 6994486 |         22 |  8.8 |                          0.08 |                     0.88 |        nan   |\n"
+     ]
+    }
+   ],
    "source": [
     "results = pd.concat([new_rec, new_new_rec]) # etc.\n",
     "print(\"\\n\\nAll experiments with measured values: \")\n",