diff --git a/baybe_hack.ipynb b/baybe_hack.ipynb index 638325b..a137079 100644 --- a/baybe_hack.ipynb +++ b/baybe_hack.ipynb @@ -108,16 +108,16 @@ "NumericalDiscreteParameter(\n", " name=\"Time (h)\",\n", " values=np.arange(1, 25, 1)\n", - " # tolerance = 0.004\n", + " # tolerance = 0.004, assume certain experimental noise for each parameter measurement?\n", "),\n", "NumericalDiscreteParameter(\n", " name=\"pH\",\n", " values=np.arange(-1, 15.1, 0.1)\n", " # tolerance = 0.004\n", " ), \n", - "NumericalContinuousParameter( # Set this as continuous, the values seem quite small?\n", + "NumericalDiscreteParameter( # Set this as continuous, the values seem quite small?\n", " name=\"Inhibitor Concentration (M)\",\n", - " bounds=(0, 0.02)\n", + " values=np.arange(0, 0.1, 0.01), # Remove data outliers like 0.1?\n", " # tolerance = 0.004\n", " ),\n", "NumericalDiscreteParameter(\n", @@ -161,13 +161,13 @@ "SubstanceParameter(\n", " name=\"SMILES\",\n", " data={\n", - " # df_AA2024 SMILES column\n", + " # INCORPORATE TRAINING DATA FROM DATAFRAME SMILES COLUMN\n", " \"Water\": \"O\",\n", " \"1-Octanol\": \"CCCCCCCCO\",\n", " \"Toluene\": \"CC1=CC=CC=C1\",\n", " },\n", - " encoding=\"MORDRED\", # optional\n", - " decorrelate=0.7, # optional\n", + " encoding=\"MORDRED\", # Can be also RDKIT or MORGAN_FP - WHICH IS BETTER?\n", + " decorrelate=0.7, # Change threshold to avoid overfitting?\n", ")" ] }, @@ -175,19 +175,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "The encoding option defines what kind of descriptors are calculated:\n", - "\n", - "MORDRED: 2D descriptors from the Mordred package. Since the original package is now unmaintained, baybe requires the community replacement mordredcommunity\n", - "\n", - "RDKIT: 2D descriptors from the RDKit package\n", - "\n", - "MORGAN_FP: Morgan fingerprints calculated with RDKit (1024 bits, radius 4)\n", - "\n", - "These calculations will typically result in 500 to 1500 numbers per molecule. **To avoid detrimental effects on the surrogate model fit, we reduce the number of descriptors via decorrelation before using them.** For instance, the decorrelate option in the example above specifies that only descriptors with a correlation lower than 0.7 to any other descriptor will be kept. This usually reduces the number of descriptors to 10-50, depending on the specific items in data.\n", - "\n", - "**WARNING:**\n", - "The descriptors calculated for a SubstanceParameter were developed to describe small molecules and are not suitable for other substances. If you deal with large molecules like polymers or arbitrary substance mixtures, we recommend to provide your own descriptors via the CustomParameter." + "These calculations will typically result in 500 to 1500 numbers per molecule. **To avoid detrimental effects on the surrogate model fit, we reduce the number of descriptors via decorrelation before using them.** For instance, the decorrelate option in the example above specifies that only descriptors with a correlation lower than 0.7 to any other descriptor will be kept. This usually reduces the number of descriptors to 10-50, depending on the specific items in data." ] }, { @@ -201,23 +189,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "CustomDiscreteParameter(name='Polymer', data= Glass_Transition_TempC Weight_kDalton\n", - "Polymer A 20 120\n", - "Polymer B -71 32\n", - "Polymer C -39 241, decorrelate=True, encoding=)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "\"\"\"\n", "from baybe.parameters import CustomDiscreteParameter\n", @@ -353,9 +327,16 @@ "campaign = Campaign(searchspace, objective, strategy)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get recommendations" + ] + }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -365,23 +346,21 @@ "\n", "\n", "Recommended experiments: \n", - "| | Time (h) | pH | Salt Concentration (M) | Inhibitor Concentration (M) |\n", - "|-------:|-----------:|-----:|-------------------------:|------------------------------:|\n", - "| 492324 | 16 | 2.4 | 0.75 | 0.0150329 |\n", - "| 341299 | 11 | 7.8 | 0.01 | 0.00234041 |\n", - "| 470340 | 15 | 7.6 | 0 | 0.00838565 |\n" + "| | Time (h) | pH | Salt Concentration (M) | Inhibitor Concentration (M) |\n", + "|------:|-----------:|-----:|-------------------------:|------------------------------:|\n", + "| 11808 | 1 | 4.8 | 1.5 | 0.00858356 |\n" ] } ], "source": [ - "new_rec = campaign.recommend(batch_size=3)\n", + "new_rec = campaign.recommend(batch_size=1) # TEST with different batch sizes for optimal performance\n", "print(\"\\n\\nRecommended experiments: \")\n", "print(new_rec.to_markdown())" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -391,27 +370,26 @@ "\n", "\n", "Recommended experiments with measured values: \n", - "| | Time (h) | pH | Salt Concentration (M) | Inhibitor Concentration (M) | efficiency |\n", - "|-------:|-----------:|-----:|-------------------------:|------------------------------:|-------------:|\n", - "| 492324 | 16 | 2.4 | 0.75 | 0.0150329 | 0.1 |\n", - "| 341299 | 11 | 7.8 | 0.01 | 0.00234041 | 0.2 |\n", - "| 470340 | 15 | 7.6 | 0 | 0.00838565 | 0.3 |\n" + "| | Time (h) | pH | Salt Concentration (M) | Inhibitor Concentration (M) |\n", + "|------:|-----------:|-----:|-------------------------:|------------------------------:|\n", + "| 11808 | 1 | 4.8 | 1.5 | 0.00858356 |\n" ] } ], "source": [ - "new_rec[\"efficiency\"] = [0.1,0.2,0.3]\n", + "# Get and input efficiency value from Excel table, for specific SMILES component first, \n", + "# then for the closest values of the rest of the parameters\n", + "\n", + "new_rec[\"efficiency\"] = [0.1]\n", "print(\"\\n\\nRecommended experiments with measured values: \")\n", "print(new_rec.to_markdown())" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# Merge all results into a dataframe" + "### Merge all results into a dataframe" ] }, { @@ -424,6 +402,31 @@ "print(\"\\n\\nAll experiments with measured values: \")\n", "print(results.to_markdown())" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transfer learning + Initial Data INFO" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://emdgroup.github.io/baybe/examples/Transfer_Learning/basic_transfer_learning.html\n", + "\n", + "https://emdgroup.github.io/baybe/userguide/transfer_learning.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://emdgroup.github.io/baybe/examples/Backtesting/full_initial_data.html\n", + "\n", + "https://emdgroup.github.io/baybe/examples/Backtesting/full_lookup.html" + ] } ], "metadata": { diff --git a/data/filtered_AA1000.xlsx b/data/filtered_AA1000.xlsx new file mode 100644 index 0000000..9b24f5e Binary files /dev/null and b/data/filtered_AA1000.xlsx differ diff --git a/data/filtered_AA2024.xlsx b/data/filtered_AA2024.xlsx index 618858a..2744c29 100644 Binary files a/data/filtered_AA2024.xlsx and b/data/filtered_AA2024.xlsx differ diff --git a/data/filtered_Al.xlsx b/data/filtered_Al.xlsx index 75f1c1a..d9be577 100644 Binary files a/data/filtered_Al.xlsx and b/data/filtered_Al.xlsx differ diff --git a/data/filtered_full.xlsx b/data/filtered_full.xlsx deleted file mode 100644 index ecd6388..0000000 Binary files a/data/filtered_full.xlsx and /dev/null differ