diff --git a/can_baybe-inhibitor.ipynb b/can_baybe-inhibitor.ipynb index 7132fe4..8ca1038 100644 --- a/can_baybe-inhibitor.ipynb +++ b/can_baybe-inhibitor.ipynb @@ -18,7 +18,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Initizalization" + "# Initialization" ] }, { @@ -217,6 +217,8 @@ "from baybe.simulation import simulate_scenarios\n", "from baybe.targets import NumericalTarget\n", "\n", + "# these are datasets already preprocessed, filtered, and grouped by \n", + "so we have only one row for each unique combination of parameters\n", "df_AA2024 = pd.read_excel('data/averaged_filtered_AA2024.xlsx')\n", "df_AA5000 = pd.read_excel('data/averaged_filtered_AA5000.xlsx')\n", "df_AA6000 = pd.read_excel('data/averaged_filtered_AA6000.xlsx')\n", @@ -250,6 +252,7 @@ "metadata": {}, "outputs": [], "source": [ + "# def required from baybe package\n", "lookup = df_active" ] }, @@ -268,6 +271,13 @@ "smiles_dict =list_to_dict(unique_SMILES)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Defining parameters for the search space" + ] + }, { "cell_type": "code", "execution_count": 300, @@ -336,13 +346,36 @@ " ]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setting the target" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_no_target = lookup.drop('Efficiency', axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Creating the searchspace\n", + "Multiple searchspaces and parameter groups are initialized to investigate the influence of built-in featurization methods on the Bayesian optimization process." + ] + }, { "cell_type": "code", "execution_count": 301, "metadata": {}, "outputs": [], "source": [ - "df_no_target = lookup.drop('Efficiency', axis=1)\n", "\n", "# searchspace = SearchSpace.from_dataframe(df = df_no_target, parameters=parameters)\n", "# print('Print test 1')\n", @@ -528,6 +561,13 @@ "searchspace_rdkit" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Defining the campaign = searchspace + objective" + ] + }, { "cell_type": "code", "execution_count": 303, @@ -539,6 +579,7 @@ "campaign_rdkit = Campaign(searchspace=searchspace_rdkit, objective=objective)\n", "campaign_ohe = Campaign(searchspace=searchspace_ohe, objective=objective)\n", "\n", + "# not all randoms are used but checked for differences in behaviour\n", "campaign_rand_mordred = Campaign(\n", " searchspace=searchspace_mordred,\n", " recommender=TwoPhaseMetaRecommender(recommender=RandomRecommender()),\n", @@ -556,9 +597,16 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Puttting the campaigns that we are interested in a scenario" + ] + }, { "cell_type": "code", - "execution_count": 304, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -570,6 +618,13 @@ " }" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Start our simulations" + ] + }, { "cell_type": "code", "execution_count": 305, @@ -770,9 +825,9 @@ } ], "source": [ - "N_MC_ITERATIONS = 10\n", - "N_DOE_ITERATIONS = 50\n", - "BATCH_SIZE = 1\n", + "N_MC_ITERATIONS = 10 # number of Monte Carlo iterations\n", + "N_DOE_ITERATIONS = 50 # number of Design of Experiments iterations\n", + "BATCH_SIZE = 1 # number of experiments each DoE contains \n", "\n", "results = simulate_scenarios(\n", " scenarios,\n", @@ -790,9 +845,17 @@ "metadata": {}, "outputs": [], "source": [ + "# record results to excel\n", "results.to_excel(f\"./results/{exp_dataset_name}_simulation_{N_MC_ITERATIONS}MC_{N_DOE_ITERATIONS}exp_{BATCH_SIZE}batch.xlsx\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plotting the results" + ] + }, { "cell_type": "code", "execution_count": 313, @@ -1155,27 +1218,3045 @@ "results" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Transfer Learning\n", + "### Use transfer learning to gain information from prior experimental campaigns." + ] + }, { "cell_type": "code", - "execution_count": 311, + "execution_count": 317, "metadata": {}, "outputs": [], "source": [ - "results.to_excel(f\"./results/{exp_dataset_name}_simulation_{N_MC_ITERATIONS}MC_{N_DOE_ITERATIONS}exp_{BATCH_SIZE}batch.xlsx\")\n" + "df_active = df_AA2024\n", + "df_transfer = df_AA1000" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 318, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from baybe.parameters import TaskParameter\n", + "\n", + "taskparam = TaskParameter(\n", + " name=\"Al_alloys\",\n", + " values=[\"AA1000\", \"AA2024\"],\n", + " active_values=[\"AA2024\"],\n", + ")" + ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 321, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Time_hpHInhib_Concentrat_MSalt_Concentrat_MEfficiency
count848.000000848.0000008.480000e+02848.000000848.000000
mean126.8431604.1895806.352976e-020.08896235.066659
std192.0556763.6961833.690920e-010.227758245.617010
min0.000000-0.6000001.000000e-070.000000-4834.000000
25%6.0000000.0000005.000000e-040.00000035.000000
50%24.0000004.0000001.000000e-030.01000060.000000
75%144.0000007.0000004.200000e-030.10000080.507500
max720.00000013.0000003.280000e+002.000000100.000000
\n", + "
" + ], + "text/plain": [ + " Time_h pH Inhib_Concentrat_M Salt_Concentrat_M \\\n", + "count 848.000000 848.000000 8.480000e+02 848.000000 \n", + "mean 126.843160 4.189580 6.352976e-02 0.088962 \n", + "std 192.055676 3.696183 3.690920e-01 0.227758 \n", + "min 0.000000 -0.600000 1.000000e-07 0.000000 \n", + "25% 6.000000 0.000000 5.000000e-04 0.000000 \n", + "50% 24.000000 4.000000 1.000000e-03 0.010000 \n", + "75% 144.000000 7.000000 4.200000e-03 0.100000 \n", + "max 720.000000 13.000000 3.280000e+00 2.000000 \n", + "\n", + " Efficiency \n", + "count 848.000000 \n", + "mean 35.066659 \n", + "std 245.617010 \n", + "min -4834.000000 \n", + "25% 35.000000 \n", + "50% 60.000000 \n", + "75% 80.507500 \n", + "max 100.000000 " + ] + }, + "execution_count": 321, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "df_combined = pd.concat([df_active, df_transfer], axis=0)\n", + "df_combined.describe()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 332, + "metadata": {}, + "outputs": [], + "source": [ + "unique_SMILES_transfer = df_transfer[\"SMILES\"].unique()\n", + "unique_SMILES = df_combined[\"SMILES\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 333, + "metadata": {}, + "outputs": [], + "source": [ + "from baybe.parameters import NumericalContinuousParameter, CategoricalParameter, NumericalDiscreteParameter\n", + "from baybe.searchspace import SearchSpace\n", + "\n", + "transfer_parameters=[\n", + "NumericalDiscreteParameter(\n", + " name=\"Time_h\",\n", + " values=df_combined[\"Time_h\"].unique(),\n", + " tolerance=5/60,\n", + "),\n", + "NumericalDiscreteParameter(\n", + " name=\"pH\",\n", + " values=df_combined[\"pH\"].unique(),\n", + " ), \n", + "NumericalDiscreteParameter(\n", + " name=\"Inhib_Concentrat_M\",\n", + " values=df_combined[\"Inhib_Concentrat_M\"].unique(),\n", + " ),\n", + "NumericalDiscreteParameter(\n", + " name=\"Salt_Concentrat_M\",\n", + " values=df_combined[\"Salt_Concentrat_M\"].unique(),\n", + " ),\n", + "CategoricalParameter(\n", + " name=\"SMILES\",\n", + " values=unique_SMILES,\n", + " encoding=\"OHE\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 334, + "metadata": {}, + "outputs": [], + "source": [ + "searchspace_transfer = SearchSpace.from_dataframe(df_transfer.drop(\"Efficiency\", axis = 1), transfer_parameters)\n", + "\n", + "campaign_transfer = Campaign(searchspace_transfer, objective)" + ] + }, + { + "cell_type": "code", + "execution_count": 328, + "metadata": {}, + "outputs": [], + "source": [ + "df_features = df_active.drop(\"Efficiency\", axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 335, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/10 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Time_hpHInhib_Concentrat_MSalt_Concentrat_MEfficiency
count258.000000258.000000258.000000258.000000258.000000
mean167.6027136.6360470.0073860.11790728.268191
std220.4887882.1496130.0132020.166813265.800655
min0.5000000.0000000.0000100.000000-3813.000000
25%24.0000005.4000000.0010000.05000030.000000
50%24.0000007.0000000.0010000.10000055.000000
75%240.0000007.0000000.0045000.10000089.000000
max672.00000010.0000000.0440000.600000100.000000
\n", + "" + ], + "text/plain": [ + " Time_h pH Inhib_Concentrat_M Salt_Concentrat_M \\\n", + "count 258.000000 258.000000 258.000000 258.000000 \n", + "mean 167.602713 6.636047 0.007386 0.117907 \n", + "std 220.488788 2.149613 0.013202 0.166813 \n", + "min 0.500000 0.000000 0.000010 0.000000 \n", + "25% 24.000000 5.400000 0.001000 0.050000 \n", + "50% 24.000000 7.000000 0.001000 0.100000 \n", + "75% 240.000000 7.000000 0.004500 0.100000 \n", + "max 672.000000 10.000000 0.044000 0.600000 \n", + "\n", + " Efficiency \n", + "count 258.000000 \n", + "mean 28.268191 \n", + "std 265.800655 \n", + "min -3813.000000 \n", + "25% 30.000000 \n", + "50% 55.000000 \n", + "75% 89.000000 \n", + "max 100.000000 " + ] + }, + "execution_count": 338, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fraction_df.describe()" ] }, { @@ -1183,7 +4264,41 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "concatenated_df = pd.concat([result_fresh_start, result_transfer_learning], axis=0, ignore_index=True)\n", + "concatenated_df" + ] + }, + { + "cell_type": "code", + "execution_count": 339, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 339, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# until 50\n", + "limit = 50\n", + "exp_dataset_name = 'transferAA1000_to_AA2024'\n", + "sns.lineplot(\n", + " data=concatenated_df, x=\"Num_Experiments\", y=\"Efficiency_CumBest\", hue=\"Scenario\", marker=\"x\"\n", + ")\n", + "plt.plot([0.5, N_DOE_ITERATIONS+0.5], [max_yield, max_yield], \"--r\", alpha=0.4)\n", + "plt.legend(loc=\"lower right\")\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.xlim(0, limit+1)\n", + "plt.savefig(f\"./img/{exp_dataset_name}_simulation_{N_MC_ITERATIONS}MC_{N_DOE_ITERATIONS}exp_{BATCH_SIZE}batch_first25.png\")" + ] } ], "metadata": { diff --git a/img/transferAA1000_to_AA2024_simulation_10MC_50exp_1batch_first25.png b/img/transferAA1000_to_AA2024_simulation_10MC_50exp_1batch_first25.png new file mode 100644 index 0000000..96c5873 Binary files /dev/null and b/img/transferAA1000_to_AA2024_simulation_10MC_50exp_1batch_first25.png differ diff --git a/img/transferAA1000_to_AA2024_simulation_10MC_50exp_1batch_first50.png b/img/transferAA1000_to_AA2024_simulation_10MC_50exp_1batch_first50.png new file mode 100644 index 0000000..d19a666 Binary files /dev/null and b/img/transferAA1000_to_AA2024_simulation_10MC_50exp_1batch_first50.png differ