From d40bb0069002c88fa9f335af4bb2b906e85cacf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Can=20=C3=96zkan?= <128815525+canozkan42@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:53:49 +0000 Subject: [PATCH] added SubstanceParameter script --- can_baybe-inhibitor.ipynb | 232 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 can_baybe-inhibitor.ipynb diff --git a/can_baybe-inhibitor.ipynb b/can_baybe-inhibitor.ipynb new file mode 100644 index 0000000..cf51e62 --- /dev/null +++ b/can_baybe-inhibitor.ipynb @@ -0,0 +1,232 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This project will focus on exploring the capabilities of Bayesian optimization, specifically employing BayBE, in the discovery of novel corrosion inhibitors for materials design. Initially, we will work with a randomly chosen subset from a comprehensive database of electrochemical responses of small organic molecules. Our goal is to assess how Bayesian optimization can speed up the screening process across the design space to identify promising compounds. We will compare different strategies for incorporating alloy information, while optimizing the experimental parameters with respect to the inhibitive performance of the screened compounds." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Initizalization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading libraries and data files:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from baybe import Campaign\n", + "\n", + "df_AA2024 = pd.read_excel('data/filtered_AA2024.xlsx')\n", + "df_AA1000 = pd.read_excel('data/filtered_AA1000.xlsx')\n", + "df_Al = pd.read_excel('data/filtered_Al.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "unique_SMILES = df_AA2024.SMILES.unique()\n", + "\n", + "def list_to_dict(input_list):\n", + " return {item: item for item in input_list}\n", + "\n", + "smiles_dict =list_to_dict(unique_SMILES)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Processing" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SubstanceParameter(name='Inhibitor', data={'COCCOC(=O)OCSc1nc2c(s1)cccc2': 'COCCOC(=O)OCSc1nc2c(s1)cccc2', 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O': 'Cc1ccc(c(c1)n1nc2c(n1)cccc2)O', 'Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O': 'Clc1ccc(cc1)CC[C@](C(C)(C)C)(Cn1cncn1)O', 'On1nnc2c1cccc2': 'On1nnc2c1cccc2', 'c1ncn[nH]1': 'c1ncn[nH]1', 'Sc1n[nH]cn1': 'Sc1n[nH]cn1', 'S[C]1NC2=C[CH]C=NC2=N1': 'S[C]1NC2=C[CH]C=NC2=N1', 'S=c1[nH]c2c([nH]1)nccn2': 'S=c1[nH]c2c([nH]1)nccn2', 'Sc1ncc[nH]1': 'Sc1ncc[nH]1', 'C=CC(=O)OCCOC(=O)OCCSc1ncccn1': 'C=CC(=O)OCCOC(=O)OCCSc1ncccn1', 'CCSc1nnc(s1)N': 'CCSc1nnc(s1)N', 'CSc1nnc(s1)N': 'CSc1nnc(s1)N', 'Cc1ccc2c(c1)nc([nH]2)S': 'Cc1ccc2c(c1)nc([nH]2)S', 'OC(=O)CS': 'OC(=O)CS', 'Sc1nc2c([nH]1)cccc2': 'Sc1nc2c([nH]1)cccc2', 'OC(=O)c1ccccc1S': 'OC(=O)c1ccccc1S', 'S=c1sc2c([nH]1)cccc2': 'S=c1sc2c([nH]1)cccc2', 'OC(=O)c1cccnc1S': 'OC(=O)c1cccnc1S', 'Sc1ncccn1': 'Sc1ncccn1', 'c1ccc(nc1)c1ccccn1': 'c1ccc(nc1)c1ccccn1', 'Sc1nnc(s1)S': 'Sc1nnc(s1)S', 'Nc1cc(S)nc(n1)N': 'Nc1cc(S)nc(n1)N', 'Nc1nc([nH]n1)C(=O)O': 'Nc1nc([nH]n1)C(=O)O', 'Nc1n[nH]cn1': 'Nc1n[nH]cn1', 'OC(=O)c1n[nH]c(n1)N': 'OC(=O)c1n[nH]c(n1)N', 'Nc1n[nH]c(n1)S': 'Nc1n[nH]c(n1)S', 'CS[C]1N[N]C(=N1)N': 'CS[C]1N[N]C(=N1)N', 'C1=CC(=CC(=C1)S)C(=O)O': 'C1=CC(=CC(=C1)S)C(=O)O', 'OC(=O)CCS': 'OC(=O)CCS', 'Oc1ccccc1c1nnc([nH]1)S': 'Oc1ccccc1c1nnc([nH]1)S', 'Nn1cnnc1': 'Nn1cnnc1', 'Nc1ccnc(n1)S': 'Nc1ccnc(n1)S', 'Nn1c(NN)nnc1S': 'Nn1c(NN)nnc1S', 'Nn1c(S)nnc1c1ccccc1': 'Nn1c(S)nnc1c1ccccc1', 'Sc1nc(N)c2c(n1)[nH]nc2': 'Sc1nc(N)c2c(n1)[nH]nc2', 'Oc1ccc(cc1)C(=O)O': 'Oc1ccc(cc1)C(=O)O', 'OC(=O)c1ccc(cc1)S': 'OC(=O)c1ccc(cc1)S', 'Cn1cnnc1S': 'Cn1cnnc1S', 'Sc1nc(N)c(c(n1)S)N': 'Sc1nc(N)c(c(n1)S)N', 'Nc1ncncc1N': 'Nc1ncncc1N', 'Nc1cc(N)nc(n1)S': 'Nc1cc(N)nc(n1)S', 'Cc1cc(C)nc(n1)S': 'Cc1cc(C)nc(n1)S', 'Clc1cccc(c1)c1n[nH]c(=S)[nH]1': 'Clc1cccc(c1)c1n[nH]c(=S)[nH]1', 'COc1cccc(c1)c1n[nH]c(=S)[nH]1': 'COc1cccc(c1)c1n[nH]c(=S)[nH]1', 'Clc1ccc(cc1Cl)c1n[nH]c(=S)[nH]1': 'Clc1ccc(cc1Cl)c1n[nH]c(=S)[nH]1', 'c1cc(ccc1c2[nH]c(nn2)S)[N+](=O)[O-]': 'c1cc(ccc1c2[nH]c(nn2)S)[N+](=O)[O-]', 'S=c1[nH]nc([nH]1)c1ccco1': 'S=c1[nH]nc([nH]1)c1ccco1', 'S=c1[nH]nc([nH]1)c1cccnc1': 'S=c1[nH]nc([nH]1)c1cccnc1', 'S=c1[nH]nc([nH]1)c1ccncc1': 'S=c1[nH]nc([nH]1)c1ccncc1', 'Nc1n[nH]c(=S)s1': 'Nc1n[nH]c(=S)s1', 'Cc1nsc(c1)N': 'Cc1nsc(c1)N', 'Clc1ccc2c(c1)[nH]c(n2)S': 'Clc1ccc2c(c1)[nH]c(n2)S', 'CCOc1ccc2c(c1)nc([nH]2)S': 'CCOc1ccc2c(c1)nc([nH]2)S', 'Cn1nnnc1S': 'Cn1nnnc1S', 'OC(=O)Cn1nnnc1S': 'OC(=O)Cn1nnnc1S', 'COc1ccc2c(c1)[nH]c(=S)[nH]2': 'COc1ccc2c(c1)[nH]c(=S)[nH]2', 'Cc1n[nH]c(=S)s1': 'Cc1n[nH]c(=S)s1', 'ClC([C]1N[N]C=N1)(Cl)Cl': 'ClC([C]1N[N]C=N1)(Cl)Cl', 'Clc1cc2[nH]c(=S)[nH]c2cc1Cl': 'Clc1cc2[nH]c(=S)[nH]c2cc1Cl', 'CSc1[nH]c2c(n1)cc(c(c2)C)C': 'CSc1[nH]c2c(n1)cc(c(c2)C)C', 'Nc1ccc2c(c1)sc(=S)[nH]2': 'Nc1ccc2c(c1)sc(=S)[nH]2', 'OC(=O)c1ccc(=S)[nH]c1': 'OC(=O)c1ccc(=S)[nH]c1', 'Oc1cccc2c1nccc2': 'Oc1cccc2c1nccc2', 'S=c1[nH]c2c([nH]1)c(=O)n(cn2)C': 'S=c1[nH]c2c([nH]1)c(=O)n(cn2)C', 'S=c1[nH]c2c([nH]1)cncn2': 'S=c1[nH]c2c([nH]1)cncn2', 'CC(=O)O': 'CC(=O)O', 'OC(=O)CCCCC(=O)O': 'OC(=O)CCCCC(=O)O', 'OC(=O)c1ccccc1': 'OC(=O)c1ccccc1', 'c1ccc2c(c1)[nH]nn2': 'c1ccc2c(c1)[nH]nn2', 'OC(=O)c1ccc(cc1)c1ccccc1': 'OC(=O)c1ccc(cc1)c1ccccc1', 'OC(=O)/C=C/c1ccccc1': 'OC(=O)/C=C/c1ccccc1', 'C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O': 'C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O', 'O[C@H]1C(=O)OCC1(C)C': 'O[C@H]1C(=O)OCC1(C)C', 'OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O': 'OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O', 'OC[C@H]([C@H]([C@@H]([C@@H](CO)O)O)O)O': 'OC[C@H]([C@H]([C@@H]([C@@H](CO)O)O)O)O', 'CC(=O)SSC(=O)C': 'CC(=O)SSC(=O)C', 'CCCCOP(=O)(OCCCC)O': 'CCCCOP(=O)(OCCCC)O', 'CCN(C(=S)S)CC': 'CCN(C(=S)S)CC', 'O/N=C(/C(=N/O)/C)\\\\C': 'O/N=C(/C(=N/O)/C)\\\\C', 'CCCCCCCCCCCCc1ccccc1S([O])([O])O': 'CCCCCCCCCCCCc1ccccc1S([O])([O])O', 'CCCCCCCCCCCCOS(=O)(=O)O': 'CCCCCCCCCCCCOS(=O)(=O)O', 'OC(=O)CN(CC(=O)O)CCN(CC(=O)O)CC(=O)O': 'OC(=O)CN(CC(=O)O)CCN(CC(=O)O)CC(=O)O', 'O/N=C(\\\\C(=N/O)\\\\c1ccco1)/c1ccco1': 'O/N=C(\\\\C(=N/O)\\\\c1ccco1)/c1ccco1', 'OC[C@H]([C@H]([C@@H]([C@H](C(=O)O)O)O)O)O': 'OC[C@H]([C@H]([C@@H]([C@H](C(=O)O)O)O)O)O', 'OCC(CO)O': 'OCC(CO)O', 'NCC(=O)O': 'NCC(=O)O', 'OC(=O)CCCCCCCCCCCCCCC(=O)O': 'OC(=O)CCCCCCCCCCCCCCC(=O)O', 'C1N2CN3CN1CN(C2)C3': 'C1N2CN3CN1CN(C2)C3', 'NO': 'NO', 'COC(=O)CCCC1=CNC2=CC=CC=C21': 'COC(=O)CCCC1=CNC2=CC=CC=C21', 'OC(=O)c1ccncc1': 'OC(=O)c1ccncc1', 'C1COCCN1CCCS(=O)(=O)O': 'C1COCCN1CCCS(=O)(=O)O', 'OC(=O)c1cccnc1': 'OC(=O)c1cccnc1', 'CCCCCCCC/C=C\\\\CCCCCCCC(=O)O': 'CCCCCCCC/C=C\\\\CCCCCCCC(=O)O', 'C(=O)(C(=O)[O-])[O-]': 'C(=O)(C(=O)[O-])[O-]', 'OC(=O)c1ccc(cc1)N': 'OC(=O)c1ccc(cc1)N', 'Oc1ccc(cc1)S([O])([O])O': 'Oc1ccc(cc1)S([O])([O])O', 'OC(=O)c1ccccn1': 'OC(=O)c1ccccn1', 'OC(=O)c1ccccc1O': 'OC(=O)c1ccccc1O', 'CCCCCCCCCCCCCCCCCC(=O)O': 'CCCCCCCCCCCCCCCCCC(=O)O', 'SC#N': 'SC#N', 'C1=CC(=C(C=C1SSC2=CC(=C(C=C2)[N+](=O)[O-])C(=O)O)C(=O)O)[N+](=O)[O-]': 'C1=CC(=C(C=C1SSC2=CC(=C(C=C2)[N+](=O)[O-])C(=O)O)C(=O)O)[N+](=O)[O-]', '[O-]S(=O)[O-].[Na+].[Na+]': '[O-]S(=O)[O-].[Na+].[Na+]', 'CCCCCCCCN(CC(=O)O[Na])CC(=O)O[Na]': 'CCCCCCCCN(CC(=O)O[Na])CC(=O)O[Na]', 'CCCCCCCCCCCCN(CC(=O)O[Na])CC(=O)O[Na]': 'CCCCCCCCCCCCN(CC(=O)O[Na])CC(=O)O[Na]', 'CCCCCCCCCCCCCCN(CC(=O)O[Na])CC(=O)O[Na]': 'CCCCCCCCCCCCCCN(CC(=O)O[Na])CC(=O)O[Na]', 'CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C(=O)O)C': 'CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C(=O)O)C', 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C': 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C', 'N.N.[N+](=O)(O)[O-].[N+](=O)(O)[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].O.O.O.O.[Ce+3]': 'N.N.[N+](=O)(O)[O-].[N+](=O)(O)[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].O.O.O.O.[Ce+3]', '[NH4+].[NH4+].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[Ce+4]': '[NH4+].[NH4+].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[Ce+4]', '[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[Ce+3]': '[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[N+](=O)([O-])[O-].[Ce+3]', '[O-]S(=O)(=O)[O-].[O-]S(=O)(=O)[O-].[O-]S(=O)(=O)[O-].[Ce+3].[Ce+3]': '[O-]S(=O)(=O)[O-].[O-]S(=O)(=O)[O-].[O-]S(=O)(=O)[O-].[Ce+3].[Ce+3]', '[Cl-].[Cl-].[Cl-].[Ce+3]': '[Cl-].[Cl-].[Cl-].[Ce+3]', 'CNCC(C1=CC(=CC=C1)O)O': 'CNCC(C1=CC(=CC=C1)O)O', 'C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Fe+2]': 'C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Fe+2]', 'C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Zn+2]': 'C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Zn+2]', 'C1=CC=C(C(=C1)C=NNC(=S)N)O': 'C1=CC=C(C(=C1)C=NNC(=S)N)O', 'C1=CC(=C(C=C1O)O)C=NNC(=S)N': 'C1=CC(=C(C=C1O)O)C=NNC(=S)N', 'NC(=S)NN=CC1=C(C(=C(C=C1)O)O)O': 'NC(=S)NN=CC1=C(C(=C(C=C1)O)O)O', 'CCCCN(CCCC)C1=NC(=NC(=N1)NC(CCSC)C(=O)O)NC(CCSC)C(=O)O': 'CCCCN(CCCC)C1=NC(=NC(=N1)NC(CCSC)C(=O)O)NC(CCSC)C(=O)O', 'C1=CC2=NNN=C2C=C1Cl': 'C1=CC2=NNN=C2C=C1Cl', 'O=C([O-])C(O)C(O)C(O)C(O)CO.[Na+]': 'O=C([O-])C(O)C(O)C(O)C(O)CO.[Na+]', 'COC(=O)n1nnc2ccccc12': 'COC(=O)n1nnc2ccccc12'}, decorrelate=0.7, encoding=)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from baybe.parameters import SubstanceParameter\n", + "\n", + "SubstanceParameter(\n", + " name=\"Inhibitor\",\n", + " data=smiles_dict,\n", + " encoding=\"MORDRED\", # optional\n", + " decorrelate=0.7, # optional\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bayesian Optimization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Search Space" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Objective" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Recommender" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Benchmarking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Transfer Learning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}