diff --git a/baybe_hack.ipynb b/baybe_hack.ipynb new file mode 100644 index 0000000..272c339 --- /dev/null +++ b/baybe_hack.ipynb @@ -0,0 +1,362 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from baybe.targets import NumericalTarget\n", + "from baybe.objective import Objective\n", + "\n", + "from baybe.parameters import NumericalDiscreteParameter, NumericalContinuousParameter\n", + "from baybe.searchspace import SearchSpace\n", + "\n", + "from baybe.recommenders import RandomRecommender, SequentialGreedyRecommender, NaiveHybridRecommender\n", + "from baybe.surrogates import GaussianProcessSurrogate\n", + "\n", + "from baybe.strategies import TwoPhaseStrategy\n", + "from baybe import Campaign" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting the objectives" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The DESIRABILITY mode enables the combination multiple targets via scalarization into a single value.\n", + "\n", + "See MATCH mode, instead of MAX/MIN + For more details on transformation functions: \n", + "https://emdgroup.github.io/baybe/userguide/targets.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set targets/objectives, efficiency?" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "overpotential = NumericalTarget(\n", + " name=\"overpotential\", \n", + " mode=\"MAX\", \n", + " bounds=(-400, 0),\n", + " transformation=\"LINEAR\" # optional, will be applied if bounds are not None, LINEAR only one available for MAX/MIN\n", + " ) \n", + "\n", + "overpotential_slope = NumericalTarget(\n", + " name=\"overpotential_slope\", \n", + " mode=\"MAX\", \n", + " bounds=(-0.05, 0.05),\n", + " transformation=\"LINEAR\" # optional, will be applied if bounds are not None, LINEAR only one available for MAX/MIN\n", + " )\n", + "\n", + "objective = Objective(\n", + " mode=\"DESIRABILITY\",\n", + " targets=[overpotential, overpotential_slope],\n", + " weights=[1.0, 1.0], # optional, by default all weights are equal\n", + " combine_func=\"GEOM_MEAN\", # optional, geometric mean is the default\n", + ")\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Search Space" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "parameters = [\n", + "NumericalDiscreteParameter(\n", + " name=\"Time (h)\",\n", + " values=np.arange(6, 25, 1) # Assuming time below 6 hours is discarded\n", + "),\n", + "NumericalDiscreteParameter(\n", + " name=\"pH\",\n", + " values=np.arange(-1, 15.1, 0.1)\n", + " ), \n", + "NumericalContinuousParameter( # Set this as continuous, the values seem quite small?\n", + " name=\"Inhibitor Concentration (M)\",\n", + " bounds=(0, 0.02)\n", + " ),\n", + "NumericalDiscreteParameter(\n", + " name=\"Salt Concentration (M)\",\n", + " values=np.arange(0, 2.01, 0.01),\n", + " )\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Substance parameter**\n", + "\n", + "Instead of values, this parameter accepts data in form of a dictionary. The items correspond to pairs of labels and SMILES. SMILES are string-based representations of molecular structures. Based on these, BayBE can assign each label a set of molecular descriptors as encoding.\n", + "\n", + "For instance, a parameter corresponding to a choice of solvents can be initialized with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from baybe.parameters import SubstanceParameter\n", + "\n", + "SubstanceParameter(\n", + " name=\"Solvent\",\n", + " data={\n", + " \"Water\": \"O\",\n", + " \"1-Octanol\": \"CCCCCCCCO\",\n", + " \"Toluene\": \"CC1=CC=CC=C1\",\n", + " },\n", + " encoding=\"MORDRED\", # optional\n", + " decorrelate=0.7, # optional\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The encoding option defines what kind of descriptors are calculated:\n", + "\n", + "MORDRED: 2D descriptors from the Mordred package. Since the original package is now unmaintained, baybe requires the community replacement mordredcommunity\n", + "\n", + "RDKIT: 2D descriptors from the RDKit package\n", + "\n", + "MORGAN_FP: Morgan fingerprints calculated with RDKit (1024 bits, radius 4)\n", + "\n", + "These calculations will typically result in 500 to 1500 numbers per molecule. **To avoid detrimental effects on the surrogate model fit, we reduce the number of descriptors via decorrelation before using them.** For instance, the decorrelate option in the example above specifies that only descriptors with a correlation lower than 0.7 to any other descriptor will be kept. This usually reduces the number of descriptors to 10-50, depending on the specific items in data.\n", + "\n", + "**WARNING:**\n", + "The descriptors calculated for a SubstanceParameter were developed to describe small molecules and are not suitable for other substances. If you deal with large molecules like polymers or arbitrary substance mixtures, we recommend to provide your own descriptors via the CustomParameter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The encoding concept introduced above is generalized by the CustomParameter. Here, the user is expected to provide their own descriptors for the encoding.\n", + "\n", + "Take, for instance, a parameter that corresponds to the choice of a polymer. Polymers are not well represented by the small molecule descriptors utilized in the SubstanceParameter. Still, one could provide experimental measurements or common metrics used to classify polymers:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from baybe.parameters import CustomDiscreteParameter\n", + "\n", + "# Create or import new dataframe containing custom descriptors\n", + "\n", + "descriptors = pd.DataFrame(\n", + " {\n", + " \"Glass_Transition_TempC\": [20, -71, -39],\n", + " \"Weight_kDalton\": [120, 32, 241],\n", + " },\n", + " index=[\"Polymer A\", \"Polymer B\", \"Polymer C\"], # put labels in the index\n", + ")\n", + "\n", + "CustomDiscreteParameter(\n", + " name=\"Polymer\",\n", + " data=descriptors,\n", + " decorrelate=True, # optional, uses default correlation threshold = 0.7?\n", + ")\n", + "\n", + "# Add this to the parameters list afterwards" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "searchspace = SearchSpace.from_product(parameters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Recommenders" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The **SequentialGreedyRecommender** is a powerful recommender that leverages BoTorch optimization functions to perform sequential Greedy optimization. It can be applied for discrete, continuous and hybrid sarch spaces. It is an implementation of the BoTorch optimization functions for discrete, continuous and mixed spaces. **It is important to note that this recommender performs a brute-force search when applied in hybrid search spaces, as it optimizes the continuous part of the space while exhaustively searching choices in the discrete subspace.** You can customize this behavior to only sample a certain percentage of the discrete subspace via the sample_percentage attribute and to choose different sampling strategies via the hybrid_sampler attribute. \n", + "\n", + "e.g.\n", + "strategy = TwoPhaseStrategy(recommender=SequentialGreedyRecommender(hybrid_sampler=\"Farthest\", sampling_percentage=0.3))\n", + "\n", + "The **NaiveHybridRecommender** can be applied to all search spaces, but is intended to be used in hybrid spaces. This recommender **combines individual recommenders for the continuous and the discrete subspaces. It independently optimizes each subspace and consolidates the best results to generate a candidate for the original hybrid space.** " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For implementing fully customized surrogate models e.g. from sklearn or PyTorch, see:\n", + "https://emdgroup.github.io/baybe/examples/Custom_Surrogates/Custom_Surrogates.html\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\lordm\\Desktop\\Projects\\baybe\\.venv\\lib\\site-packages\\baybe\\recommenders\\bayesian.py:492: UserWarning: The value of 'allow_recommending_already_measured' differs from what is specified in the discrete recommender. The value of the discrete recommender will be ignored.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "available_surr_models = [\n", + " \"GaussianProcessSurrogate\", \n", + " \"BayesianLinearSurrogate\",\n", + " \"MeanPredictionSurrogate\",\n", + " \"NGBoostSurrogate\",\n", + " \"RandomForestSurrogate\"\n", + "]\n", + "\n", + "available_acq_functions = [\n", + " \"qPI\", # q-Probability Of Improvement\n", + " \"qEI\", # q-Expected Improvement\n", + " \"qUCB\", # q-upper confidence bound with beta of 1.0\n", + "]\n", + "\n", + "# Defaults anyway\n", + "SURROGATE_MODEL = GaussianProcessSurrogate()\n", + "ACQ_FUNCTION = \"qEI\" # q-Expected Improvement, only q-fuctions are available for batch_size > 1\n", + "\n", + "seq_greedy_recommender = SequentialGreedyRecommender(\n", + " surrogate_model=SURROGATE_MODEL,\n", + " acquisition_function_cls=ACQ_FUNCTION,\n", + " hybrid_sampler=\"Farthest\", # find more details in the documentation\n", + " sampling_percentage=0.3, # should be relatively low\n", + " allow_repeated_recommendations=False,\n", + " allow_recommending_already_measured=False,\n", + " )\n", + "\n", + "hybrid_recommender = NaiveHybridRecommender(\n", + " allow_repeated_recommendations=False,\n", + " allow_recommending_already_measured=False\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Campaign Strategy" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "strategy = TwoPhaseStrategy(\n", + " initial_recommender = RandomRecommender(), # Initial recommender, if no training data is available\n", + " # Other initial recommenders don't seem to work for my hybrid search space/set of parameters\n", + " # Doesn't matter since I already have training data\n", + " recommender = seq_greedy_recommender, # Bayesian model-based optimization\n", + " # recommender = hybrid_recommender,\n", + " switch_after=1 # Switch to the model-based recommender after 1 batch or iteration (so the initial training data)\n", + ")\n", + "\n", + "campaign = Campaign(searchspace, objective, strategy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import and read modified Excel file as dataframe? - Now containing only specific columns as training data - as in possibly this example: \n", + "\n", + "https://emdgroup.github.io/baybe/examples/Backtesting/full_initial_data.html\n", + "\n", + "\n", + "https://emdgroup.github.io/baybe/examples/Backtesting/full_lookup.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### For transfer learning see: \n", + "\n", + "https://emdgroup.github.io/baybe/userguide/transfer_learning\n", + "\n", + "&\n", + "\n", + "https://emdgroup.github.io/baybe/examples/Transfer_Learning/basic_transfer_learning.html" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/exploration.ipynb b/exploration.ipynb new file mode 100644 index 0000000..912d9c2 --- /dev/null +++ b/exploration.ipynb @@ -0,0 +1,1011 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Exploration" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'cordata.xlsx'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df_full \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_excel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mcordata.xlsx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# contains all data\u001b[39;00m\n\u001b[0;32m 2\u001b[0m df_Al \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_excel(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcordata_Al.xlsx\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;66;03m# only Al alloy class\u001b[39;00m\n", + "File \u001b[1;32mc:\\Users\\lordm\\Desktop\\Projects\\baybe\\.venv\\lib\\site-packages\\pandas\\io\\excel\\_base.py:495\u001b[0m, in \u001b[0;36mread_excel\u001b[1;34m(io, sheet_name, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, date_format, thousands, decimal, comment, skipfooter, storage_options, dtype_backend, engine_kwargs)\u001b[0m\n\u001b[0;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(io, ExcelFile):\n\u001b[0;32m 494\u001b[0m should_close \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m--> 495\u001b[0m io \u001b[38;5;241m=\u001b[39m \u001b[43mExcelFile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 496\u001b[0m \u001b[43m \u001b[49m\u001b[43mio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 497\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 498\u001b[0m \u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 499\u001b[0m \u001b[43m \u001b[49m\u001b[43mengine_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 500\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 501\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m engine \u001b[38;5;129;01mand\u001b[39;00m engine \u001b[38;5;241m!=\u001b[39m io\u001b[38;5;241m.\u001b[39mengine:\n\u001b[0;32m 502\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 503\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEngine should not be specified when passing \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 504\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124man ExcelFile - ExcelFile already has the engine set\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 505\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\lordm\\Desktop\\Projects\\baybe\\.venv\\lib\\site-packages\\pandas\\io\\excel\\_base.py:1550\u001b[0m, in \u001b[0;36mExcelFile.__init__\u001b[1;34m(self, path_or_buffer, engine, storage_options, engine_kwargs)\u001b[0m\n\u001b[0;32m 1548\u001b[0m ext \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxls\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1549\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1550\u001b[0m ext \u001b[38;5;241m=\u001b[39m \u001b[43minspect_excel_format\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1551\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontent_or_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\n\u001b[0;32m 1552\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1553\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ext \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 1554\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 1555\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExcel file format cannot be determined, you must specify \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1556\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124man engine manually.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1557\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\lordm\\Desktop\\Projects\\baybe\\.venv\\lib\\site-packages\\pandas\\io\\excel\\_base.py:1402\u001b[0m, in \u001b[0;36minspect_excel_format\u001b[1;34m(content_or_path, storage_options)\u001b[0m\n\u001b[0;32m 1399\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(content_or_path, \u001b[38;5;28mbytes\u001b[39m):\n\u001b[0;32m 1400\u001b[0m content_or_path \u001b[38;5;241m=\u001b[39m BytesIO(content_or_path)\n\u001b[1;32m-> 1402\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1403\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontent_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[0;32m 1404\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m handle:\n\u001b[0;32m 1405\u001b[0m stream \u001b[38;5;241m=\u001b[39m handle\u001b[38;5;241m.\u001b[39mhandle\n\u001b[0;32m 1406\u001b[0m stream\u001b[38;5;241m.\u001b[39mseek(\u001b[38;5;241m0\u001b[39m)\n", + "File \u001b[1;32mc:\\Users\\lordm\\Desktop\\Projects\\baybe\\.venv\\lib\\site-packages\\pandas\\io\\common.py:882\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[0;32m 874\u001b[0m handle,\n\u001b[0;32m 875\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 878\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 879\u001b[0m )\n\u001b[0;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m--> 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 883\u001b[0m handles\u001b[38;5;241m.\u001b[39mappend(handle)\n\u001b[0;32m 885\u001b[0m \u001b[38;5;66;03m# Convert BytesIO or file objects passed with an encoding\u001b[39;00m\n", + "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'cordata.xlsx'" + ] + } + ], + "source": [ + "df_full = pd.read_excel('cordata.xlsx') # contains all data\n", + "df_Al = pd.read_excel('cordata_Al.xlsx') # only Al alloy class" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 4973 entries, 0 to 4972\n", + "Data columns (total 17 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Index 4973 non-null int64 \n", + " 1 Inhibitor 4973 non-null object \n", + " 2 Mol._weight 4973 non-null float64\n", + " 3 SMILES 4973 non-null object \n", + " 4 Metal 4973 non-null object \n", + " 5 Alloy 4973 non-null object \n", + " 6 Time_h 4973 non-null float64\n", + " 7 Temperature_K 4973 non-null float64\n", + " 8 pH 4973 non-null float64\n", + " 9 Inhib_Concentrat_M 4973 non-null float64\n", + " 10 Salt_Concentrat_M 4973 non-null float64\n", + " 11 Synergistic_Inhib_type 265 non-null object \n", + " 12 Synergistic_Inhib_M 4925 non-null float64\n", + " 13 Methodology 4973 non-null object \n", + " 14 Reference 4973 non-null object \n", + " 15 Contributor 4973 non-null object \n", + " 16 Efficiency 4973 non-null float64\n", + "dtypes: float64(8), int64(1), object(8)\n", + "memory usage: 660.6+ KB\n", + "None\n" + ] + } + ], + "source": [ + "print(df_full.info())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 2011 entries, 0 to 2010\n", + "Data columns (total 19 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Inhibitor 2011 non-null object \n", + " 1 SMILES 2011 non-null object \n", + " 2 Number 2011 non-null int64 \n", + " 3 Metal 2011 non-null object \n", + " 4 Alloy 2011 non-null object \n", + " 5 Time_h 2011 non-null float64\n", + " 6 Temperature_K 2011 non-null int64 \n", + " 7 pH 2011 non-null float64\n", + " 8 Inhib_Concentrat_M 2011 non-null float64\n", + " 9 Salt_Concentrat_M 2011 non-null float64\n", + " 10 Synergistic_inhib 2011 non-null object \n", + " 11 Synergistic_inhib_type 2011 non-null object \n", + " 12 Synergistic_inhib_Concentrat_M 2011 non-null float64\n", + " 13 Encapsulated 2011 non-null object \n", + " 14 Efficiency 2011 non-null float64\n", + " 15 Methodology 2011 non-null object \n", + " 16 Reference 2011 non-null object \n", + " 17 Link 2011 non-null object \n", + " 18 Contributor 2011 non-null object \n", + "dtypes: float64(6), int64(2), object(11)\n", + "memory usage: 298.6+ KB\n", + "None\n" + ] + } + ], + "source": [ + "print(df_Al.info())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Al: 2001\n", + "Cu: 272\n", + "Fe: 1230\n", + "Mg: 1465\n", + "Zn: 5\n" + ] + } + ], + "source": [ + "# number of entries for every alloy class in the full dataset\n", + "for metal in df_full['Metal'].unique():\n", + " print(f\"{metal}: {len(df_full[df_full['Metal'] == metal])}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### since Al has the biggest number it is indeed smart to start the alloy specific analysis with Al" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Efficiency
medianstd
SMILES
Brc1ncccn113.96.407808
C(=O)(C(=O)[O-])[O-]33.066.649155
C(C(=O)O)(C)(CO)CO63.514.066509
C(C(=O)O)N-289.0NaN
C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O63.6171.524937
.........
c1ncn[nH]1-16.5376.741314
c2ccc1[nH]cnc1c247.910.392243
c3ccc(Cn2cnc1ccccc12)cc397.50.223607
c4ccc(Cn3c(c1ccccc1)nc2ccccc23)cc497.80.316228
n1c2C(=O)NC(N)=Nc2ncc1CNc3ccc(cc3)C(=O)N[C@H](C(O)=O)CCC(O)=O82.420.836281
\n", + "

402 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " Efficiency \n", + " median std\n", + "SMILES \n", + "Brc1ncccn1 13.9 6.407808\n", + "C(=O)(C(=O)[O-])[O-] 33.0 66.649155\n", + "C(C(=O)O)(C)(CO)CO 63.5 14.066509\n", + "C(C(=O)O)N -289.0 NaN\n", + "C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O 63.6 171.524937\n", + "... ... ...\n", + "c1ncn[nH]1 -16.5 376.741314\n", + "c2ccc1[nH]cnc1c2 47.9 10.392243\n", + "c3ccc(Cn2cnc1ccccc12)cc3 97.5 0.223607\n", + "c4ccc(Cn3c(c1ccccc1)nc2ccccc23)cc4 97.8 0.316228\n", + "n1c2C(=O)NC(N)=Nc2ncc1CNc3ccc(cc3)C(=O)N[C@H](C... 82.4 20.836281\n", + "\n", + "[402 rows x 2 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# unique compounds for full dataset\n", + "df_full.groupby('SMILES').agg({'Efficiency':['median', 'std']})" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Efficiency
medianstd
SMILES
C(=O)(C(=O)[O-])[O-]20.00015.146012
C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O71.30033.074888
C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Fe+2]65.00022.500907
C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Zn+2]61.00032.650756
C(C(CO)([N+](=O)[O-])Br)O84.2859.737557
.........
[O-]S(=O)[O-].[Na+].[Na+]85.20015.171265
c1cc(ccc1c2[nH]c(nn2)S)[N+](=O)[O-]25.0005.773503
c1ccc(nc1)c1ccccn130.0008.164966
c1ccc2c(c1)[nH]nn291.00031.045126
c1ncn[nH]135.00034.034296
\n", + "

177 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " Efficiency \n", + " median std\n", + "SMILES \n", + "C(=O)(C(=O)[O-])[O-] 20.000 15.146012\n", + "C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O 71.300 33.074888\n", + "C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O... 65.000 22.500907\n", + "C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O... 61.000 32.650756\n", + "C(C(CO)([N+](=O)[O-])Br)O 84.285 9.737557\n", + "... ... ...\n", + "[O-]S(=O)[O-].[Na+].[Na+] 85.200 15.171265\n", + "c1cc(ccc1c2[nH]c(nn2)S)[N+](=O)[O-] 25.000 5.773503\n", + "c1ccc(nc1)c1ccccn1 30.000 8.164966\n", + "c1ccc2c(c1)[nH]nn2 91.000 31.045126\n", + "c1ncn[nH]1 35.000 34.034296\n", + "\n", + "[177 rows x 2 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# unique compounds for Al only dataset\n", + "df_Al.groupby('SMILES').agg({'Efficiency':['median', 'std']})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 402 compounds for full, 177 for only Al. How about if we work for the largest alloy group?" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank of unique values:\n", + "Mild steel 690\n", + "AA2024 590\n", + "Al 424\n", + "Cu 262\n", + "CP-Mg-220ppm 239\n", + "AA7075 227\n", + "AA1060 203\n", + "Cold rolled steel 182\n", + "AZ31 158\n", + "AlSi 155\n", + "ZE41 155\n", + "WE43 153\n", + "AM50 153\n", + "AZ91 145\n", + "Fe 143\n", + "E21 133\n", + "HP-Mg-50ppm 119\n", + "HP-Mg-51ppm 109\n", + "Carbon steel 92\n", + "AA5754 84\n", + "Steel 75\n", + "CP-Mg-342ppm 60\n", + "Al_rod 49\n", + "Stainless steel 48\n", + "AA6063 48\n", + "AA6061 48\n", + "AA5083 46\n", + "AA5052 45\n", + "Mg-0,8Ca 25\n", + "AA1100 25\n", + "AA3SR 18\n", + "AZ91E 15\n", + "AA63400 12\n", + "AA2014 12\n", + "Brass 10\n", + "AA5005 8\n", + "AA2017A 7\n", + "Galvanized 5\n", + "AZ91D 1\n", + "Name: Alloy, dtype: int64\n", + "\n", + "Unique alloys in full dataset: 39\n", + "Unique alloys in only Al dataset: 17\n" + ] + } + ], + "source": [ + "def value_counts(df, value):\n", + " # Count unique values in the 'value' column\n", + " value_counts = df[value].value_counts()\n", + " ranked_values = value_counts.sort_values(ascending=False)\n", + " print(\"Rank of unique values:\")\n", + " print(ranked_values)\n", + "\n", + "value_counts(df_full, 'Alloy')\n", + "print()\n", + "print(f'Unique alloys in full dataset: {len(df_full[\"Alloy\"].unique())}')\n", + "print(f'Unique alloys in only Al dataset: {len(df_Al[\"Alloy\"].unique())}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### I suggest we start from AA2024, as mild steel is too general of a category." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Efficiency
medianstd
SMILES
C(=O)(C(=O)[O-])[O-]20.0014.061383
C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O12.5540.887682
C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Fe+2]65.0022.500907
C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O-])O)O)O)O)O.[Zn+2]61.0032.650756
C1=CC(=C(C=C1O)O)C=NNC(=S)N86.6026.204889
.........
[O-]S(=O)[O-].[Na+].[Na+]85.2015.171265
c1cc(ccc1c2[nH]c(nn2)S)[N+](=O)[O-]25.007.071068
c1ccc(nc1)c1ccccn135.007.071068
c1ccc2c(c1)[nH]nn297.8020.848309
c1ncn[nH]160.0042.426407
\n", + "

123 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " Efficiency \n", + " median std\n", + "SMILES \n", + "C(=O)(C(=O)[O-])[O-] 20.00 14.061383\n", + "C(C(=O)[O-])C(CC(=O)[O-])(C(=O)[O-])O 12.55 40.887682\n", + "C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O... 65.00 22.500907\n", + "C(C(C(C(C(C(=O)[O-])O)O)O)O)O.C(C(C(C(C(C(=O)[O... 61.00 32.650756\n", + "C1=CC(=C(C=C1O)O)C=NNC(=S)N 86.60 26.204889\n", + "... ... ...\n", + "[O-]S(=O)[O-].[Na+].[Na+] 85.20 15.171265\n", + "c1cc(ccc1c2[nH]c(nn2)S)[N+](=O)[O-] 25.00 7.071068\n", + "c1ccc(nc1)c1ccccn1 35.00 7.071068\n", + "c1ccc2c(c1)[nH]nn2 97.80 20.848309\n", + "c1ncn[nH]1 60.00 42.426407\n", + "\n", + "[123 rows x 2 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# composition of AA2000 series is nearly identical, grab alloys from this series\n", + "df_AA2024 = df_Al[df_Al['Alloy'].isin(['AA2024', 'AA2014', 'AA2017A'])]\n", + "df_AA2024.groupby('SMILES').agg({'Efficiency':['median', 'std']})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 177 compounds for all Al, to 123 compounds to only AA2024." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2.40e+01 6.72e+02 0.00e+00 1.67e+00 3.30e-01 5.80e-01 1.00e+00 5.00e-01\n", + " 1.50e+00 2.00e+00 2.50e+00 4.80e+01 9.60e+01 1.44e+02 1.92e+02 2.40e+02\n", + " 2.88e+02 3.36e+02 3.84e+02 4.00e+00 6.00e+00 8.00e+00 2.50e-01 7.50e-01\n", + " 7.20e+01 3.60e+02 4.80e+02 6.00e+02 1.68e+02 1.20e+02 1.00e+01 3.00e+00\n", + " 4.32e+02 5.28e+02 5.76e+02 6.24e+02 3.50e+00 4.50e+00 5.00e+00 5.50e+00\n", + " 7.20e+02 6.70e-01]\n", + "42\n" + ] + } + ], + "source": [ + "# you can play around with the column names to see what unique values are in each column\n", + "column_name = 'Time_h'\n", + "print(df_Al[column_name].unique())\n", + "print((len(df_Al[column_name].unique())))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank of unique values:\n", + "24.00 656\n", + "0.50 306\n", + "2.00 268\n", + "1.00 134\n", + "672.00 54\n", + "3.00 48\n", + "144.00 48\n", + "48.00 47\n", + "72.00 41\n", + "1.67 40\n", + "168.00 28\n", + "0.33 28\n", + "240.00 25\n", + "96.00 22\n", + "336.00 20\n", + "4.00 19\n", + "288.00 19\n", + "6.00 17\n", + "480.00 16\n", + "0.67 16\n", + "0.00 14\n", + "192.00 14\n", + "384.00 14\n", + "576.00 13\n", + "432.00 13\n", + "10.00 9\n", + "360.00 8\n", + "120.00 8\n", + "528.00 8\n", + "624.00 8\n", + "600.00 7\n", + "0.58 7\n", + "2.50 6\n", + "1.50 6\n", + "720.00 5\n", + "5.00 4\n", + "8.00 4\n", + "4.50 3\n", + "3.50 3\n", + "5.50 3\n", + "0.25 1\n", + "0.75 1\n", + "Name: Time_h, dtype: int64\n" + ] + } + ], + "source": [ + "# previous counter fuinction to check distribution\n", + "value_counts(df_Al, 'Time_h')" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# you can play around with histograms below to check distribution\n", + "# \n", + "# # Plot histogram\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_hist(df, value, name):\n", + " plt.hist(df[value], bins=50, color='skyblue', edgecolor='black')\n", + " plt.xlabel(f'{value}')\n", + " plt.ylabel('Frequency')\n", + " plt.title(f'Histogram of {name}')\n", + " plt.grid(True)\n", + " plt.show()\n", + "\n", + "hist_value = 'Time_h'\n", + "plot_hist(df_full, value = hist_value, name = 'Full')\n", + "plot_hist(df_Al, value = hist_value, name = 'Al')\n", + "plot_hist(df_AA2024, value = hist_value, name = 'AA2024')" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of entries in full dataset: 2011\n", + "Number of entries in full dataset with Time_h >= 1: 1638\n", + "Number of entries in full dataset with Time_h < 1: 373\n" + ] + } + ], + "source": [ + "# check filtering to see the effect of getting rid of some rows, to check whether its feasible to drop some columns\n", + "\n", + "def check_filter(dataset, column_name, filter):\n", + " print(f\"Number of entries in full dataset: {len(dataset)}\")\n", + " print(f\"Number of entries in full dataset with {column_name} >= {filter}: {len(dataset[dataset[column_name] >= filter])}\")\n", + " print(f\"Number of entries in full dataset with {column_name} < {filter}: {len(dataset[dataset[column_name] < filter])}\")\n", + "\n", + "\n", + "check_filter(dataset = df_Al, column_name = 'Time_h', filter = 1)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "121" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# quick check to check the amount of unique compounds\n", + "filtered_df_Al = df_AA2024[df_AA2024['Time_h'] >= 1] # 123 to 121 compounds, maybe not much information loss for the small dataset...\n", + "len(filtered_df_Al['SMILES'].unique())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Now to create a new dataframe cleaned of unnecessary details for analysis. \n", + "Entries with synergy and encapsulation is removed, then unnecessary columns dropped." + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "611" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def filter_dataframe_AA2024(df):\n", + " no_synergy_df = df[df['Synergistic_inhib'] == 'No']\n", + " no_encapsulation_df = no_synergy_df[no_synergy_df['Encapsulated'] == 'No']\n", + " filtered_df = no_encapsulation_df.drop(columns=['Inhibitor', 'Number', 'Metal', 'Alloy', 'Temperature_K', 'Salt_Concentrat_M', \n", + " 'Synergistic_inhib','Synergistic_inhib_type', 'Synergistic_inhib_Concentrat_M',\n", + " 'Encapsulated', 'Methodology','Reference', 'Link', 'Contributor'])\n", + " return filtered_df\n", + "\n", + "filtered_df_AA2024 = filter_dataframe_AA2024(df_AA2024)\n", + "len(filtered_df_AA2024['SMILES'])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 611 datapoints for actual Bayesian optimization work. " + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "filtered_df_AA2024.to_excel('filtered_AA2024.xlsx', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1966" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def filter_dataframe_Al(df):\n", + " no_synergy_df = df[df['Synergistic_inhib'] == 'No']\n", + " no_encapsulation_df = no_synergy_df[no_synergy_df['Encapsulated'] == 'No']\n", + " filtered_df = no_encapsulation_df.drop(columns=['Inhibitor', 'Number', 'Metal', 'Temperature_K', 'Salt_Concentrat_M', \n", + " 'Synergistic_inhib','Synergistic_inhib_type', 'Synergistic_inhib_Concentrat_M',\n", + " 'Encapsulated', 'Methodology','Reference', 'Link', 'Contributor'])\n", + " return filtered_df\n", + "\n", + "filtered_df_Al = filter_dataframe_Al(df_Al)\n", + "len(filtered_df_Al['SMILES'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1966 datapoints, almost 3 times. Assuming we can just ignore the effect of alloy type, or find a way to featurize it, would be fun to work with. Maybe composition based?" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "filtered_df_Al.to_excel('filtered_Al.xlsx', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4708" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def filter_dataframe_full(df):\n", + " no_synergy_df = df[df['Synergistic_Inhib_type'].isnull()]\n", + " filtered_df = no_synergy_df.drop(columns=['Inhibitor', 'Index', 'Mol._weight', 'Temperature_K', 'Salt_Concentrat_M',\n", + " 'Synergistic_Inhib_type', 'Synergistic_Inhib_M','Methodology','Reference',\n", + " 'Contributor'])\n", + " return filtered_df\n", + "\n", + "filtered_df_full = filter_dataframe_full(df_full)\n", + "len(filtered_df_full['SMILES'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4708 datapoints, we more than double the previous. Again if we can jump to the featurization of alloy or substrate, super cool number to work with." + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "filtered_df_full.to_excel('filtered_full.xlsx', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/filtered_AA2024.xlsx b/filtered_AA2024.xlsx new file mode 100644 index 0000000..618858a Binary files /dev/null and b/filtered_AA2024.xlsx differ diff --git a/filtered_Al.xlsx b/filtered_Al.xlsx new file mode 100644 index 0000000..75f1c1a Binary files /dev/null and b/filtered_Al.xlsx differ diff --git a/filtered_full.xlsx b/filtered_full.xlsx new file mode 100644 index 0000000..ecd6388 Binary files /dev/null and b/filtered_full.xlsx differ diff --git a/hello.py b/hello.py deleted file mode 100644 index 5b312bc..0000000 --- a/hello.py +++ /dev/null @@ -1,2 +0,0 @@ -def hello_world(): - return "Hello World!" diff --git a/hello_test.py b/hello_test.py deleted file mode 100644 index 708a061..0000000 --- a/hello_test.py +++ /dev/null @@ -1,5 +0,0 @@ -import hello - - -def test_hello(): - assert hello.hello_world() == "Hello World!"