diff --git a/exploration.ipynb b/exploration.ipynb index 959bc40..dac88cb 100644 --- a/exploration.ipynb +++ b/exploration.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -19,25 +19,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: 'cordata.xlsx'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df_full \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_excel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mcordata.xlsx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# contains all data\u001b[39;00m\n\u001b[1;32m 2\u001b[0m df_Al \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_excel(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcordata_Al.xlsx\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;66;03m# only Al alloy class\u001b[39;00m\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/excel/_base.py:495\u001b[0m, in \u001b[0;36mread_excel\u001b[0;34m(io, sheet_name, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, date_format, thousands, decimal, comment, skipfooter, storage_options, dtype_backend, engine_kwargs)\u001b[0m\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(io, ExcelFile):\n\u001b[1;32m 494\u001b[0m should_close \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 495\u001b[0m io \u001b[38;5;241m=\u001b[39m \u001b[43mExcelFile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 496\u001b[0m \u001b[43m \u001b[49m\u001b[43mio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 497\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 498\u001b[0m \u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 499\u001b[0m \u001b[43m \u001b[49m\u001b[43mengine_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 501\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m engine \u001b[38;5;129;01mand\u001b[39;00m engine \u001b[38;5;241m!=\u001b[39m io\u001b[38;5;241m.\u001b[39mengine:\n\u001b[1;32m 502\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 503\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEngine should not be specified when passing \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 504\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124man ExcelFile - ExcelFile already has the engine set\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 505\u001b[0m )\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/excel/_base.py:1550\u001b[0m, in \u001b[0;36mExcelFile.__init__\u001b[0;34m(self, path_or_buffer, engine, storage_options, engine_kwargs)\u001b[0m\n\u001b[1;32m 1548\u001b[0m ext \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxls\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1549\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1550\u001b[0m ext \u001b[38;5;241m=\u001b[39m \u001b[43minspect_excel_format\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1551\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontent_or_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\n\u001b[1;32m 1552\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1553\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ext \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1554\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1555\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExcel file format cannot be determined, you must specify \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1556\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124man engine manually.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1557\u001b[0m )\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/excel/_base.py:1402\u001b[0m, in \u001b[0;36minspect_excel_format\u001b[0;34m(content_or_path, storage_options)\u001b[0m\n\u001b[1;32m 1399\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(content_or_path, \u001b[38;5;28mbytes\u001b[39m):\n\u001b[1;32m 1400\u001b[0m content_or_path \u001b[38;5;241m=\u001b[39m BytesIO(content_or_path)\n\u001b[0;32m-> 1402\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1403\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontent_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m 1404\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m handle:\n\u001b[1;32m 1405\u001b[0m stream \u001b[38;5;241m=\u001b[39m handle\u001b[38;5;241m.\u001b[39mhandle\n\u001b[1;32m 1406\u001b[0m stream\u001b[38;5;241m.\u001b[39mseek(\u001b[38;5;241m0\u001b[39m)\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/io/common.py:882\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 874\u001b[0m handle,\n\u001b[1;32m 875\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 878\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 879\u001b[0m )\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m--> 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 883\u001b[0m handles\u001b[38;5;241m.\u001b[39mappend(handle)\n\u001b[1;32m 885\u001b[0m \u001b[38;5;66;03m# Convert BytesIO or file objects passed with an encoding\u001b[39;00m\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'cordata.xlsx'" - ] - } - ], + "outputs": [], "source": [ "df_full = pd.read_excel('cordata.xlsx') # contains all data\n", "df_Al = pd.read_excel('cordata_Al.xlsx') # only Al alloy class" @@ -45,63 +29,27 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df_full' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdf_full\u001b[49m\u001b[38;5;241m.\u001b[39minfo())\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_full' is not defined" - ] - } - ], + "outputs": [], "source": [ "print(df_full.info())" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df_Al' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdf_Al\u001b[49m\u001b[38;5;241m.\u001b[39minfo())\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_Al' is not defined" - ] - } - ], + "outputs": [], "source": [ "print(df_Al.info())" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df_full' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[5], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# number of entries for every alloy class in the full dataset\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m metal \u001b[38;5;129;01min\u001b[39;00m \u001b[43mdf_full\u001b[49m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMetal\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39munique():\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmetal\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(df_full[df_full[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMetal\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;250m \u001b[39m\u001b[38;5;241m==\u001b[39m\u001b[38;5;250m \u001b[39mmetal])\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_full' is not defined" - ] - } - ], + "outputs": [], "source": [ "# number of entries for every alloy class in the full dataset\n", "for metal in df_full['Metal'].unique():\n", @@ -117,21 +65,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df_full' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# unique compounds for full dataset\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mdf_full\u001b[49m\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSMILES\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39magg({\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mEfficiency\u001b[39m\u001b[38;5;124m'\u001b[39m:[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmedian\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstd\u001b[39m\u001b[38;5;124m'\u001b[39m]})\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_full' is not defined" - ] - } - ], + "outputs": [], "source": [ "# unique compounds for full dataset\n", "df_full.groupby('SMILES').agg({'Efficiency':['median', 'std']})" @@ -139,21 +75,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df_Al' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# unique compounds for Al only dataset\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mdf_Al\u001b[49m\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSMILES\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39magg({\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mEfficiency\u001b[39m\u001b[38;5;124m'\u001b[39m:[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmedian\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstd\u001b[39m\u001b[38;5;124m'\u001b[39m]})\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_Al' is not defined" - ] - } - ], + "outputs": [], "source": [ "# unique compounds for Al only dataset\n", "df_Al.groupby('SMILES').agg({'Efficiency':['median', 'std']})" @@ -168,21 +92,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df_full' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[8], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRank of unique values:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(ranked_values)\n\u001b[0;32m----> 8\u001b[0m value_counts(\u001b[43mdf_full\u001b[49m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAlloy\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28mprint\u001b[39m()\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUnique alloys in full dataset: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(df_full[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAlloy\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39munique())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_full' is not defined" - ] - } - ], + "outputs": [], "source": [ "def value_counts(df, value):\n", " # Count unique values in the 'value' column\n", @@ -206,21 +118,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df_Al' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[9], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# composition of AA2000 series is nearly identical, grab alloys from this series\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df_AA2024 \u001b[38;5;241m=\u001b[39m \u001b[43mdf_Al\u001b[49m[df_Al[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAlloy\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39misin([\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAA2024\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAA2014\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAA2017A\u001b[39m\u001b[38;5;124m'\u001b[39m])]\n\u001b[1;32m 3\u001b[0m df_AA2024\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSMILES\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39magg({\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mEfficiency\u001b[39m\u001b[38;5;124m'\u001b[39m:[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmedian\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstd\u001b[39m\u001b[38;5;124m'\u001b[39m]})\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_Al' is not defined" - ] - } - ], + "outputs": [], "source": [ "# composition of AA2000 series is nearly identical, grab alloys from this series\n", "df_AA2024 = df_Al[df_Al['Alloy'].isin(['AA2024', 'AA2014', 'AA2017A'])]\n", @@ -236,21 +136,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df_Al' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[10], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# you can play around with the column names to see what unique values are in each column\u001b[39;00m\n\u001b[1;32m 2\u001b[0m column_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTime_h\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdf_Al\u001b[49m[column_name]\u001b[38;5;241m.\u001b[39munique())\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m((\u001b[38;5;28mlen\u001b[39m(df_Al[column_name]\u001b[38;5;241m.\u001b[39munique())))\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_Al' is not defined" - ] - } - ], + "outputs": [], "source": [ "# you can play around with the column names to see what unique values are in each column\n", "column_name = 'Time_h'\n", @@ -260,21 +148,9 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df_Al' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[11], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# previous counter fuinction to check distribution\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m value_counts(\u001b[43mdf_Al\u001b[49m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTime_h\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_Al' is not defined" - ] - } - ], + "outputs": [], "source": [ "# previous counter fuinction to check distribution\n", "value_counts(df_Al, 'Time_h')" @@ -282,21 +158,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df_full' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[12], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m plt\u001b[38;5;241m.\u001b[39mshow()\n\u001b[1;32m 14\u001b[0m hist_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTime_h\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 15\u001b[0m plot_hist(\u001b[43mdf_full\u001b[49m, value \u001b[38;5;241m=\u001b[39m hist_value, name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFull\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 16\u001b[0m plot_hist(df_Al, value \u001b[38;5;241m=\u001b[39m hist_value, name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAl\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 17\u001b[0m plot_hist(df_AA2024, value \u001b[38;5;241m=\u001b[39m hist_value, name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAA2024\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_full' is not defined" - ] - } - ], + "outputs": [], "source": [ "# you can play around with histograms below to check distribution\n", "# \n", @@ -321,17 +185,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of entries in full dataset: 2011\n", - "Number of entries in full dataset with Time_h >= 1: 1638\n", - "Number of entries in full dataset with Time_h < 1: 373\n" - ] - } - ], + "outputs": [], "source": [ "# check filtering to see the effect of getting rid of some rows, to check whether its feasible to drop some columns\n", "\n", @@ -349,18 +203,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "121" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# quick check to check the amount of unique compounds\n", "filtered_df_Al = df_AA2024[df_AA2024['Time_h'] >= 1] # 123 to 121 compounds, maybe not much information loss for the small dataset...\n", @@ -379,18 +222,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "611" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "def filter_dataframe_AA2024(df):\n", " no_synergy_df = df[df['Synergistic_inhib'] == 'No']\n", @@ -425,18 +257,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1966" - ] - }, - "execution_count": 93, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "def filter_dataframe_Al(df):\n", " no_synergy_df = df[df['Synergistic_inhib'] == 'No']\n", @@ -469,21 +290,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df_full' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[13], line 8\u001b[0m\n\u001b[1;32m 3\u001b[0m filtered_df \u001b[38;5;241m=\u001b[39m no_synergy_df\u001b[38;5;241m.\u001b[39mdrop(columns\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mInhibitor\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mIndex\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMol._weight\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTemperature_K\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSalt_Concentrat_M\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 4\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSynergistic_Inhib_type\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSynergistic_Inhib_M\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMethodology\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mReference\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mContributor\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m filtered_df\n\u001b[0;32m----> 8\u001b[0m filtered_df_full \u001b[38;5;241m=\u001b[39m filter_dataframe_full(\u001b[43mdf_full\u001b[49m)\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28mlen\u001b[39m(filtered_df_full[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSMILES\u001b[39m\u001b[38;5;124m'\u001b[39m])\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_full' is not defined" - ] - } - ], + "outputs": [], "source": [ "def filter_dataframe_full(df):\n", " no_synergy_df = df[df['Synergistic_Inhib_type'].isnull()]\n", @@ -505,32 +314,13 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'filtered_df_full' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[14], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# save\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mfiltered_df_full\u001b[49m\u001b[38;5;241m.\u001b[39mto_excel(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfiltered_full.xlsx\u001b[39m\u001b[38;5;124m'\u001b[39m, index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'filtered_df_full' is not defined" - ] - } - ], + "outputs": [], "source": [ "# save\n", "filtered_df_full.to_excel('filtered_full.xlsx', index=False)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {