diff --git a/README.md b/README.md index a4f169fe5c..b714e41539 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,17 @@ -# Real Python Materials - -Bonus materials, exercises, and example projects for Real Python's [Python tutorials](https://realpython.com). - -Build Status: -[![GitHub Actions](https://img.shields.io/github/actions/workflow/status/realpython/materials/linters.yml?branch=master)](https://github.com/realpython/materials/actions) - -## Got a Question? - -The best way to get support for Real Python courses, articles, and code in this repository is to join one of our [weekly Office Hours calls](https://realpython.com/office-hours/) or to ask your question in the [RP Community Chat](https://realpython.com/community/). - -Due to time constraints, we cannot provide 1:1 support via GitHub. See you on Slack or on the next Office Hours call 🙂 - -## Adding Source Code & Sample Projects to This Repo (RP Contributors) - -### Running Code Style Checks - -We use [flake8](http://flake8.pycqa.org/en/latest/) and [black](https://black.readthedocs.io/) to ensure a consistent code style for all of our sample code in this repository. - -Run the following commands to validate your code against the linters: - -```sh -$ flake8 -$ black --check . -``` - -### Running Python Code Formatter - -We're using a tool called [black](https://black.readthedocs.io/) on this repo to ensure consistent formatting. On CI it runs in "check" mode to ensure any new files added to the repo follow PEP 8. If you see linter warnings that say something like "would reformat some_file.py" it means that black disagrees with your formatting. - -**The easiest way to resolve these errors is to run Black locally on the code and then commit those changes, as explained below.** - -To automatically re-format your code to be consistent with our code style guidelines, run [black](https://black.readthedocs.io/) in the repository root folder: - -```sh -$ black . -``` +# Using Python for Data Analysis + +This folder contains completed notebooks and other files used in the Real Python tutorial on [Using Python for Data Analysis](https://realpython.com/using-python-for-data-analysis/). + +None of the files are mandatory to complete the tutorial, however, you may find them of use for reference during the tutorial. + +## Available Files: + +`data analysis findings.ipynb` is a Jupyter Notebook containing all the code used in the tutorial. +`data analysis results.ipynb` is a Jupyter Notebook containing the final version of the cleansing and analysis code. +`james_bond_data.csv` contains the data to be cleansed and analyzed in its original form, in CSV format. +`james_bond_data.json` contains the data to be cleansed and analyzed in its original form, in JSON format. +`james_bond_data.parquet` contains the data to be cleansed and analyzed in its original form, in parquet format. +`james_bond_data.xlsx` contains the data to be cleansed and analyzed in its original form, in Microsoft Excel format. +`james_bond_data_cleansed.csv` contains the cleansed data in its final form. + +## Although the tutorial can be completed in a range of Python environments, the use of Jupyter Notebook within JupyterLab is highly recommended. \ No newline at end of file diff --git a/data analysis results.ipynb b/data analysis results.ipynb new file mode 100644 index 0000000000..fed90709d0 --- /dev/null +++ b/data analysis results.ipynb @@ -0,0 +1,222 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ade4bd3f-543b-460b-980f-0b41aab2c8b6", + "metadata": {}, + "source": [ + "# Data Cleansing Code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a360772e-7829-4c15-9af9-d4596efc7351", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c98c7640-1472-4869-9fdd-f070d665ae1d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_csv(\"james_bond_data.csv\").convert_dtypes()\n", + "\n", + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"]\n", + " .str.removesuffix(\"mins\")\n", + " .astype(int)\n", + " .replace(1200, 120)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_Year=lambda data: data[\"release_date\"].dt.year,\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " martinis_consumed=lambda data: data[\"martinis_consumed\"].replace(\n", + " -6, 6\n", + " ),\n", + " )\n", + ").drop_duplicates(ignore_index=True)\n", + "\n", + "data.to_csv(\"james_bond_data_cleansed.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f50918ee-e61f-46b2-b0c2-1ffa2c62bbc0", + "metadata": {}, + "source": [ + "# Data Analysis Code" + ] + }, + { + "cell_type": "markdown", + "id": "86817f68-05a0-4235-a1c8-a5d1f6e9141e", + "metadata": {}, + "source": [ + "## Performing a Regression Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bee6d6cb-e418-4c1d-8b75-604b9ab2e63d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install matplotlib scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "669fb9d7-d744-4e6b-899e-a69aebec53ed", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "x = data.loc[:, [\"imdb_rating\"]]\n", + "y = data.loc[:, \"rotten_tomatoes_rating\"]\n", + "\n", + "model = LinearRegression()\n", + "model.fit(x, y)\n", + "\n", + "r_squared = f\"R-Squared: {model.score(x, y):.2f}\"\n", + "best_fit = f\"y = {model.coef_[0]:.4f}x{model.intercept_:+.4f}\"\n", + "y_pred = model.predict(x)\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(x, y)\n", + "ax.plot(x, y_pred, color=\"red\")\n", + "ax.text(7.25, 5.5, r_squared, fontsize=10)\n", + "ax.text(7.25, 7, best_fit, fontsize=10)\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")\n", + "# fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "b38df412-c320-49fb-93ae-e253405537a8", + "metadata": {}, + "source": [ + "## Investigating a Statistical Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938e5942-e57f-4e41-99f1-215cfb37d0df", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# fig, ax = plt.subplots()\n", + "length = data[\"film_length\"].value_counts(bins=7).sort_index()\n", + "length.plot.bar(\n", + " title=\"Film Length Distribution\",\n", + " xlabel=\"Time Range (mins)\",\n", + " ylabel=\"Count\",\n", + ")\n", + "# fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff4e9955-baf4-48eb-b032-fbf55f439194", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"film_length\"].agg([\"mean\", \"max\", \"min\", \"std\"])" + ] + }, + { + "cell_type": "markdown", + "id": "1b14c433-c3a6-4484-bc0a-26825bd1e870", + "metadata": {}, + "source": [ + "## Finding No Relationship" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bb83374-347f-4cf6-bc21-8180a003371d", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"imdb_rating\"], data[\"bond_kills\"])\n", + "ax.set_title(\"Scatter Plot of Kills vs Ratings\")\n", + "ax.set_xlabel(\"Average IMDb Rating\")\n", + "ax.set_ylabel(\"Kills by Bond\")\n", + "fig.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data_analysis_findings.ipynb b/data_analysis_findings.ipynb new file mode 100644 index 0000000000..e1a3e0f216 --- /dev/null +++ b/data_analysis_findings.ipynb @@ -0,0 +1,1093 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ade4bd3f-543b-460b-980f-0b41aab2c8b6", + "metadata": {}, + "source": [ + "# Acquiring Your Data" + ] + }, + { + "cell_type": "markdown", + "id": "83ad2114-5ed8-4a90-85fa-adea5eda4392", + "metadata": {}, + "source": [ + "## Reading Data From CSV Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a360772e-7829-4c15-9af9-d4596efc7351", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e52f486-232e-440b-8585-90416e4300c2", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_csv(\"james_bond_data.csv\").convert_dtypes()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "797f69eb-3108-45d3-9a67-58c43593abf1", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e47c1f9b-b390-4035-956b-622615b57f32", + "metadata": {}, + "source": [ + "## Reading Data From Other Sources" + ] + }, + { + "cell_type": "markdown", + "id": "1d85aee9-cfeb-460b-9fe8-f3c7e7dfb764", + "metadata": {}, + "source": [ + "### Reading JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7465cd11-dad4-4741-9372-f825b28c33d6", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_json(\"james_bond_data.json\").convert_dtypes()" + ] + }, + { + "cell_type": "markdown", + "id": "47a0e4a6-0ed9-4253-9833-0ad22c49b968", + "metadata": {}, + "source": [ + "### Reading Excel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0364b81-64a0-4098-89fc-e58bd6d68257", + "metadata": {}, + "outputs": [], + "source": [ + "! python -m pip install openpyxl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8302139f-52dc-4f95-aa9a-96040ae5d82b", + "metadata": {}, + "outputs": [], + "source": [ + "import openpyxl\n", + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_excel(\"james_bond_data.xlsx\").convert_dtypes()" + ] + }, + { + "cell_type": "markdown", + "id": "be4a1143-c966-4056-8a5e-3bdebe2a9b1f", + "metadata": {}, + "source": [ + "### Reading Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f36ef600-e6ba-4cc6-9ee3-0cbf369a4be2", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pyarrow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c86284a2-9073-4240-b4d5-5e8b0373fc27", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_parquet(\n", + " \"james_bond_data.parquet\"\n", + ").convert_dtypes()" + ] + }, + { + "cell_type": "markdown", + "id": "69f884c2-92e8-4db3-bd63-84007f654808", + "metadata": {}, + "source": [ + "### Scraping HTML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b902722d-9648-4124-80b0-64004342170d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install lxml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fb2ff9c-3030-4f4a-be30-c2ab68452a21", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data_html = pd.read_html(\n", + " \"https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories\"\n", + ")\n", + "james_bond_data = james_bond_data_html[1].convert_dtypes()" + ] + }, + { + "cell_type": "markdown", + "id": "31068de2-9864-434a-9652-b115d1131684", + "metadata": {}, + "source": [ + "# Cleansing Your Data With Python" + ] + }, + { + "cell_type": "markdown", + "id": "e0dcca3b-6e71-481d-a071-6218012db962", + "metadata": {}, + "source": [ + "## Creating Meaningful Column Names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d70997b9-3c75-4165-b034-8544bd084c04", + "metadata": {}, + "outputs": [], + "source": [ + "new_column_names = {\n", + " \"Release\": \"release_date\",\n", + " \"Movie\": \"movie_title\",\n", + " \"Bond\": \"bond_actor\",\n", + " \"Bond_Car_MFG\": \"car_manufacturer\",\n", + " \"US_Gross\": \"gross_income_usa\",\n", + " \"World_Gross\": \"gross_income_world\",\n", + " \"Budget ($ 000s)\": \"movie_budget\",\n", + " \"Film_Length\": \"film_length\",\n", + " \"Avg_User_IMDB\": \"imdb_rating\",\n", + " \"Avg_User_Rtn_Tom\": \"rotten_tomatoes_rating\",\n", + " \"Martinis\": \"martinis_consumed\",\n", + " \"Kills_Bond\": \"bond_kills\",\n", + "}\n", + "\n", + "data = james_bond_data.rename(columns=new_column_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "937b9121-b7ae-4f7e-800d-bfcc2689c98a", + "metadata": {}, + "outputs": [], + "source": [ + "data.columns" + ] + }, + { + "cell_type": "markdown", + "id": "e432b28e-257b-422b-b2f8-06f41608391b", + "metadata": {}, + "source": [ + "## Dealing With Missing Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d497e64c-aa7e-4d09-8de1-f529939d58f9", + "metadata": {}, + "outputs": [], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b29d5a34-c930-4ce2-898c-b9e8aa7f771d", + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[data.isna().any(axis=\"columns\")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1db9201a-11c1-4cdd-9625-d70cee736191", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.rename(columns=new_column_names).combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49d06d77-49b0-4e89-b228-8583650595af", + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame({\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4af51fb-fd1f-4570-b16f-6f20e0b65473", + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[data.isna().any(axis=\"columns\")]" + ] + }, + { + "cell_type": "markdown", + "id": "f6297c81-4c63-4eff-95e3-4a944bb5fe03", + "metadata": {}, + "source": [ + "## Correcting Invalid Data Types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "916c91b8-7888-40fc-bce7-247837508adf", + "metadata": {}, + "outputs": [], + "source": [ + "data[\n", + " [\"gross_income_usa\", \"gross_income_world\", \"movie_budget\", \"film_length\"]\n", + "].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "001996e3-2fce-4228-a873-b78eef613bba", + "metadata": {}, + "outputs": [], + "source": [ + "data[\n", + " [\"gross_income_usa\", \"gross_income_world\", \"movie_budget\", \"film_length\"]\n", + "].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "880e4710-1c11-4de2-a2c3-97a9672ce6f7", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae9c1d1b-a620-43c5-a199-eb6a7bff7ce2", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8331b98e-169f-4d3b-9b88-0ece7ddc8dea", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(int)\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6340b1f-3b1c-42e6-9b69-e981f645d77b", + "metadata": {}, + "outputs": [], + "source": [ + "data[\n", + " [\"gross_income_usa\", \"gross_income_world\", \"movie_budget\", \"film_length\"]\n", + "].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7025fbd2-ce44-4efe-88c9-9f51830776c2", + "metadata": {}, + "outputs": [], + "source": [ + "data[\n", + " [\"gross_income_usa\", \"gross_income_world\", \"movie_budget\", \"film_length\"]\n", + "].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d45b9b42-7c71-422f-9ddb-ea659e5385c9", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"release_date\"]].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f5dacf7-2f6c-47f4-b875-7d36f2251627", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"release_date\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed0ead0e-7310-4c82-86d5-2480a95f1525", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(int)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_year=lambda data: data[\"release_date\"].dt.year,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f147876-7348-43e9-ac6a-3f3df6ee2af9", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"release_date\", \"release_year\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47d4868a-94d8-4d36-85b9-b0c9a6203a8a", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"release_date\", \"release_year\"]].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2c7922a-916e-4e01-829b-77cbb2205153", + "metadata": {}, + "outputs": [], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "markdown", + "id": "89653d81-3bcd-4078-83cb-ad4b2fa560e6", + "metadata": {}, + "source": [ + "## Fixing Inconsistencies in Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47a41ef3-751a-41ed-869d-9f2c45509196", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"gross_income_usa\", \"gross_income_world\", \"movie_budget\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc483320-7895-4368-a672-b98f8d0c9755", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(int)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_Year=lambda data: data[\"release_date\"].dt.year,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6cbd7ea-e168-442e-8dd9-e2955288fa57", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"gross_income_usa\", \"gross_income_world\", \"movie_budget\"]].head()" + ] + }, + { + "cell_type": "markdown", + "id": "8bdaa8b1-9f2e-46a5-b53a-c1ae4c201c99", + "metadata": {}, + "source": [ + "## Correcting Spelling Errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e442e51a-28fd-42d7-94b0-aaf1abe5d9a8", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"bond_actor\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9863aa7-b5db-4ab1-be63-727ff437b63b", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(int)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_Year=lambda data: data[\"release_date\"].dt.year,\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e313152b-92b4-43a8-8483-637281a1f04d", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"bond_actor\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a26b138d-72e5-4e15-a875-ee65023545d1", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"car_manufacturer\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd4ae142-e339-4601-b0a4-84375eb28c02", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(int)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_Year=lambda data: data[\"release_date\"].dt.year,\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c596022b-02a4-40c0-ac5f-d0b0643a7a4a", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"car_manufacturer\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "50c80bc8-fdb9-4c28-af5a-cd6b66c7a01d", + "metadata": {}, + "source": [ + "## Checking For Invalid Outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8260f6b1-6d7f-4338-95b7-8946d69a92e2", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"film_length\", \"martinis_consumed\"]].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c73fe06b-5f42-4357-9b0f-2e460bf0dacf", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"]\n", + " .str.removesuffix(\"mins\")\n", + " .astype(int)\n", + " .replace(1200, 120)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_Year=lambda data: data[\"release_date\"].dt.year,\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " martinis_consumed=lambda data: data[\"martinis_consumed\"].replace(\n", + " -6, 6\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2abb5b80-48be-4a00-9483-4732b9a5d802", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"film_length\", \"martinis_consumed\"]].describe()" + ] + }, + { + "cell_type": "markdown", + "id": "3e129b32-5e66-41cb-b938-8fd58bb94116", + "metadata": {}, + "source": [ + "## Removing Duplicate Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be7aad8b-ef3f-48a6-a9a0-de909133921f", + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[data.duplicated(keep=False)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c98c7640-1472-4869-9fdd-f070d665ae1d", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame(\n", + " {\"imdb_rating\": {10: 7.1}, \"rotten_tomatoes_rating\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " gross_income_usa=lambda data: (\n", + " data[\"gross_income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " gross_income_world=lambda data: (\n", + " data[\"gross_income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"]\n", + " .str.removesuffix(\"mins\")\n", + " .astype(int)\n", + " .replace(1200, 120)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_Year=lambda data: data[\"release_date\"].dt.year,\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " martinis_consumed=lambda data: data[\"martinis_consumed\"].replace(\n", + " -6, 6\n", + " ),\n", + " )\n", + ").drop_duplicates(ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff05e0ae-4f9b-47a7-87f1-fb7630fabddc", + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[data.duplicated(keep=False)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1216a25-4791-4601-83ba-62513e4cc880", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"movie_title\"].value_counts().head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ba17e3f-3ce1-4885-a104-f60d254d9feb", + "metadata": {}, + "outputs": [], + "source": [ + " data[\"bond_actor\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "52db1351-36ed-4104-a999-345ebbc62214", + "metadata": {}, + "source": [ + "## Storing Your Cleansed Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "575a774e-6913-41fb-8ff9-4d786f478007", + "metadata": {}, + "outputs": [], + "source": [ + "data.to_csv(\"james_bond_data_cleansed.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f50918ee-e61f-46b2-b0c2-1ffa2c62bbc0", + "metadata": {}, + "source": [ + "# Using Python for Data Analysis" + ] + }, + { + "cell_type": "markdown", + "id": "86817f68-05a0-4235-a1c8-a5d1f6e9141e", + "metadata": {}, + "source": [ + "## Performing a Regression Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bee6d6cb-e418-4c1d-8b75-604b9ab2e63d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install matplotlib scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27d0a3dd-e71a-4b8a-883c-40cb5c001f7e", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"imdb_rating\"], data[\"rotten_tomatoes_rating\"])\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")\n", + "# fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "669fb9d7-d744-4e6b-899e-a69aebec53ed", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "x = data.loc[:, [\"imdb_rating\"]]\n", + "y = data.loc[:, \"rotten_tomatoes_rating\"]\n", + "\n", + "model = LinearRegression()\n", + "model.fit(x, y)\n", + "\n", + "r_squared = f\"R-Squared: {model.score(x, y):.2f}\"\n", + "best_fit = f\"y = {model.coef_[0]:.4f}x{model.intercept_:+.4f}\"\n", + "y_pred = model.predict(x)\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(x, y)\n", + "ax.plot(x, y_pred, color=\"red\")\n", + "ax.text(7.25, 5.5, r_squared, fontsize=10)\n", + "ax.text(7.25, 7, best_fit, fontsize=10)\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")\n", + "# fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "b38df412-c320-49fb-93ae-e253405537a8", + "metadata": {}, + "source": [ + "## Investigating a Statistical Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938e5942-e57f-4e41-99f1-215cfb37d0df", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# fig, ax = plt.subplots()\n", + "length = data[\"film_length\"].value_counts(bins=7).sort_index()\n", + "length.plot.bar(\n", + " title=\"Film Length Distribution\",\n", + " xlabel=\"Time Range (mins)\",\n", + " ylabel=\"Count\",\n", + ")\n", + "# fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff4e9955-baf4-48eb-b032-fbf55f439194", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"film_length\"].agg([\"mean\", \"max\", \"min\", \"std\"])" + ] + }, + { + "cell_type": "markdown", + "id": "1b14c433-c3a6-4484-bc0a-26825bd1e870", + "metadata": {}, + "source": [ + "## Finding No Relationship" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bb83374-347f-4cf6-bc21-8180a003371d", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"imdb_rating\"], data[\"bond_kills\"])\n", + "ax.set_title(\"Scatter Plot of Kills vs Ratings\")\n", + "ax.set_xlabel(\"Average IMDb Rating\")\n", + "ax.set_ylabel(\"Kills by Bond\")\n", + "fig.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/james_bond_data.csv b/james_bond_data.csv new file mode 100644 index 0000000000..4a983b2201 --- /dev/null +++ b/james_bond_data.csv @@ -0,0 +1,28 @@ +Release,Movie,Bond,Bond_Car_MFG,US_Gross,World_Gross,Budget ($ 000s),Film_Length,Avg_User_IMDB,Avg_User_Rtn_Tom,Martinis,Kills_Bond +"June, 1962",Dr. No,Sean Connery,Sunbeam," $16,067,035.00 "," $59,567,035.00 "," $1,000.00 ",110 mins,7.3,7.7,2,4 +"August, 1963",From Russia with Love,Sean Connery,Bently," $24,800,000.00 "," $78,900,000.00 "," $2,000.00 ",115 mins,7.5,8,0,11 +"May, 1964",Goldfinger,Sean Connery,Aston Martin," $51,100,000.00 "," $124,900,000.00 "," $3,000.00 ",110 mins,7.8,8.4,1,9 +"September, 1965",Thunderball,Sean Connery,Aston Martin," $63,600,000.00 "," $141,200,000.00 "," $9,000.00 ",130 mins,7,6.8,0,20 +"November, 1967",You Only Live Twice,Sean Connery,Toyota," $43,100,000.00 "," $111,600,000.00 "," $9,500.00 ",117 mins,6.9,6.3,1,21 +"July, 1969",On Her Majesty's Secret Service,George Lazenby,Mercury," $22,800,000.00 "," $82,000,000.00 "," $8,000.00 ",142 mins,6.8,6.7,1,5 +"March, 1971",Diamonds Are Forever,Shawn Connery,Ford," $43,800,000.00 "," $116,000,000.00 "," $7,200.00 ",1200 mins,6.7,6.3,0,7 +"August, 1973",Live and Let Die,Roger Moore,AMC," $35,400,000.00 "," $161,800,000.00 "," $7,000.00 ",121 mins,6.8,5.9,0,8 +"July, 1974",The Man with the Golden Gun,Roger Moore,AMC," $21,000,000.00 "," $97,600,000.00 "," $7,000.00 ",125 mins,6.7,5.1,0,1 +"July, 1974",The Man with the Golden Gun,Roger Moore,AMC," $21,000,000.00 "," $97,600,000.00 "," $7,000.00 ",125 mins,6.7,5.1,0,1 +"April, 1977",The Spy Who Loved Me,Roger Moore,Lotus," $46,800,000.00 "," $185,400,000.00 "," $14,000.00 ",125 mins,,,1,31 +"October, 1979",Moonraker,Roger Moore,Lotus," $70,300,000.00 "," $210,300,000.00 "," $31,000.00 ",126 mins,6.2,5.7,1,12 +"June, 1981",For Your Eyes Only,Roger MOORE,Citroen," $54,800,000.00 "," $195,300,000.00 "," $28,000.00 ",127 mins,6.8,6.3,0,18 +"March, 1983",Octopussy,Roger Moore,Bajaj," $67,900,000.00 "," $187,500,000.00 "," $27,500.00 ",131 mins,6.5,5.3,0,15 +"October, 1985",A View to a Kill,Roger Moore,Rolls Royce," $50,327,960.00 "," $152,627,960.00 "," $30,000.00 ",131 mins,6.2,4.7,0,5 +"May, 1987",The Living Daylights,Timothy Dalton,Rolls Royce," $51,185,000.00 "," $191,200,000.00 "," $40,000.00 ",130 mins,6.7,6.3,2,13 +"May, 1987",The Living Daylights,Timothy Dalton,Rolls Royce," $51,185,000.00 "," $191,200,000.00 "," $40,000.00 ",130 mins,6.7,6.3,2,13 +"January, 1989",License to Kill,Timothy Dalton,Aston Martin," $34,667,015.00 "," $156,167,015.00 "," $42,000.00 ",133 mins,6.5,6,1,10 +"September, 1995",GoldenEye,Pierce Brosnan,BMW," $106,429,941.00 "," $356,429,941.00 "," $60,000.00 ",130 mins,7.2,6.9,1,47 +"July, 1997",Tomorrow Never Dies,Pierce Brosnan,Aston Martin," $125,304,276.00 "," $339,504,276.00 "," $110,000.00 ",119 mins,6.4,6,1,30 +"June, 1999",The World Is Not Enough,Pierce Brosnan,BMW," $126,930,660.00 "," $361,730,660.00 "," $135,000.00 ",128 mins,6.3,5.7,1,27 +"August, 2002",Die Another Day,Pierce Brosnan,Aston Martin," $160,942,139.00 "," $431,942,139.00 "," $142,000.00 ",133 mins,6,6.1,2,31 +"February, 2006",Casino Royale,Daniel Craig,Astin Martin," $167,365,000.00 "," $596,365,000.00 "," $102,000.00 ",144 mins,7.9,7.8,3,11 +"December, 2008",Quantum of Solace,Daniel Craig,Aston Martin," $169,368,427.00 "," $591,692,078.00 "," $230,000.00 ",106 mins,6.7,6.1,-6,16 +"November, 2012",Skyfall,Daniel Craig,Astin Martin," $304,360,277.00 "," $1,108,561,108.00 "," $200,000.00 ",143 mins,7.8,8.2,1,26 +"September, 2015",Spectre,Daniel Craig,Aston Martin," $200,074,175.00 "," $879,620,923.00 "," $245,000.00 ",148 mins,6.8,6.4,1,30 +"November, 2021",No Time to Die,Daniel Craig,Aston Martin," $160,891,007.00 "," $759,959,662.00 "," $275,000.00 ",163 mins,7.3,7.3,1,14 diff --git a/james_bond_data.json b/james_bond_data.json new file mode 100644 index 0000000000..852810b38e --- /dev/null +++ b/james_bond_data.json @@ -0,0 +1 @@ +{"Release":{"0":"June, 1962","1":"August, 1963","2":"May, 1964","3":"September, 1965","4":"November, 1967","5":"July, 1969","6":"March, 1971","7":"August, 1973","8":"July, 1974","9":"July, 1974","10":"April, 1977","11":"October, 1979","12":"June, 1981","13":"March, 1983","14":"October, 1985","15":"May, 1987","16":"May, 1987","17":"January, 1989","18":"September, 1995","19":"July, 1997","20":"June, 1999","21":"August, 2002","22":"February, 2006","23":"December, 2008","24":"November, 2012","25":"September, 2015","26":"November, 2021"},"Movie":{"0":"Dr. No","1":"From Russia with Love","2":"Goldfinger","3":"Thunderball","4":"You Only Live Twice","5":"On Her Majesty's Secret Service","6":"Diamonds Are Forever","7":"Live and Let Die","8":"The Man with the Golden Gun","9":"The Man with the Golden Gun","10":"The Spy Who Loved Me","11":"Moonraker","12":"For Your Eyes Only","13":"Octopussy","14":"A View to a Kill","15":"The Living Daylights","16":"The Living Daylights","17":"License to Kill","18":"GoldenEye","19":"Tomorrow Never Dies","20":"The World Is Not Enough","21":"Die Another Day","22":"Casino Royale","23":"Quantum of Solace","24":"Skyfall","25":"Spectre","26":"No Time to Die"},"Bond":{"0":"Sean Connery","1":"Sean Connery","2":"Sean Connery","3":"Sean Connery","4":"Sean Connery","5":"George Lazenby","6":"Shawn Connery","7":"Roger Moore","8":"Roger Moore","9":"Roger Moore","10":"Roger Moore","11":"Roger Moore","12":"Roger MOORE","13":"Roger Moore","14":"Roger Moore","15":"Timothy Dalton","16":"Timothy Dalton","17":"Timothy Dalton","18":"Pierce Brosnan","19":"Pierce Brosnan","20":"Pierce Brosnan","21":"Pierce Brosnan","22":"Daniel Craig","23":"Daniel Craig","24":"Daniel Craig","25":"Daniel Craig","26":"Daniel Craig"},"Bond_Car_MFG":{"0":"Sunbeam","1":"Bently","2":"Aston Martin","3":"Aston Martin","4":"Toyota","5":"Mercury","6":"Ford","7":"AMC","8":"AMC","9":"AMC","10":"Lotus","11":"Lotus","12":"Citroen","13":"Bajaj","14":"Rolls Royce","15":"Rolls Royce","16":"Rolls Royce","17":"Aston Martin","18":"BMW","19":"Aston Martin","20":"BMW","21":"Aston Martin","22":"Astin Martin","23":"Aston Martin","24":"Astin Martin","25":"Aston Martin","26":"Aston Martin"},"US_Gross":{"0":" $16,067,035.00 ","1":" $24,800,000.00 ","2":" $51,100,000.00 ","3":" $63,600,000.00 ","4":" $43,100,000.00 ","5":" $22,800,000.00 ","6":" $43,800,000.00 ","7":" $35,400,000.00 ","8":" $21,000,000.00 ","9":" $21,000,000.00 ","10":" $46,800,000.00 ","11":" $70,300,000.00 ","12":" $54,800,000.00 ","13":" $67,900,000.00 ","14":" $50,327,960.00 ","15":" $51,185,000.00 ","16":" $51,185,000.00 ","17":" $34,667,015.00 ","18":" $106,429,941.00 ","19":" $125,304,276.00 ","20":" $126,930,660.00 ","21":" $160,942,139.00 ","22":" $167,365,000.00 ","23":" $169,368,427.00 ","24":" $304,360,277.00 ","25":" $200,074,175.00 ","26":" $160,891,007.00 "},"World_Gross":{"0":" $59,567,035.00 ","1":" $78,900,000.00 ","2":" $124,900,000.00 ","3":" $141,200,000.00 ","4":" $111,600,000.00 ","5":" $82,000,000.00 ","6":" $116,000,000.00 ","7":" $161,800,000.00 ","8":" $97,600,000.00 ","9":" $97,600,000.00 ","10":" $185,400,000.00 ","11":" $210,300,000.00 ","12":" $195,300,000.00 ","13":" $187,500,000.00 ","14":" $152,627,960.00 ","15":" $191,200,000.00 ","16":" $191,200,000.00 ","17":" $156,167,015.00 ","18":" $356,429,941.00 ","19":" $339,504,276.00 ","20":" $361,730,660.00 ","21":" $431,942,139.00 ","22":" $596,365,000.00 ","23":" $591,692,078.00 ","24":" $1,108,561,108.00 ","25":" $879,620,923.00 ","26":" $759,959,662.00 "},"Budget ($ 000s)":{"0":" $1,000.00 ","1":" $2,000.00 ","2":" $3,000.00 ","3":" $9,000.00 ","4":" $9,500.00 ","5":" $8,000.00 ","6":" $7,200.00 ","7":" $7,000.00 ","8":" $7,000.00 ","9":" $7,000.00 ","10":" $14,000.00 ","11":" $31,000.00 ","12":" $28,000.00 ","13":" $27,500.00 ","14":" $30,000.00 ","15":" $40,000.00 ","16":" $40,000.00 ","17":" $42,000.00 ","18":" $60,000.00 ","19":" $110,000.00 ","20":" $135,000.00 ","21":" $142,000.00 ","22":" $102,000.00 ","23":" $230,000.00 ","24":" $200,000.00 ","25":" $245,000.00 ","26":" $275,000.00 "},"Film_Length":{"0":"110 mins","1":"115 mins","2":"110 mins","3":"130 mins","4":"117 mins","5":"142 mins","6":"1200 mins","7":"121 mins","8":"125 mins","9":"125 mins","10":"125 mins","11":"126 mins","12":"127 mins","13":"131 mins","14":"131 mins","15":"130 mins","16":"130 mins","17":"133 mins","18":"130 mins","19":"119 mins","20":"128 mins","21":"133 mins","22":"144 mins","23":"106 mins","24":"143 mins","25":"148 mins","26":"163 mins"},"Avg_User_IMDB":{"0":7.3,"1":7.5,"2":7.8,"3":7.0,"4":6.9,"5":6.8,"6":6.7,"7":6.8,"8":6.7,"9":6.7,"10":null,"11":6.2,"12":6.8,"13":6.5,"14":6.2,"15":6.7,"16":6.7,"17":6.5,"18":7.2,"19":6.4,"20":6.3,"21":6.0,"22":7.9,"23":6.7,"24":7.8,"25":6.8,"26":7.3},"Avg_User_Rtn_Tom":{"0":7.7,"1":8.0,"2":8.4,"3":6.8,"4":6.3,"5":6.7,"6":6.3,"7":5.9,"8":5.1,"9":5.1,"10":null,"11":5.7,"12":6.3,"13":5.3,"14":4.7,"15":6.3,"16":6.3,"17":6.0,"18":6.9,"19":6.0,"20":5.7,"21":6.1,"22":7.8,"23":6.1,"24":8.2,"25":6.4,"26":7.3},"Martinis":{"0":2,"1":0,"2":1,"3":0,"4":1,"5":1,"6":0,"7":0,"8":0,"9":0,"10":1,"11":1,"12":0,"13":0,"14":0,"15":2,"16":2,"17":1,"18":1,"19":1,"20":1,"21":2,"22":3,"23":-6,"24":1,"25":1,"26":1},"Kills_Bond":{"0":4,"1":11,"2":9,"3":20,"4":21,"5":5,"6":7,"7":8,"8":1,"9":1,"10":31,"11":12,"12":18,"13":15,"14":5,"15":13,"16":13,"17":10,"18":47,"19":30,"20":27,"21":31,"22":11,"23":16,"24":26,"25":30,"26":14}} \ No newline at end of file diff --git a/james_bond_data.parquet b/james_bond_data.parquet new file mode 100644 index 0000000000..88bd22b4fb Binary files /dev/null and b/james_bond_data.parquet differ diff --git a/james_bond_data.xlsx b/james_bond_data.xlsx new file mode 100644 index 0000000000..1e042705fc Binary files /dev/null and b/james_bond_data.xlsx differ diff --git a/james_bond_data_cleansed.csv b/james_bond_data_cleansed.csv new file mode 100644 index 0000000000..f57c20a4d8 --- /dev/null +++ b/james_bond_data_cleansed.csv @@ -0,0 +1,26 @@ +bond_actor,bond_kills,car_manufacturer,film_length,gross_income_usa,gross_income_world,imdb_rating,martinis_consumed,movie_budget,movie_title,release_date,rotten_tomatoes_rating,release_Year +Sean Connery,4,Sunbeam,110,16067035.0,59567035.0,7.3,2,1000000.0,Dr. No,1962-06-01,7.7,1962 +Sean Connery,11,Bently,115,24800000.0,78900000.0,7.5,0,2000000.0,From Russia with Love,1963-08-01,8.0,1963 +Sean Connery,9,Aston Martin,110,51100000.0,124900000.0,7.8,1,3000000.0,Goldfinger,1964-05-01,8.4,1964 +Sean Connery,20,Aston Martin,130,63600000.0,141200000.0,7.0,0,9000000.0,Thunderball,1965-09-01,6.8,1965 +Sean Connery,21,Toyota,117,43100000.0,111600000.0,6.9,1,9500000.0,You Only Live Twice,1967-11-01,6.3,1967 +George Lazenby,5,Mercury,142,22800000.0,82000000.0,6.8,1,8000000.0,On Her Majesty's Secret Service,1969-07-01,6.7,1969 +Sean Connery,7,Ford,120,43800000.0,116000000.0,6.7,0,7200000.0,Diamonds Are Forever,1971-03-01,6.3,1971 +Roger Moore,8,AMC,121,35400000.0,161800000.0,6.8,0,7000000.0,Live and Let Die,1973-08-01,5.9,1973 +Roger Moore,1,AMC,125,21000000.0,97600000.0,6.7,0,7000000.0,The Man with the Golden Gun,1974-07-01,5.1,1974 +Roger Moore,31,Lotus,125,46800000.0,185400000.0,7.1,1,14000000.0,The Spy Who Loved Me,1977-04-01,6.8,1977 +Roger Moore,12,Lotus,126,70300000.0,210300000.0,6.2,1,31000000.0,Moonraker,1979-10-01,5.7,1979 +Roger Moore,18,Citroen,127,54800000.0,195300000.0,6.8,0,28000000.0,For Your Eyes Only,1981-06-01,6.3,1981 +Roger Moore,15,Bajaj,131,67900000.0,187500000.0,6.5,0,27500000.0,Octopussy,1983-03-01,5.3,1983 +Roger Moore,5,Rolls Royce,131,50327960.0,152627960.0,6.2,0,30000000.0,A View to a Kill,1985-10-01,4.7,1985 +Timothy Dalton,13,Rolls Royce,130,51185000.0,191200000.0,6.7,2,40000000.0,The Living Daylights,1987-05-01,6.3,1987 +Timothy Dalton,10,Aston Martin,133,34667015.0,156167015.0,6.5,1,42000000.0,License to Kill,1989-01-01,6.0,1989 +Pierce Brosnan,47,BMW,130,106429941.0,356429941.0,7.2,1,60000000.0,GoldenEye,1995-09-01,6.9,1995 +Pierce Brosnan,30,Aston Martin,119,125304276.0,339504276.0,6.4,1,110000000.0,Tomorrow Never Dies,1997-07-01,6.0,1997 +Pierce Brosnan,27,BMW,128,126930660.0,361730660.0,6.3,1,135000000.0,The World Is Not Enough,1999-06-01,5.7,1999 +Pierce Brosnan,31,Aston Martin,133,160942139.0,431942139.0,6.0,2,142000000.0,Die Another Day,2002-08-01,6.1,2002 +Daniel Craig,11,Aston Martin,144,167365000.0,596365000.0,7.9,3,102000000.0,Casino Royale,2006-02-01,7.8,2006 +Daniel Craig,16,Aston Martin,106,169368427.0,591692078.0,6.7,6,230000000.0,Quantum of Solace,2008-12-01,6.1,2008 +Daniel Craig,26,Aston Martin,143,304360277.0,1108561108.0,7.8,1,200000000.0,Skyfall,2012-11-01,8.2,2012 +Daniel Craig,30,Aston Martin,148,200074175.0,879620923.0,6.8,1,245000000.0,Spectre,2015-09-01,6.4,2015 +Daniel Craig,14,Aston Martin,163,160891007.0,759959662.0,7.3,1,275000000.0,No Time to Die,2021-11-01,7.3,2021 diff --git a/python-for-data-analysis/README.md b/python-for-data-analysis/README.md new file mode 100644 index 0000000000..9281b87392 --- /dev/null +++ b/python-for-data-analysis/README.md @@ -0,0 +1,15 @@ +# Downloadable Files + +This folder contains completed notebooks and other files used in the Real Python tutorial on [Using Python for Data Analysis](https://realpython.com/using-python-for-data-analysis/). + +The `james_bond_data.csv` file contains the original uncleansed data and is the only mandatory file you will need to complete the tutorial. The same data is also available in JSON, Parquet and Excel versions to allow you to complete the optional exercises in reading from these file types. + +A cleansed version of the original data is available in the `james_bond_data_cleansed.csv` file. + +The complete code is available in the `james_bond_analysis.ipynb` Jupyter notebook. + +## Setup + +The easiest way to work through this tutorial is to install and use [JupyterLab](https://realpython.com/using-jupyterlab/). Using Jupyter Notebook within JupyterLab will allow you to run code and see its results cleanly, and in the same way they are presented in the tutorial. It will also make it easy for you to view the supporting files. + + diff --git a/python-for-data-analysis/james_bond_analysis.ipynb b/python-for-data-analysis/james_bond_analysis.ipynb new file mode 100644 index 0000000000..4a8ce4b46e --- /dev/null +++ b/python-for-data-analysis/james_bond_analysis.ipynb @@ -0,0 +1,794 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ade4bd3f-543b-460b-980f-0b41aab2c8b6", + "metadata": {}, + "source": [ + "# Acquiring Your Data" + ] + }, + { + "cell_type": "markdown", + "id": "83ad2114-5ed8-4a90-85fa-adea5eda4392", + "metadata": {}, + "source": [ + "## Reading Data From CSV Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a360772e-7829-4c15-9af9-d4596efc7351", + "metadata": {}, + "outputs": [], + "source": [ + "! python -m pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e52f486-232e-440b-8585-90416e4300c2", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_csv(\"james_bond_data.csv\").convert_dtypes()\n", + "james_bond_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e47c1f9b-b390-4035-956b-622615b57f32", + "metadata": {}, + "source": [ + "## Reading Data From Other Sources" + ] + }, + { + "cell_type": "markdown", + "id": "1d85aee9-cfeb-460b-9fe8-f3c7e7dfb764", + "metadata": {}, + "source": [ + "### Reading JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7465cd11-dad4-4741-9372-f825b28c33d6", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data_json = pd.read_json(\"james_bond_data.json\").convert_dtypes()\n", + "james_bond_data_json.head()" + ] + }, + { + "cell_type": "markdown", + "id": "47a0e4a6-0ed9-4253-9833-0ad22c49b968", + "metadata": {}, + "source": [ + "### Reading Excel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0364b81-64a0-4098-89fc-e58bd6d68257", + "metadata": {}, + "outputs": [], + "source": [ + "! python -m pip install openpyxl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8302139f-52dc-4f95-aa9a-96040ae5d82b", + "metadata": {}, + "outputs": [], + "source": [ + "import openpyxl\n", + "import pandas as pd\n", + "\n", + "james_bond_data_excel = pd.read_excel(\"james_bond_data.xlsx\").convert_dtypes()\n", + "james_bond_data_excel.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b3237b8-99a0-4070-81a9-e7f9e44c8973", + "metadata": {}, + "outputs": [], + "source": [ + "! python.exe -m pip install --upgrade pip" + ] + }, + { + "cell_type": "markdown", + "id": "be4a1143-c966-4056-8a5e-3bdebe2a9b1f", + "metadata": {}, + "source": [ + "### Reading Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f36ef600-e6ba-4cc6-9ee3-0cbf369a4be2", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pyarrow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c86284a2-9073-4240-b4d5-5e8b0373fc27", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data_parquet = pd.read_parquet(\n", + " \"james_bond_data.parquet\"\n", + ").convert_dtypes()\n", + "james_bond_data_parquet.head()" + ] + }, + { + "cell_type": "markdown", + "id": "69f884c2-92e8-4db3-bd63-84007f654808", + "metadata": {}, + "source": [ + "### Scraping HTML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b902722d-9648-4124-80b0-64004342170d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install lxml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fb2ff9c-3030-4f4a-be30-c2ab68452a21", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data_html = pd.read_html(\n", + " \"https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories\"\n", + ")\n", + "james_bond_data_html = james_bond_data_html[1].convert_dtypes()\n", + "james_bond_data_html.head()" + ] + }, + { + "cell_type": "markdown", + "id": "31068de2-9864-434a-9652-b115d1131684", + "metadata": {}, + "source": [ + "# Cleansing Your Data With Python" + ] + }, + { + "cell_type": "markdown", + "id": "e432b28e-257b-422b-b2f8-06f41608391b", + "metadata": {}, + "source": [ + "## Dealing With Missing Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38eb1abb-9f89-4a53-9e77-f7c71dbeff18", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b29d5a34-c930-4ce2-898c-b9e8aa7f771d", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data[james_bond_data.isna().any(axis=\"columns\")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1db9201a-11c1-4cdd-9625-d70cee736191", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.combine_first(\n", + " pd.DataFrame({\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}})\n", + ")\n", + "\n", + "data" + ] + }, + { + "cell_type": "markdown", + "id": "f6297c81-4c63-4eff-95e3-4a944bb5fe03", + "metadata": {}, + "source": [ + "## Correcting Invalid Data Types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "001996e3-2fce-4228-a873-b78eef613bba", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"US_Gross\", \"World_Gross\", \"Budget ($ 000s)\", \"Film_Length\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "880e4710-1c11-4de2-a2c3-97a9672ce6f7", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.combine_first(\n", + " pd.DataFrame({\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}})\n", + ").assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a542149d-35d1-4012-8638-25e59f2f3ae4", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"US_Gross\", \"World_Gross\", \"Budget\"]].head()\n", + "data[[\"US_Gross\", \"World_Gross\", \"Budget\"]].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae9c1d1b-a620-43c5-a199-eb6a7bff7ce2", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.combine_first(\n", + " pd.DataFrame({\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}})\n", + ").assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + " ),\n", + ")\n", + "\n", + "data[[\"Film_Length\"]].head()\n", + "data[[\"Film_Length\"]].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed0ead0e-7310-4c82-86d5-2480a95f1525", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.combine_first(\n", + " pd.DataFrame({\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}})\n", + ").assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + " ),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year,\n", + ")\n", + "\n", + "data[[\"Release\"]].info()\n", + "data[[\"Release_Year\"]].head()" + ] + }, + { + "cell_type": "markdown", + "id": "89653d81-3bcd-4078-83cb-ad4b2fa560e6", + "metadata": {}, + "source": [ + "## Fixing Inconsistencies in Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc483320-7895-4368-a672-b98f8d0c9755", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.combine_first(\n", + " pd.DataFrame({\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}})\n", + ").assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " * 1000\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + " ),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year,\n", + ")\n", + "\n", + "data[[\"US_Gross\", \"World_Gross\", \"Budget\"]].head()" + ] + }, + { + "cell_type": "markdown", + "id": "3e129b32-5e66-41cb-b938-8fd58bb94116", + "metadata": {}, + "source": [ + "## Removing Duplicate Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be7aad8b-ef3f-48a6-a9a0-de909133921f", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"Movie\"].value_counts().head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20067efb-e7c7-4690-b483-1d29847ad24f", + "metadata": {}, + "outputs": [], + "source": [ + "duplicate_movies = [\"The Man with the Golden Gun\", \"The Living Daylights\"]\n", + "data[data[\"Movie\"].isin(duplicate_movies)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c98c7640-1472-4869-9fdd-f070d665ae1d", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.combine_first(\n", + " pd.DataFrame(\n", + " {\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + " ),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year,\n", + " )\n", + " .drop_duplicates(ignore_index=True)\n", + ")\n", + "\n", + "duplicate_movies = [\"The Man with the Golden Gun\", \"The Living Daylights\"]\n", + "data[data[\"Movie\"].isin(duplicate_movies)]" + ] + }, + { + "cell_type": "markdown", + "id": "8bdaa8b1-9f2e-46a5-b53a-c1ae4c201c99", + "metadata": {}, + "source": [ + "## Correcting Spelling Errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e442e51a-28fd-42d7-94b0-aaf1abe5d9a8", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"Bond\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9863aa7-b5db-4ab1-be63-727ff437b63b", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.combine_first(\n", + " pd.DataFrame(\n", + " {\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + " ),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year,\n", + " Bond=lambda data: (\n", + " data[\"Bond\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " )\n", + " .drop_duplicates(ignore_index=True)\n", + ")\n", + "\n", + "data[\"Bond\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a26b138d-72e5-4e15-a875-ee65023545d1", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data[\"Bond_Car_MFG\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd4ae142-e339-4601-b0a4-84375eb28c02", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.combine_first(\n", + " pd.DataFrame(\n", + " {\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"].str.rstrip(\"mins\").astype(int)\n", + " ),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year,\n", + " Bond=lambda data: (\n", + " data[\"Bond\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " Bond_Car_MFG=lambda data: data[\"Bond_Car_MFG\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " )\n", + " .drop_duplicates(ignore_index=True)\n", + ")\n", + "\n", + "data[\"Bond_Car_MFG\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "50c80bc8-fdb9-4c28-af5a-cd6b66c7a01d", + "metadata": {}, + "source": [ + "## Checking For Invalid Outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8260f6b1-6d7f-4338-95b7-8946d69a92e2", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"Film_Length\", \"Martinis\"]].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c73fe06b-5f42-4357-9b0f-2e460bf0dacf", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.combine_first(\n", + " pd.DataFrame(\n", + " {\"Avg_User_IMDB\": {10: 7.1}, \"Avg_User_Rtn_Tom\": {10: 6.8}}\n", + " )\n", + " )\n", + " .assign(\n", + " US_Gross=lambda data: (\n", + " data[\"US_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " World_Gross=lambda data: (\n", + " data[\"World_Gross\"].replace(\"[$,]\", \"\", regex=True).astype(float)\n", + " ),\n", + " Budget=lambda data: (\n", + " data[\"Budget ($ 000s)\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(float)\n", + " ),\n", + " Film_Length=lambda data: (\n", + " data[\"Film_Length\"]\n", + " .str.rstrip(\"mins\")\n", + " .astype(int)\n", + " .replace(1200, 120)\n", + " ),\n", + " Release=lambda data: pd.to_datetime(data[\"Release\"], format=\"%B, %Y\"),\n", + " Release_Year=lambda data: data[\"Release\"].dt.year,\n", + " Bond=lambda data: (\n", + " data[\"Bond\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " Bond_Car_MFG=lambda data: data[\"Bond_Car_MFG\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " Martinis=lambda data: data[\"Martinis\"].replace(-6, 6),\n", + " )\n", + " .drop_duplicates(ignore_index=True)\n", + ")\n", + "\n", + "data[[\"Film_Length\", \"Martinis\"]].describe()" + ] + }, + { + "cell_type": "markdown", + "id": "52db1351-36ed-4104-a999-345ebbc62214", + "metadata": {}, + "source": [ + "## Storing Your Cleansed Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "575a774e-6913-41fb-8ff9-4d786f478007", + "metadata": {}, + "outputs": [], + "source": [ + "data.to_csv(\"james_bond_data_cleansed.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f50918ee-e61f-46b2-b0c2-1ffa2c62bbc0", + "metadata": {}, + "source": [ + "# Using Python for Data Analysis" + ] + }, + { + "cell_type": "markdown", + "id": "86817f68-05a0-4235-a1c8-a5d1f6e9141e", + "metadata": {}, + "source": [ + "## Performing a Regression Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bee6d6cb-e418-4c1d-8b75-604b9ab2e63d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install matplotlib scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27d0a3dd-e71a-4b8a-883c-40cb5c001f7e", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "data = pd.read_csv(\"james_bond_data_cleansed.csv\").convert_dtypes()\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"Avg_User_IMDB\"], data[\"Avg_User_Rtn_Tom\"])\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "669fb9d7-d744-4e6b-899e-a69aebec53ed", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt\n", + "\n", + "x = data.loc[:, [\"Avg_User_IMDB\"]]\n", + "y = data.loc[:, \"Avg_User_Rtn_Tom\"]\n", + "\n", + "model = LinearRegression()\n", + "model.fit(x, y)\n", + "\n", + "r_squared = f\"R-Squared: {model.score(x, y):.2f}\"\n", + "best_fit = f\"y = {model.coef_[0]:.4f}x{model.intercept_:+.4f}\"\n", + "y_pred = model.predict(x)\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(x, y)\n", + "ax.plot(x, y_pred, color=\"red\")\n", + "ax.text(7.25, 5.5, r_squared, fontsize=10)\n", + "ax.text(7.25, 7, best_fit, fontsize=10)\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")" + ] + }, + { + "cell_type": "markdown", + "id": "b38df412-c320-49fb-93ae-e253405537a8", + "metadata": {}, + "source": [ + "## Investigating a Statistical Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938e5942-e57f-4e41-99f1-215cfb37d0df", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "length = data[\"Film_Length\"].value_counts(bins=7).sort_index()\n", + "length.plot.bar(\n", + " title=\"Film Length Distribution\",\n", + " xlabel=\"Time Range (mins)\",\n", + " ylabel=\"Count\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff4e9955-baf4-48eb-b032-fbf55f439194", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"Film_Length\"].agg([\"mean\", \"max\", \"min\", \"std\"])" + ] + }, + { + "cell_type": "markdown", + "id": "1b14c433-c3a6-4484-bc0a-26825bd1e870", + "metadata": {}, + "source": [ + "## Finding No Relationship" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bb83374-347f-4cf6-bc21-8180a003371d", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"Avg_User_IMDB\"], data[\"Kills_Bond\"])\n", + "ax.set_title(\"Scatter Plot of Kills vs Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Kills by Bond\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python-for-data-analysis/james_bond_data.csv b/python-for-data-analysis/james_bond_data.csv new file mode 100644 index 0000000000..d8b675dddd --- /dev/null +++ b/python-for-data-analysis/james_bond_data.csv @@ -0,0 +1,28 @@ +Release,Movie,Bond,Bond_Car_MFG,US_Gross,World_Gross,Budget ($ 000s),Film_Length,Avg_User_IMDB,Avg_User_Rtn_Tom,Martinis,Kills_Bond +"June, 1962",Dr. No,Sean Connery,Sunbeam," $16,067,035.00 "," $59,567,035.00 "," $1,000.00 ",110 mins,7.3,7.7,2,4 +"August, 1963",From Russia with Love,Sean Connery,Bently," $24,800,000.00 "," $78,900,000.00 "," $2,000.00 ",115 mins,7.5,8,0,11 +"May, 1964",Goldfinger,Sean Connery,Aston Martin," $51,100,000.00 "," $124,900,000.00 "," $3,000.00 ",110 mins,7.8,8.4,1,9 +"September, 1965",Thunderball,Sean Connery,Aston Martin," $63,600,000.00 "," $141,200,000.00 "," $9,000.00 ",130 mins,7,6.8,0,20 +"November, 1967",You Only Live Twice,Sean Connery,Toyota," $43,100,000.00 "," $111,600,000.00 "," $9,500.00 ",117 mins,6.9,6.3,1,21 +"July, 1969",On Her Majesty's Secret Service,George Lazenby,Mercury," $22,800,000.00 "," $82,000,000.00 "," $8,000.00 ",142 mins,6.8,6.7,1,5 +"March, 1971",Diamonds Are Forever,Shawn Connery,Ford," $43,800,000.00 "," $116,000,000.00 "," $7,200.00 ",1200 mins,6.7,6.3,0,7 +"August, 1973",Live and Let Die,Roger Moore,AMC," $35,400,000.00 "," $161,800,000.00 "," $7,000.00 ",121 mins,6.8,5.9,0,8 +"July, 1974",The Man with the Golden Gun,Roger Moore,AMC," $21,000,000.00 "," $97,600,000.00 "," $7,000.00 ",125 mins,6.7,5.1,0,1 +"July, 1974",The Man with the Golden Gun,Roger Moore,AMC," $21,000,000.00 "," $97,600,000.00 "," $7,000.00 ",125 mins,6.7,5.1,0,1 +"April, 1977",The Spy Who Loved Me,Roger Moore,Lotus," $46,800,000.00 "," $185,400,000.00 "," $14,000.00 ",125 mins,,,1,31 +"October, 1979",Moonraker,Roger Moore,Lotus," $70,300,000.00 "," $210,300,000.00 "," $31,000.00 ",126 mins,6.2,5.7,1,12 +"June, 1981",For Your Eyes Only,Roger MOORE,Citroen," $54,800,000.00 "," $195,300,000.00 "," $28,000.00 ",127 mins,6.8,6.3,0,18 +"March, 1983",Octopussy,Roger Moore,Bajaj," $67,900,000.00 "," $187,500,000.00 "," $27,500.00 ",131 mins,6.5,5.3,0,15 +"October, 1985",A View to a Kill,Roger Moore,Rolls Royce," $50,327,960.00 "," $152,627,960.00 "," $30,000.00 ",131 mins,6.2,4.7,0,5 +"May, 1987",The Living Daylights,Timothy Dalton,Rolls Royce," $51,185,000.00 "," $191,200,000.00 "," $40,000.00 ",130 mins,6.7,6.3,2,13 +"May, 1987",The Living Daylights,Timothy Dalton,Rolls Royce," $51,185,000.00 "," $191,200,000.00 "," $40,000.00 ",130 mins,6.7,6.3,2,13 +"January, 1989",License to Kill,Timothy Dalton,Aston Martin," $34,667,015.00 "," $156,167,015.00 "," $42,000.00 ",133 mins,6.5,6,1,10 +"September, 1995",GoldenEye,Pierce Brosnan,BMW," $106,429,941.00 "," $356,429,941.00 "," $60,000.00 ",130 mins,7.2,6.9,1,47 +"July, 1997",Tomorrow Never Dies,Pierce Brosnan,Aston Martin," $125,304,276.00 "," $339,504,276.00 "," $110,000.00 ",119 mins,6.4,6,1,30 +"June, 1999",The World Is Not Enough,Pierce Brosnan,BMW," $126,930,660.00 "," $361,730,660.00 "," $135,000.00 ",128 mins,6.3,5.7,1,27 +"August, 2002",Die Another Day,Pierce Brosnan,Aston Martin," $160,942,139.00 "," $431,942,139.00 "," $142,000.00 ",133 mins,6,6.1,2,31 +"February, 2006",Casino Royale,Daniel Craig,Astin Martin," $167,365,000.00 "," $596,365,000.00 "," $102,000.00 ",144 mins,7.9,7.8,3,11 +"December, 2008",Quantum of Solace,Daniel Craig,Aston Martin," $169,368,427.00 "," $591,692,078.00 "," $230,000.00 ",106 mins,6.7,6.1,-6,16 +"November, 2012",Skyfall,Daniel Craig,Astin Martin," $304,360,277.00 "," $1,108,561,108.00 "," $200,000.00 ",143 mins,7.8,8.2,1,26 +"September, 2015",Spectre,Daniel Craig,Aston Martin," $200,074,175.00 "," $879,620,923.00 "," $245,000.00 ",148 mins,6.8,6.4,1,30 +"November, 2021",No Time to Die,Daniel Craig,Aston Martin," $160,891,007.00 "," $759,959,662.00 "," $275,000.00 ",163 mins,7.3,7.3,1,14 diff --git a/python-for-data-analysis/james_bond_data.json b/python-for-data-analysis/james_bond_data.json new file mode 100644 index 0000000000..852810b38e --- /dev/null +++ b/python-for-data-analysis/james_bond_data.json @@ -0,0 +1 @@ +{"Release":{"0":"June, 1962","1":"August, 1963","2":"May, 1964","3":"September, 1965","4":"November, 1967","5":"July, 1969","6":"March, 1971","7":"August, 1973","8":"July, 1974","9":"July, 1974","10":"April, 1977","11":"October, 1979","12":"June, 1981","13":"March, 1983","14":"October, 1985","15":"May, 1987","16":"May, 1987","17":"January, 1989","18":"September, 1995","19":"July, 1997","20":"June, 1999","21":"August, 2002","22":"February, 2006","23":"December, 2008","24":"November, 2012","25":"September, 2015","26":"November, 2021"},"Movie":{"0":"Dr. No","1":"From Russia with Love","2":"Goldfinger","3":"Thunderball","4":"You Only Live Twice","5":"On Her Majesty's Secret Service","6":"Diamonds Are Forever","7":"Live and Let Die","8":"The Man with the Golden Gun","9":"The Man with the Golden Gun","10":"The Spy Who Loved Me","11":"Moonraker","12":"For Your Eyes Only","13":"Octopussy","14":"A View to a Kill","15":"The Living Daylights","16":"The Living Daylights","17":"License to Kill","18":"GoldenEye","19":"Tomorrow Never Dies","20":"The World Is Not Enough","21":"Die Another Day","22":"Casino Royale","23":"Quantum of Solace","24":"Skyfall","25":"Spectre","26":"No Time to Die"},"Bond":{"0":"Sean Connery","1":"Sean Connery","2":"Sean Connery","3":"Sean Connery","4":"Sean Connery","5":"George Lazenby","6":"Shawn Connery","7":"Roger Moore","8":"Roger Moore","9":"Roger Moore","10":"Roger Moore","11":"Roger Moore","12":"Roger MOORE","13":"Roger Moore","14":"Roger Moore","15":"Timothy Dalton","16":"Timothy Dalton","17":"Timothy Dalton","18":"Pierce Brosnan","19":"Pierce Brosnan","20":"Pierce Brosnan","21":"Pierce Brosnan","22":"Daniel Craig","23":"Daniel Craig","24":"Daniel Craig","25":"Daniel Craig","26":"Daniel Craig"},"Bond_Car_MFG":{"0":"Sunbeam","1":"Bently","2":"Aston Martin","3":"Aston Martin","4":"Toyota","5":"Mercury","6":"Ford","7":"AMC","8":"AMC","9":"AMC","10":"Lotus","11":"Lotus","12":"Citroen","13":"Bajaj","14":"Rolls Royce","15":"Rolls Royce","16":"Rolls Royce","17":"Aston Martin","18":"BMW","19":"Aston Martin","20":"BMW","21":"Aston Martin","22":"Astin Martin","23":"Aston Martin","24":"Astin Martin","25":"Aston Martin","26":"Aston Martin"},"US_Gross":{"0":" $16,067,035.00 ","1":" $24,800,000.00 ","2":" $51,100,000.00 ","3":" $63,600,000.00 ","4":" $43,100,000.00 ","5":" $22,800,000.00 ","6":" $43,800,000.00 ","7":" $35,400,000.00 ","8":" $21,000,000.00 ","9":" $21,000,000.00 ","10":" $46,800,000.00 ","11":" $70,300,000.00 ","12":" $54,800,000.00 ","13":" $67,900,000.00 ","14":" $50,327,960.00 ","15":" $51,185,000.00 ","16":" $51,185,000.00 ","17":" $34,667,015.00 ","18":" $106,429,941.00 ","19":" $125,304,276.00 ","20":" $126,930,660.00 ","21":" $160,942,139.00 ","22":" $167,365,000.00 ","23":" $169,368,427.00 ","24":" $304,360,277.00 ","25":" $200,074,175.00 ","26":" $160,891,007.00 "},"World_Gross":{"0":" $59,567,035.00 ","1":" $78,900,000.00 ","2":" $124,900,000.00 ","3":" $141,200,000.00 ","4":" $111,600,000.00 ","5":" $82,000,000.00 ","6":" $116,000,000.00 ","7":" $161,800,000.00 ","8":" $97,600,000.00 ","9":" $97,600,000.00 ","10":" $185,400,000.00 ","11":" $210,300,000.00 ","12":" $195,300,000.00 ","13":" $187,500,000.00 ","14":" $152,627,960.00 ","15":" $191,200,000.00 ","16":" $191,200,000.00 ","17":" $156,167,015.00 ","18":" $356,429,941.00 ","19":" $339,504,276.00 ","20":" $361,730,660.00 ","21":" $431,942,139.00 ","22":" $596,365,000.00 ","23":" $591,692,078.00 ","24":" $1,108,561,108.00 ","25":" $879,620,923.00 ","26":" $759,959,662.00 "},"Budget ($ 000s)":{"0":" $1,000.00 ","1":" $2,000.00 ","2":" $3,000.00 ","3":" $9,000.00 ","4":" $9,500.00 ","5":" $8,000.00 ","6":" $7,200.00 ","7":" $7,000.00 ","8":" $7,000.00 ","9":" $7,000.00 ","10":" $14,000.00 ","11":" $31,000.00 ","12":" $28,000.00 ","13":" $27,500.00 ","14":" $30,000.00 ","15":" $40,000.00 ","16":" $40,000.00 ","17":" $42,000.00 ","18":" $60,000.00 ","19":" $110,000.00 ","20":" $135,000.00 ","21":" $142,000.00 ","22":" $102,000.00 ","23":" $230,000.00 ","24":" $200,000.00 ","25":" $245,000.00 ","26":" $275,000.00 "},"Film_Length":{"0":"110 mins","1":"115 mins","2":"110 mins","3":"130 mins","4":"117 mins","5":"142 mins","6":"1200 mins","7":"121 mins","8":"125 mins","9":"125 mins","10":"125 mins","11":"126 mins","12":"127 mins","13":"131 mins","14":"131 mins","15":"130 mins","16":"130 mins","17":"133 mins","18":"130 mins","19":"119 mins","20":"128 mins","21":"133 mins","22":"144 mins","23":"106 mins","24":"143 mins","25":"148 mins","26":"163 mins"},"Avg_User_IMDB":{"0":7.3,"1":7.5,"2":7.8,"3":7.0,"4":6.9,"5":6.8,"6":6.7,"7":6.8,"8":6.7,"9":6.7,"10":null,"11":6.2,"12":6.8,"13":6.5,"14":6.2,"15":6.7,"16":6.7,"17":6.5,"18":7.2,"19":6.4,"20":6.3,"21":6.0,"22":7.9,"23":6.7,"24":7.8,"25":6.8,"26":7.3},"Avg_User_Rtn_Tom":{"0":7.7,"1":8.0,"2":8.4,"3":6.8,"4":6.3,"5":6.7,"6":6.3,"7":5.9,"8":5.1,"9":5.1,"10":null,"11":5.7,"12":6.3,"13":5.3,"14":4.7,"15":6.3,"16":6.3,"17":6.0,"18":6.9,"19":6.0,"20":5.7,"21":6.1,"22":7.8,"23":6.1,"24":8.2,"25":6.4,"26":7.3},"Martinis":{"0":2,"1":0,"2":1,"3":0,"4":1,"5":1,"6":0,"7":0,"8":0,"9":0,"10":1,"11":1,"12":0,"13":0,"14":0,"15":2,"16":2,"17":1,"18":1,"19":1,"20":1,"21":2,"22":3,"23":-6,"24":1,"25":1,"26":1},"Kills_Bond":{"0":4,"1":11,"2":9,"3":20,"4":21,"5":5,"6":7,"7":8,"8":1,"9":1,"10":31,"11":12,"12":18,"13":15,"14":5,"15":13,"16":13,"17":10,"18":47,"19":30,"20":27,"21":31,"22":11,"23":16,"24":26,"25":30,"26":14}} \ No newline at end of file diff --git a/python-for-data-analysis/james_bond_data.parquet b/python-for-data-analysis/james_bond_data.parquet new file mode 100644 index 0000000000..88bd22b4fb Binary files /dev/null and b/python-for-data-analysis/james_bond_data.parquet differ diff --git a/python-for-data-analysis/james_bond_data.xlsx b/python-for-data-analysis/james_bond_data.xlsx new file mode 100644 index 0000000000..1e042705fc Binary files /dev/null and b/python-for-data-analysis/james_bond_data.xlsx differ diff --git a/python-for-data-analysis/james_bond_data_cleansed.csv b/python-for-data-analysis/james_bond_data_cleansed.csv new file mode 100644 index 0000000000..f67cf98f34 --- /dev/null +++ b/python-for-data-analysis/james_bond_data_cleansed.csv @@ -0,0 +1,26 @@ +Avg_User_IMDB,Avg_User_Rtn_Tom,Bond,Bond_Car_MFG,Budget ($ 000s),Film_Length,Kills_Bond,Martinis,Movie,Release,US_Gross,World_Gross,Budget,Release_Year +7.3,7.7,Sean Connery,Sunbeam," $1,000.00 ",110,4,2,Dr. No,1962-06-01,16067035.0,59567035.0,1000.0,1962 +7.5,8.0,Sean Connery,Bently," $2,000.00 ",115,11,0,From Russia with Love,1963-08-01,24800000.0,78900000.0,2000.0,1963 +7.8,8.4,Sean Connery,Aston Martin," $3,000.00 ",110,9,1,Goldfinger,1964-05-01,51100000.0,124900000.0,3000.0,1964 +7.0,6.8,Sean Connery,Aston Martin," $9,000.00 ",130,20,0,Thunderball,1965-09-01,63600000.0,141200000.0,9000.0,1965 +6.9,6.3,Sean Connery,Toyota," $9,500.00 ",117,21,1,You Only Live Twice,1967-11-01,43100000.0,111600000.0,9500.0,1967 +6.8,6.7,George Lazenby,Mercury," $8,000.00 ",142,5,1,On Her Majesty's Secret Service,1969-07-01,22800000.0,82000000.0,8000.0,1969 +6.7,6.3,Sean Connery,Ford," $7,200.00 ",120,7,0,Diamonds Are Forever,1971-03-01,43800000.0,116000000.0,7200.0,1971 +6.8,5.9,Roger Moore,AMC," $7,000.00 ",121,8,0,Live and Let Die,1973-08-01,35400000.0,161800000.0,7000.0,1973 +6.7,5.1,Roger Moore,AMC," $7,000.00 ",125,1,0,The Man with the Golden Gun,1974-07-01,21000000.0,97600000.0,7000.0,1974 +7.1,6.8,Roger Moore,Lotus," $14,000.00 ",125,31,1,The Spy Who Loved Me,1977-04-01,46800000.0,185400000.0,14000.0,1977 +6.2,5.7,Roger Moore,Lotus," $31,000.00 ",126,12,1,Moonraker,1979-10-01,70300000.0,210300000.0,31000.0,1979 +6.8,6.3,Roger Moore,Citroen," $28,000.00 ",127,18,0,For Your Eyes Only,1981-06-01,54800000.0,195300000.0,28000.0,1981 +6.5,5.3,Roger Moore,Bajaj," $27,500.00 ",131,15,0,Octopussy,1983-03-01,67900000.0,187500000.0,27500.0,1983 +6.2,4.7,Roger Moore,Rolls Royce," $30,000.00 ",131,5,0,A View to a Kill,1985-10-01,50327960.0,152627960.0,30000.0,1985 +6.7,6.3,Timothy Dalton,Rolls Royce," $40,000.00 ",130,13,2,The Living Daylights,1987-05-01,51185000.0,191200000.0,40000.0,1987 +6.5,6.0,Timothy Dalton,Aston Martin," $42,000.00 ",133,10,1,License to Kill,1989-01-01,34667015.0,156167015.0,42000.0,1989 +7.2,6.9,Pierce Brosnan,BMW," $60,000.00 ",130,47,1,GoldenEye,1995-09-01,106429941.0,356429941.0,60000.0,1995 +6.4,6.0,Pierce Brosnan,Aston Martin," $110,000.00 ",119,30,1,Tomorrow Never Dies,1997-07-01,125304276.0,339504276.0,110000.0,1997 +6.3,5.7,Pierce Brosnan,BMW," $135,000.00 ",128,27,1,The World Is Not Enough,1999-06-01,126930660.0,361730660.0,135000.0,1999 +6.0,6.1,Pierce Brosnan,Aston Martin," $142,000.00 ",133,31,2,Die Another Day,2002-08-01,160942139.0,431942139.0,142000.0,2002 +7.9,7.8,Daniel Craig,Aston Martin," $102,000.00 ",144,11,3,Casino Royale,2006-02-01,167365000.0,596365000.0,102000.0,2006 +6.7,6.1,Daniel Craig,Aston Martin," $230,000.00 ",106,16,6,Quantum of Solace,2008-12-01,169368427.0,591692078.0,230000.0,2008 +7.8,8.2,Daniel Craig,Aston Martin," $200,000.00 ",143,26,1,Skyfall,2012-11-01,304360277.0,1108561108.0,200000.0,2012 +6.8,6.4,Daniel Craig,Aston Martin," $245,000.00 ",148,30,1,Spectre,2015-09-01,200074175.0,879620923.0,245000.0,2015 +7.3,7.3,Daniel Craig,Aston Martin," $275,000.00 ",163,14,1,No Time to Die,2021-11-01,160891007.0,759959662.0,275000.0,2021