Skip to content

Commit

Permalink
complete cleaning, data_check, eda, and train
Browse files Browse the repository at this point in the history
  • Loading branch information
sManohar201 committed Mar 1, 2024
1 parent b6ad271 commit b026c94
Show file tree
Hide file tree
Showing 5 changed files with 213 additions and 36 deletions.
6 changes: 3 additions & 3 deletions src/basic_cleaning/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ channels:
- conda-forge
- defaults
dependencies:
- pip=23.3.1
- pandas=2.2.1
- pip=20.3.3
- pandas=1.2.3
- mlflow=1.14.1
- pip:
- mlflow==2.8.1
- wandb==0.16.0
7 changes: 3 additions & 4 deletions src/data_check/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@ channels:
- conda-forge
- defaults
dependencies:
- python=3.10.0
- pandas=2.1.3
- pandas=1.1.4
- pytest=7.4
- scipy=1.8
- pip=23.3.1
- pip=20.3.3
- mlflow=1.14.1
- pip:
- mlflow==2.8.1
- wandb==0.16.0
211 changes: 196 additions & 15 deletions src/eda/EDA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -12,9 +12,83 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"wandb version 0.16.3 is available! To upgrade, please run:\n",
" $ pip install wandb --upgrade"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Tracking run with wandb version 0.16.0"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Run data is saved locally in <code>/home/sagemaker-user/project-3/ml-pipeline-for-short-term-rental-prices/src/eda/wandb/run-20240301_065933-uflrnl4w</code>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Syncing run <strong><a href='https://wandb.ai/sabarimooc/nyc_airbnb/runs/uflrnl4w' target=\"_blank\">resilient-cosmos-64</a></strong> to <a href='https://wandb.ai/sabarimooc/nyc_airbnb' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" View project at <a href='https://wandb.ai/sabarimooc/nyc_airbnb' target=\"_blank\">https://wandb.ai/sabarimooc/nyc_airbnb</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" View run at <a href='https://wandb.ai/sabarimooc/nyc_airbnb/runs/uflrnl4w' target=\"_blank\">https://wandb.ai/sabarimooc/nyc_airbnb/runs/uflrnl4w</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"run = wandb.init(project=\"nyc_airbnb\", group=\"eda\", save_code=True)\n",
"local_path = wandb.use_artifact(\"sample.csv:latest\").file()\n",
Expand All @@ -23,28 +97,47 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"!pip install ydata_profiling"
"# import sys\n",
"# !pip install ydata_profiling\n",
"\n",
"# import pandas_profiling\n",
"\n",
"# import pandas_profiling\n",
"\n",
"# profile = pandas_profiling.ProfileReport(df)\n",
"# profile.to_widgets()"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from ydata_profiling import ProfileReport"
"# from ydata_profiling import ProfileReport"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'ProfileReport' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/home/sagemaker-user/project-3/ml-pipeline-for-short-term-rental-prices/src/eda/EDA.ipynb Cell 5\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell://lbmytgglfkt5hdk.studio.ap-south-1.sagemaker.aws/home/sagemaker-user/project-3/ml-pipeline-for-short-term-rental-prices/src/eda/EDA.ipynb#W4sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m profile \u001b[39m=\u001b[39m ProfileReport(df)\n",
"\u001b[0;31mNameError\u001b[0m: name 'ProfileReport' is not defined"
]
}
],
"source": [
"profile = ProfileReport(df)"
]
Expand All @@ -60,7 +153,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -74,18 +167,94 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 19001 entries, 0 to 19999\n",
"Data columns (total 16 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 19001 non-null int64 \n",
" 1 name 18994 non-null object \n",
" 2 host_id 19001 non-null int64 \n",
" 3 host_name 18993 non-null object \n",
" 4 neighbourhood_group 19001 non-null object \n",
" 5 neighbourhood 19001 non-null object \n",
" 6 latitude 19001 non-null float64 \n",
" 7 longitude 19001 non-null float64 \n",
" 8 room_type 19001 non-null object \n",
" 9 price 19001 non-null int64 \n",
" 10 minimum_nights 19001 non-null int64 \n",
" 11 number_of_reviews 19001 non-null int64 \n",
" 12 last_review 15243 non-null datetime64[ns]\n",
" 13 reviews_per_month 15243 non-null float64 \n",
" 14 calculated_host_listings_count 19001 non-null int64 \n",
" 15 availability_365 19001 non-null int64 \n",
"dtypes: datetime64[ns](1), float64(3), int64(7), object(5)\n",
"memory usage: 2.5+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"wandb: WARNING Source type is set to 'repo' but some required information is missing from the environment. A job will not be created from this run. See https://docs.wandb.ai/guides/launch/create-job\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ce6110fc0129408a95b19667ba3bdbbf",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(Label(value='0.037 MB of 0.052 MB uploaded\\r'), FloatProgress(value=0.7201854270524825, max=1.0…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" View run <strong style=\"color:#cdcd00\">resilient-cosmos-64</strong> at: <a href='https://wandb.ai/sabarimooc/nyc_airbnb/runs/uflrnl4w' target=\"_blank\">https://wandb.ai/sabarimooc/nyc_airbnb/runs/uflrnl4w</a><br/>Synced 7 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Find logs at: <code>./wandb/run-20240301_065933-uflrnl4w/logs</code>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"run.finish()"
]
Expand All @@ -103,6 +272,18 @@
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
Expand Down
14 changes: 6 additions & 8 deletions src/eda/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@ channels:
- conda-forge
- defaults
dependencies:
- python=3.10
- hydra-core=1.3.2
- matplotlib=3.8.2
- pandas=2.1.3
- pip=23.3.1
- scikit-learn=1.3.2
- pandas=1.2.3
- seaborn=0.11.1
- pip=20.3.3
- scikit-learn=0.24.1
- jupyterlab=3.0.12
- jupyter=1.0
- mlflow=1.14.1
- pandas-profiling=2.11.0
- pyarrow=2.0
- pip:
- mlflow==2.8.1
- wandb==0.16.0
11 changes: 5 additions & 6 deletions src/train_random_forest/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@ channels:
- conda-forge
- defaults
dependencies:
- python=3.10
- hydra-core=1.3.2
- matplotlib=3.8.2
- pandas=2.1.3
- pip=23.3.1
- matplotlib=3.2.2
- pandas=1.1.4
- pip=20.3.3
- mlflow=1.14.1
- scikit-learn=1.3.2
- pillow=8.1.2
- pip:
- mlflow==2.10.1
- wandb==0.16.0

0 comments on commit b026c94

Please sign in to comment.