upload regression project rental prices

udacity · Aug 26, 2024 · 9dd7237 · 9dd7237
1 parent 3e57919
commit 9dd7237
Show file tree

Hide file tree

Showing 9 changed files with 1,248 additions and 81 deletions.
diff --git a/config.yaml b/config.yaml
@@ -22,12 +22,12 @@ modeling:
   stratify_by: "neighbourhood_group"
   # Maximum number of features to consider for the TFIDF applied to the title of the
   # insertion (the column called "name")
-  max_tfidf_features: 5
+  max_tfidf_features: 4
   # NOTE: you can put here any parameter that is accepted by the constructor of
   # RandomForestRegressor. This is a subsample, but more could be added:
   random_forest:
-    n_estimators: 100
-    max_depth: 15
+    n_estimators: 500
+    max_depth: 10
     min_samples_split: 4
     min_samples_leaf: 3
     # Here -1 means all available cores

diff --git a/main.py b/main.py
@@ -50,47 +50,74 @@ def go(config: DictConfig):
             )
 
         if "basic_cleaning" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
+                "main",
+                parameters={
+                    "input_artifact": "sample.csv:latest",
+                    "output_artifact": "clean_sample.csv",
+                    "output_type": "clean_sample",
+                    "output_description": "Data with outliers and null values removed",
+                    "min_price": config['etl']['min_price'],
+                    "max_price": config['etl']['max_price']
+                },
+            )
 
         if "data_check" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
+                "main",
+                parameters={
+                    "csv": "clean_sample.csv:latest",
+                    "ref": "clean_sample.csv:reference",
+                    "kl_threshold": config["data_check"]["kl_threshold"],
+                    "min_price": config['etl']['min_price'],
+                    "max_price": config['etl']['max_price']
+                },
+            )
 
         if "data_split" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+                f"{config['main']['components_repository']}/train_val_test_split",
+                "main",
+                parameters={
+                    "input": "clean_sample.csv:latest",
+                    "test_size": config['modeling']['test_size'],
+                    "random_seed": config['modeling']['random_seed'],
+                    "stratify_by": config['modeling']['stratify_by']
+                },
+            )
 
         if "train_random_forest" in active_steps:
 
             # NOTE: we need to serialize the random forest configuration into JSON
             rf_config = os.path.abspath("rf_config.json")
             with open(rf_config, "w+") as fp:
-                json.dump(dict(config["modeling"]["random_forest"].items()), fp)  # DO NOT TOUCH
+                json.dump(dict(config["modeling"]["random_forest"].items()), fp) 
 
-            # NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
-            # step
-
-            ##################
-            # Implement here #
-            ##################
-
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest"),
+                "main",
+                parameters={
+                    "trainval_artifact": "trainval_data.csv:latest",
+                    "val_size": config["modeling"]["val_size"],
+                    "random_seed": config["modeling"]["random_seed"],
+                    "stratify_by": config["modeling"]["stratify_by"],
+                    "rf_config": rf_config,
+                    "max_tfidf_features": config["modeling"]["max_tfidf_features"],
+                    "output_artifact": "random_forest_export"
+                },
+            )
 
         if "test_regression_model" in active_steps:
-
-            ##################
-            # Implement here #
-            ##################
-
-            pass
-
+            _ = mlflow.run(
+                f"{config['main']['components_repository']}/test_regression_model",
+                "main",
+                parameters={
+                    "mlflow_model": "random_forest_export:prod",
+                    "test_dataset": "test_data.csv:latest"
+                },
+            )
 
 if __name__ == "__main__":
     go()
diff --git a/src/basic_cleaning/MLproject b/src/basic_cleaning/MLproject
@@ -0,0 +1,37 @@
+name: basic_cleaning
+conda_env: conda.yml
+
+entry_points:
+  main:
+    parameters:
+      input_artifact:
+        description: name of input artifact
+        type: string
+
+      output_artifact:
+        description: name of the artifact generated
+        type: string
+
+      output_type:
+        description: type of the output
+        type: string
+
+      output_description:
+        description: description of the output
+        type: string
+
+      min_price:
+        description: min price
+        type: float
+
+      max_price:
+        description: max price
+        type: float
+
+    command: >-
+        python run.py --input_artifact {input_artifact}  \
+                      --output_artifact {output_artifact} \
+                      --output_type {output_type} \
+                      --output_description {output_description} \
+                      --min_price {min_price} \
+                      --max_price {max_price} 
diff --git a/src/basic_cleaning/conda.yml b/src/basic_cleaning/conda.yml
@@ -0,0 +1,9 @@
+name: basic_cleaning
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - pip=23.3.1
+  - pip:
+      - mlflow==2.8.1
+      - wandb==0.16.0
diff --git a/src/basic_cleaning/run.py b/src/basic_cleaning/run.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+"""
+Performs basic cleaning on the data and save the results in Weights & Biases
+"""
+import argparse
+import logging
+import wandb
+import pandas as pd
+import os
+
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
+logger = logging.getLogger()
+
+
+def go(args):
+
+    run = wandb.init(job_type="basic_cleaning")
+    run.config.update(args)
+
+    # Download input artifact. This will also log that this script is using this
+    # particular version of the artifact
+    # artifact_local_path = run.use_artifact(args.input_artifact).file()
+
+    logger.info("Downloading artifact")
+    artifact_local_path = run.use_artifact(args.input_artifact).file()
+
+    df = pd.read_csv(artifact_local_path)
+
+    # Drop the duplicates
+    logger.info("Dropping duplicates and identify nulls")
+    print(df.columns[df.isnull().any()])
+    df.dropna(inplace=True) #drop rows with any nulls
+
+    df = df.drop_duplicates().reset_index(drop=True)
+
+    # Drop outliers and nulls
+    logger.info("Dropping min max prices")
+    df = df[(df['price'] >= args.min_price) & (df['price'] <= args.max_price)]
+
+    filename = args.output_artifact
+
+    df.to_csv(filename, index=False)
+
+    artifact = wandb.Artifact(
+        filename,
+        type=args.output_type,
+        description=args.output_description,
+    )
+    artifact.add_file(filename)
+
+    logger.info("Logging artifact")
+    run.log_artifact(artifact)
+
+    os.remove(filename)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="This steps cleans the data")
+
+
+    parser.add_argument(
+        "--input_artifact", 
+        type=str,
+        help="name of input artifact",
+        required=True
+    )
+
+    parser.add_argument(
+        "--output_artifact", 
+        type=str,
+        help="name of the artifact generated",
+        required=True
+    )
+
+    parser.add_argument(
+        "--output_type", 
+        type=str,
+        help="type of the output",
+        required=True
+    )
+
+    parser.add_argument(
+        "--output_description", 
+        type=str,
+        help="description of the output",
+        required=False
+    )
+
+    parser.add_argument(
+        "--min_price", 
+        type=float,
+        help="min price",
+        required=True
+    )
+
+    parser.add_argument(
+        "--max_price", 
+        type=float,
+        help="max price",
+        required=True
+    )
+
+    args = parser.parse_args()
+
+    go(args)
diff --git a/src/data_check/conda.yml b/src/data_check/conda.yml
@@ -5,8 +5,8 @@ channels:
 dependencies:
   - python=3.10.0
   - pandas=2.1.3
-  - pytest=6.2.2
-  - scipy=1.5.2
+  - pytest=8.3.2
+  - scipy=1.13.1
   - pip=23.3.1
   - pip:
       - mlflow==2.8.1

diff --git a/src/data_check/test_data.py b/src/data_check/test_data.py
@@ -60,6 +60,9 @@ def test_similar_neigh_distrib(data: pd.DataFrame, ref_data: pd.DataFrame, kl_th
     assert scipy.stats.entropy(dist1, dist2, base=2) < kl_threshold
 
 
-########################################################
-# Implement here test_row_count and test_price_range   #
-########################################################
+def test_row_count(data: pd.DataFrame):
+    assert 15000 < data.shape[0] < 1000000
+
+
+def test_price_range(data: pd.DataFrame, min_price: float, max_price: float):
+    assert data['price'].between(min_price, max_price).all()