update: final project ml pipeline

udacity · Mar 26, 2024 · ec70ff2 · ec70ff2
1 parent 3e57919
commit ec70ff2
Show file tree

Hide file tree

Showing 15 changed files with 19,729 additions and 41 deletions.
diff --git a/components/test_regression_model/MLproject b/components/test_regression_model/MLproject
@@ -1,4 +1,4 @@
-name: test_model
+name: test_regression_model
 conda_env: conda.yml
 
 entry_points:

diff --git a/components/test_regression_model/conda.yml b/components/test_regression_model/conda.yml
@@ -9,6 +9,6 @@ dependencies:
   - scikit-learn=1.3.2
   - pandas=2.1.3
   - pip:
-      - mlflow==2.8.1
+      - mlflow==2.11.3
       - wandb==0.16.0
       - git+https://github.com/udacity/nd0821-c2-build-model-workflow-starter.git#egg=wandb-utils&subdirectory=components
diff --git a/config.yaml b/config.yaml
@@ -22,7 +22,7 @@ modeling:
   stratify_by: "neighbourhood_group"
   # Maximum number of features to consider for the TFIDF applied to the title of the
   # insertion (the column called "name")
-  max_tfidf_features: 5
+  max_tfidf_features: 30
   # NOTE: you can put here any parameter that is accepted by the constructor of
   # RandomForestRegressor. This is a subsample, but more could be added:
   random_forest:
@@ -33,6 +33,6 @@ modeling:
     # Here -1 means all available cores
     n_jobs: -1
     criterion: squared_error
-    max_features: 0.5
+    max_features: 0.33
     # DO not change the following
     oob_score: true
diff --git a/main.py b/main.py
@@ -50,22 +50,43 @@ def go(config: DictConfig):
             )
 
         if "basic_cleaning" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
+                "main",
+                parameters={
+                    "input_artifact": "sample.csv:latest",
+                    "output_artifact": "clean_sample.csv",
+                    "output_type": "clean_sample",
+                    "output_description": "Data with outliers and null values removed",
+                    "min_price": config['etl']['min_price'],
+                    "max_price": config['etl']['max_price'],
+                },
+            )
 
         if "data_check" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
+                "main",
+                parameters={
+                    "csv": "clean_sample.csv:latest",
+                    "ref": "clean_sample.csv:reference",
+                    "kl_threshold": config['data_check']['kl_threshold'],
+                    "min_price": config['etl']['min_price'],
+                    "max_price": config['etl']['max_price'],
+                },
+            )
 
         if "data_split" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "components", "train_val_test_split"),
+                "main",
+                parameters={
+                    "input": "clean_sample.csv:latest",
+                    "test_size": config['modeling']['test_size'],
+                    "random_seed": config['modeling']['random_seed'],
+                    "stratify_by": config['modeling']['stratify_by'],
+                },
+            )
 
         if "train_random_forest" in active_steps:
 
@@ -77,19 +98,31 @@ def go(config: DictConfig):
             # NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
             # step
 
-            ##################
-            # Implement here #
-            ##################
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest"),
+                "main",
+                parameters={
+                    "trainval_artifact": "trainval_data.csv:latest",
+                    "val_size": config["modeling"]["val_size"],
+                    "random_seed": config["modeling"]["random_seed"],
+                    "stratify_by": config["modeling"]["stratify_by"],
+                    "rf_config": rf_config,
+                    "max_tfidf_features": config["modeling"]["max_tfidf_features"],
+                    "output_artifact":"model_export",
+                },
+            )
 
-            pass
 
         if "test_regression_model" in active_steps:
 
-            ##################
-            # Implement here #
-            ##################
-
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "components", "test_regression_model"),
+                "main",
+                parameters={
+                    "mlflow_model": "model_export:prod",
+                    "test_dataset": "test_data.csv:latest",
+                },
+            )
 
 
 if __name__ == "__main__":

diff --git a/src/basic_cleaning/MLproject b/src/basic_cleaning/MLproject
@@ -0,0 +1,27 @@
+name: basic_cleaning
+conda_env: conda.yml
+
+entry_points:
+  main:
+    parameters:
+      input_artifact:
+        description: The raw data file to be clean.
+        type: string
+      output_artifact:
+        description: The name for the output artifact.
+        type: string
+      output_type:
+        description: The type for the output artifact.
+        type: string
+      output_description:
+        description: A description for the output artifact.
+        type: string
+      min_price:
+        description: The minimun price to consider.
+        type: float
+      max_price:
+        description: The maximun price to consider
+        type: float
+
+    command: >-
+        python run.py  --input_artifact {input_artifact}  --output_artifact {output_artifact}  --output_type {output_type}  --output_description {output_description}  --min_price {min_price}  --max_price {max_price}