udacity · LesterFreamon · Mar 31, 2023 · Mar 31, 2023 · Mar 31, 2023 · Mar 31, 2023
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.analysis.typeCheckingMode": "basic"
+}
diff --git a/components/conda.yml b/components/conda.yml
@@ -3,4 +3,4 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - mlflow=1.14.1
+  - mlflow=2.2.2
diff --git a/components/get_data/MLproject b/components/get_data/MLproject
@@ -22,4 +22,8 @@ entry_points:
         description: A brief description of the output artifact
         type: string
 
-    command: "python run.py {sample} {artifact_name} {artifact_type} {artifact_description}"
+    command: >-
+      python run.py --sample {sample} \
+                    --artifact_name {artifact_name} \
+                    --artifact_type {artifact_type} \
+                    --artifact_description {artifact_description}
diff --git a/components/get_data/conda.yml b/components/get_data/conda.yml
@@ -3,9 +3,9 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - pip=20.3.3
-  - requests=2.24.0
-  - mlflow=1.14.1
+  - pip=23.0.1
+  - requests=2.28.2
+  - mlflow=2.2.2
   - pip:
-      - wandb==0.10.31
-      - git+https://github.com/udacity/nd0821-c2-build-model-workflow-starter.git#egg=wandb-utils&subdirectory=components
+      - wandb==0.14.0
+      - git+https://github.com/LesterFreamon/build-ml-pipeline-for-short-term-rental-prices.git#egg=wandb-utils&subdirectory=components
diff --git a/components/get_data/run.py b/components/get_data/run.py
@@ -33,14 +33,14 @@ def go(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Download URL to a local destination")
 
-    parser.add_argument("sample", type=str, help="Name of the sample to download")
+    parser.add_argument("--sample", type=str, help="Name of the sample to download")
 
-    parser.add_argument("artifact_name", type=str, help="Name for the output artifact")
+    parser.add_argument("--artifact_name", type=str, help="Name for the output artifact")
 
-    parser.add_argument("artifact_type", type=str, help="Output artifact type.")
+    parser.add_argument("--artifact_type", type=str, help="Output artifact type.")
 
     parser.add_argument(
-        "artifact_description", type=str, help="A brief description of this artifact"
+        "--artifact_description", type=str, help="A brief description of this artifact"
     )
 
     args = parser.parse_args()

diff --git a/components/test_regression_model/MLproject b/components/test_regression_model/MLproject
@@ -13,4 +13,5 @@ entry_points:
         description: The test artifact
         type: string
 
-    command: "python run.py  --mlflow_model {mlflow_model} --test_dataset {test_dataset}"
+    command: >-
+      python run.py  --mlflow_model {mlflow_model} --test_dataset {test_dataset}
diff --git a/components/test_regression_model/conda.yml b/components/test_regression_model/conda.yml
@@ -3,10 +3,10 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - pandas=1.1.4
-  - pip=20.3.3
-  - mlflow=1.14.1
-  - scikit-learn=0.24.1
+  - pandas=1.5.3
+  - pip=23.0.1
+  - mlflow=2.2.2
+  - scikit-learn=1.2.2
   - pip:
-      - wandb==0.10.31
-      - git+https://github.com/udacity/nd0821-c2-build-model-workflow-starter.git#egg=wandb-utils&subdirectory=components
+      - wandb==0.14.0
+      - git+https://github.com/LesterFreamon/build-ml-pipeline-for-short-term-rental-prices.git#egg=wandb-utils&subdirectory=components
diff --git a/components/train_val_test_split/MLproject b/components/train_val_test_split/MLproject
@@ -23,4 +23,5 @@ entry_points:
         type: string
         default: 'none'
 
-    command: "python run.py {input} {test_size} --random_seed {random_seed} --stratify_by {stratify_by}"
+    command: >-
+      python run.py --input {input} --test_size {test_size} --random_seed {random_seed} --stratify_by {stratify_by}
diff --git a/components/train_val_test_split/conda.yml b/components/train_val_test_split/conda.yml
@@ -3,10 +3,10 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - pip=20.3.3
-  - requests=2.24.0
-  - mlflow=1.14.1
-  - scikit-learn=0.24.1
+  - pip=23.0.1
+  - requests=2.28.2
+  - mlflow=2.2.2
+  - scikit-learn=1.2.2
   - pip:
-      - wandb==0.10.31
-      - git+https://github.com/udacity/nd0821-c2-build-model-workflow-starter.git#egg=wandb-utils&subdirectory=components
+      - wandb==0.14.0
+      - git+https://github.com/LesterFreamon/build-ml-pipeline-for-short-term-rental-prices.git#egg=wandb-utils&subdirectory=components
diff --git a/components/train_val_test_split/run.py b/components/train_val_test_split/run.py
@@ -53,18 +53,18 @@ def go(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Split test and remainder")
 
-    parser.add_argument("input", type=str, help="Input artifact to split")
+    parser.add_argument("--input", type=str, help="Input artifact to split")
 
     parser.add_argument(
-        "test_size", type=float, help="Size of the test split. Fraction of the dataset, or number of items"
+        "--test_size", type=float, help="Size of the test split. Fraction of the dataset, or number of items"
     )
 
     parser.add_argument(
-        "--random_seed", type=int, help="Seed for random number generator", default=42, required=False
+        "--random_seed", type=int, help="Seed for random number generator", default=42
     )
 
     parser.add_argument(
-        "--stratify_by", type=str, help="Column to use for stratification", default='none', required=False
+        "--stratify_by", type=str, help="Column to use for stratification", default='none'
     )
 
     args = parser.parse_args()

diff --git a/conda.yml b/conda.yml
@@ -3,9 +3,10 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - mlflow=1.14.1
-  - pyyaml=5.3.1
-  - hydra-core=1.0.6
-  - pip=20.3.3
+  - mlflow=2.2.2
+  - pyyaml=6.0
+  - hydra-core=1.3.2
+  - pip=23.0.1
   - pip:
-      - wandb==0.10.31
+      - wandb==0.14.0
+      - hydra-joblib-launcher==1.2.0
diff --git a/config.yaml b/config.yaml
@@ -1,5 +1,5 @@
 main:
-  components_repository: "https://github.com/udacity/build-ml-pipeline-for-short-term-rental-prices#components"
+  components_repository: "https://github.com/LesterFreamon/build-ml-pipeline-for-short-term-rental-prices#components"
   # All the intermediate files will be copied to this directory at the end of the run.
   # Set this to null if you are running in prod
   project_name: nyc_airbnb
@@ -9,6 +9,10 @@ etl:
   sample: "sample1.csv"
   min_price: 10 # dollars
   max_price: 350 # dollars
+  min_longitude: -74.25
+  max_longitude: -73.50
+  min_latitude: 40.50
+  max_latitude: 41.20
 data_check:
   kl_threshold: 0.2
 modeling:
@@ -22,7 +26,7 @@ modeling:
   stratify_by: "neighbourhood_group"
   # Maximum number of features to consider for the TFIDF applied to the title of the
   # insertion (the column called "name")
-  max_tfidf_features: 5
+  max_tfidf_features: 15
   # NOTE: you can put here any parameter that is accepted by the constructor of
   # RandomForestRegressor. This is a subsample, but more could be added:
   random_forest:
@@ -32,7 +36,7 @@ modeling:
     min_samples_leaf: 3
     # Here -1 means all available cores
     n_jobs: -1
-    criterion: mae
+    criterion: absolute_error
     max_features: 0.5
     # DO not change the following
     oob_score: true
diff --git a/cookie-mlflow-step/{{cookiecutter.step_name}}/conda.yml b/cookie-mlflow-step/{{cookiecutter.step_name}}/conda.yml
@@ -3,6 +3,6 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - pip=20.3.3
+  - pip=23.0.1
   - pip:
-      - wandb==0.10.31
+      - wandb==0.14.0
diff --git a/environment.yml b/environment.yml
@@ -3,15 +3,15 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - mlflow=1.14.1
-  - ipython=7.21.0
-  - notebook=6.2.0
-  - jupyterlab=3.0.10
-  - cookiecutter=1.7.2
-  - hydra-core=1.0.6
-  - matplotlib=3.3.4
-  - pandas=1.2.3
-  - git=2.30.2
-  - pip=20.3.3
+  - mlflow=2.2.2
+  - ipython=8.11.0
+  - notebook=6.5.3
+  - jupyterlab=3.6.2
+  - cookiecutter=2.1.1
+  - hydra-core=1.3.2
+  - matplotlib=3.7.1
+  - pandas=1.5.3
+  - git=2.39.2
+  - pip=23.0.1
   - pip:
-      - wandb==0.10.31
+      - wandb===0.14.0
diff --git a/main.py b/main.py
@@ -21,7 +21,7 @@
 
 
 # This automatically reads in the configuration
-@hydra.main(config_name='config')
+@hydra.main(version_base="1.2", config_path=".", config_name='config')
 def go(config: DictConfig):
 
     # Setup the wandb experiment. All runs will be grouped under this name
@@ -32,13 +32,15 @@ def go(config: DictConfig):
     steps_par = config['main']['steps']
     active_steps = steps_par.split(",") if steps_par != "all" else _steps
 
+
+
     # Move to a temporary directory
     with tempfile.TemporaryDirectory() as tmp_dir:
 
         if "download" in active_steps:
             # Download file and load in W&B
             _ = mlflow.run(
-                f"{config['main']['components_repository']}/get_data",
+                f'{config["main"]["components_repository"]}/get_data'
                 "main",
                 version='main',
                 parameters={
@@ -53,19 +55,53 @@ def go(config: DictConfig):
             ##################
             # Implement here #
             ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
+                "main",
+                parameters={
+                    "input_artifact": "sample.csv:latest",
+                    "artifact_name": "clean_sample.csv",
+                    "artifact_type": "clean_sample",
+                    "artifact_description": "Data with outliers and null values removed",
+                    "min_price": config["etl"]["min_price"],
+                    "max_price": config["etl"]["max_price"],
+                    "min_longitude": config["etl"]["min_longitude"],
+                    "max_longitude": config["etl"]["max_longitude"],
+                    "min_latitude": config["etl"]["min_latitude"],
+                    "max_latitude": config["etl"]["max_latitude"]
+                },
+            )
 
         if "data_check" in active_steps:
             ##################
             # Implement here #
             ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
+                "main",
+                parameters={
+                    "csv": "clean_sample.csv:latest",
+                    "ref": "clean_sample.csv:reference",
+                    "kl_threshold": config["data_check"]["kl_threshold"],
+                    "min_price": config["etl"]["min_price"],
+                    "max_price": config["etl"]["max_price"],
+                },
+            )
 
         if "data_split" in active_steps:
             ##################
             # Implement here #
             ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "components", "train_val_test_split"),
+                "main",
+                parameters={
+                    "input": "clean_sample.csv:latest",
+                    "test_size": config["modeling"]["test_size"],
+                    "random_seed": config["modeling"]["random_seed"],
+                    "stratify_by": config["modeling"]["stratify_by"]
+                },
+            )
 
         if "train_random_forest" in active_steps:
 
@@ -81,15 +117,35 @@ def go(config: DictConfig):
             # Implement here #
             ##################
 
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest"),
+                "main",
+                parameters={
+                    "trainval_artifact": "trainval_data.csv:latest",
+                    "val_size": config["modeling"]["val_size"],
+                    "random_seed": config["modeling"]["random_seed"],
+                    "stratify_by": config["modeling"]["stratify_by"],
+                    "rf_config": rf_config,
+                    "max_tfidf_features": config["modeling"]["max_tfidf_features"],
+                    "output_artifact": "random_forest_export",
+                },
+            )
+
 
         if "test_regression_model" in active_steps:
 
             ##################
             # Implement here #
             ##################
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "components", "test_regression_model"),
+                "main",
+                parameters={
+                    "mlflow_model": "random_forest_export:prod",
+                    "test_dataset": "test_data.csv:latest",
+                },
+            )
 
-            pass
 
 
 if __name__ == "__main__":

diff --git a/src/basic_cleaning/MLproject b/src/basic_cleaning/MLproject
@@ -0,0 +1,50 @@
+name: basic_cleaning
+conda_env: conda.yml
+
+entry_points:
+  main:
+    parameters:
+
+      input_artifact:
+        description: The name of the input artifact
+        type: string
+
+      artifact_name:
+        description: The name of the output artifact
+        type: string
+
+      artifact_type:
+        description: Type of the output artifact. This will be used to categorize the artifact in the W&B
+                     interface
+        type: string
+
+      artifact_description:
+        description: A brief description of the output artifact
+        type: string
+
+      min_price:
+        description: Minimum price of the rental
+        type: string
+
+      max_price:
+        description: Maximum price of the rental
+        type: string
+
+      min_longitude:
+        description: Minimum longitude of the rental
+        type: string
+
+      max_longitude:
+        description: Maximum longitude of the rental
+        type: string
+
+      min_latitude:
+        description: Minimum latitude of the rental
+        type: string
+
+      max_latitude:
+        description: Maximum latitude of the rental
+        type: string
+
+    command: >-
+      python run.py --input_artifact {input_artifact} --artifact_name {artifact_name} --artifact_type {artifact_type} --artifact_description {artifact_description} --min_price {min_price} --max_price {max_price} --min_longitude {min_longitude} --max_longitude {max_longitude} --min_latitude {min_latitude} --max_latitude {max_latitude}