Skip to content

Commit

Permalink
update: final project ml pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
samuel-haddad committed Mar 26, 2024
1 parent 3e57919 commit ec70ff2
Show file tree
Hide file tree
Showing 15 changed files with 19,729 additions and 41 deletions.
2 changes: 1 addition & 1 deletion components/test_regression_model/MLproject
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: test_model
name: test_regression_model
conda_env: conda.yml

entry_points:
Expand Down
2 changes: 1 addition & 1 deletion components/test_regression_model/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ dependencies:
- scikit-learn=1.3.2
- pandas=2.1.3
- pip:
- mlflow==2.8.1
- mlflow==2.11.3
- wandb==0.16.0
- git+https://github.com/udacity/nd0821-c2-build-model-workflow-starter.git#egg=wandb-utils&subdirectory=components
4 changes: 2 additions & 2 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ modeling:
stratify_by: "neighbourhood_group"
# Maximum number of features to consider for the TFIDF applied to the title of the
# insertion (the column called "name")
max_tfidf_features: 5
max_tfidf_features: 30
# NOTE: you can put here any parameter that is accepted by the constructor of
# RandomForestRegressor. This is a subsample, but more could be added:
random_forest:
Expand All @@ -33,6 +33,6 @@ modeling:
# Here -1 means all available cores
n_jobs: -1
criterion: squared_error
max_features: 0.5
max_features: 0.33
# DO not change the following
oob_score: true
75 changes: 54 additions & 21 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,43 @@ def go(config: DictConfig):
)

if "basic_cleaning" in active_steps:
##################
# Implement here #
##################
pass
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
"main",
parameters={
"input_artifact": "sample.csv:latest",
"output_artifact": "clean_sample.csv",
"output_type": "clean_sample",
"output_description": "Data with outliers and null values removed",
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price'],
},
)

if "data_check" in active_steps:
##################
# Implement here #
##################
pass
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
"main",
parameters={
"csv": "clean_sample.csv:latest",
"ref": "clean_sample.csv:reference",
"kl_threshold": config['data_check']['kl_threshold'],
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price'],
},
)

if "data_split" in active_steps:
##################
# Implement here #
##################
pass
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "components", "train_val_test_split"),
"main",
parameters={
"input": "clean_sample.csv:latest",
"test_size": config['modeling']['test_size'],
"random_seed": config['modeling']['random_seed'],
"stratify_by": config['modeling']['stratify_by'],
},
)

if "train_random_forest" in active_steps:

Expand All @@ -77,19 +98,31 @@ def go(config: DictConfig):
# NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
# step

##################
# Implement here #
##################
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest"),
"main",
parameters={
"trainval_artifact": "trainval_data.csv:latest",
"val_size": config["modeling"]["val_size"],
"random_seed": config["modeling"]["random_seed"],
"stratify_by": config["modeling"]["stratify_by"],
"rf_config": rf_config,
"max_tfidf_features": config["modeling"]["max_tfidf_features"],
"output_artifact":"model_export",
},
)

pass

if "test_regression_model" in active_steps:

##################
# Implement here #
##################

pass
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "components", "test_regression_model"),
"main",
parameters={
"mlflow_model": "model_export:prod",
"test_dataset": "test_data.csv:latest",
},
)


if __name__ == "__main__":
Expand Down
27 changes: 27 additions & 0 deletions src/basic_cleaning/MLproject
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: basic_cleaning
conda_env: conda.yml

entry_points:
main:
parameters:
input_artifact:
description: The raw data file to be clean.
type: string
output_artifact:
description: The name for the output artifact.
type: string
output_type:
description: The type for the output artifact.
type: string
output_description:
description: A description for the output artifact.
type: string
min_price:
description: The minimun price to consider.
type: float
max_price:
description: The maximun price to consider
type: float

command: >-
python run.py --input_artifact {input_artifact} --output_artifact {output_artifact} --output_type {output_type} --output_description {output_description} --min_price {min_price} --max_price {max_price}
Loading

0 comments on commit ec70ff2

Please sign in to comment.