Skip to content

Commit

Permalink
upload regression project rental prices
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniele Ceriali authored and Daniele Ceriali committed Aug 26, 2024
1 parent 3e57919 commit 9dd7237
Show file tree
Hide file tree
Showing 9 changed files with 1,248 additions and 81 deletions.
6 changes: 3 additions & 3 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ modeling:
stratify_by: "neighbourhood_group"
# Maximum number of features to consider for the TFIDF applied to the title of the
# insertion (the column called "name")
max_tfidf_features: 5
max_tfidf_features: 4
# NOTE: you can put here any parameter that is accepted by the constructor of
# RandomForestRegressor. This is a subsample, but more could be added:
random_forest:
n_estimators: 100
max_depth: 15
n_estimators: 500
max_depth: 10
min_samples_split: 4
min_samples_leaf: 3
# Here -1 means all available cores
Expand Down
83 changes: 55 additions & 28 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,47 +50,74 @@ def go(config: DictConfig):
)

if "basic_cleaning" in active_steps:
##################
# Implement here #
##################
pass
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
"main",
parameters={
"input_artifact": "sample.csv:latest",
"output_artifact": "clean_sample.csv",
"output_type": "clean_sample",
"output_description": "Data with outliers and null values removed",
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price']
},
)

if "data_check" in active_steps:
##################
# Implement here #
##################
pass
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
"main",
parameters={
"csv": "clean_sample.csv:latest",
"ref": "clean_sample.csv:reference",
"kl_threshold": config["data_check"]["kl_threshold"],
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price']
},
)

if "data_split" in active_steps:
##################
# Implement here #
##################
pass
_ = mlflow.run(
f"{config['main']['components_repository']}/train_val_test_split",
"main",
parameters={
"input": "clean_sample.csv:latest",
"test_size": config['modeling']['test_size'],
"random_seed": config['modeling']['random_seed'],
"stratify_by": config['modeling']['stratify_by']
},
)

if "train_random_forest" in active_steps:

# NOTE: we need to serialize the random forest configuration into JSON
rf_config = os.path.abspath("rf_config.json")
with open(rf_config, "w+") as fp:
json.dump(dict(config["modeling"]["random_forest"].items()), fp) # DO NOT TOUCH
json.dump(dict(config["modeling"]["random_forest"].items()), fp)

# NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
# step

##################
# Implement here #
##################

pass
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest"),
"main",
parameters={
"trainval_artifact": "trainval_data.csv:latest",
"val_size": config["modeling"]["val_size"],
"random_seed": config["modeling"]["random_seed"],
"stratify_by": config["modeling"]["stratify_by"],
"rf_config": rf_config,
"max_tfidf_features": config["modeling"]["max_tfidf_features"],
"output_artifact": "random_forest_export"
},
)

if "test_regression_model" in active_steps:

##################
# Implement here #
##################

pass

_ = mlflow.run(
f"{config['main']['components_repository']}/test_regression_model",
"main",
parameters={
"mlflow_model": "random_forest_export:prod",
"test_dataset": "test_data.csv:latest"
},
)

if __name__ == "__main__":
go()
37 changes: 37 additions & 0 deletions src/basic_cleaning/MLproject
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: basic_cleaning
conda_env: conda.yml

entry_points:
main:
parameters:
input_artifact:
description: name of input artifact
type: string

output_artifact:
description: name of the artifact generated
type: string

output_type:
description: type of the output
type: string

output_description:
description: description of the output
type: string

min_price:
description: min price
type: float

max_price:
description: max price
type: float

command: >-
python run.py --input_artifact {input_artifact} \
--output_artifact {output_artifact} \
--output_type {output_type} \
--output_description {output_description} \
--min_price {min_price} \
--max_price {max_price}
9 changes: 9 additions & 0 deletions src/basic_cleaning/conda.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: basic_cleaning
channels:
- conda-forge
- defaults
dependencies:
- pip=23.3.1
- pip:
- mlflow==2.8.1
- wandb==0.16.0
107 changes: 107 additions & 0 deletions src/basic_cleaning/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python
"""
Performs basic cleaning on the data and save the results in Weights & Biases
"""
import argparse
import logging
import wandb
import pandas as pd
import os


logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
logger = logging.getLogger()


def go(args):

run = wandb.init(job_type="basic_cleaning")
run.config.update(args)

# Download input artifact. This will also log that this script is using this
# particular version of the artifact
# artifact_local_path = run.use_artifact(args.input_artifact).file()

logger.info("Downloading artifact")
artifact_local_path = run.use_artifact(args.input_artifact).file()

df = pd.read_csv(artifact_local_path)

# Drop the duplicates
logger.info("Dropping duplicates and identify nulls")
print(df.columns[df.isnull().any()])
df.dropna(inplace=True) #drop rows with any nulls

df = df.drop_duplicates().reset_index(drop=True)

# Drop outliers and nulls
logger.info("Dropping min max prices")
df = df[(df['price'] >= args.min_price) & (df['price'] <= args.max_price)]

filename = args.output_artifact

df.to_csv(filename, index=False)

artifact = wandb.Artifact(
filename,
type=args.output_type,
description=args.output_description,
)
artifact.add_file(filename)

logger.info("Logging artifact")
run.log_artifact(artifact)

os.remove(filename)


if __name__ == "__main__":

parser = argparse.ArgumentParser(description="This steps cleans the data")


parser.add_argument(
"--input_artifact",
type=str,
help="name of input artifact",
required=True
)

parser.add_argument(
"--output_artifact",
type=str,
help="name of the artifact generated",
required=True
)

parser.add_argument(
"--output_type",
type=str,
help="type of the output",
required=True
)

parser.add_argument(
"--output_description",
type=str,
help="description of the output",
required=False
)

parser.add_argument(
"--min_price",
type=float,
help="min price",
required=True
)

parser.add_argument(
"--max_price",
type=float,
help="max price",
required=True
)

args = parser.parse_args()

go(args)
4 changes: 2 additions & 2 deletions src/data_check/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ channels:
dependencies:
- python=3.10.0
- pandas=2.1.3
- pytest=6.2.2
- scipy=1.5.2
- pytest=8.3.2
- scipy=1.13.1
- pip=23.3.1
- pip:
- mlflow==2.8.1
Expand Down
9 changes: 6 additions & 3 deletions src/data_check/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ def test_similar_neigh_distrib(data: pd.DataFrame, ref_data: pd.DataFrame, kl_th
assert scipy.stats.entropy(dist1, dist2, base=2) < kl_threshold


########################################################
# Implement here test_row_count and test_price_range #
########################################################
def test_row_count(data: pd.DataFrame):
assert 15000 < data.shape[0] < 1000000


def test_price_range(data: pd.DataFrame, min_price: float, max_price: float):
assert data['price'].between(min_price, max_price).all()
Loading

0 comments on commit 9dd7237

Please sign in to comment.