-
Notifications
You must be signed in to change notification settings - Fork 661
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
upload regression project rental prices
- Loading branch information
Daniele Ceriali
authored and
Daniele Ceriali
committed
Aug 26, 2024
1 parent
3e57919
commit 9dd7237
Showing
9 changed files
with
1,248 additions
and
81 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
name: basic_cleaning | ||
conda_env: conda.yml | ||
|
||
entry_points: | ||
main: | ||
parameters: | ||
input_artifact: | ||
description: name of input artifact | ||
type: string | ||
|
||
output_artifact: | ||
description: name of the artifact generated | ||
type: string | ||
|
||
output_type: | ||
description: type of the output | ||
type: string | ||
|
||
output_description: | ||
description: description of the output | ||
type: string | ||
|
||
min_price: | ||
description: min price | ||
type: float | ||
|
||
max_price: | ||
description: max price | ||
type: float | ||
|
||
command: >- | ||
python run.py --input_artifact {input_artifact} \ | ||
--output_artifact {output_artifact} \ | ||
--output_type {output_type} \ | ||
--output_description {output_description} \ | ||
--min_price {min_price} \ | ||
--max_price {max_price} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
name: basic_cleaning | ||
channels: | ||
- conda-forge | ||
- defaults | ||
dependencies: | ||
- pip=23.3.1 | ||
- pip: | ||
- mlflow==2.8.1 | ||
- wandb==0.16.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Performs basic cleaning on the data and save the results in Weights & Biases | ||
""" | ||
import argparse | ||
import logging | ||
import wandb | ||
import pandas as pd | ||
import os | ||
|
||
|
||
logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s") | ||
logger = logging.getLogger() | ||
|
||
|
||
def go(args): | ||
|
||
run = wandb.init(job_type="basic_cleaning") | ||
run.config.update(args) | ||
|
||
# Download input artifact. This will also log that this script is using this | ||
# particular version of the artifact | ||
# artifact_local_path = run.use_artifact(args.input_artifact).file() | ||
|
||
logger.info("Downloading artifact") | ||
artifact_local_path = run.use_artifact(args.input_artifact).file() | ||
|
||
df = pd.read_csv(artifact_local_path) | ||
|
||
# Drop the duplicates | ||
logger.info("Dropping duplicates and identify nulls") | ||
print(df.columns[df.isnull().any()]) | ||
df.dropna(inplace=True) #drop rows with any nulls | ||
|
||
df = df.drop_duplicates().reset_index(drop=True) | ||
|
||
# Drop outliers and nulls | ||
logger.info("Dropping min max prices") | ||
df = df[(df['price'] >= args.min_price) & (df['price'] <= args.max_price)] | ||
|
||
filename = args.output_artifact | ||
|
||
df.to_csv(filename, index=False) | ||
|
||
artifact = wandb.Artifact( | ||
filename, | ||
type=args.output_type, | ||
description=args.output_description, | ||
) | ||
artifact.add_file(filename) | ||
|
||
logger.info("Logging artifact") | ||
run.log_artifact(artifact) | ||
|
||
os.remove(filename) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
parser = argparse.ArgumentParser(description="This steps cleans the data") | ||
|
||
|
||
parser.add_argument( | ||
"--input_artifact", | ||
type=str, | ||
help="name of input artifact", | ||
required=True | ||
) | ||
|
||
parser.add_argument( | ||
"--output_artifact", | ||
type=str, | ||
help="name of the artifact generated", | ||
required=True | ||
) | ||
|
||
parser.add_argument( | ||
"--output_type", | ||
type=str, | ||
help="type of the output", | ||
required=True | ||
) | ||
|
||
parser.add_argument( | ||
"--output_description", | ||
type=str, | ||
help="description of the output", | ||
required=False | ||
) | ||
|
||
parser.add_argument( | ||
"--min_price", | ||
type=float, | ||
help="min price", | ||
required=True | ||
) | ||
|
||
parser.add_argument( | ||
"--max_price", | ||
type=float, | ||
help="max price", | ||
required=True | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
go(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.