-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path00_data_preparation.py
36 lines (31 loc) · 1.23 KB
/
00_data_preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import logging
import pathlib
import pandas as pd
from sklearn.model_selection import train_test_split
def main() -> None:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logging.info("Start")
df = pd.read_csv("data/Train.csv", engine="pyarrow")
logging.info("Train val test split")
train, test = train_test_split(df, random_state=0, stratify=df["labels"])
train, val = train_test_split(train, random_state=0, stratify=train["labels"])
X_train = train.drop(columns=["labels"])
y_train = pd.DataFrame(train["labels"])
X_val = val.drop(columns=["labels"])
y_val = pd.DataFrame(val["labels"])
X_test = test.drop(columns=["labels"])
y_test = pd.DataFrame(test["labels"])
logging.info("Output")
pathlib.Path("data/dataset").mkdir(parents=True, exist_ok=True)
X_train.to_parquet("data/dataset/X_train.parquet")
y_train.to_parquet("data/dataset/y_train.parquet")
X_val.to_parquet("data/dataset/X_val.parquet")
y_val.to_parquet("data/dataset/y_val.parquet")
X_test.to_parquet("data/dataset/X_test.parquet")
y_test.to_parquet("data/dataset/y_test.parquet")
logging.info("End")
if __name__ == "__main__":
main()