-
Notifications
You must be signed in to change notification settings - Fork 0
/
linear_regression_v3.py
104 lines (83 loc) · 3.85 KB
/
linear_regression_v3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
import warnings
import tqdm
import pickle
# Assuming df is your DataFrame with the engineered features
df = pd.read_csv("featured_combined_data.csv")
# List of unique SKU IDs and store IDs
sku_ids = df['sku_id'].unique()
store_ids = df['store_id'].unique()
# Initialize dictionaries to store model results and models
mse_dict = {}
r2_dict = {}
models_dict = {}
# Create a progress bar for the outer loop
outer_pbar = tqdm.tqdm(total=len(sku_ids), desc="SKU Progress")
# Loop through each combination of SKU and store
for sku_id in tqdm.tqdm(sku_ids, desc="SKU Progress"):
for store_id in tqdm.tqdm(store_ids, desc="Store Progress", leave=False):
# Filter data for the current SKU and store
sku_store_data = df[(df['sku_id'] == sku_id) & (df['store_id'] == store_id)].copy()
# Check if there are enough samples for the SKU-store combination
if len(sku_store_data) < 2:
tqdm.tqdm.write(f"Skipping SKU {sku_id}, Store {store_id} due to insufficient samples.")
continue
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
sku_store_data.drop(['date', 'sales_amount'], axis=1), # Features excluding 'date' and 'sales_amount'
sku_store_data['sales_amount'],
test_size=0.2,
random_state=42
)
# Check if there are enough samples for cross-validation
if len(X_train) < 5: # Adjust the threshold as needed
tqdm.tqdm.write(f"Skipping cross-validation for SKU {sku_id}, Store {store_id} due to insufficient training samples.")
continue
# Create a pipeline with PolynomialFeatures and Ridge regression
degree = 2 # Degree of polynomial features
alpha = 0.1 # Regularization strength
model = make_pipeline(
StandardScaler(),
PolynomialFeatures(degree),
Ridge(alpha=alpha)
)
# Perform cross-validation to evaluate the model
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_mse = -cv_scores.mean() # Taking the negative because cross_val_score returns negative MSE
# Fit the model on the training data
model.fit(X_train, y_train)
# Store the model in the dictionary
models_dict[(sku_id, store_id)] = model
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Store results in dictionaries
mse_dict[(sku_id, store_id)] = mse
r2_dict[(sku_id, store_id)] = r2
# Print results for the current SKU and store
tqdm.tqdm.write(f"SKU {sku_id}, Store {store_id}:")
tqdm.tqdm.write(f"Mean Squared Error: {mse}")
tqdm.tqdm.write(f"R-squared: {r2}")
tqdm.tqdm.write(f"Cross-validated Mean Squared Error: {cv_mse}")
tqdm.tqdm.write("")
# Close the progress bar for the outer loop
outer_pbar.close()
# Overall model evaluation (average across SKU-store combinations)
avg_mse = sum(mse_dict.values()) / len(mse_dict)
avg_r2 = sum(r2_dict.values()) / len(r2_dict)
print("Overall Model Evaluation:")
print(f"Average Mean Squared Error: {avg_mse}")
print(f"Average R-squared: {avg_r2}")
# Save the dictionary to disk
filename = 'full_models_dict.sav'
pickle.dump(models_dict, open(filename, 'wb'))