-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathoptional_effects.py
128 lines (103 loc) · 3.75 KB
/
optional_effects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import random
import numpy as np
import pandas as pd
from datetime import timedelta
def _balance(
s: pd.Series,
gain: float = 0.,
) -> pd.Series:
"""
calulates roughly balanced log lambda.
Parameters
----------
s
series
gain
demand-increase ratio for lower-demand products in a couple
Returns
-------
series with roughly balanced log lambda
"""
return s + (s.max() - s) * gain
def simulate_coupled_demand(
df: pd.DataFrame,
n_couples: int,
max_products: int = 3,
) -> pd.DataFrame:
"""
Simulates coupled-demand effects.
In many cases, things like bread and jam are bought together.
Parameters
----------
df
dataframe
n_couples
number of couples to simulate
max_products
max number of products included in one couple
Returns
-------
dataframe with coupled-demand effect added on the log lambda column.
"""
p_id = df['P_ID'].unique()
df['C_ID'] = -1 # no couple
# couple
for c_id in range(n_couples):
prods = np.random.randint(low=2, high=max_products+1)
couple = np.random.choice(p_id, size=prods, replace=False)
mask = df['P_ID'].isin(couple)
df.loc[mask, 'C_ID'] = c_id
p_id = list(set(p_id) - set(couple)) # not allow overlapping
# add coupled-demand effect for different couples
mask = df['C_ID'] != -1
df.loc[mask, 'LOG_LAMBDA'] = df.loc[mask].groupby(["C_ID", "L_ID"],
group_keys=False)['LOG_LAMBDA'].apply(
_balance, np.random.uniform(0.0, 0.5))
df.drop(columns=['C_ID'], inplace=True)
return df
def drop_zeros(
df: pd.DataFrame
) -> pd.DataFrame:
return df.loc[df['SALES'] != 0]
def add_anomalies(
df: pd.DataFrame,
prob_huge = 0.,
prob_neg = 0.
) -> pd.DataFrame:
"""
randomly assign a small number of sales as huge or negative
"""
n = len(df)
anomaly_list = [1] * n
for i in range(n):
if random.random() < prob_huge:
anomaly_list[i] = random.randint(1000, 10000) # huge number
elif random.random() < prob_neg:
anomaly_list[i] = random.randint(-10, -1)
df['SALES'] = df['SALES'] * anomaly_list
return df
def restrict_pl_ranges(
df: pd.DataFrame
) -> pd.DataFrame:
"""
NPI and discontinuation
"""
# products
df_cut_dates = df.groupby(['P_ID']).DATE.max().reset_index()
df_cut_dates['START_DATE'] = [df_cut_dates.DATE.max() - timedelta(days=random.randint(1, 5000)) for _ in
range(len(df_cut_dates))]
df_cut_dates['END_DATE'] = [df_cut_dates.DATE.max() + timedelta(days=9000) - timedelta(days=random.randint(1, 10000))
for _ in range(len(df_cut_dates))]
df = df.merge(df_cut_dates[['P_ID', 'START_DATE', 'END_DATE']], on=['P_ID'], how='left')
df = df[(df.DATE > df.START_DATE) & (df.DATE < df.END_DATE)]
df.drop(columns=['START_DATE', 'END_DATE'], inplace=True)
# product-location combinations
df_cut_dates = df.groupby(['P_ID', 'L_ID']).DATE.max().reset_index()
df_cut_dates['START_DATE'] = [df_cut_dates.DATE.max() - timedelta(days=random.randint(1, 5000)) for _ in
range(len(df_cut_dates))]
df_cut_dates['END_DATE'] = [df_cut_dates.DATE.max() + timedelta(days=9000) - timedelta(days=random.randint(1, 10000))
for _ in range(len(df_cut_dates))]
df = df.merge(df_cut_dates[['P_ID', 'L_ID', 'START_DATE', 'END_DATE']], on=['P_ID', 'L_ID'], how='left')
df = df[(df.DATE > df.START_DATE) & (df.DATE < df.END_DATE)]
df.drop(columns=['START_DATE', 'END_DATE'], inplace=True)
return df