-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions2.py
82 lines (69 loc) · 2.65 KB
/
functions2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import statsmodels.formula.api as smf
def forward_selected(data, response):
"""Linear model designed by forward selection.
Parameters:
-----------
data : pandas DataFrame with all possible predictors and response
response: string, name of response column in data
Returns:
--------
model: an "optimal" fitted statsmodels linear model
with an intercept
selected by forward selection
evaluated by adjusted R-squared
"""
remaining = set(data.columns)
remaining.remove(response)
selected = []
current_score, best_new_score = 0.0, 0.0
while remaining and current_score == best_new_score:
scores_with_candidates = []
for candidate in remaining:
formula = "{} ~ {} + 1".format(response,
' + '.join(selected + [candidate]))
score = smf.ols(formula, data).fit().rsquared_adj
scores_with_candidates.append((score, candidate))
scores_with_candidates.sort()
best_new_score, best_candidate = scores_with_candidates.pop()
if current_score < best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
formula = "{} ~ {} + 1".format(response,
' + '.join(selected))
model = smf.ols(formula, data).fit()
print(model.summary())
return model
def backward_selected(data, response):
"""Linear model designed by backward selection.
Parameters:
-----------
data : pandas DataFrame with all possible predictors and response
response: string, name of response column in data
Returns:
--------
model: an "optimal" fitted statsmodels linear model
with an intercept
selected by backward selection
evaluated by parameters p-value
"""
remaining = set(data._get_numeric_data().columns)
if response in remaining:
remaining.remove(response)
cond = True
while remaining and cond:
formula = "{} ~ {} + 1".format(response,' + '.join(remaining))
print('_______________________________')
print(formula)
model = smf.ols(formula, data).fit()
score = model.pvalues[1:]
toRemove = score[score == score.max()]
if toRemove.values.any() > 0.05:
print('remove', toRemove.index[0], '(p-value :', round(toRemove.values[0],3), ')')
remaining.remove(toRemove.index[0])
else:
cond = False
print('is the final model !')
print('')
print(model.summary())
return model