-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHousingPrice_categoricalVariables.py
186 lines (104 loc) · 4.34 KB
/
HousingPrice_categoricalVariables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
from sklearn.model_selection import train_test_split
#read data set
X = pd.read_csv(r'train.csv')
X_test = pd.read_csv(r'test.csv')
#remove data with missing target, seperate target from pridector
X.dropna(axis=0,subset=['SalePrice'],inplace=True)
y = X.SalePrice
X.drop(['SalePrice'],axis=1,inplace=True)
#remove columns with missing value
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing,axis=1,inplace=True)
X_test.drop(cols_with_missing,axis=1,inplace=True)
#Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2,
train_size=0.8,
random_state=0)
# In[4]:
X_train.head()
# In[5]:
X_train.describe(include='object')
# In[6]:
X_train.describe()
# In[7]:
X_train['Condition2'].unique()
# In[8]:
X_valid['Condition2'].unique()
# In[2]:
#This is a common problem that you'll encounter with real-world data,
#and there are many approaches to fixing this issue.
#For instance, you can write a custom label encoder to deal with new categories.
#The simplest approach, however, is to drop the problematic categorical columns.
#Run the code cell below to save the problematic columns to a Python list
#bad_label_cols.
#Likewise, columns that can be safely label encoded are stored in
#good_label_cols.
object_cols = [col for col in X_train.columns if X_train[col].dtype=='object']
good_label_cols = [col for col in object_cols
if set(X_train[col])==set(X_valid[col])]
bad_label_cols = list(set(object_cols)-set(good_label_cols))
# In[20]:
print(good_label_cols)
print(bad_label_cols)
# In[3]:
#drop bad label col
label_X_train = X_train.drop(bad_label_cols,axis=1)
label_X_valid = X_valid.drop(bad_label_cols,axis=1)
# In[4]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in set(good_label_cols):
label_X_train[col] = label_encoder.fit_transform(X_train[col])
label_X_valid[col] = label_encoder.transform(X_valid[col])
# In[5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
myModel = RandomForestRegressor(n_estimators=300,random_state=0,criterion='mae',max_depth=13)
myModel.fit(label_X_train,y_train)
print('MAE is %d' % mean_absolute_error(y_valid,myModel.predict(label_X_valid)))
# In[6]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
# In[7]:
olabel_X_train = X_train.drop(bad_label_cols,axis=1)
olabel_X_valid = X_valid.drop(bad_label_cols,axis=1)
olabel_X_train = ordinal_encoder.fit_transform(X_train[good_label_cols])
olabel_X_valid = ordinal_encoder.transform(X_valid[good_label_cols])
# In[8]:
myModel.fit(label_X_train,y_train)
print('MAE is %d' % mean_absolute_error(y_valid,myModel.predict(label_X_valid)))
# In[9]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))
# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])
# In[10]:
X_train.describe(include='object')
# In[12]:
low_cardinality_col = [col for col in X_train.columns
if X_train[col].nunique()<10]
high_cardinality_col = set(object_cols) - set(low_cardinality_col)
# In[15]:
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
#create new dataframe based X_train[low_calrdinality_col] onehot encoding
OH_cols_train = pd.DataFrame(oneHotEncoder.fit_transform(X_train[low_cardinality_col]))
OH_cols_valid = pd.DataFrame(oneHotEncoder.transform(X_valid[low_cardinality_col]))
#oneHot remove index
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
#remove object from tain and valid
num_X_train = X_train.drop(object_cols,axis=1)
num_X_valid = X_valid.drop(object_cols,axis=1)
#now you should merge two data set
OH_X_train = pd.concat([num_X_train,OH_cols_train],axis=1)
OH_X_valid = pd.concat([num_X_valid,OH_cols_valid],axis=1)
# In[17]:
myModel.fit(OH_X_train,y_train)
print('MAE is %d' % mean_absolute_error(y_valid,myModel.predict(OH_X_valid)))
# In[ ]: