-
Notifications
You must be signed in to change notification settings - Fork 0
/
final.py
391 lines (311 loc) · 16.3 KB
/
final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import seaborn as sns
import re
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
#import data with user descriptiomns/review
df_130= pd.read_csv('winemag-data-130k-v2.csv')
# #drop twiter info, and designtation
# test = df_130[df_130['country']=='US']
# test_drop = test.drop(columns=['designation','taster_name','taster_twitter_handle'])
# test_drop.isna().sum()
#creating new dataframe with just US data
def df_us ():
states=df_130[df_130['country']=='US']
states['province'].unique()
return states
states= df_us()
#creating new dataframe with countries outside fo the US data
def df_other ():
other_countries=df_130[df_130['country']!='US']
other_countries['province'].unique()
return other_countries
other_countries = df_other()
#if Region_2 had NaN values, put in values from Region_1
def update_region_2(region,df):
df.loc[df['region_2'].isnull(),'region_2'] = df[region]
return df
#updating states region 2
states= update_region_2('region_1',states)
#updating other countries region 2
other_countries=update_region_2('province', other_countries)
# states.loc[states['region_2'].isnull(),'region_2'] = states['region_1']
#rename df province columns to stastes and dropping desingation, twitter name and handles
def drop(data):
data=data.rename(columns={'province': 'state'})
data=data.drop(columns='designation')
data=data.drop(columns=['taster_name','taster_twitter_handle'])
data=data.dropna()
return data
#renaming states to df and dropping/renaming columns
df=drop(states)
#renaming other_countries to df_other and dropping/renaming columns
df_other = drop(other_countries)
#rename df province columns to stastes and dropping desingation, twitter name and handles
# df=df.rename(columns={'province': 'state'})
# df=df.drop(columns='designation')
# df=df.drop(columns=['taster_name','taster_twitter_handle'])
# df
#mapping state names and assigning numerical values
# creating function to to map state names & assinging numerical values
def map_state(data):
gle = LabelEncoder()
state_labels = gle.fit_transform(data['state'])
state_mappings = {index: label for index, label in enumerate(gle.classes_)}
data['state_label'] = state_labels
return data
#mapping state names and assigning numerical values for US
df= map_state(df)
#mapping state names and assigning numerical values for other countries
df_other= map_state(df_other)
# one hot encoder for state labels
def ohe(data):
gle = LabelEncoder()
state_ohe = OneHotEncoder()
state_labels = gle.fit_transform(data['state'])
state_feature_arr = state_ohe.fit_transform(data[['state_label']]).toarray()
state_feature_labels = list(gle.classes_)
state_features = pd.DataFrame(state_feature_arr, columns=state_feature_labels)
return state_features
#mapping OHE for US states
df_ohe=ohe(df)
#mapping OHE for other counries states
df_other_ohe= ohe(df_other)
#creating new variety column in dataframe to drop words after " - " in variety column aka wine types
def drop_dash_wine (data):
data['new_v2']= data.variety.map(lambda x: x[0: x.find('-')] if '-' in x else x).str.lower()
data['new_v2'].unique()
return data
#dropping second wine name from US df
df= drop_dash_wine(df)
#dropping second wine name from other_country df
df_other= drop_dash_wine(df_other)
#creating wine category dictionary
wine_cat = {'red_bone_dry_bitter' : ['tannat', 'nebbiolo','sagrantino'],'red_bone_dry_savory':['cabernet franc','chianti', 'petit verdot','petite verdot','bordeaux','meritage', 'tempranillo','tempranillo blend','tinto fino','tinta de toro', 'french mourvedre', 'aglianico', 'barbera', 'montepulcaiano'], 'red_dry_herb':
['montepulciano','sangiovese','carmenere','cabernet france','cabernet sauvignon','cabernet'],'red_dry_floral':['gamay','mencía','mencia','valpolicella','rhone_blend','mourvèdre','rhône','beaujolais','burgundy','syrah','trincadeira','pinot noir','pinot nero'],'red_dry_spice':['dolcetto','garnacha', 'bonarda','amorone della valpolicella','negroamaro','nerello mascalese','supertuscans',
'merlot', 'alfrocheiro','alcicante bouschet'],'red_dry_fruit_vanilla':['shiraz','monastrell','malbec',"nero d'avola",'petite sirah','primitivo','zinfandel','grenache','g-s-m','g','touriga nacional'],'red_semi-sweet':['sangiovese grosso','lambrusco','brachetto'],'sweet':['port','banyuls','maury'],
'white_bone_dry': ['pinot grigio','pinot blanc','albarino','garganega','dry furmit','gavi','muscadet','melon','muscat','muskat','chablis','grenache blanc','macabeo','vinho verde','grillo','arinto'],'white_dry_herb':['sauvignon blanc','friulano','fumé blanc','fume blanc','sauvignon gris','sauvignon',
'verdejo','grüner veltliner','verdicchio','colombard'],'white_dry_tarte':['vermentino','turbiana','vernaccia','chenin blanc','torrontés'],'white_dry_sweet':['fiano','albariño','chardonnay','marsanne','roussanne','semillon','trebbiano'], 'white_dry_floral':['viura','pinot gris','sémillon','pinot blanc','pinot bianco','viognier','dry riesling'],'white_off_dry':[ 'riesling',
'johannisberg riesling','chenin blanc','torrontés','müller-thurgau'],'white_semi_sweet':['moscato','gewürztraminer'],'white_sweet':['sauternes','tokaji'],'white_very_sweet':['white port', 'moscatel dessert wine', 'passito wines','vin santo'],'red_blend':['red blend', 'corvina, rondinella, molinara'],'white_blend':['white blend'], 'rose':
['rose','rosé','rosato'],'sparkling':['champagne','prosecco','cava','sparkling blend','sparkling wine','champagne blend','glera']}
wine_cat_2 = {'red_dry' : ['tannat', 'nebbiolo','sagrantino'],'red_dry':['cabernet franc','chianti', 'petit verdot','petite verdot','bordeaux','meritage', 'tempranillo','tempranillo blend','tinto fino','tinta de toro', 'french mourvedre', 'aglianico', 'barbera', 'montepulcaiano'], 'red_dry':
['montepulciano','sangiovese','carmenere','cabernet france','cabernet sauvignon','cabernet'],'red_dry':['gamay','mencía','mencia','valpolicella','rhone_blend','mourvèdre','rhône','beaujolais','burgundy','syrah','trincadeira','pinot noir','pinot nero'],'red_dry':['dolcetto','garnacha', 'bonarda','amorone della valpolicella','negroamaro','nerello mascalese','supertuscans',
'merlot', 'alfrocheiro','alcicante bouschet'],'red_dry':['shiraz','monastrell','malbec',"nero d'avola",'petite sirah','primitivo','zinfandel','grenache','g-s-m','g','touriga nacional'],'red_dry':['sangiovese grosso','lambrusco','brachetto'],'red_sweet':['port','banyuls','maury'],
'white_dry': ['pinot grigio','pinot blanc','albarino','garganega','dry furmit','gavi','muscadet','melon','muscat','muskat','chablis','grenache blanc','macabeo','vinho verde','grillo','arinto'],'white_dry':['sauvignon blanc','friulano','fumé blanc','fume blanc','sauvignon gris','sauvignon',
'verdejo','grüner veltliner','verdicchio','colombard'],'white_dry':['vermentino','turbiana','vernaccia','chenin blanc','torrontés'],'white_dry':['fiano','albariño','chardonnay','marsanne','roussanne','semillon','trebbiano'], 'white_dry':['viura','pinot gris','sémillon','pinot blanc','pinot bianco',
'viognier','dry riesling'],'white_dry':[ 'riesling','johannisberg riesling','chenin blanc','torrontés','müller-thurgau'],'white_sweet':['moscato','gewürztraminer'],'white_sweet':['sauternes','tokaji'],'white_sweet':['white port', 'moscatel dessert wine', 'passito wines','vin santo'],'red_dry':['red blend', 'corvina, rondinella, molinara'],
'white_dry':['white blend'], 'rose':['rose','rosé','rosato'],'sparkling':['champagne','prosecco','cava','sparkling blend','sparkling wine','champagne blend','glera']}
# creating funciton to make new dictionary "updae",turned all values into keys ex: {wine_type: wine_category}
def update_dict (dict):
update= {}
for i in list(dict.keys()):
for wine in wine_cat[i]:
update[wine]=i
return update
#creating update dictionary to add new categiries
update = update_dict(wine_cat)
# function to create a dateframe from wine category/update dictionary
def wine_cat_df(dict):
df_1= pd.DataFrame.from_dict(dict, orient='index', columns= ['category'])
df_1['name']= df_1.index
df_1['category1']=df_1['category']
df_1=df_1.drop(columns= 'category')
return df_1
df_1 = wine_cat_df(update)
#SUPER IMPORTANT
#function for mapping update dictionary to DF to make new coloumn for wine categories
def mapping (data, dict):
data['category'] = data['new_v2'].map(dict,np.nan)
return data
#mapping US wines
df = mapping (df, update)
#mapping other wines
df_other =mapping (df_other, update)
#for testing reasons, counting columns
def test_nan_categories (data):
test_1 = data[data['category'].isnull()]
unique_element, count_element=np.unique(test_1['new_v2'], return_counts=True)
left_overs= pd.DataFrame(unique_element, count_element)
pd.set_option('display.max_rows',125)
return left_overs
#count column items:
def test_categories_counts (data, c_name):
unique_element, count_element=np.unique(data[c_name], return_counts=True)
dframe= pd.DataFrame(unique_element, count_element)
return dframe
#function to create final dataframe with dropping 10000 wines that we could not categorize
def drp_na_cat(data):
final_list = data.dropna()
return final_list
#final US list with droped NaN values from new categories
final_list = drp_na_cat(df)
#final US list with droped NaN values from new categories
final_list_other = drp_na_cat(df_other)
#function to label categories using encorder
def cat_coder(data):
category_le = LabelEncoder()
category_labels = category_le.fit_transform(data['category'])
data['Gen_Label'] = category_labels
return data
#adding numerical category to US list
final_list =cat_coder(final_list)
#adding numerical category to other list
final_list_other =cat_coder(final_list_other)
#function for one hot encoder categories
def ohe_category(data):
category_le = LabelEncoder()
cat_ohe = OneHotEncoder()
category_labels = category_le.fit_transform(data['category'])
cat_feature_arr = cat_ohe.fit_transform(data[['Gen_Label']]).toarray()
cat_feature_labels = list(category_le.classes_)
cat_features = pd.DataFrame(cat_feature_arr,columns=cat_feature_labels)
return cat_features
#cat OHE us
cat_ohe_us =ohe_category(final_list )
#cat OHE other_country
cat_ohe_other =ohe_category(final_list_other)
#
# vectorizing text function
def vectorize_text(text):
vectorizer1 = TfidfVectorizer(stop_words = 'english')
vector1 = vectorizer1.fit_transform(text)
return vector1, vectorizer1
#vectorize text using final_list dataframe (all dropped columns and new updated wine categories)-US
vector_vectorizer = vectorize_text(final_list['description'])
#vectorize text using final_list dataframe (all dropped columns and new updated wine categories)-oth
vector_vectorizer_other = vectorize_text(final_list_other['description'])
#creating vectorized array for words
def get_vector_df(text):
vector_vectorizer = vectorize_text(text)
vector_df = pd.DataFrame(vector_vectorizer[0].toarray(), columns = vector_vectorizer[1].get_feature_names())
return vector_df
#assisgning funciton to get_vect variable- US
get_vect=get_vector_df(final_list['description'])
#assisgning funciton to get_vect variable-other
get_vect_other=get_vector_df(final_list_other['description'])
#creating new DF for the vectorized words
vect_df =pd.DataFrame(get_vect)
#returning sparse vector object
def sparse_vec(vectorizer):
sparse_vector = vector_vectorizer[0]
return sparse_vector
#sparse vector for US
sparse_vector =sparse_vec(vector_vectorizer)
#sparse vector for other countries
sparse_vector_other =sparse_vec(vector_vectorizer_other)
# return vectorizer
# vectorizer = get_vect[1]
#creating new dataframe to include proper index column to match the vect_df index, in order to map each data frame to one another
def create_index(data):
update_index =data.reset_index(drop= True)
return update_index
#creating new index for final_listn data frame to match vectorized df - US
df_10= create_index(final_list)
#creating new index for final_listn data frame to match vectorized df - other
df_10_other= create_index(final_list_other)
#creating dicitonary to rename df_10 columns so we dont have any overlap in vect_df columns when mapping. all columns end in _1
{'Unnamed: 0': 'old_index_1', 'country':'country_1', 'description':'description_1', 'points':'points_1', 'price':'price_1', 'state':'state_1','region_1':'region_1_1', 'region_2':'region_2_1', 'title':'title_1', 'variety':'variety_1', 'winery':'winery_1',
'state_label':'state_label_1','new_v2':'new_v2_1', 'category':'category_1', 'Gen_Label':'cat_label_1'}
#renaming df_10 columns to end in _1
def column_name_update (data):
new_columns =data.rename(columns={'Unnamed: 0': 'old_index_1', 'country':'country_1', 'description':'description_1', 'points':'points_1', 'price':'price_1', 'state':'state_1','region_1':'region_1_1', 'region_2':'region_2_1', 'title':'title_1', 'variety':'variety_1',
'winery':'winery_1','state_label':'state_label_1','new_v2':'new_v2_1', 'category':'category_1', 'Gen_Label':'cat_label_1'})
return new_columns
#updateing US df_10 columns to end in 1
df_10 = column_name_update (df_10)
#updateing other df_10_other columns to end in 1
df_10_other = column_name_update (df_10_other)
#adding column to extract year
def add_vintage(dataframe):
year_list = [re.findall(r'[0-9]{4}', item) for item in dataframe['title_1']]
for item in year_list:
if item == []:
item.append('0')
year_list = [year[0] for year in year_list]
dataframe['vintage_1'] = year_list
return dataframe
# making new column for US list
df_10= add_vintage(df_10)
#making new column for other countries list
df_10_other= add_vintage(df_10_other)
#creating list to have both df_10 and vect_df in order to join
def create_frame( data, vect):
frame = [data,vect]
return frame
#US frame
frame= create_frame(df_10,get_vect)
#other frame
frame_other= create_frame(df_10_other,get_vect_other)
#
#creating new data frame concating DF_10 and vect_df. Use Axis =1 in order to join them by columns
def wine_vect_words (frame_list):
wine_df_final= pd.concat(frame_list, axis=1)
return wine_df_final
#create US vector list
final_wine=wine_vect_words(frame)
#create other vector list
final_wine_other=wine_vect_words(frame_other)
# import plotly
# from plotly.offline import iplot, init_notebook_mode
# init_notebook_mode(connected=True)
# scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
# [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
#
#
#
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv')
#
# for col in df.columns:
# df[col] = df[col].astype(str)
#
# scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
# [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
#
# # df_10['text'] = df_10['state_1']
#
# df_states = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv')
# df_states = pd.DataFrame(df_states['state'])
# state_numbers = df_10['state_1'].value_counts()
#
# dict_state= state_numbers.to_dict()
# df_states['counts']=df_states['state'].map(dict_state)
# df_states.loc[4, 'counts'] = 35683
#
# data = [ dict(
# type='choropleth',
# colorscale = scl,
# autocolorscale = False,
# locations = df['code'],
# z = df_states['counts'],
# locationmode = 'USA-states',
# # text = df['text'],
# marker = dict(
# line = dict (
# color = 'rgb(255,255,255)',
# width = 2
# )
# ),
# colorbar = dict(
# title = "# Wines"
# )
# ) ]
#
# layout = dict(
# title = 'Wine Concentration by State<br>(Hover for breakdown)',
# geo = dict(
# scope='usa',
# projection=dict( type='albers usa' ),
# showlakes = True,
# lakecolor = 'rgb(255, 255, 255)',
# ),
# )
#
# fig = dict( data=data, layout=layout )
#
# url = iplot( fig, filename='d3-cloropleth-map' )