-
Notifications
You must be signed in to change notification settings - Fork 0
/
weather_utils.py
153 lines (122 loc) · 5 KB
/
weather_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Utilities to retrieve weather data for the races"""
import sys
import time
import datetime
import numpy as np
import pandas as pd
import weather
def search_weather(races_df, api_key):
"""Build a dataset with weather information given a dataset containing
information about races.
Parameters
----------
races_df : pandas.DataFrame
dataframe containing the races information
api_key : string
API key for historical weather service
Returns
-------
new_dataframe : pandas.DataFrame
the new dataframe containing previous information and weather
information
"""
# Load the races dataset
dataframe = races_df
# Create an empty dataframe to store weather
weather_df = pd.DataFrame(columns=['min_temp', 'max_temp', 'uv_index',
'weather_desc'])
# For each row (ie race), find the weather and store it
for index, row in dataframe.iterrows():
# Extract race information
date_string = row['Date']
date_split = date_string.split('.')
day = int(date_split[1])
month = int(date_split[2])
year = int(date_split[3])
city = row['Place']
# If date too old (< 1st July 2008), no weather available, put nans and
# go on...
if (year < 2008) or (month < 7 and year == 2008):
#print("old race")
weather_df.loc[index, 'min_temp'] = np.nan
weather_df.loc[index, 'max_temp'] = np.nan
weather_df.loc[index, 'uv_index'] = np.nan
weather_df.loc[index, 'weather_desc'] = None
else:
# Lookup weather
weather_json = weather.lookup_weather(city, day, month, year, api_key)
# Extract weather info
try:
min_temp, max_temp, uv_index, hourly = weather.extract_weather_info(weather_json)
weather_df.loc[index, 'min_temp'] = min_temp
weather_df.loc[index, 'max_temp'] = max_temp
weather_df.loc[index, 'uv_index'] = uv_index
descriptions = [hourly[k]['description'] for k in hourly.keys()]
from collections import Counter
count = Counter(descriptions)
if len(set([cnt for (_, cnt) in count.most_common()])) == 1:
# All descriptions are unique : take midday
desc = hourly['1200']['description']
else:
# Take most common description
desc = count.most_common()[0][0]
weather_df.loc[index, 'weather_desc'] = desc
except Exception as excep:
#print(excep)
weather_df.loc[index, 'min_temp'] = np.nan
weather_df.loc[index, 'max_temp'] = np.nan
weather_df.loc[index, 'uv_index'] = np.nan
weather_df.loc[index, 'weather_desc'] = None
# Merge dataframes
new_dataframe = pd.concat([dataframe, weather_df], axis=1)
return new_dataframe
# Store new dataframe on CSV file (with timestamp in name)
# new_dataframe.to_csv('races_weather_{}.csv'.format(int(time.time())))
def build_weather_dataset(races_filename, api_key):
"""Search the weather for all races and add information in a new dataframe.
This function is needed because of the API limitations.
Parameters
----------
races_filename : string
the file containing information about the races
api_key : string
API key for historical weather service
Returns
-------
nothing
"""
# Load data
races_df = pd.read_csv(races_filename, index_col=0)
# Split data given API limitation
api_calls_limit = 450
num_chunks = round(races_df.shape[0] / api_calls_limit)
chunks = np.array_split(races_df, num_chunks)
print("Data divided in {} chunks to fit API limit".format(num_chunks))
# Create empty dataframe
final_df = pd.DataFrame()
# For each chunk, get new dataframe and add it to the final one
chunk_num = 0
for chunk in chunks:
print("Processing chunk {}".format(chunk_num + 1))
chunk_num += 1
# Search for weather information
chunk_new = search_weather(chunk, api_key)
final_df = pd.concat([final_df, chunk_new], axis=0)
# Save intermediary step
chunk_new.to_csv("races-information-{}.csv".format(chunk_num))
# Wait a bit more than 24 hours :-)
if (chunk_num != len(chunks)):
print("Entering sleep mode : see you tomorrow !")
delta = datetime.timedelta(hours=25)
time.sleep(delta.total_seconds())
# Output final dataframe to CSV file
print("Exporting to CSV file")
final_df.to_csv('../datasets/races-information-weather.csv')
if __name__ == '__main__':
args = sys.argv[1:]
if len(args) < 2:
print("Usage : `python weather_utils.py <races_file.csv> <api_key>`")
sys.exit(1)
build_weather_dataset(args[0], args[1])