-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathultrapollution_noaa_parser.py
132 lines (85 loc) · 2.91 KB
/
ultrapollution_noaa_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# This script parses noaa data and creates weather variables
# Importing required modules
import pandas as pd
import glob
# Declaring username + directory
username = ''
direc = 'C:/Users/' + username + '/Documents/Data/ultrapollution/noaa_data/'
# Creating a list of the files to parse
to_parse = glob.glob(direc + '*')
# Data storage
stations = []
lats = []
longs = []
dates = []
dewps = []
preps = []
temps = []
# Parsing the files
for f in to_parse:
# Status update
print('Parsing file ' + str(1+to_parse.index(f)) + ' of ' + str(len(to_parse)) + '.......')
# Get the text
tmp = open(f)
dat = tmp.read()
tmp.close()
# Parse the text
idx = dat.find('\\n')
dat = dat[idx+3:]
while len(dat) > 10:
idx = dat.find(',')
stations.append(dat[:idx-1])
dat = dat[idx+1:]
idx = dat.find(',')
lats.append(dat[1:idx-1])
dat = dat[idx+1:]
idx = dat.find(',')
longs.append(dat[1:idx-1])
dat = dat[idx+1:]
idx = dat.find(',')
dat = dat[idx+1:]
idx = dat.find(',')
dates.append(dat[1:idx-1])
dat = dat[idx+1:]
idx = dat.find(',')
if idx-1 > 1:
dewps.append(dat[1:idx-1])
else:
dewps.append(None)
dat = dat[idx+1:]
idx = dat.find(',')
if idx-1 > 1:
preps.append(dat[1:idx-1])
else:
preps.append(None)
dat = dat[idx+1:]
idx = dat.find('\\n')
if idx-1 > 1:
temps.append(dat[1:idx-1])
else:
temps.append(None)
idx = dat.find('\\n')
dat = dat[idx+3:]
# Cleaning the data lists
def bad_spaces_go_bye_bye(x):
if x != None:
while x[0] == ' ':
x = x[1:]
return x
dewps_clean = [bad_spaces_go_bye_bye(x) for x in dewps]
preps_clean = [bad_spaces_go_bye_bye(x) for x in preps]
temps_clean = [bad_spaces_go_bye_bye(x) for x in temps]
# Make a dataframe
s = pd.Series(stations, name = 'Station')
d = pd.Series(dates, name = 'Date')
la = pd.Series(lats, name = 'Latitude')
lo = pd.Series(longs, name = 'Longitude')
p = pd.Series(preps_clean, name = 'Precipitation')
df = pd.concat([s,d,la,lo,p], axis = 1)
# Remove some bad entries
indices = [1 if str(df.Station[i])[:2] == 'US' else 0 for i in range(len(df))]
df = pd.concat([df, pd.Series(indices, name = 'Keep')], axis = 1)
df = df[df.Keep == 1].reset_index(drop = True)
df = df[list(df.columns)[:-1]]
# Save to file
df.to_csv(direc[:-10] + 'ultra_data/NOAA.csv', index = False)