forked from tomslee/airbnb-data-collection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
export_spreadsheet.py
342 lines (314 loc) · 12.7 KB
/
export_spreadsheet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
#!/usr/bin/python
import psycopg2 as pg
import pandas as pd
import argparse
import datetime as dt
import logging
from airbnb_config import ABConfig
LOG_LEVEL = logging.INFO
# Set up logging
LOG_FORMAT = '%(levelname)-8s%(message)s'
logging.basicConfig(format=LOG_FORMAT, level=LOG_LEVEL)
DEFAULT_START_DATE = '2013-05-02'
def survey_df(ab_config, city, start_date):
sql_survey_ids = """
select survey_id, survey_date, comment
from survey s, search_area sa
where s.search_area_id = sa.search_area_id
and sa.name = %(city)s
and s.survey_date > '{start_date}'
and s.status = 1
order by survey_id
""".format(start_date=start_date)
conn = ab_config.connect()
df = pd.read_sql(sql_survey_ids, conn,
params={"city": city})
conn.close()
return(df)
def city_view_name(ab_config, city):
sql_abbrev = """
select abbreviation from search_area
where name = %s
"""
conn = ab_config.connect()
cur = conn.cursor()
cur.execute(sql_abbrev, (city, ))
city_view_name = 'listing_' + cur.fetchall()[0][0]
cur.close()
return city_view_name
def total_listings(ab_config, city_view):
sql = """select s.survey_id "Survey",
survey_date "Date", count(*) "Listings"
from {city_view} r join survey s
on r.survey_id = s.survey_id
group by 1, 2
order by 1
""".format(city_view=city_view)
conn = ab_config.connect()
df = pd.read_sql(sql, conn)
conn.close()
return df
def by_room_type(ab_config, city_view):
sql = """select s.survey_id "Survey",
survey_date "Date", room_type "Room Type",
count(*) "Listings", sum(reviews) "Reviews",
sum(reviews * price) "Relative Income"
from {city_view} r join survey s
on r.survey_id = s.survey_id
where room_type is not null
group by 1, 2, 3
order by 1
""".format(city_view=city_view)
conn = ab_config.connect()
df = pd.read_sql(sql, conn)
conn.close()
return df.pivot(index="Date", columns="Room Type")
def by_host_type(ab_config, city_view):
sql = """
select survey_id "Survey",
survey_date "Date",
case when listings_for_host = 1
then 'Single' else 'Multi'
end "Host Type",
sum(hosts) "Hosts", sum(listings) "Listings", sum(reviews) "Reviews"
from (
select survey_id, survey_date,
listings_for_host, count(*) hosts,
sum(listings_for_host) listings, sum(reviews) reviews
from (
select s.survey_id survey_id, survey_date,
host_id, count(*) listings_for_host,
sum(reviews) reviews
from {city_view} r join survey s
on r.survey_id = s.survey_id
group by s.survey_id, survey_date, host_id
) T1
group by 1, 2, 3
) T2
group by 1, 2, 3
""".format(city_view=city_view)
conn = ab_config.connect()
df = pd.read_sql(sql, conn)
conn.close()
df = df.pivot(index="Date", columns="Host Type")
# df.set_index(["Date"], drop=False, inplace=True)
return df
def by_neighborhood(ab_config, city_view):
sql = """select
s.survey_id, survey_date "Date", neighborhood "Neighborhood",
count(*) "Listings", sum(reviews) "Reviews"
from {city_view} r join survey s
on r.survey_id = s.survey_id
group by 1, 2, 3
""".format(city_view=city_view)
conn = ab_config.connect()
df = pd.read_sql(sql, conn)
conn.close()
df = df.pivot(index="Date", columns="Neighborhood")
# df.set_index(["Date"], drop=False, inplace=True)
return df
def export_city_summary(ab_config, city, project, start_date):
logging.info(" ---- Exporting summary spreadsheet" +
" for " + city +
" using project " + project)
city_bar = city.replace(" ", "_").lower()
today = dt.date.today().isoformat()
xlsxfile = ("./{project}/slee_{project}_{city_bar}_summary_{today}.xlsx"
).format(project=project, city_bar=city_bar, today=today)
writer = pd.ExcelWriter(xlsxfile, engine="xlsxwriter")
df = survey_df(ab_config, city, start_date)
city_view = city_view_name(ab_config, city)
logging.info("Total listings...")
df = total_listings(ab_config, city_view)
df.to_excel(writer, sheet_name="Total Listings", index=False)
logging.info("Listings by room type...")
df = by_room_type(ab_config, city_view)
df["Listings"].to_excel(writer,
sheet_name="Listings by room type", index=True)
df["Reviews"].to_excel(writer,
sheet_name="Reviews by room type", index=True)
logging.info("Listings by host type...")
df = by_host_type(ab_config, city_view)
df["Hosts"].to_excel(writer,
sheet_name="Hosts by host type", index=True)
df["Listings"].to_excel(writer,
sheet_name="Listings by host type", index=True)
df["Reviews"].to_excel(writer,
sheet_name="Reviews by host type", index=True)
logging.info("Listings by neighborhood...")
df = by_neighborhood(ab_config, city_view)
df["Listings"].to_excel(writer,
sheet_name="Listings by Neighborhood", index=True)
df["Reviews"].to_excel(writer,
sheet_name="Reviews by Neighborhood", index=True)
logging.info("Saving " + xlsxfile)
writer.save()
def export_city_data(ab_config, city, project, format, start_date):
logging.info(" ---- Exporting " + format +
" for " + city +
" using project " + project)
df = survey_df(ab_config, city, start_date)
survey_ids = df["survey_id"].tolist()
survey_dates = df["survey_date"].tolist()
logging.info(" ---- Surveys: " + ', '.join(str(id) for id in survey_ids))
conn = ab_config.connect()
# survey_ids = [11, ]
if project == "gis":
city_view = city_view_name(ab_config, city)
sql = """
select room_id, host_id, room_type,
borough, neighborhood,
reviews, overall_satisfaction,
accommodates, bedrooms, bathrooms,
price, minstay,
latitude, longitude,
last_modified as collected
from {city_view}
where survey_id = %(survey_id)s
order by room_id
""".format(city_view=city_view)
elif project == "hvs":
sql = """
select room_id, host_id, room_type,
borough, neighborhood,
reviews, overall_satisfaction,
accommodates, bedrooms, bathrooms,
price, minstay,
latitude, longitude,
last_modified as collected
from hvs.listing
where survey_id = %(survey_id)s
order by room_id
"""
else:
sql = """
select room_id, host_id, room_type,
city, neighborhood,
reviews, overall_satisfaction,
accommodates, bedrooms, bathrooms,
price, minstay,
latitude, longitude,
last_modified as collected
from survey_room(%(survey_id)s)
order by room_id
"""
city_bar = city.replace(" ", "_").lower()
if format == "csv":
for survey_id, survey_date in \
zip(survey_ids, survey_dates):
csvfile = ("./{project}/ts_{city_bar}_{survey_date}.csv").format(
project=project, city_bar=city_bar,
survey_date=str(survey_date))
csvfile = csvfile.lower()
df = pd.read_sql(sql, conn,
# index_col="room_id",
params={"survey_id": survey_id.item()}
)
logging.info("CSV export: survey " +
str(survey_id) + " to " + csvfile)
df.to_csv(csvfile)
# default encoding is 'utf-8' on Python 3
else:
today = dt.date.today().isoformat()
xlsxfile = ("./{project}/slee_{project}_{city_bar}_{today}.xlsx"
).format(project=project, city_bar=city_bar, today=today)
writer = pd.ExcelWriter(xlsxfile, engine="xlsxwriter")
logging.info("Spreadsheet name: " + xlsxfile)
# read surveys
for survey_id, survey_date in \
zip(survey_ids, survey_dates):
logging.info("Survey " + str(survey_id) + " for " + city)
df = pd.read_sql(sql, conn,
# index_col="room_id",
params={"survey_id": survey_id.item()}
)
if len(df) > 0:
logging.info("Survey " + str(survey_id) +
": to Excel worksheet")
df.to_excel(writer, sheet_name=str(survey_date))
else:
logging.info("Survey " + str(survey_id) +
" not in production project: ignoring")
# neighborhood summaries
if project == "gis":
sql = "select to_char(survey_date, 'YYYY-MM-DD') as survey_date,"
sql += " neighborhood, count(*) as listings from"
sql += " " + city_view + " li,"
sql += " survey s"
sql += " where li.survey_id = s.survey_id"
sql += " and s.survey_date > %(start_date)s"
sql += " group by survey_date, neighborhood order by 3 desc"
try:
df = pd.read_sql(sql, conn, params={"start_date": start_date})
if len(df.index) > 0:
logging.info("Exporting listings for " + city)
dfnb = df.pivot(index='neighborhood', columns='survey_date',
values='listings')
dfnb.fillna(0)
dfnb.to_excel(writer, sheet_name="Listings by neighborhood")
except pg.InternalError:
# Miami has no neighborhoods
pass
except pd.io.sql.DatabaseError:
# Miami has no neighborhoods
pass
sql = "select to_char(survey_date, 'YYYY-MM-DD') as survey_date,"
sql += " neighborhood, sum(reviews) as visits from"
sql += " " + city_view + " li,"
sql += " survey s"
sql += " where li.survey_id = s.survey_id"
sql += " and s.survey_date > %(start_date)s"
sql += " group by survey_date, neighborhood order by 3 desc"
try:
df = pd.read_sql(sql, conn, params={"start_date": start_date})
if len(df.index) > 0:
logging.info("Exporting visits for " + city)
dfnb = df.pivot(index='neighborhood', columns='survey_date',
values='visits')
dfnb.fillna(0)
dfnb.to_excel(writer, sheet_name="Visits by neighborhood")
except pg.InternalError:
# Miami has no neighborhoods
pass
except pd.io.sql.DatabaseError:
pass
logging.info("Saving " + xlsxfile)
writer.save()
def main():
parser = \
argparse.ArgumentParser(
description="Create a spreadsheet of surveys from a city")
parser.add_argument("-cfg", "--config_file",
metavar="config_file", action="store", default=None,
help="""explicitly set configuration file, instead of
using the default <username>.config""")
parser.add_argument('-c', '--city',
metavar='city', action='store',
help="""set the city""")
parser.add_argument('-p', '--project',
metavar='project', action='store', default="public",
help="""the project determines the table or view: public
for room, gis for listing_city, default public""")
parser.add_argument('-f', '--format',
metavar='format', action='store', default="xlsx",
help="""output format (xlsx or csv), default xlsx""")
parser.add_argument('-s', '--summary',
action='store_true', default=False,
help="create a summary spreadsheet instead of raw data")
parser.add_argument('-sd', '--start_date',
metavar="start_date", action='store',
default=DEFAULT_START_DATE,
help="create a summary spreadsheet instead of raw data")
args = parser.parse_args()
ab_config = ABConfig(args)
if args.city:
if args.summary:
export_city_summary(ab_config, args.city, args.project.lower(),
args.start_date)
else:
export_city_data(ab_config, args.city, args.project.lower(),
args.format, args.start_date)
else:
parser.print_help()
if __name__ == "__main__":
main()