-
Notifications
You must be signed in to change notification settings - Fork 7
/
dump.py
77 lines (67 loc) · 2.45 KB
/
dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import sqlaload as sl
import csv
from common import *
log = logging.getLogger('dump')
def convert_value(value):
if value is None:
value = u''
elif not isinstance(value, unicode):
value = unicode(value)
return value.encode('utf-8')
def sources_metadata(engine):
sources = {}
log.info("Building sources index...")
source_table = sl.get_table(engine, 'source')
for source in sl.all(engine, source_table):
data = {
'SourceDatasetName': source.get('package_name'),
'SourceDatasetID': source.get('package_id'),
'SourceDatasetTitle': source.get('package_title'),
'SourcePublisherName': source.get('publisher_name'),
'SourcePublisherTitle': source.get('publisher_title'),
'SourceID': source.get('resource_id'),
'SourceURL': source.get('url'),
'SourceFormat': source.get('format'),
}
sources[source['resource_id']] = data
return sources
def generate_all():
engine = db_connect()
spending = sl.get_table(engine, 'spending')
sources = sources_metadata(engine)
signatures = set()
for row in sl.find(engine, spending, valid=True):
if row['signature'] in signatures:
continue
signatures.add(row['signature'])
if not row['resource_id'] in sources:
continue
row.update(sources[row['resource_id']])
row.pop('valid', True)
row.pop('row_id', True)
row.pop('resource_id', True)
row.pop('resource_hash', True)
row['RecordETLID'] = row.pop('id', None)
row['RecordSignature'] = row.pop('signature', None)
row['SourceSheetID'] = row.pop('sheet_id', None)
yield row
def dump_all(filename):
writer = None
fh = open(filename, 'wb')
for i, row in enumerate(generate_all()):
if writer is None:
field_names = row.keys()
writer = csv.DictWriter(fh, field_names)
try:
# Python 2.7/3.2+ only
writer.writeheader()
except AttributeError:
# Python 2.6 etc
writer.writerow(dict(zip(field_names, field_names)))
writer.writerow(dict((k, convert_value(v)) for k,v in row.items()))
if i % 1000 == 0:
log.info("Writing: %s...", i)
log.info("Finished: %s records exported to %s", i, filename)
fh.close()
if __name__ == '__main__':
dump_all('spending.csv')