-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcouchdb-loader.py
executable file
·103 lines (79 loc) · 1.94 KB
/
couchdb-loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python
# encoding: utf-8
"""
Simple script to import geonames data into couchdb
"""
import sys
import os
import shutil
import tempfile
from couchdb import Server, ResourceNotFound
from uuid import uuid4
import time
start = time.time()
data_file_path = "data/test_large.txt"
field_keys = [
'geonameid',
'name',
'asciiname',
'alternatenames',
'lat',
'long',
'feature_class',
'feature_code',
'country_code',
'cc2',
'admin1',
'admin2',
'admin3',
'admin4',
'population',
'elevation',
'gtopo30',
'timezone',
'last_updated']
class GeonamesLoader:
total_inserts = 0
def __init__(self):
uri = 'http://localhost:5984/'
self.cache_dir = tempfile.mkdtemp(prefix='couchdb')
self.server = Server(uri, cache=self.cache_dir)
try:
self.db = self.server['geonames']
except ResourceNotFound:
self.db = self.server.create('geonames')
def load_data(self, data_file_path, field_keys):
data_file = open(data_file_path, 'r')
docs = []
max_docs_per_insert = 1000
i = 0
for line in data_file:
values = line.decode('utf8')
values = values.rstrip().split("\t")
doc = dict(zip(field_keys, values))
# remove blank values
for k, v in doc.items():
if len(v) == 0:
del doc[k]
doc['_id'] = uuid4().hex
docs.append(doc)
if i == max_docs_per_insert:
self.bulk_insert_docs(docs)
docs = []
i = 0
i = i+1
self.bulk_insert_docs(docs)
data_file.close()
def bulk_insert_docs(self, docs):
num_docs = len(docs)
if num_docs != 0:
self.db.update(docs)
print "inserted %d docs" % num_docs
self.total_inserts = self.total_inserts + num_docs
def __del__(self):
shutil.rmtree(self.cache_dir)
geonames_loader = GeonamesLoader()
print "Loading data..."
geonames_loader.load_data(data_file_path, field_keys)
print "..... done."
print "Loaded ",geonames_loader.total_inserts," docs in ", time.time() - start, "seconds."