-
Notifications
You must be signed in to change notification settings - Fork 4
/
geolabel.py
43 lines (42 loc) · 980 Bytes
/
geolabel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# gets all coordinates in the main snak
# needs kb
import gzip
output = gzip.open('geolabel.txt.gz', 'w')
count = 0
linecount = 0
item = ''
enlabel = ''
label = ''
for line in gzip.open('kb.txt.gz') :
linecount += 1
if (linecount % 1000000) == 0 : print linecount / 1000000
if line.startswith('#') :
output.write(line)
continue
if line.startswith(' ') : continue
parts = line.split(' ', 2)
if len(parts) != 3 : continue
s = parts[0]
p = parts[1]
o = parts[2]
if s != item :
item = s
enlabel = ''
label = ''
if p == 'label' :
if o.startswith('{en:') :
enlabel = o[4:-4]
label = o[1:-4]
if not p.startswith('P') : continue
if not o.startswith("{u'latitude':") : continue
o = o[:-2]
o = eval(o)
if enlabel != '' : label = enlabel
if label == '' :
label = s
print s
output.write(str(o['latitude']) + ' ' + str(o['longitude']) + ' ' + p + ' ' + s + ' ' + label + "\n")
count += 1
print linecount, 'lines'
print count, 'results'
output.close()