-
Notifications
You must be signed in to change notification settings - Fork 4
/
countstatements.py
52 lines (47 loc) · 1.3 KB
/
countstatements.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# counts the number of statements
# needs kb
import gzip
subjects = set()
objects = set()
output = file('stats.txt', 'w')
count = 0
linecount = 0
aliascount = 0
labelcount = 0
descriptioncount = 0
sitelinkcount = 0
for line in gzip.open('kb.txt.gz') :
linecount += 1
if (linecount % 1000000) == 0 : print linecount / 1000000
if line.startswith('#') :
output.write(line)
continue
if line.startswith(' ') : continue
parts = line.split(' ', 2)
if len(parts) != 3 : continue
s = parts[0]
if not s.startswith('Q') : continue
p = parts[1]
o = parts[2]
if p == 'label' :
labelcount += 1
if p == 'description' :
descriptioncount += 1
if p == 'alias' :
aliascount += 1
if p == 'link' :
sitelinkcount += 1
if not p.startswith('P') : continue
subjects.add(s)
count += 1
if not o.startswith('Q') : continue
objects.add(o)
output.write(str(linecount) + ' lines' + "\n")
output.write(str(count) + ' statements' + "\n")
output.write(str(labelcount) + ' labels' + "\n")
output.write(str(descriptioncount) + ' descriptions' + "\n")
output.write(str(aliascount) + ' aliases' + "\n")
output.write(str(sitelinkcount) + ' sitelinks' + "\n")
output.write(str(len(subjects)) + ' subjects' + "\n")
output.write(str(len(objects)) + ' objects' + "\n")
#output.write(str(len(subjects.intersection(objects))), 'nodes' + "\n")