-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpure_words.py
executable file
·68 lines (51 loc) · 1.8 KB
/
pure_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/python
# -*- coding: utf-8 -*-
from collections import defaultdict
import re
import sys
number = re.compile(r'\d+((,|.|/)\d*)*(%)?') # number, date
# very strict match of english, german and turkish words
alphabet = re.compile(r'^[a-zA-ZßäöüÄÖÜçÇşŞğĞıİ]+$')
def load(freq_file):
d = defaultdict(int)
try:
for line in open(freq_file):
tmp = line.decode('utf-8').strip().split()
d[tmp[0]] = int(tmp[1])
except:
pass
return d
def is_word(text):
return alphabet.match(text.encode('utf-8'))
def read_freq(top = 10000):
en_freq = load('freq_en.txt')
de_freq = load('freq_de.txt')
tr_freq = load('freq_tr.txt')
# en_set = set(w for w, c in en_freq.iteritems() if c >= 5)
# de_set = set(w for w, c in de_freq.iteritems() if c >= 5)
# tr_set = set(w for w, c in tr_freq.iteritems() if c >= 5)
en_set = set(w for w,c in sorted(en_freq.iteritems(), key=lambda (k, v): v, reverse = True)[:top])
de_set = set(w for w,c in sorted(de_freq.iteritems(), key=lambda (k, v): v, reverse = True)[:top])
tr_set = set(w for w,c in sorted(tr_freq.iteritems(), key=lambda (k, v): v, reverse = True)[:top])
# print len(en_set)
# print len(de_set)
# print len(tr_set)
de_pure = filter(is_word, de_set - en_set)
tr_pure = filter(is_word, tr_set - en_set)
# print len(de_pure)
# print len(tr_pure)
# print sorted(de_pure)[:100]
f = open('dict_de.txt', 'w')
for w in sorted(de_pure):
f.write(w.encode('utf-8') + '\n')
f.close()
f = open('dict_tr.txt', 'w')
for w in sorted(tr_pure):
f.write(w.encode('utf-8') + '\n')
f.close()
if __name__ == '__main__':
if len(sys.argv) != 2:
top = 100000
else:
top = int(sys.argv[1])
read_freq(top)