-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcheck.py
executable file
·269 lines (218 loc) · 8.91 KB
/
check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#!/usr/bin/python
# -*- coding: utf-8 -*-
import tweepy
import time
import json
import re
import sys
from pymongo import MongoClient
from segtok.segmenter import split_multi
from segtok.tokenizer import word_tokenizer, split_contractions
import pexpect
import subprocess
# NOTE:
# Title() for morph analysis, lower() for dictionary lookup
# ALWAYS: decode().lower().encode()
# encapsule the lower() and title()
# filter @username, #topic and url
filter_pattern = re.compile(r'(@|#|https?:)\S*')
# try to rule out number, punctuation, proper noun, guess, abbreviation, and weird composition
# use regex to catch all
# for german morphs
pattern1 = re.compile(r'_|<\+PUNCT>|<\+CARD>|<\+SYMBOL>|<\+NPROP>|<GUESSER>|<\^ABBR>')
pattern2 = re.compile(r'<NN>|<V>|<SUFF>|<VPART>') # check the compound words in dictionary, since morph are too loose
# for turkish morphs
pattern3 = re.compile(r'\*UNKNOWN\*|\+Punct|\+Punc|\+Num')
# pattern3 = re.compile(r'\*UNKNOWN\*|\+Punct|\+Num')
def read_dict(dict_file):
d = set()
for line in open(dict_file):
d.add(line.strip())
return d
class Checker:
"""
for the speed of morphological analyzer, it has to work in batch,
it analyzes e.g. every 1000 tweets in one go, then reloads the analyzer
"""
def __init__(self, source_db, target_db, de_dict_file, tr_dict_file, policy):
self.client = MongoClient()
self.source_db = self.client[source_db]
self.target_db = self.client[target_db]
self.de_dict = read_dict(de_dict_file)
self.tr_dict = read_dict(tr_dict_file)
self.policy = policy
# generator for reading tweets from db
def tweet_stream(self):
for tweet in self.source_db['tweets'].find():
# change the state of the tweet as indexed (checked)
self.source_db['tweets'].update({'_id': tweet['_id']}, {'$set': {'indexed': True}}, upsert = True)
yield (tweet['text'], tweet['tweet_id'], tweet['user_id'])
def check(self):
batch_num = 1
size = 10000
# for each batch
for batch in self.batch(self.tweet_stream(), size):
print batch_num * size
batch_num += 1
words, counts = self.tokenize(batch)
trs = self.check_tr(words)
des = self.check_de(words)
i = 0
# ans = []
for ((text, tid, uid), count) in zip(batch, counts):
tr = trs[i: i + count] # [True, False, False, True]
de = des[i: i + count] # [False, True, False, True]
ws = words[i: i + count] # [tr, de, xx, tr]
i += count
de_list = [w for (w, d, t) in zip(ws, de, tr) if d and not t]
if de_list and tr.count(True) >= tr.count(False) * 2:
print zip(ws, tr)
self.log(text, tid, uid, de_list)
# for debugging
def check_single(self, text):
text = text.decode('utf-8')
words, counts = self.tokenize([(text, 'tid', 'uid')])
print words
# print counts
tr = self.check_tr(words)
print tr
de = self.check_de(words)
print de
de_list = [w for (w, d, t) in zip(words, de, tr) if d and not t]
print de_list
if de_list and tr.count(True) >= 5:
# self.log(text, tid, uid, de_list)
print 'find one'
def log(self, text, tid, uid, de_list):
print text.encode('utf-8')
print '[' + ', '.join(de_list) + ']'
################
# log the tweet
self.target_db['tweets'].insert({'tweet_id': tid,\
'user_id': uid,\
'text': text,\
'words': de_list})
# log the user
self.target_db['users'].update({'user_id': uid},\
{'$inc': {'count': 1}}, upsert = True)
# log the german words
for word in de_list:
self.target_db['words'].update({'word': word},\
{'$inc': {'count': 1}}, upsert = True)
def batch(self, stream, size):
out = []
i = 0
for (text, tid, uid) in stream:
out.append((text, tid, uid))
i += 1
if i == size:
yield out
out = []
i = 0
if out:
yield out
# DONE
# input decoded text
# output incoded wordlist, lowercase
def tokenize(self, tweets):
"""
tokenize the text and filter the @username (and punctuation, smiley ...), leave only words
"""
counts = [] # [5, 12, 0, 3, ...] the counts of valid words for each tweet
words = [] # list of words
# out = '' # one-word-per-line string of the tokenized words for morph analysis
for (text, tid, uid) in tweets:
i = 0
text = filter_pattern.sub(' ', text)
for sent in split_multi(text):
for token in word_tokenizer(sent):
# words.append(token.lower().encode('utf-8', 'ignore'))
words.append(token.encode('utf-8', 'ignore'))
i += 1
counts.append(i)
return words, counts
# input: list of words in the batch (encoded)
# output: list of whether each word is a turkish word (either in morph or dict)
def check_tr(self, words):
"""
morphological analysis for turkish words
"""
input_str = '\n'.join(w.decode('utf-8').title().encode('utf-8') for w in words) + '\n'
# cmd = './bin/lookup -d -q -f bin/checker.script'
cmd = './bin/Morph-Pipeline/lookup -d -q -f bin/Morph-Pipeline/test-script.txt'
lookup = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
output = lookup.communicate(input=input_str)[0]
morphs = output.strip().split('\n\n')
# print morphs
# print len(words)
assert len(morphs) == len(words)
# true if not ends with '+?', no matter how many analysis for a word
# morph_ans = map(lambda x: not x.endswith('*UNKNOWN*'), morphs)
morph_ans = map(lambda x: self.is_tr_word(x), morphs)
dict_ans = [w in self.tr_dict for w in words]
return [any(pair) for pair in zip(morph_ans, dict_ans)]
def check_de(self, words):
"""
morphological analysis for german words, exclude punctuation and numbers
"""
input_str = '\n'.join(w.decode('utf-8').title().encode('utf-8') for w in words) + '\n'
cmd = './bin/_run_smor.sh 2> /dev/null'
lookup = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
output = lookup.communicate(input=input_str)[0]
morphs = output.strip().split('\n\n')
assert len(morphs) == len(words)
morph_ans = map(lambda (w, m): self.is_de_word(w, m), zip(words, morphs))
return morph_ans
def is_de_word(self, word, morph_str):
if self.policy == 1:
for line in morph_str.split('\n'):
if pattern1.search(line):
return False
elif pattern2.search(line):
return (word.decode('utf-8').lower().encode('utf-8') in self.de_dict)
return True
elif self.policy >= 2:
if pattern1.search(morph_str):
return False
else:
return word.decode('utf-8').lower().encode('utf-8') in self.de_dict
def is_tr_word(self, morph_str):
if pattern3.search(morph_str):
return False
else:
return True
class TextChecker(Checker):
def __init__(self, source_file, target_db, de_dict_file, tr_dict_file, policy):
self.client = MongoClient()
self.target_db = self.client[target_db]
self.de_dict = read_dict(de_dict_file)
self.tr_dict = read_dict(tr_dict_file)
self.source_file = source_file
self.policy = policy
def tweet_stream(self):
for line in open(self.source_file):
try:
items = line.strip().split(',', 3)
tid = items[1]
uid = items[2]
text = items[3]
yield (text.decode('utf-8', 'ignore'), tid, uid)
except:
pass
if __name__ == '__main__':
source_db = sys.argv[1]
target_db = sys.argv[2]
if len(sys.argv) == 4:
policy = int(sys.argv[3]) # 1 for loose, 2 for strict, 3 for super strict
else:
policy = 2
checker = Checker(source_db, target_db, 'dict_de.txt', 'dict_tr.txt', policy)
checker.check()
# source_file = sys.argv[1]
# target_db = sys.argv[2]
# if len(sys.argv) == 4:
# policy = int(sys.argv[3]) # 1 for loose, 2 for strict, 3 for super strict
# else:
# policy = 2
# checker = TextChecker(source_file, target_db, 'dict_de.txt', 'dict_tr.txt', policy)
# checker.check()