-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuildIndex_tokenize.py
97 lines (86 loc) · 2.56 KB
/
buildIndex_tokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import sys
import time
from bs4 import BeautifulSoup
import nltk
import multiprocessing
import logging
#from multiprocessing.dummy import Pool as ThreadPool
def worker_tokenize(file_path):
global tokenize_dir
global stemmer
print 'Begin: ' + file_path
filename = '_'.join(file_path.split('/'))
fin = open(file_path,'r')
fout = open(tokenize_dir+filename,'wt')
doc_count = 0
soup = BeautifulSoup(fin,'lxml')
for doc_i in soup.find_all('doc'):
fout.write('<doc>\n')
fout.write('<docno> %s </docno>\n' % (doc_i.docno.text.strip()))
doc_count += 1
doc_text = ''
if doc_i.h3 != None:
doc_text += doc_i.h3.get_text().strip() + '.\n'
if doc_i.h4 != None:
doc_text += doc_i.h4.get_text().strip() + '.\n'
for text_in_doc in doc_i.select('text'):
doc_text += text_in_doc.get_text()
fout.write('<doctext>\n')
for sent in nltk.sent_tokenize(doc_text):
word_list = nltk.word_tokenize(sent)
word_list = [word.lower() for word in word_list \
if ((word[0].isdigit() or word[0].isalpha()))]
term_list = []
for i in range(len(word_list)):
term_i = stemmer.stem(word_list[i])
if term_i != '':
term_list.append(term_i)
try:
fout.write('%s\n' % ('\t'.join(term_list)))
except:
for i in range(len(term_list)):
try:
fout.write('%s' % (term_list[i]))
if i < len(term_list)-1:
fout.write('\t')
except:
continue
fout.write('\n')
fout.write('</doctext>\n')
fout.write('</doc>\n')
fin.close()
fout.close()
print 'end: %s,%d' %(file_path,doc_count)
logger = multiprocessing.log_to_stderr()
logger.setLevel(logging.INFO)
logger.warning('doomed')
global root_dir
root_dir = '/home/zxz/Documents/ir/Robust2004/corpus/'
global tokenize_dir
tokenize_dir = '/home/zxz/Documents/ir/Robust2004/result/tokenize/'
global stemmer
stemmer = nltk.stem.porter.PorterStemmer()
### get all file pathes in corpus
list_file = []
list_dir = os.walk(root_dir)
for root, dirs, files in list_dir:
for f in files:
list_file.append(os.path.join(root,f))
sys.stdout.write('There are #%d files to be processed.\n'%(len(list_file)))
### for each file, tokenize plus Porter stemmer
count = 0
start_time = time.time()
# Make the Pool of workers
pool = multiprocessing.Pool(processes=6)
result = []
# Open the urls in their own threads
# and return the results
for file_i in list_file:
pool.apply_async(worker_tokenize,(file_i, ))
#pool.map(worker_tokenize, list_file)
#close the pool and wait for the work to finish
pool.close()
pool.join()
end_time = time.time()
print 'Total time: %f' %(end_time - start_time)