-
Notifications
You must be signed in to change notification settings - Fork 1
/
process_wiki.py
107 lines (92 loc) · 3.33 KB
/
process_wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import re
import json
import argparse
import subprocess
from tqdm import tqdm
from multiprocessing import Pool
from unicode_tr import unicode_tr
from trtokenizer.tr_tokenizer import SentenceTokenizer
# Global variables
tokenize_fn = None
lower_fn = None
chars_to_remove_regex = "[#$%&()*+,-./:;<=>?@[\]^_{|}~!\"\\\]"
apostrophes = "[’`´ʹʻʼʽʿˈ]"
def parse_sentences_wiki_json_file(fpath):
""" Parses & cleans sentences from a wikipedia file in JSON format
Args:
fpath (str): Path to a extracted wikipedia file
Returns:
sentences (List[str]): List of cleaned sentences
"""
# Load text from wiki articles
with open(fpath) as fp:
texts = [json.loads(line.strip())['text'] for line in fp]
# Senteces from paragraphs
sentences = []
for text in texts:
for sent in tokenize_fn(text):
# Lower the sentence
sent = lower_fn(sent)
# Remove pre-defined chars
sent = re.sub(chars_to_remove_regex, "", sent)
# Unify apostrophes
sent = re.sub(apostrophes, "'", sent)
# Remove multiple spaces
sent = re.sub(r"\s+", " ", sent)
# Append
if len(sent) > 0:
sentences.append(sent)
return sentences
def main():
# Parse arguments
parser = argparse.ArgumentParser(description="Process extracted wikipedia files")
parser.add_argument(
"--wiki_dump", type=str, required=True, help="Path to a wikipedia dump"
)
parser.add_argument(
"--output", type=str, required=True, help="Output directory to save extracted files and sentences file"
)
parser.add_argument(
"--language_id", type=str, default="tr", help="Language id of the wikipedia dump"
)
parser.add_argument(
"--processes", type=int, default=8, help="Number of processes to use"
)
args = parser.parse_args()
global tokenize_fn
global lower_fn
# Sentence tokenizer and string lower functions for the language
if args.language_id == 'tr':
sentence_tokenizer = SentenceTokenizer()
tokenize_fn = sentence_tokenizer.tokenize
lower_fn = lambda x: unicode_tr(x).lower()
else:
raise NotImplementedError("Language id '{}' is not supported!".format(args.language_id))
# Run extractor on the wiki dump
extract_dir = os.path.join(args.output, "extract")
subargs = [
"wikiextractor", args.wiki_dump,
"-o", extract_dir,
"--no-templates",
"--json",
"--processes", str(args.processes)
]
subprocess.check_call(subargs)
# Paths of the extracted wiki files
filepaths = [os.path.join(root, filename)
for root, dirnames, filenames in os.walk(extract_dir)
for filename in filenames]
# Load wiki files and parse sentences
with Pool(args.processes) as pool:
sentences = []
for sentences_pool in tqdm(pool.imap(parse_sentences_wiki_json_file, filepaths), total=len(filepaths)):
sentences += sentences_pool
print("Number of extracted sentences: {}".format(len(sentences)))
# Save all parsed sentences
with open(os.path.join(args.output, 'sentences.txt'), 'w') as fp:
for sent in sentences:
fp.write('{}\n'.format(sent))
return
if __name__ == "__main__":
main()