-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordfilt.py
executable file
·174 lines (145 loc) · 5.38 KB
/
wordfilt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python
"""wordfilt.py: Library/utility for preparing text for word frequency analysis.
Usage: ./wordfilt.py [options] <file>
Options:
-h --help Show this help
"""
import re
import nltk
import unittest
from docopt import docopt
class WordMapper:
def __init__(self, *args):
"""
Create a WordMapper from one or more text files expressing word
mappings. Arguments are file names. Each line in each file
expresses an equivalence class after the mapping. For example,
the line:
be was is were are
will cause "was", "is", "were", and "are" to be mapped to "be".
"""
try:
raise NotImplementedError("Consider caching the map with marshall")
except:
self.mapping = dict()
for fn in args:
with open(fn) as mapfile:
map_specs =(l.split() for l in mapfile.read().splitlines())
# Build a dict mapping each equivalent word to the first.
for spec in map_specs:
if len(spec) >= 2: # No singletons or empty lines.
for word in spec:
if word in self.mapping:
raise ValueError(
"Duplicate word {}".format(word)
)
else:
self.mapping[word] = spec[0]
def map(self,word):
word = self.regex_subs(word)
if word in self.mapping:
return self.mapping[word]
else:
return word
def regex_subs(self,word):
word = re.sub("'s$",'',word) # killing trailing 's should be safe
word = re.sub("'$",'',word) # killing trailing ' should also be safe
return word
def norm_capitalization(sent):
"""
Normalize the capitalization of each word in a sentence, uncapping
the first word if only the first letter is capitalized, and attempting
to detect ALL CAPS with heuristics.
Input
=====
sent : string
Return
======
unicode string
"""
def uncap_titlecase_match(m):
"""Function for passing to re.sub. Lowercase a titlecase Word."""
word = m.group(0)
if re.sub(r'[^A-Za-z]','',word) not in {'I'}:
return word.lower()
else:
return word
def uncap_allcaps_match(m):
phrase = m.group(0)
return phrase.lower()
# Uncap Studly words at begenning of sentence, or following " or '.
sent = re.sub(r'((^|[\'">])[A-Z]([a-z-]+| ))',uncap_titlecase_match,sent)
# Uncap n-grams of words in ALL CAPS, for n>=2.
sent = re.sub(r'[A-Z]+( [A-Z]+)+',uncap_allcaps_match,sent)
return sent
def clean_text(text):
text = re.sub(r'>','>',text) # quotes
text = re.sub(r' ',' ',text) # I don't know where these come from.
text = re.sub(r'https?://[^\s]+|[^\s]+\.[^\s]{2,3}','',text) # URLs
return text
if __name__ == "__main__":
args = docopt(__doc__)
with open(args['<file>']) as infile:
text = infile.read()
text = clean_text(text)
sents = nltk.sent_tokenize(text)
text = "\n".join(norm_capitalization(s) for s in sents)
print(text)
################################### TESTS #####################################
class Test_norm_capitalization(unittest.TestCase):
def test_fixedstrings(self):
fixedstrings = [
"my machine runs Debian.",
"there are four CPUs.",
"CPU is an acronym here.",
"P99 shouldn't change either",
"bob says, 'SNAFU is an acryonym too.'",
"T.J. rides the bus.",
"I yam what I yam.",
"he said, \"I yam what I yam.\""
]
for s in fixedstrings:
self.assertEqual(norm_capitalization(s),s)
def test_sent_begin(self):
inputs = [
"This is a sentence.",
"Ten-der.",
"Supple."
]
expected = [
"this is a sentence.",
"ten-der.",
"supple."
]
for i,e in zip(inputs,expected):
self.assertEqual(norm_capitalization(i),e)
def test_quote_begin(self):
inputs = [
"bob said, 'This is a sentence.'",
"tj. said, \"Ten-der.\"",
"'Supple,' Joe ejaculated.",
"A man is not an island."
]
expected = [
"bob said, 'this is a sentence.'",
"tj. said, \"ten-der.\"",
"'supple,' Joe ejaculated.",
"a man is not an island."
]
for i,e in zip(inputs,expected):
self.assertEqual(norm_capitalization(i),e)
def test_allcaps(self):
inputs = [
"WHAT THE FUCK?",
"We live in the USA.",
"This is ABSOLUTELY RIDICULOUS.",
"My computer has 4 GiB of RAM."
]
expected = [
"what the fuck?",
"we live in the USA.",
"this is absolutely ridiculous.",
"my computer has 4 GiB of RAM."
]
for i,e in zip(inputs,expected):
self.assertEqual(norm_capitalization(i),e)