-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcleaning.py
333 lines (269 loc) · 10.2 KB
/
cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
"""
This file contains functions for cleaning movie scripts (or short stories)
input directory: m_scripts or short_stories
output directory: m_scripts_clean or short_stories_clean
The code in this file cleans one movie script file at a time. It takes each
input movie script file from the folder `m_scripts` and outputs a new file
to the folder `m_scripts_clean`.
It removes contractions like "didn't", and replaces exclusively unicode
symbols by their closest ANSII analogues (e.g., curly quotes are replaced by
straight quotes).
It uses the software SpaCy to break up the movie script into separate
sentences, and returns a file with only one sentence per line.
For the case of movie scripts (but not for short stories), it also tries to
distinguish between dialog lines and narration lines. In many but not all
movie scripts, the dialog lines are indented with respect to the narration
lines. In the case of Pixar/Disney, they don't indent dialog. In cases where
the movie script indents, the MM software gives the option of throwing away
all the dialog lines and keeping only the narration ones. Folders ending in
`_rd` are for remove dialog files.
Occasionally in this file, we use regex (via the Python module `re`).
Here is a nice reference on `re`.
https://www.datacamp.com/tutorial/python-regular-expression-tutorial
ChatGPT is also very good at answering regex questions.
"""
import re
import os
# sentence splitting with NLTK
# from nltk.tokenize import sent_tokenize
import collections as co
from globals import *
from unidecode import unidecode
import contractions
from utils import *
# sentence splitting with spacy
import spacy
nlp = spacy.load('en_core_web_sm')
def expand_contractions(line):
"""
This auxiliary method replaces all contractions in the string `line` by
expansions thereof (e.g., replaces "didn't" by "did not".)
Parameters
----------
line: str
Returns
-------
str
"""
str_list = []
for word in line.split():
str_list.append(contractions.fix(word))
return ' '.join(str_list)
def clean_one_m_script(in_dir,
out_dir,
file_name,
remove_dialog=False):
"""
in_dir and out_dir can be the same, but this will overwrite the files.
This method reads a file called `file_name` in the `in_dir` directory
and creates a clean version in the `out_dir` directory.
Parameters
----------
in_dir: str
out_dir: str
file_name: str
remove_dialog: bool
True iff dialog part of the movie script is removed, leaving only
the narration part.
Returns
-------
None
"""
print('fetching %s' % file_name)
def count_leading_wh_sp(str0):
# wh_sp = white space
count = 0
if str0:
for char in str0:
if char.isspace():
count += 1
else:
break
return count
inpath = in_dir + "/" + file_name
outpath = out_dir + "/" + file_name
with open(inpath, "r", encoding='utf-8') as f:
lines = [line for line in f]
# Replace exclusively unicode characters by ascii analogues (e.g.,
# replace curly quotes by straight ones) so don't have to use
# encoding="utf-8" as a parameter in open() from here on.
lines = [unidecode(line) for line in lines]
# expand contractions
lines = [expand_contractions(line) for line in lines]
# strip trailing (i.e., right) white space and newline.
# If this results in an empty line, remove it.
new_lines = []
for line in lines:
line = line.rstrip()
if line:
new_lines.append(line)
lines = new_lines
# remove everything after and including THE END
new_lines = []
for line in lines:
if line.strip() in ["THE END", "END"]:
break
else:
new_lines.append(line)
lines = new_lines
# regex for parenthetical remarks
pattern_paren = re.compile(r'\[(.*?)\]|\((.*?)\)|\{(.*?)\}')
# regex for period followed by white spaces + number
pattern_period = r"\.(?=\s*\d)"
# Substitutions. If subs results in empty line, remove it.
new_lines = []
for line in lines:
# print("ssdf", line)
# remove parenthetical remarks
line = re.sub(pattern_paren, "", line)
# remove the underscore, which is not
# considered a punctuation mark.
line = re.sub(r'[_]', '', line)
# Replace tabs by 12 blank spaces
line = re.sub(r"\t", " " * 12, line)
# replace period by dash if period followed by number
line = re.sub(pattern_period, "-", line)
# print("\tssdf", line)
if len(line) >= 1:
new_lines.append(line)
lines = new_lines
# Add missing periods for transitions from dialog to narration or vice
# versa
indent = count_leading_wh_sp(lines[0])
for i in range(len(lines)):
if i != len(lines) - 1:
next_indent = count_leading_wh_sp(lines[i + 1])
if indent != next_indent and \
not lines[i][-1] in [".", "!", "?"]:
lines[i] = lines[i] + "."
else:
next_indent = None
if not lines[i][-1] in [".", "!", "?"]:
lines[i] = lines[i] + "."
indent = next_indent
# Regex for string that contains at least 2 lower case letters
# Found cases where line was just "is."
pattern_lc = re.compile(r'^(.*[a-z]){2,}.*$')
# Reject lines that don't contain at least 2 lower case letters string.
# This gets rid of scene directions and character invocations.
lines = [line for line in lines if re.search(pattern_lc, line)]
white_spaces = [count_leading_wh_sp(line) for line in lines]
# Counter returns dictionary mapping item to its number of repetitions
wh_sp_counter = co.Counter(white_spaces)
# print("llkh", wh_sp_counter)
sum_reps = sum(wh_sp_counter.values())
indent_prob_dist = co.OrderedDict()
indents = []
for indent in sorted(wh_sp_counter,
key=wh_sp_counter.get,
reverse=True):
prob = round(wh_sp_counter[indent] / sum_reps, 3)
indent_prob_dist[indent] = prob
indents.append(indent)
# print("ddfg", indents)
# print("ddfg", indent_prob_dist)
print("indent prob dist =", [(indent, indent_prob_dist[indent]) \
for indent in indents[0:4]])
# likely dialog indents
# most probable indent = indents[0]
dial_indents = [indent for indent in indents if \
abs(indent - indents[0]) <= 3 and \
indent_prob_dist[indent] >= .01]
ndial_indents = [indent for indent in indents \
if indent not in dial_indents]
# likely narration indents
narr_indents = [indent for indent in ndial_indents if \
abs(indent - ndial_indents[0]) <= 3 and \
indent_prob_dist[indent] >= .01]
print("dialog indents=", dial_indents)
print("narration indents=", narr_indents)
# keep only narration (less likely than dialog) indentations. Also
# remove smallest indentation.
new_lines = []
for line in lines:
indent = count_leading_wh_sp(line)
if indent in dial_indents + narr_indents:
if not narr_indents or not dial_indents:
# there is no difference in indentation between narr and dial
new_lines.append(line)
else:
if remove_dialog:
if indent in narr_indents:
new_lines.append(line[min(narr_indents):])
else:
new_lines.append(line[min(narr_indents):])
lines = new_lines
# print("nnuu", lines[0:15])
# print("nnuu", lines[-15:])
# remove enumeration markers.
# pattern = re.compile(r"^[^a-zA-Z]*")
# lines = [re.sub(pattern, "", line) for line in lines]
# join lines to create new script
lines = [line.strip() for line in lines if line]
script = ' '.join(lines)
# split script into sentences with NLTK
# lines = sent_tokenize(script)
# split script into sentences with spacy
lines = nlp(script).sents
# for line in lines:
# print("zzzxc", line)
# remove single character sentences
lines = [line.text for line in lines if len(line.text) > 1]
with open(outpath, "w") as f:
for line in lines:
f.write(line + "\n")
def clean_batch_of_m_scripts(
in_dir, out_dir,
batch_file_names,
remove_dialog=False):
"""
This method calls the method `clean_one_m_script` for all the file names
in the list of file names `batch_file_names`.
Parameters
----------
in_dir: str
out_dir: str
batch_file_names: list[str]
remove_dialog: bool
Returns
-------
None
"""
all_file_names = my_listdir(in_dir)
assert set(batch_file_names).issubset(set(all_file_names))
for file_name in batch_file_names:
i = all_file_names.index(file_name)
print('%i.' % (i + 1))
clean_one_m_script(in_dir,
out_dir,
file_name,
remove_dialog=remove_dialog)
if __name__ == "__main__":
from globals import *
def main1():
in_dir = "short_stories"
out_dir = "short_stories_clean"
batch_file_names = my_listdir(in_dir)[0:3]
clean_batch_of_m_scripts(
in_dir, out_dir,
batch_file_names,
remove_dialog=False)
def main2():
remove_dialog = True
clean_one_m_script(
in_dir=M_SCRIPTS_DIR,
out_dir=CLEAN_DIR if not remove_dialog else CLEAN_RD_DIR,
file_name="up.txt",
remove_dialog=remove_dialog)
def main3():
remove_dialog = False
# batch_file_names=my_listdir(M_SCRIPTS_DIR)
batch_file_names = ["toy-story.txt", "up.txt", "wall-e.txt"]
clean_batch_of_m_scripts(
in_dir=M_SCRIPTS_DIR,
out_dir=CLEAN_DIR if not remove_dialog else CLEAN_RD_DIR,
batch_file_names=batch_file_names,
remove_dialog=remove_dialog)
# main1()
# main2()
main3()