-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprofanity_filter.py
81 lines (68 loc) · 2.8 KB
/
profanity_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from better_profanity.better_profanity import Profanity
from better_profanity.constants import ALLOWED_CHARACTERS
from better_profanity.utils import any_next_words_form_swear_word
class ProfanityFilter(Profanity):
'''Custom class to return more detailed censoring than the one provided by better_profanity'''
def __init__(self):
super(ProfanityFilter, self).__init__()
def _hide_swear_words(self, text, censor_char):
'''
Copied the _hide_swear_words function from better_profanity but calls the
get_replacement_for_swear_word in this class instead of their utils function
'''
censored_text = ""
cur_word = ""
skip_index = -1
next_words_indices = []
start_idx_of_next_word = self._get_start_index_of_next_word(text, 0)
# If there are no words in the text, return the raw text without parsing
if start_idx_of_next_word >= len(text) - 1:
return text
# Left strip the text, to avoid inaccurate parsing
if start_idx_of_next_word > 0:
censored_text = text[:start_idx_of_next_word]
text = text[start_idx_of_next_word:]
# Splitting each word in the text to compare with censored words
for index, char in iter(enumerate(text)):
if index < skip_index:
continue
if char in ALLOWED_CHARACTERS:
cur_word += char
continue
# Skip continuous non-allowed characters
if cur_word.strip() == "":
censored_text += char
cur_word = ""
continue
# Iterate the next words combined with the current one
# to check if it forms a swear word
next_words_indices = self._update_next_words_indices(
text, next_words_indices, index
)
contains_swear_word, end_index = any_next_words_form_swear_word(
cur_word, next_words_indices, self.CENSOR_WORDSET
)
if contains_swear_word:
cur_word = self.get_replacement_for_swear_word(cur_word, censor_char)
skip_index = end_index
char = ""
next_words_indices = []
# If the current a swear word
if cur_word.lower() in self.CENSOR_WORDSET:
cur_word = self.get_replacement_for_swear_word(cur_word, censor_char)
censored_text += cur_word + char
cur_word = ""
# Final check
if cur_word != "" and skip_index < len(text) - 1:
if cur_word.lower() in self.CENSOR_WORDSET:
cur_word = self.get_replacement_for_swear_word(cur_word, censor_char)
censored_text += cur_word
return censored_text
def get_replacement_for_swear_word(self, cur_word, censor_char):
'''Censors cur_word by replacing all but the first character with the censor_char'''
return cur_word[0] + (censor_char * (len(cur_word)-1))
def censor(text):
'''Censor text using profanity filter'''
pf = ProfanityFilter()
pf.load_censor_words()
return pf.censor(text)