forked from luistovar/sara
-
Notifications
You must be signed in to change notification settings - Fork 0
/
profanity_filter.py
129 lines (102 loc) · 4.15 KB
/
profanity_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Module that provides a class that filters profanities
"""
__author__ = "leoluk"
__modifiedby__ = "iorch"
__version__ = '0.0.2'
import random
from bs4 import BeautifulSoup
import re
import unicodedata
import os
import sys
sys.path.append(os.path.dirname(__file__))
import malas_palabras
my_list = malas_palabras.__una_palabra__
class ProfanitiesFilter(object):
def __init__(self, filterlist, ignore_case=True, replacements="$@%-?!",
complete=True, inside_words=False):
"""
Inits the profanity filter.
filterlist -- a list of regular expressions that
matches words that are forbidden
ignore_case -- ignore capitalization
replacements -- string with characters to replace the forbidden word
complete -- completely remove the word or keep the first and last char?
inside_words -- search inside other words?
"""
self.badwords = filterlist
self.ignore_case = ignore_case
self.replacements = replacements
self.complete = complete
self.inside_words = inside_words
def remove_accents(self, input_str):
nkfd_form = unicodedata.normalize('NFKD', input_str)
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def review_words(self, raw_text):
review_text = BeautifulSoup(raw_text).get_text()
letters_only = re.sub("^(\w+)[0-9]@", " ", review_text)
callback = lambda pat: pat.group(0).decode('utf-8').lower()
iac = re.sub(u"Ă", u"í", letters_only)
ene = re.sub(u"Ñ", u"ñ", iac)
no_accents = self.remove_accents(ene)
meaningful_words = re.sub("(\w+)", callback, no_accents).split()
return u" ".join(meaningful_words)
def profanity_score(self, my_string0):
# The score is the number of bad words in the string. The bad words are
# defined in the malas_palabras.py file.
compiled_bw = {}
isabadword = {}
to_test = re.compile(".*-.*")
my_string = self.review_words(my_string0)
for j in my_string.split(' '):
isabadword[j] = 0
for i in self.badwords:
i = self.remove_accents(i)
compiled_bw[i] = re.compile(i)
if isabadword[j] == 0:
__replacement__ = compiled_bw[i].sub(self.__replacer,j)
test = to_test.match(__replacement__)
isabadword[j] = 0 if test is None else 1
else:
next
print isabadword
return reduce(lambda x, y: x+y, isabadword.values(), 0)
def _make_clean_word(self, length):
"""
Generates a random replacement string of a given length
using the chars in self.replacements.
"""
return ''.join([random.choice(self.replacements) for i in
range(length)])
def __replacer(self, match):
value = match.group()
if self.complete:
return self._make_clean_word(len(value))
else:
return value[0]+self._make_clean_word(len(value)-2)+value[-1]
def clean(self, text):
"""Cleans a string from profanity."""
regexp_insidewords = {
True: r'(%s)',
False: r'\b(%s)\b',
}
regexp = (regexp_insidewords[self.inside_words] %
'|'.join(self.badwords))
r = re.compile(regexp, re.IGNORECASE if self.ignore_case else 0)
return r.sub(self.__replacer, text)
if __name__ == '__main__':
f = ProfanitiesFilter(my_list, replacements="-")
example = u"I am doing pendejadas chingonas encabronadas porque ese maldito coño de funcionario me hizo emputar, " \
u"y es un pinche maricón mirón de mierda."
print f.clean(example)
# Returns "I am doing --- ------ badlike things."
f.inside_words = True
print f.clean(example)
# Returns "I am doing --- ------ ---like things."
f.complete = False
print f.clean(example)
# Returns "I am doing b-d u----d b-dlike things."`
print f.profanity_score(example)