-
Notifications
You must be signed in to change notification settings - Fork 0
/
Preprocessor.py
107 lines (88 loc) · 2.99 KB
/
Preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from nltk.stem import PorterStemmer # Porter Stemmer
import re # Python regular expressions
# from nltk.stem.snowball import SnowballStemmerpo
class Preprocessor(object):
"""Class of type object that provides a basic toolkit for text preprocessing
Fields
------
stopwords : Set type
The set with all stopwords
porter: Object of type PorterStemmer
The porter stemmer utility
"""
stopwords = set() # Set with stopwords - O(1) search
porter = PorterStemmer()
def __init__(self):
"""Constructor of Class Preprocessor
"""
self.loadStopwords()
def tokenize(self, string):
"""Splits parameter 'string' and returns a list of the tokens.
The regular expression used is (?!\&\b)\W+ which splits the string in every non alphanumeric character (\W+).
An exception to this is when a "&" is within a word (?!\&\b) e.g. AT&T, P&G, etc.
These kinds of words should not be split.
Parameters
----------
string : String type
A sentence to be split
Returns
-------
tokens : List of strings
A list containing all tokens
"""
return re.split(r'(?!\&\b)\W+', string) # r stands for raw expression
def stemWordPorter(self, word):
"""Stems the given word using the Porter Stemmer library
Parameters
----------
word : String type
A word to be stemmed
Returns
-------
stemmedWord : String type
The stemmed version of the given word
"""
return self.porter.stem(word)
def stemWordSnowball(self, word):
"""Stems the given word using the Snowball Stemmer library
Parameters
----------
word : String type
A word to be stemmed
Returns
-------
stemmedWord : String type
The stemmed version of the given word
"""
return self.snws.stem(word)
def toLowerCase(self, string):
"""Receives a word and returns it with all letters in lower case
Parameters
----------
word : String type
A word to be lowercased
Returns
-------
lowercaseWord : String type
The lowercase version of the given word
"""
return string.lower()
def isNotAStopword(self, word):
"""Determines whether a word is a stopword
Parameters
----------
word : String type
A word to be checked
Returns
-------
isNotStopword : Boolean type
Returns True if the given word is not a stopword, otherwise False
"""
if word in self.stopwords:
return False
return True
def loadStopwords(self):
"""Loads all stopword terms from file and saves them to a set structure
"""
with open('data/stopwords.txt') as stopWordFile:
self.stopwords = set(stopWordFile.read().splitlines())