-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
98 lines (82 loc) · 3.42 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import re, hazm, nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("stopwords")
class Preprocess:
def fit(self, data, lang):
is_fa = data["lang"] == "fa"
fa_data = data[is_fa]
is_en = data["lang"] == "en"
en_data = data[is_en]
if lang == "fa":
self.data = self.clean_fa(fa_data)
elif lang == "en":
self.data = self.clean_en(en_data)
def clean_fa(self, data):
data.text = self.fa_normalize(data.text)
data.text = self.tokenizer(data.text)
stemmer = hazm.Stemmer()
lemmatizer = hazm.Lemmatizer()
stopwords = hazm.stopwords_list()
alphabet = set(list("ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"))
data.text = data.apply(
lambda row: self.stemLemmaStopWord(
stemmer, lemmatizer, stopwords, alphabet, row.text
),
axis=1,
)
return data
def clean_en(self, data):
data.text = self.normalize(data.text)
data.text = self.tokenizer(data.text)
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words("english"))
alphabet = set(list("abcdefghijklmnopqrstuvwxyz"))
data.text = data.apply(
lambda row: self.stemLemmaStopWord(
stemmer, lemmatizer, stopwords, alphabet, row.text
),
axis=1,
)
return data
def tokenizer(self, text):
text = text.str.split(" ")
return text
def stemLemmaStopWord(self, stemmer, lemmatizer, stopwords, alphabet, tokens):
final_tokens = []
for token in tokens:
stemmed_token = stemmer.stem(lemmatizer.lemmatize(token))
if "#" in stemmed_token:
stemmed_token = stemmed_token.split("#")[0]
if (
token not in stopwords
and stemmed_token not in stopwords
and not token == ""
and stemmed_token not in alphabet
):
final_tokens.append(stemmed_token)
return final_tokens
def fa_normalize(self, text):
text = text.replace(to_replace=r"[ئيی]", value="ی", regex=True)
text = text.replace(to_replace=r"[ك]", value="ک", regex=True)
text = text.replace(to_replace=r"[ؤ]", value="و", regex=True)
text = text.replace(to_replace=r"[ة]", value="ه", regex=True)
text = text.replace(to_replace=r"[إأآا]", value="ا", regex=True)
text = text.replace(
to_replace=r"[^ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی]", value=" ", regex=True
)
text = text.replace(to_replace=r"(.)\1+", value=r"\1", regex=True)
text = text.replace(to_replace=r"[^\S\n\t]+", value=" ", regex=True)
return text
def en_normalize(self, text):
text = text.replace(to_replace=r"@([A-Za-z0-9_]+)", value="", regex=True)
text = text.replace(to_replace=r"http([^\s\\]+)", value="", regex=True)
text = text.str.lower()
text = text.replace(to_replace=r"[^a-z]", value=" ", regex=True)
text = text.replace(to_replace=r"(.)\1+", value=r"\1", regex=True)
text = text.replace(to_replace=r"[^\S\n\t]+", value=" ", regex=True)
return text