-
Notifications
You must be signed in to change notification settings - Fork 1
/
yapp_classifier.py
108 lines (89 loc) · 3.89 KB
/
yapp_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import pickle
import re
import tensorflow as tf
from nltk.corpus import stopwords
class YappClassifier:
def __init__(self, model_path: str, tokenizer_path: str, stemmer_path: str,
max_length: int = 20, truncating_type: str = 'post',
padding_type: str = 'post'):
self.max_length = max_length
self.truncating_type = truncating_type
self.padding_type = padding_type
try:
# Load the stopwords
self.stop_words = stopwords.words("english")
except Exception as e:
raise RuntimeError("Error loading stopwords: {}".format(e))
try:
# Load the tokenizer
with open(tokenizer_path, 'rb') as handle:
self.tokenizer = pickle.load(handle)
except FileNotFoundError:
raise FileNotFoundError(
f"Tokenizer file not found at path: {tokenizer_path}")
except pickle.UnpicklingError:
raise ValueError(
f"Error unpickling tokenizer file at path: {tokenizer_path}")
except Exception as e:
raise RuntimeError(f"Error loading tokenizer: {e}")
try:
# Load the model
self.model = tf.keras.models.load_model(model_path)
except FileNotFoundError:
raise FileNotFoundError(
f"Model file not found at path: {model_path}")
except Exception as e:
raise RuntimeError(f"Error loading model: {e}")
try:
# Load the tokenizer
with open(stemmer_path, 'rb') as handle:
self.stemmer = pickle.load(handle)
except FileNotFoundError:
raise FileNotFoundError(
f"Stemmer file not found at path: {tokenizer_path}")
except pickle.UnpicklingError:
raise ValueError(
f"Error unpickling stemmer file at path: {tokenizer_path}")
except Exception as e:
raise RuntimeError(f"Error loading stemmer: {e}")
def clean_yapp(self, yapp: str) -> str:
try:
# Casefolding and remove extra spaces
output = yapp.lower().strip()
# Remove extra spaces in between
output = re.sub(' +', ' ', output)
# Remove @username
output = re.sub('@\w+', ' ', output)
# Remove punctuations
output = re.sub('[^a-zA-Z]', ' ', output)
# Remove all words with only one char in it
output = re.sub(r'\b\w\b', '', output)
# Remove stopwords and apply stemming
output = ' '.join(self.stemmer.stem(word)
for word in output.split() if word not in self.stop_words)
return output
except Exception as e:
raise ValueError(f"Error cleaning text: {e}")
def preprocess_yapps(self, yapps: list[str]):
try:
cleaned_yapps = [self.clean_yapp(yapp) for yapp in yapps]
sequences = self.tokenizer.texts_to_sequences(cleaned_yapps)
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
sequences, maxlen=self.max_length, padding=self.padding_type, truncating=self.truncating_type)
return padded_sequences
except Exception as e:
raise ValueError(f"Error preprocessing texts: {e}")
def classify_yapp(self, yapp: str) -> dict:
try:
preprocessed_yapp = self.preprocess_yapps([yapp])
prediction = self.model.predict(preprocessed_yapp)
# Calculate confidence
prediction = prediction.flatten()
negative_confidence = 1 - prediction[0]
confidence = {
"negative": negative_confidence,
"positive": 1 - negative_confidence,
}
return confidence
except Exception as e:
raise RuntimeError(f"Error classifying yapp: {e}")