-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
189 lines (152 loc) · 7 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Input, Concatenate, Dropout, Flatten
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import emoji
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
# Load data from CSV file
data = pd.read_csv('reddit_posts_data.csv')
# Data Preprocessing
# Data Cleaning
def clean_text(text):
# Remove unnecessary characters
text = re.sub(r'[^\w\s]', '', text)
# Replace repetitive line breaks and blank spaces with only one
text = re.sub(r'\s+', ' ', text).strip()
# Remove emoticons and emojis
text = re.sub(r'[\U00010000-\U0010ffff]', '', text)
return text
data['post_text'] = data['post_text'].apply(clean_text)
# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
tokens = word_tokenize(text)
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
return ' '.join(lemmatized_tokens)
data['post_text'] = data['post_text'].apply(lemmatize_text)
# POS Tagging
def pos_tagging(text):
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)
return tagged_tokens
# Apply POS tagging
data['post_text_pos'] = data['post_text'].apply(pos_tagging)
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['post_text'])
X_seq = tokenizer.texts_to_sequences(data['post_text'])
X_pad = pad_sequences(X_seq, maxlen=100)
# Stopwords removal
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
tokens = word_tokenize(text)
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
return ' '.join(filtered_tokens)
data['post_text'] = data['post_text'].apply(remove_stopwords)
# Define the function to extract structural features
def extract_structural_features(text):
# Implement feature extraction logic
message_length = len(text)
num_tokens = len(word_tokenize(text))
num_hashtags = text.count('#')
num_emails = len(re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text))
num_urls = text.count('http://') + text.count('https://')
num_periods = text.count('.')
num_commas = text.count(',')
num_digits = sum(c.isdigit() for c in text)
num_sentences = len(sent_tokenize(text))
num_mentioned_users = text.count('@')
num_uppercase = sum(c.isupper() for c in text)
num_question_marks = text.count('?')
num_exclamation_marks = text.count('!')
emojis = set(re.findall(r'\:[\w]+\:', emoji.demojize(text)))
num_emoticons = len(emojis)
num_dollar_symbols = text.count('$')
# Other symbols
num_other_symbols = len([char for char in text if char not in '"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789#@.://,?!' + ''.join(emojis)])
# Return features as a list
return [message_length, num_tokens, num_hashtags, num_emails, num_urls, num_periods, num_commas, num_digits, num_sentences, num_mentioned_users, num_uppercase, num_question_marks, num_exclamation_marks, num_emoticons, num_dollar_symbols, num_other_symbols]
# Apply the function to extract structural features and create new columns
data['structural_features'] = data['post_text'].apply(extract_structural_features)
# Convert labels to numerical values
label_dict = {'negative': 0, 'neutral': 1, 'positive': 2}
data['opinion'] = data['opinion'].map(label_dict)
# Normalize 'post_upvotes' column
data['post_upvotes'] = (data['post_upvotes'] - data['post_upvotes'].min()) / (data['post_upvotes'].max() - data['post_upvotes'].min())
# Define the model architecture
input_content = Input(shape=(100,), name='content_input')
input_upvotes = Input(shape=(1,), name='upvotes_input')
embedding = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=100)(input_content)
conv_layer = Conv1D(128, 5, activation='relu')(embedding)
pooling_layer_content = GlobalMaxPooling1D()(conv_layer)
# Upvotes input
dense_upvotes = Dense(32, activation='relu')(input_upvotes)
# Concatenate content and upvotes
concatenated_inputs = Concatenate()([pooling_layer_content, dense_upvotes])
# Additional fully connected layers
dense = Dense(128, activation='relu')(concatenated_inputs)
output = Dense(3, activation='softmax')(dense) # Output
# Define the model
model = Model(inputs=[input_content, input_upvotes], outputs=output)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Split the dataset into training and testing sets for X_pad
X_train_pad, X_test_pad, y_train, y_test = train_test_split(X_pad, data['opinion'], test_size=0.2, random_state=42)
# Split the dataset into training and testing sets for post_upvotes
X_train_upvotes, X_test_upvotes, _, _ = train_test_split(data['post_upvotes'], data['opinion'], test_size=0.2, random_state=42)
# Concatenate X_pad and post_upvotes for training and testing sets
X_train = [X_train_pad, X_train_upvotes]
X_test = [X_test_pad, X_test_upvotes]
# Train the model
model.fit(
x=X_train,
y=y_train,
batch_size=32,
epochs=5,
validation_data=(X_test, y_test)
)
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')
while True:
# User input
user_input = input("Enter a post text (type 'exit' to quit): ")
# Check if user wants to exit
if user_input.lower() == 'exit':
print("Exiting...")
break
# Preprocess user input
cleaned_input = clean_text(user_input)
lemmatized_input = lemmatize_text(cleaned_input)
tokenized_input = tokenizer.texts_to_sequences([lemmatized_input])
padded_input = pad_sequences(tokenized_input, maxlen=100)
# Get user input structural features
structural_features = extract_structural_features(cleaned_input)
structural_features = np.array(structural_features).reshape(1, -1)
# Get user input upvotes (assuming it's normalized)
user_upvotes = float(input("Enter the number of upvotes (normalized): "))
# Concatenate user input pad and upvotes
user_input_pad = [padded_input, np.array([[user_upvotes]])]
# Predict sentiment
predictions = model.predict(user_input_pad)
# Get the sentiment probabilities
negative_prob = predictions[0][0]
neutral_prob = predictions[0][1]
positive_prob = predictions[0][2]
# Print results
print("\nSentiment Probabilities:")
print(f"Negative: {negative_prob:.2f}")
print(f"Neutral: {neutral_prob:.2f}")
print(f"Positive: {positive_prob:.2f}")
print()