-
Notifications
You must be signed in to change notification settings - Fork 0
/
src.py
489 lines (415 loc) · 19.5 KB
/
src.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt #Plotting properties
import seaborn as sns #Plotting properties
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer #Data transformation
from sklearn.model_selection import train_test_split #Data testing
from sklearn.linear_model import LogisticRegression #Prediction Model
from sklearn.metrics import confusion_matrix, accuracy_score #Comparison between real and predicted
from sklearn.preprocessing import LabelEncoder #Variable encoding and decoding for XGBoost
import re #Regular expressions
import nltk
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
def read_data(path):
df = pd.read_csv(path, header=None)
df.columns = ['id','information','type','text']
#Text transformation
df["lower"]=df.text.str.lower() #lowercase
df["lower"]=[str(data) for data in df.lower] #converting all to string
df["lower"]=df.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x)) #regex
data = df
return data
import pandas as pd
def dataset(datasize):
train_data = read_data('./Dataset/twitter_training.csv')
val_data = read_data('./Dataset/twitter_validation.csv')
all_data = pd.concat([train_data, val_data], ignore_index=True)
if datasize == 'All':
sampled_df = all_data
elif datasize == 60000:
# Load the original DataFrame
df = all_data
# Separate the 'Irrelevant' category from the rest of the data
df_irrelevant = df[df['type'] == 'Irrelevant']
df = df[df['type'] != 'Irrelevant']
# Group the remaining DataFrame by type
grouped = df.groupby('type')
# Create an empty DataFrame to store the sampled rows
sampled_df = pd.DataFrame()
# Loop through each group and sample 15,000 rows from each type
for name, group in grouped:
sampled_rows = group.sample(15000, random_state=42)
sampled_df = pd.concat([sampled_df, sampled_rows])
# Concatenate the sampled rows from all categories into a single DataFrame
sampled_df = pd.concat([sampled_df, df_irrelevant])
# Reset the index of the resulting DataFrame
sampled_df = sampled_df.reset_index(drop=True)
else:
# Group the DataFrame by type
grouped = all_data.groupby('type')
num_per_type = datasize//4
# Create an empty DataFrame to store the sampled rows
sampled_df = pd.DataFrame()
# Loop through each group and sample 15,000 rows from each type
for name, group in grouped:
sampled_rows = group.sample(n=num_per_type, random_state=42)
sampled_df = pd.concat([sampled_df, sampled_rows])
# Reset the index of the resulting DataFrame
sampled_df = sampled_df.reset_index(drop=True)
type_counts = sampled_df['type'].value_counts()
type_proportions = type_counts / len(sampled_df)
print(type_proportions)
return sampled_df
def bert_data(data):
# Split the data into training and testing sets
train_df, rem_df = train_test_split(data, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(rem_df, test_size=0.5, random_state=42)
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenize the text data and convert it to input features
train_encodings = tokenizer(list(train_df["lower"].astype(str)),
truncation=True,
padding=True,
max_length=128)
val_encodings = tokenizer(list(val_df["lower"].astype(str)),
truncation=True,
padding=True,
max_length=128)
test_encodings = tokenizer(list(test_df["lower"].astype(str)),
truncation=True,
padding=True,
max_length=128)
# encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['type'].values)
val_labels = label_encoder.fit_transform(val_df['type'].values)
test_labels = label_encoder.fit_transform(test_df['type'].values)
# Create TensorFlow datasets from the tokenized data
train_dataset = tf.data.Dataset.from_tensor_slices((
dict(train_encodings),
train_labels.astype(np.int32)
))
val_dataset = tf.data.Dataset.from_tensor_slices((
dict(val_encodings),
val_labels.astype(np.int32)
))
test_dataset = tf.data.Dataset.from_tensor_slices((
dict(test_encodings),
test_labels.astype(np.int32)
))
print('Data is processed and splitted')
return train_dataset, val_dataset, test_dataset, test_labels
def bert_model(train_dataset, val_dataset, test_dataset):
# Load the pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
# Compile the model with an appropriate optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
# Train the model on the training data
history = model.fit(train_dataset.batch(32), epochs=5,
validation_data=val_dataset.batch(32),
batch_size=32)
return model, history
def train_val_plot(history):
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.ylabel('Accuracy')
plt.ylim([min(plt.ylim()),1])
plt.title('Training and Validation Accuracy 60k data')
plt.subplot(2, 1, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.ylabel('Loss')
plt.title('Training and Validation Loss 60k data')
plt.xlabel('epoch')
plt.savefig('BERT Train Results 60k data.png')
print('Train curve for bert model is saved')
def plot_consusion_matrix(model, test_dataset, test_labels):
# Get predicted labels
y_pred = np.argmax(model.predict(test_dataset.batch(32)).logits, axis=1)
# Get true labels
y_true = test_labels.astype(np.int32)
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Plot confusion matrix as heatmap
plt.figure(figsize = (8,5))
sns.heatmap(cm, annot=True, fmt=',d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('BERT Confusion Matrix 60k Data')
plt.savefig('BERT Confusion Matrix 60k Data.jpeg')
print('Confusion Matrix for bert model is Stored')
print("Accuracy Score:", accuracy_score(y_true, y_pred))
def data_split(data):
bow_counts = CountVectorizer(
tokenizer=word_tokenize,
ngram_range=(1,4))
train, test = train_test_split(data, test_size=0.1, random_state=42)
X_train_bow = bow_counts.fit_transform(train.lower)
X_test_bow = bow_counts.transform(test.lower)
y_train_bow = train['type']
y_test_bow = test['type']
print('N-gram based data is also processed and splitted')
return X_train_bow, y_train_bow, X_test_bow, y_test_bow
def log_reg(X_train_bow, y_train_bow, X_test_bow, y_test_bow ):
model = LogisticRegression(C=10, solver="liblinear",max_iter=1500,verbose=True)
# Logistic regression
model.fit(X_train_bow, y_train_bow)
# Prediction
y_pred = model.predict(X_test_bow)
print("Accuracy: ", accuracy_score(y_test_bow, y_pred) * 100)
return y_pred
def lr_confusion_matrix(y_test_bow, y_pred):
cm = confusion_matrix(y_test_bow, y_pred)
# Plot confusion matrix as heatmap
sns.heatmap(cm, annot=True, fmt=',d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('4-grams LRogistic Regression Confusion Matrix 60k Data')
plt.savefig('4-grams LRogistic Regression Confusion Matrix 60k Data.jpeg')
print('Confusion Matrix for LR is Stored')
corrected_pred_all_labels = np.diag(cm)/y_test_bow.value_counts().sort_index()
bias = (corrected_pred_all_labels.max() - corrected_pred_all_labels.min())/corrected_pred_all_labels.max()
print('{:.2%}'.format(bias))
def fourgrams_mlp(X_train_bow, y_train_bow, y_test_bow, X_test_bow):
model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam',
max_iter=20, early_stopping=True, verbose=True, n_iter_no_change=5)
# Train the model on the training set, with early stopping based on the validation loss
history = model.fit(X_train_bow, y_train_bow)
# Evaluate the model on the test set
y_pred = model.predict(X_test_bow)
test_acc = accuracy_score(y_test_bow, y_pred)
print('Test accuracy:', test_acc)
return history, y_pred
def fourgrams_mlp_train_val_plot(history):
val_acc = history.validation_scores_
loss = history.loss_curve_
plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(val_acc, 'b', label='Validation Accuracy')
plt.legend(loc='lower right')
plt.ylabel('Accuracy')
plt.ylim([0,1])
plt.title('Validation Accuracy')
plt.subplot(2, 1, 2)
plt.plot(loss, 'r', label='Training Loss')
plt.legend(loc='upper right')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.xlabel('epoch')
plt.show()
plt.savefig('4-Grams MLP Train Results 60k data.png')
print('Train curve for MLP is saved')
def mlp_confusion_matrix(y_test_bow, y_pred):
cm = confusion_matrix(y_test_bow, y_pred)
# Plot confusion matrix as heatmap
plt.figure(figsize=(8, 5))
sns.heatmap(cm, annot=True, fmt=',d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('4-grams MLP Confusion Matrix 60k Data')
plt.savefig('4-grams MLP Confusion Matrix 60k Data.jpeg')
corrected_pred_all_labels = np.diag(cm)/y_test_bow.value_counts().sort_index()
bias = (corrected_pred_all_labels.max() - corrected_pred_all_labels.min())/corrected_pred_all_labels.max()
print('{:.2%}'.format(bias))
print('Confusion Matrix for MLP is Stored')
def bias_data(data):
# Read in your dataset and split them based on their labels
# data = read_data('/content/my_dataset.csv')
label_1_data = data[data['type'] == 'Positive']
label_2_data = data[data['type'] == 'Negative']
label_3_data = data[data['type'] == 'Irrelevant']
label_4_data = data[data['type'] == 'Neutral']
# Split each label subset into train and test sets
label_1_train, label_1_test = train_test_split(label_1_data, test_size=100, train_size=20000, random_state=42)
label_2_train, label_2_test = train_test_split(label_2_data, test_size=100, train_size=20000, random_state=42)
label_3_train, label_3_test = train_test_split(label_3_data, test_size=2400, train_size=2500, random_state=42)
label_4_train, label_4_test = train_test_split(label_4_data, test_size=2400, train_size=2500, random_state=42)
# Concatenate the train sets and test sets for each label subset
train_df = pd.concat([label_1_train, label_2_train, label_3_train, label_4_train])
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)
test_df = pd.concat([label_1_test, label_2_test, label_3_test, label_4_test])
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenize the text data and convert it to input features
train_encodings = tokenizer(list(train_df["lower"].astype(str)),
truncation=True,
padding=True,
max_length=128)
val_encodings = tokenizer(list(val_df["lower"].astype(str)),
truncation=True,
padding=True,
max_length=128)
test_encodings = tokenizer(list(test_df["lower"].astype(str)),
truncation=True,
padding=True,
max_length=128)
# encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['type'].values)
val_labels = label_encoder.fit_transform(val_df['type'].values)
test_labels = label_encoder.fit_transform(test_df['type'].values)
# Create TensorFlow datasets from the tokenized data
train_dataset = tf.data.Dataset.from_tensor_slices((
dict(train_encodings),
train_labels.astype(np.int32)
))
val_dataset = tf.data.Dataset.from_tensor_slices((
dict(val_encodings),
val_labels.astype(np.int32)
))
test_dataset = tf.data.Dataset.from_tensor_slices((
dict(test_encodings),
test_labels.astype(np.int32)
))
print('Data is tokenized')
return train_dataset, val_dataset, test_dataset, test_labels
def bias_bert_model(train_dataset, val_dataset, test_dataset):
# Load the pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
# Compile the model with an appropriate optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
# Train the model on the training data
history = model.fit(train_dataset.batch(32), epochs=5,
validation_data=val_dataset.batch(32),
batch_size=32)
return model, history
def bias_train_val_plot(history):
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.ylabel('Accuracy')
plt.ylim([min(plt.ylim()),1])
plt.title('Training and Validation Accuracy 50k biased data case2')
plt.subplot(2, 1, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.ylabel('Loss')
# plt.ylim([0,1.0])
plt.title('Training and Validation Loss 50k biased data case2')
plt.xlabel('epoch')
# plt.show()
plt.savefig('BERT Train Results 50k biased data case2.png')
def bias_plot_consusion_matrix(model, test_dataset, test_labels):
# Get predicted labels
y_pred = np.argmax(model.predict(test_dataset.batch(32)).logits, axis=1)
# Get true labels
y_true = test_labels.astype(np.int32)
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Plot confusion matrix as heatmap
plt.figure(figsize = (8,5))
sns.heatmap(cm, annot=True, fmt=',d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('BERT Confusion Matrix 50k biased Data Case2')
plt.savefig('BERT Confusion Matrix 50k biased Data Case2.jpeg')
print('Confusion Matrix is Stored')
print("Accuracy Score:", accuracy_score(y_true, y_pred))
def bias_data_split(data):
# Read in your dataset and split them based on their labels
# data = read_data('/content/my_dataset.csv')
label_1_data = data[data['type'] == 'Positive']
label_2_data = data[data['type'] == 'Negative']
label_3_data = data[data['type'] == 'Irrelevant']
label_4_data = data[data['type'] == 'Neutral']
# Split each label subset into train and test sets
label_1_train, label_1_test = train_test_split(label_1_data, test_size=100, train_size=20000, random_state=42)
label_2_train, label_2_test = train_test_split(label_2_data, test_size=100, train_size=20000, random_state=42)
label_3_train, label_3_test = train_test_split(label_3_data, test_size=2400, train_size=2500, random_state=42)
label_4_train, label_4_test = train_test_split(label_4_data, test_size=2400, train_size=2500, random_state=42)
# Concatenate the train sets and test sets for each label subset
train = pd.concat([label_1_train, label_2_train, label_3_train, label_4_train])
test = pd.concat([label_1_test, label_2_test, label_3_test, label_4_test])
bow_counts = CountVectorizer(
tokenizer=word_tokenize,
ngram_range=(1,4)
)
X_train_bow = bow_counts.fit_transform(train.lower)
X_test_bow = bow_counts.transform(test.lower)
y_train_bow = train['type']
y_test_bow = test['type']
return X_train_bow, y_train_bow, X_test_bow, y_test_bow
def bias_log_reg(X_train_bow, y_train_bow, X_test_bow, y_test_bow ):
model = LogisticRegression(C=10, solver="liblinear",max_iter=1500,verbose=True)
# Logistic regression
model.fit(X_train_bow, y_train_bow)
# Prediction
y_pred = model.predict(X_test_bow)
print("Accuracy: ", accuracy_score(y_test_bow, y_pred) * 100)
return y_pred
def bias_lr_confusion_matrix(y_test_bow, y_pred):
cm = confusion_matrix(y_test_bow, y_pred)
# Plot confusion matrix as heatmap
sns.heatmap(cm, annot=True, fmt=',d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('4-grams LRogistic Regression Confusion Matrix 50k bias Case2 Data')
plt.savefig('4-grams LRogistic Regression Confusion Matrix 50k bias Case2 Data.jpeg')
print('Confusion Matrix is Stored')
corrected_pred_all_labels = np.diag(cm)/y_test_bow.value_counts().sort_index()
bias = (corrected_pred_all_labels.max() - corrected_pred_all_labels.min())/corrected_pred_all_labels.max()
print('{:.2%}'.format(bias))
def bias_fourgrams_mlp(X_train_bow, y_train_bow, y_test_bow, X_test_bow):
model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam',
max_iter=20, early_stopping=True, verbose=True, n_iter_no_change=5)
# Train the model on the training set, with early stopping based on the validation loss
history = model.fit(X_train_bow, y_train_bow)
# Evaluate the model on the test set
y_pred = model.predict(X_test_bow)
test_acc = accuracy_score(y_test_bow, y_pred)
print('Test accuracy:', test_acc)
return history, y_pred
def bias_fourgrams_mlp_train_val_plot(history):
val_acc = history.validation_scores_
loss = history.loss_curve_
plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(val_acc, 'b', label='Validation Accuracy')
plt.legend(loc='lower right')
plt.ylabel('Accuracy')
plt.ylim([0,1])
plt.title('Validation Accuracy')
plt.subplot(2, 1, 2)
plt.plot(loss, 'r', label='Training Loss')
plt.legend(loc='upper right')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.xlabel('epoch')
plt.show()
plt.savefig('4-Grams MLP Train Results 50k bias Case2 data.png')
def bias_mlp_confusion_matrix(y_test_bow, y_pred):
cm = confusion_matrix(y_test_bow, y_pred)
# Plot confusion matrix as heatmap
plt.figure(figsize=(8, 5))
sns.heatmap(cm, annot=True, fmt=',d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('4-grams MLP Confusion Matrix 50k bias Case2 Data')
plt.savefig('4-grams MLP Confusion Matrix 50k bias Case2 Data.jpeg')
corrected_pred_all_labels = np.diag(cm)/y_test_bow.value_counts().sort_index()
bias = (corrected_pred_all_labels.max() - corrected_pred_all_labels.min())/corrected_pred_all_labels.max()
print('{:.2%}'.format(bias))
print('Confusion Matrix is Stored')