-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_preprocessing.py
97 lines (85 loc) · 3.64 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import KeyedVectors
import re
from utils import power_iteration
import pickle as pkl
pretrained_model = KeyedVectors.load_word2vec_format('pretrained-word2vec/GoogleNews-vectors-negative300.bin', binary=True)
fake_data = pd.read_csv("raw-dataset/Fake.csv")
true_data = pd.read_csv("raw-dataset/True.csv")
fake_data["Label"] = 0
true_data["Label"] = 1
data = pd.concat([fake_data, true_data], axis=0, ignore_index = True)
# data = data.sample(frac = 1).reset_index(drop = True)
data.drop(["title", "subject", "date"], axis=1, inplace=True)
print(data)
# print(data.text)
# print(data["text"][0])
# print(len(data))
X = data.drop(["Label"], axis=1)
y = data["Label"]
num_docs = len(data)
feature_matrix = np.zeros((num_docs, 300))
label_matrix = np.zeros(num_docs)
for i in range(num_docs):
print('======> Preprocessing ' + str(i) + ' news.')
# print("Original text ==> ", X["text"][i])
review = re.sub("[^a-zA-Z]", " ", X["text"][i])
review = review.lower()
# print("Original text (cleaned) ==> ", review)
review = review.split()
review = [WordNetLemmatizer().lemmatize(word) for word in review if word not in stopwords.words("english")]
# print("Original text (tokens) ==> ", review)
# - Get valid tokens - #
num_nodes = 0
id_2_word = dict()
word_2_id = dict()
id_2_vec = dict()
appeared_nodes = set()
for word in review:
if word not in word_2_id.keys() and word not in appeared_nodes: # - find a new word - #
try:
vec = pretrained_model[word] # - 300 dimension - #
word_2_id[word] = num_nodes
id_2_word[num_nodes] = word
id_2_vec[num_nodes] = vec
num_nodes += 1
appeared_nodes.add(word)
except:
print('============> ' + word + ' could not be found in the Google pretrained model.')
appeared_nodes.add(word)
# - Construct graph adjacency matrix - #
adj_matrix = np.zeros((num_nodes, num_nodes))
for j in range(len(review) - 2): # - size window is 3 - #
word_x = review[j]
word_y = review[j + 1]
word_z = review[j + 2]
if word_x in word_2_id.keys() and word_y in word_2_id.keys():
adj_matrix[word_2_id[word_x]][word_2_id[word_y]] = 1
adj_matrix[word_2_id[word_y]][word_2_id[word_x]] = 1
if word_x in word_2_id.keys() and word_z in word_2_id.keys():
adj_matrix[word_2_id[word_x]][word_2_id[word_z]] = 1
adj_matrix[word_2_id[word_z]][word_2_id[word_x]] = 1
if word_y in word_2_id.keys() and word_z in word_2_id.keys():
adj_matrix[word_2_id[word_y]][word_2_id[word_z]] = 1
adj_matrix[word_2_id[word_z]][word_2_id[word_y]] = 1
# - Get graph embedding - #
h_matrix = np.zeros((num_nodes, 300))
p_matrix = np.zeros((num_nodes, num_nodes))
for j in range(num_nodes):
ppr = np.zeros((num_nodes, ))
ppr[j] = 1
ppr = power_iteration(ppr, adj_matrix)
p_matrix[j:] = ppr
h_matrix[j:] = id_2_vec[j]
z_matrix = np.dot(p_matrix, h_matrix)
z_vec = np.sum(z_matrix, axis=0)
print('======> Preprocessed ' + str(i) + ' news.')
# print(z_vec)
# print(y[i])
feature_matrix[i:] = z_vec
label_matrix[i] = y[i]
pkl.dump(feature_matrix, open('preprocessed-dataset/feature_matrix.pkl', 'wb'))
pkl.dump(label_matrix, open('preprocessed-dataset/label_matrix.pkl', 'wb'))