-
Notifications
You must be signed in to change notification settings - Fork 0
/
glove_model.py
142 lines (102 loc) · 5.12 KB
/
glove_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Created on Tue July 16 13:24:27 2020
# @author: miranda (upquark00)
import numpy
import os
import pickle
# TODO: rename load_glove, save_glove etc. simply "load", "save"
class GloVe:
def __init__(self, filepath = 'data/glove_twitter_50d.txt'):
'''
Constructor for class GloVe.
Args:
filepath (string): path to the location of the downloaded file containing
pre-trained GloVe vectors. Must be .txt file type. These are available
at https://nlp.stanford.edu/projects/glove/.
Returns: None
'''
self.filepath = filepath
def make_glove_model(self):
'''
Loads GloVe pre-trained word vectors from a designated .txt file, returning
a dictionary called GloVe_dict. Each dictionary key is a single GloVe word
(token) as a string. Each dictionary value is a numpy array whose dimension
is (features,), i.e. (50,), containing a real-valued vector representation
of the corresponding key token.
Args: None
Returns:
GloVe_dict (dict): each key is a string representing a token (string)
contained in pre-trained GloVe model. "Token" signifies a single
word which may contain symbols but no spaces. Each corresponding
value is a numpy array containing the real-valued feature vector
of that word. Its size is (features,).
average_vector (numpy.ndarray): 1D array of size (features,), i.e. (50,).
Contains the arithmetic mean or "average" of word vectors in the
GloVe model. A reasonable substitute for the vectors of missing
words per the paper's original author.
'''
print("gloVe vectors loading . . .")
with open(self.filepath,'r', encoding='utf8') as foo:
GloVe_dict = {}
for line in foo:
splitLines = line.split()
word = splitLines[0]
wordEmbedding = numpy.array([float(value) for value in splitLines[1:]])
GloVe_dict[word] = wordEmbedding
# TODO: create a separate function that creates average_vector
# TODO: make this faster
# Get average of word vectors to be used for unseen words, per GloVe author
with open(self.filepath, 'r', encoding='utf8') as foo:
for i, line in enumerate(foo):
pass
n_vec = i + 1
hidden_dim = len(line.split(' ')) - 1
vecs = numpy.zeros((n_vec, hidden_dim), dtype=numpy.float32)
with open(self.filepath, 'r', encoding='utf8') as foo:
for i, line in enumerate(foo):
vecs[i] = numpy.array([float(n) for n in line.split(' ')[1:]], dtype=numpy.float32)
average_vector = numpy.mean(vecs, axis=0)
print(len(GloVe_dict),"gloVe vectors loaded.")
return GloVe_dict, average_vector
def save(self):
'''
Calls make_glove_model to create and save pickled versions of the GloVe
model dictionary and a calculated average_vector (used as a standin for
missing words).
Args: None
Returns:
saved_glove_model.pkl: serialized object containing glove_model
saved_avg_vec.pkl: a serialized object containing average_vector
'''
glove_model, average_vector = self.make_glove_model()
with open('saved_glove_model.pkl', 'wb') as f:
pickle.dump(glove_model, f)
with open('saved_avg_vec.pkl', 'wb') as f:
pickle.dump(average_vector, f)
def load(self):
'''
Loads GloVe vectors and computed average vector from separate .pkl files.
Args: None
Returns:
GloVe_dict (dict): each key is a string representing a token (string)
contained in pre-trained GloVe model. "Token" signifies a single
word which may contain symbols but no spaces. Each corresponding
value is a numpy array containing the real-valued feature vector
of that word. Its size is (features,).
average_vector (numpy.ndarray): 1D array of size (features,), i.e. (50,).
Contains the arithmetic mean or "average" of word vectors in the
GloVe model. A reasonable substitute for the vectors of missing
words per the paper's original author.
'''
with open('saved_glove_model.pkl', 'rb') as f:
GloVe_dict = pickle.load(f)
with open('saved_avg_vec.pkl', 'rb') as f:
average_vector = pickle.load(f)
return GloVe_dict, average_vector
def main():
GloVe_Model = GloVe()
if not os.path.isfile('saved_glove_model.pkl'):
GloVe_Model.save(filepath = 'data/glove_twitter_50d.txt' )
if __name__ == "__main__":
main()