-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataloader.py
131 lines (110 loc) · 5.03 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 25 02:16:41 2018
@author: Bruce
"""
from tqdm import tqdm
import torch
from gensim.models import Word2Vec
import numpy as np
import load_embeddings
word2vec_model = Word2Vec.load('train20_embedding')
element_dict = {'a':0,'b':1,'c':2,'d':3,'e':4,'f':5,'g':6,'h':7,'i':8,'j':9,'k':10
,'l':11,'m':12,'n':13,'o':14,'p':15,'q':16,'r':17,'s':18,'t':19,
'u':20,'v':21,'w':22,'x':23,'y':24,'z':25,' ':26}
MAGIC_EMBEDDINGS_FILE = 'char-embeddings.txt'
def load_data(filename, n_data, chunk_length, embeddings='gensim'):
"""Loads the text data from the given file.
Inputs:
filename - Path to the text file containing the text corpus.
n_data - Number of chunks to load
chunk_length - Time steps per data point
embeddings - 'magic' or 'gensim' - the choice of character embeddings.
magic is 300D and gensim is 100D.
Returns:
X_train - List of tensors containing the training setences in embedded space.
y_train - List of tensors containing one-hot labels for each sentence, shifted by
1 to the right of X_train.
"""
if embeddings not in ('gensim', 'magic'):
raise ValueError('embeddings must be either "gensim" or "magic"')
X_train = []
y_train = []
file = open(filename, 'r')
if embeddings == 'magic':
magic_embeddings = load_embeddings.load(MAGIC_EMBEDDINGS_FILE)
embed = magic_embeddings
else:
embed = word2vec_model
corpus = file.read()
for i in tqdm(range(0, n_data * chunk_length, chunk_length)): # Word means line of text
word = [] # List of embedded character vectors for the chunk
input_word = corpus[i:i+chunk_length]
for j in range(len(input_word)):
if input_word[j] == ' ':
# TODO: What's the point of replacing spaces with periods?
word.append(embed['.'])
else:
word.append(embed[input_word[j]])
X_train.append(torch.tensor(word).reshape(1, -1, 100))
# Construct the one-hot truth labels corresponding to the next character
# in the sequence at each time step.
sentence_length = len(input_word)
n_chars = len(element_dict)
y = torch.zeros(sentence_length, n_chars)
# The input_word but shifted one character in the future
for j, ch in enumerate(corpus[i+1:i+chunk_length+1]):
y[j, element_dict[ch]] = 1
y_train.append(y)
file.close()
return X_train, y_train
def load_normalized_data(filename, chunk_length, n_train, n_val=0, device=torch.device('cpu'), embeddings='gensim', shuffle=True):
"""Loads a version of the dataset that has been normalized.
Inputs:
filename - Name of a file containing rows of text, the last character of
which will be used as the truth.
chunk_length - Number of characters per training example
n_train - Desired number of training sequences.
n_val - Desired number of validation sequences.
device - torch.device to put the tensors on
embeddings - 'gensim' for 100-dimensional or 'magic' for 300-dimensional
shuffle - Whether to shuffle the training data
Returns:
X_train - Training x tensor
y_train - Training y tensor (one-hot)
X_val - Validation data
y_val -
"""
n_data = n_train + n_val
X_train, y_train = load_data(filename, n_data, chunk_length, embeddings)
X_val = X_train[:n_val] # Split of the first n_val examples for validation
y_val = y_train[:n_val]
X_train = X_train[n_val:] # Remove the validation examples from the training
y_train = y_train[n_val:]
tmp = torch.cat(X_train, dim=1).reshape(-1, 100).numpy()
tmp_mean = np.mean(tmp, axis=0)
tmp_std = np.std(tmp, axis=0)
for i in range(len(X_train)):
X_train[i] = (X_train[i]-torch.from_numpy(tmp_mean))/torch.from_numpy(tmp_std)
for i in range(len(X_val)):
X_val[i] = (X_val[i] - torch.from_numpy(tmp_mean)) / torch.from_numpy(tmp_std)
# Convert X and y from lists to tensors of rank 4 and put them on the proper device
for i in range(len(X_train)):
X_train[i] = X_train[i].to(device)
y_train[i] = torch.tensor(y_train[i], device=device)
for i in range(len(X_val)):
X_val[i] = X_val[i].to(device)
y_val[i] = torch.tensor(y_val[i], device=device)
X_train = torch.stack(X_train, dim=0)
y_train = torch.stack(y_train, dim=0)
if n_val > 0:
X_val = torch.stack(X_val, dim=0)
y_val = torch.stack(y_val, dim=0)
# Shuffle X_train and y_train in the same way
if shuffle:
indices = np.random.choice(range(len(X_train)), size=n_train, replace=False)
X_train = X_train[indices]
y_train = y_train[indices]
return X_train, y_train, X_val, y_val
if __name__ == '__main__':
X_train, y_train = load_data('train20.txt')