Skip to content

Commit f2003a1

Browse files
committed
flake8
1 parent 01747cc commit f2003a1

14 files changed

+231
-157
lines changed

metrics/cluster_metrics.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
"""
22
cluster metrics: precision, recall, )f1)
33
"""
4-
import math
5-
import random
64
from collections import Counter
75

86
import scipy.optimize
9-
import numpy as np
107
import torch
118

129

model/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-

model/decoder.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import torch as t
22
import torch.nn as nn
3-
import torch.nn.functional as F
3+
44

55
class Decoder(nn.Module):
6-
def __init__(self, latent_z_size, word_emb_size, word_vocab_size, decoder_rnn_size, decoder_num_layers, dropout=0.5):
6+
def __init__(self, latent_z_size, word_emb_size, word_vocab_size, decoder_rnn_size,
7+
decoder_num_layers, dropout=0.5):
78
super(Decoder, self).__init__()
89
self.latent_z_size = latent_z_size
910
self.word_vocab_size = word_vocab_size
@@ -27,7 +28,8 @@ def forward(self, decoder_input, latent_z):
2728
"""
2829

2930
[batch_size, seq_len, _] = decoder_input.size()
30-
# decoder rnn is conditioned on context via additional bias = W_cond * z to every input token
31+
# decoder rnn is conditioned on context via additional bias = W_cond * z to every input
32+
# token
3133
latent_z = t.cat([latent_z] * seq_len, 1).view(batch_size, seq_len, self.latent_z_size)
3234
decoder_input = t.cat([decoder_input, latent_z], 2)
3335
rnn_out, _ = self.rnn(decoder_input)

model/encoder.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import torch
22
import torch.nn as nn
3-
import torch.nn.functional as F
3+
44

55
class Encoder(nn.Module):
66
def __init__(self, word_emb_size, encoder_rnn_size, encoder_num_layers, dropout=0.5):
@@ -28,7 +28,8 @@ def forward(self, encoder_input, lengths):
2828
encoder_input, lengths, True)
2929
# Unfold rnn with zero initial state and get its final state from the last layer
3030
rnn_out, (_, final_state) = self.rnn(packed_words, None)
31-
final_state = final_state.view(self.encoder_num_layers, 2, batch_size, self.encoder_rnn_size)[-1]
31+
final_state = final_state.view(
32+
self.encoder_num_layers, 2, batch_size, self.encoder_rnn_size)[-1]
3233
h_1, h_2 = final_state[0], final_state[1]
3334
final_state = torch.cat([h_1, h_2], 1)
3435
_, unperm_idx = perm_idx.sort(0)

model/multiview_encoders.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,12 @@
88

99
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
1010

11+
1112
class MultiviewEncoders(nn.Module):
1213

13-
def __init__(self, vocab_size, num_layers, embedding_size, lstm_hidden_size, word_dropout, dropout, start_idx=2, end_idx=3, pad_idx=0):
14+
def __init__(
15+
self, vocab_size, num_layers, embedding_size, lstm_hidden_size, word_dropout, dropout,
16+
start_idx=2, end_idx=3, pad_idx=0):
1417
super().__init__()
1518
self.pad_idx = pad_idx
1619
self.start_idx = start_idx # for RNN autoencoder training
@@ -24,6 +27,7 @@ def __init__(self, vocab_size, num_layers, embedding_size, lstm_hidden_size, wor
2427
self.crit = nn.CrossEntropyLoss()
2528

2629
self.embedder = nn.Embedding(vocab_size, embedding_size)
30+
2731
def create_rnn(embedding_size, bidirectional=True):
2832
return nn.LSTM(
2933
embedding_size,
@@ -52,7 +56,9 @@ def get_encoder(self, encoder):
5256
}[encoder]
5357

5458
@classmethod
55-
def construct_from_embeddings(cls, embeddings, num_layers, embedding_size, lstm_hidden_size, word_dropout, dropout, vocab_size, start_idx=2, end_idx=3, pad_idx=0):
59+
def construct_from_embeddings(
60+
cls, embeddings, num_layers, embedding_size, lstm_hidden_size, word_dropout, dropout,
61+
vocab_size, start_idx=2, end_idx=3, pad_idx=0):
5662
model = cls(
5763
num_layers=num_layers,
5864
embedding_size=embedding_size,
@@ -82,7 +88,8 @@ def decode(self, decoder_input, latent_z):
8288
embeddings = self.embedder(padded)
8389
embeddings = self.word_dropout(embeddings)
8490
[batch_size, seq_len, _] = embeddings.size()
85-
# decoder rnn is conditioned on context via additional bias = W_cond * z to every input token
91+
# decoder rnn is conditioned on context via additional bias = W_cond * z
92+
# to every input token
8693
latent_z = t.cat([latent_z] * seq_len, 1).view(batch_size, seq_len, -1)
8794
embeddings = t.cat([embeddings, latent_z], 2)
8895
rnn = self.ae_decoder
@@ -130,7 +137,8 @@ def hierarchical_forward(self, input):
130137
_, (_, final_word_state) = self.view2_word_rnn(packed, None)
131138
_, unperm_idx = perm_idx.sort(0)
132139
final_word_state = final_word_state[:, unperm_idx]
133-
final_word_state = final_word_state.view(self.num_layers, 2, batch_size*max_sent_len, self.lstm_hidden_size)[-1] \
140+
final_word_state = final_word_state.view(
141+
self.num_layers, 2, batch_size*max_sent_len, self.lstm_hidden_size)[-1] \
134142
.transpose(0, 1).contiguous() \
135143
.view(batch_size, max_sent_len, 2 * self.lstm_hidden_size)
136144

@@ -140,7 +148,8 @@ def hierarchical_forward(self, input):
140148
_, (_, final_sent_state) = self.view2_sent_rnn(sent_packed, None)
141149
_, sent_unperm_idx = sent_perm_idx.sort(0)
142150
final_sent_state = final_sent_state[:, sent_unperm_idx]
143-
final_sent_state = final_sent_state.view(self.num_layers, 2, batch_size, self.lstm_hidden_size)[-1] \
151+
final_sent_state = final_sent_state.view(
152+
self.num_layers, 2, batch_size, self.lstm_hidden_size)[-1] \
144153
.transpose(0, 1).contiguous() \
145154
.view(batch_size, 2 * self.lstm_hidden_size)
146155
return final_sent_state
@@ -169,13 +178,15 @@ def reconst_loss(self, gnd_utts, reconst):
169178
padded, lengths = pad_sentences(gnd_utts, pad_idx=self.pad_idx, rpad=self.end_idx)
170179
batch_size = len(lengths)
171180
crit = nn.CrossEntropyLoss()
172-
loss += crit(reconst.view(batch_size * seq_len, vocab_size), padded.view(batch_size * seq_len))
181+
loss += crit(
182+
reconst.view(batch_size * seq_len, vocab_size), padded.view(batch_size * seq_len))
173183
_, argmax = reconst.max(dim=-1)
174184
correct = (argmax == padded)
175185
acc = correct.float().mean().item()
176186
return loss, acc
177187

178-
def create_model_from_embeddings(glove_path, id_to_token, token_to_id):
188+
189+
def from_embeddings(glove_path, id_to_token, token_to_id):
179190
vocab_size = len(token_to_id)
180191

181192
# Load pre-trained GloVe vectors
@@ -184,7 +195,8 @@ def create_model_from_embeddings(glove_path, id_to_token, token_to_id):
184195
print('loading glove')
185196
for line in open(glove_path):
186197
parts = line.strip().split()
187-
if len(parts) % 100 != 1: continue
198+
if len(parts) % 100 != 1:
199+
continue
188200
word = parts[0]
189201
if word not in token_to_id:
190202
continue
@@ -213,7 +225,8 @@ def create_model_from_embeddings(glove_path, id_to_token, token_to_id):
213225
vocab_size=vocab_size
214226
)
215227
model.to(device)
216-
return id_to_token, token_to_id, vocab_size, word_emb_size, model
228+
return id_to_token, token_to_id, vocab_size, word_emb_size, model
229+
217230

218231
def load_model(model_path):
219232
with open(model_path, 'rb') as f:

model/utils.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1-
import os, time, pprint
21
import torch
32

3+
44
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
55

6-
def pad_sentences(sents, lpad=None, rpad=None, reverse=False, pad_idx=0, max_sent_len=100):
6+
7+
def pad_sentences(
8+
sents, lpad=None, rpad=None, reverse=False, pad_idx=0,
9+
max_sent_len=100):
710
sentences = []
811
max_len = 0
912
for i in range(len(sents)):
@@ -24,12 +27,15 @@ def pad_sentences(sents, lpad=None, rpad=None, reverse=False, pad_idx=0, max_sen
2427
for i in range(len(sentences)):
2528
lengths.append(len(sentences[i]))
2629
sentences[i] = sentences[i] + [pad_idx]*(max_len - len(sentences[i]))
27-
return torch.LongTensor(sentences).to(device), torch.LongTensor(lengths).to(device)
30+
return (torch.LongTensor(sentences).to(device),
31+
torch.LongTensor(lengths).to(device))
32+
2833

2934
def pad_paragraphs(paras, pad_idx=0):
3035
sentences, lengths = [], []
3136
max_len = 0
32-
for para in paras: max_len = max(max_len, len(para))
37+
for para in paras:
38+
max_len = max(max_len, len(para))
3339
for para in paras:
3440
for sent in para:
3541
sentences.append(sent[:])
@@ -39,12 +45,11 @@ def pad_paragraphs(paras, pad_idx=0):
3945
ret_sents, sent_lens = pad_sentences(sentences, pad_idx=pad_idx)
4046
return ret_sents, sent_lens, torch.LongTensor(lengths).to(device), max_len
4147

48+
4249
def euclidean_metric(a, b):
4350
n = a.shape[0]
4451
m = b.shape[0]
4552
a = a.unsqueeze(1).expand(n, m, -1)
4653
b = b.unsqueeze(0).expand(n, m, -1)
4754
logits = -((a - b)**2).sum(dim=2)
4855
return logits
49-
50-

pretrain.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import copy
2-
import torch
32
import numpy as np
43

54
from train import BATCH_SIZE, AE_BATCH_SIZE
65

6+
77
def pretrain_qt(dataset, perm_idx, expressions, train=True):
88
"""
99
for each pair of utterances:
@@ -48,9 +48,11 @@ def pretrain_qt(dataset, perm_idx, expressions, train=True):
4848
optimizer.step()
4949
return total_loss, total_acc / len(qt_ex)
5050

51+
5152
def after_pretrain_qt(model):
5253
model.view2_word_rnn = copy.deepcopy(model.view1_word_rnn)
5354

55+
5456
def pretrain_ae(dataset, perm_idx, expressions, train=True):
5557
"""
5658
uses v1 encoder to encode all utterances in both view1 and view2
@@ -84,6 +86,7 @@ def pretrain_ae(dataset, perm_idx, expressions, train=True):
8486
total_acc = total_acc / len(utterances)
8587
return total_loss, total_acc
8688

89+
8790
def after_pretrain_ae(model):
8891
# we'll use the view1 encoder for both view 1 and view 2
8992
model.view2_word_rnn = copy.deepcopy(model.view1_word_rnn)

proc_data.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,18 @@
99
START = "__START__"
1010
END = "__END__"
1111

12-
class Dataset(Dataset):
1312

14-
def __init__(self, fname, view1_col='view1_col', view2_col='view2_col', label_col='cluster_id', tokenized=True, max_sent=10, train_ratio=.9):
13+
class Dataset(Dataset):
14+
def __init__(self, fname, view1_col='view1_col', view2_col='view2_col', label_col='cluster_id',
15+
tokenized=True, max_sent=10, train_ratio=.9):
1516
"""
1617
Args:
1718
fname: str, training data file
1819
view1_col: str, the column corresponding to view 1 input
1920
view2_col: str, the column corresponding to view 2 input
2021
label_col: str, the column corresponding to label
2122
"""
22-
23+
2324
def tokens_to_idices(tokens):
2425
token_idices = []
2526
for token in tokens:
@@ -28,7 +29,7 @@ def tokens_to_idices(tokens):
2829
id_to_token.append(token)
2930
token_idices.append(token_to_id[token])
3031
return token_idices
31-
32+
3233
id_to_token = [PAD, UNK, START, END]
3334
token_to_id = {PAD: 0, UNK: 1, START: 2, END: 3}
3435
id_to_label = [UNK]
@@ -43,14 +44,15 @@ def tokens_to_idices(tokens):
4344
for row in reader:
4445
view1_text, view2_text = row[view1_col], row[view2_col]
4546
label = row[label_col]
46-
if 'UNK' == label: label = UNK
47+
if 'UNK' == label:
48+
label = UNK
4749
if '<cust_' not in view1_text:
4850
view2_sents = sent_tokenize(view2_text.lower())
4951
else:
5052
view2_sents = view2_text.split("> <")
5153
for i in range(len(view2_sents) - 1):
52-
view2_sents[i] = view2_sents[i] + '>'
53-
view2_sents[i+1] = '<' + view2_sents[i]
54+
view2_sents[i] = view2_sents[i] + '>'
55+
view2_sents[i+1] = '<' + view2_sents[i]
5456
v1_utts.append(view1_text)
5557
if not tokenized:
5658
v1_tokens = word_tokenize(view1_text.lower())

0 commit comments

Comments
 (0)