HuthLab
diff --git a/‎Bins_vs_location_of_abl_chunk.png
62.9 KB b/‎Bins_vs_location_of_abl_chunk.png
62.9 KB
diff --git a/‎Bins_vs_timescale_PTB.png
62.9 KB b/‎Bins_vs_timescale_PTB.png
62.9 KB
diff --git a/‎Bins_vs_timescale_Wiki.png
65.6 KB b/‎Bins_vs_timescale_Wiki.png
65.6 KB
diff --git a/‎Plot estimated timescale.ipynb
Lines changed: 346 additions & 0 deletions b/‎Plot estimated timescale.ipynb
Lines changed: 346 additions & 0 deletions
diff --git a/‎ReadMe.md
Lines changed: 30 additions & 0 deletions b/‎ReadMe.md
Lines changed: 30 additions & 0 deletions
diff --git a/‎data.py
Lines changed: 56 additions & 0 deletions b/‎data.py
Lines changed: 56 additions & 0 deletions
diff --git a/‎embed_regularize.py
Lines changed: 39 additions & 0 deletions b/‎embed_regularize.py
Lines changed: 39 additions & 0 deletions
@@ -0,0 +1,30 @@
+# Codes for training a multi-timescale (MTS) language model. 
+## Required dependencies: Python3.6 or above, Numpy, Scipy and Pytorch1.7.0 or above with CUDA version 10.1 
+
+## Example script: to train and evaluate a standard and MTS LM on PTB dataset:
+
+### bash run.sh
+
+## Detailed description:
+
+### 1. To download PTB/WIKI data: bash getdata.sh
+
+### 2. model_mts.py defines the multi-timescale language model.
+
+### 3. To train a multi-timescale model, use train_mts.py as follows:
+
+#### On PTB data
+
+python train_mts.py --batch_size 20 --data data/penn --dropouti 0.4 --dropouth 0.25 --seed 141 --epoch 1000 --save train_mts.pt 
+
+#### On Wiki data 
+
+python train_mts.py --data data/wikitext-2 --dropouth 0.2 --seed 1882 --epoch 1000 --save train_mts.pt 
+
+### 4. To evaluate model on test set: including different word frequency bins and bootstrap test set 
+
+#### Trained LM on PTB data: 
+python model_evaluation.py --model_name train_mts.pt --data data/penn/
+
+#### Trained LM on Wiki data: 
+python model_evaluation.py --model_name train_mts.pt --data data/wikitext-2/
@@ -0,0 +1,56 @@
+import os
+import torch
+
+from collections import Counter
+
+
+class Dictionary(object):
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = []
+        self.counter = Counter()
+        self.total = 0
+
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.idx2word.append(word)
+            self.word2idx[word] = len(self.idx2word) - 1
+        token_id = self.word2idx[word]
+        self.counter[token_id] += 1
+        self.total += 1
+        return self.word2idx[word]
+
+    def __len__(self):
+        return len(self.idx2word)
+
+
+class Corpus(object):
+    def __init__(self, path):
+        self.dictionary = Dictionary()
+        self.train = self.tokenize(os.path.join(path, 'train.txt'))
+        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
+        self.test = self.tokenize(os.path.join(path, 'test.txt'))
+
+    def tokenize(self, path):
+        """Tokenizes a text file."""
+        assert os.path.exists(path)
+        # Add words to the dictionary
+        with open(path, 'r') as f:
+            tokens = 0
+            for line in f:
+                words = line.split() + ['<eos>']
+                tokens += len(words)
+                for word in words:
+                    self.dictionary.add_word(word)
+
+        # Tokenize file content
+        with open(path, 'r') as f:
+            ids = torch.LongTensor(tokens)
+            token = 0
+            for line in f:
+                words = line.split() + ['<eos>']
+                for word in words:
+                    ids[token] = self.dictionary.word2idx[word]
+                    token += 1
+
+        return ids
@@ -0,0 +1,39 @@
+import numpy as np
+
+import torch
+
+def embedded_dropout(embed, words, dropout=0.1, scale=None):
+  if dropout:
+    mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout)
+    masked_embed_weight = mask * embed.weight
+  else:
+    masked_embed_weight = embed.weight
+  if scale:
+    masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight
+
+  padding_idx = embed.padding_idx
+  if padding_idx is None:
+      padding_idx = -1
+
+  X = torch.nn.functional.embedding(words, masked_embed_weight,
+    padding_idx, embed.max_norm, embed.norm_type,
+    embed.scale_grad_by_freq, embed.sparse
+  )
+  return X
+
+if __name__ == '__main__':
+  V = 50
+  h = 4
+  bptt = 10
+  batch_size = 2
+
+  embed = torch.nn.Embedding(V, h)
+
+  words = np.random.random_integers(low=0, high=V-1, size=(batch_size, bptt))
+  words = torch.LongTensor(words)
+
+  origX = embed(words)
+  X = embedded_dropout(embed, words)
+
+  print(origX)
+  print(X)