diff --git a/.gitignore b/.gitignore index 4df5d12f1..00f14b6d7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ keras-sign/sign-language **/.ipynb_checkpoints **/glove* **/aclImdb/ +.vscode \ No newline at end of file diff --git a/README.md b/README.md index 4a2f1e623..71d6f9155 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ These are specific bite-sized projects to learn an aspect of deep learning, star | Predict the weather with an RNN | [projects/6-rnn-timeseries](https://github.com/lukas/ml-class/tree/master/projects/6-rnn-timeseries) | [Recurrent Neural Networks](https://www.youtube.com/watch?v=8lbGjKhrJOo) | | Build a text generator | [projects/7-text-generation](https://github.com/lukas/ml-class/tree/master/projects/7-text-generation) | [Text Generation using LSTMs and GRUs](https://www.youtube.com/watch?v=4F69m3krMHw) | | Build a sentiment classifier on Amazon reviews. | [projects/8-text-classification](https://github.com/lukas/ml-class/tree/master/projects/8-text-classification) | [Text Classification using CNNs](https://www.youtube.com/watch?v=8YsZXTpFRO0) | -| | | [Hybrid LSTM/CNNs](https://www.youtube.com/watch?v=NysY9FN9Uac) | +| | | [Hybrid LSTM/CNNs](https://www.youtube.com/watch?v=NysY9FN9Uac) | | | | [Seq2seq Models](https://www.youtube.com/watch?v=MqugtGD605k) | | | | [Transfer Learning](https://www.youtube.com/watch?v=vbhEnEbj3JM) | | | | [One Shot Learning](https://www.youtube.com/watch?v=H4MPIWX6ftE) | diff --git a/projects/10-seq2seq/train.py b/projects/10-seq2seq/train.py new file mode 100644 index 000000000..9bab94834 --- /dev/null +++ b/projects/10-seq2seq/train.py @@ -0,0 +1,136 @@ +from keras.models import Sequential +from keras.layers import LSTM, TimeDistributed, RepeatVector, Dense +import numpy as np +import wandb +from wandb.keras import WandbCallback + +wandb.init() +config = wandb.config + +class CharacterTable(object): + """Given a set of characters: + + Encode them to a one hot integer representation + + Decode the one hot integer representation to their character output + + Decode a vector of probabilities to their character output + """ + def __init__(self, chars): + """Initialize character table. + # Arguments + chars: Characters that can appear in the input. + """ + self.chars = sorted(set(chars)) + self.char_indices = dict((c, i) for i, c in enumerate(self.chars)) + self.indices_char = dict((i, c) for i, c in enumerate(self.chars)) + + def encode(self, C, num_rows): + """One hot encode given string C. + # Arguments + num_rows: Number of rows in the returned one hot encoding. This is + used to keep the # of rows for each data the same. + """ + x = np.zeros((num_rows, len(self.chars))) + for i, c in enumerate(C): + x[i, self.char_indices[c]] = 1 + return x + + def decode(self, x, calc_argmax=True): + if calc_argmax: + x = x.argmax(axis=-1) + return ''.join(self.indices_char[x] for x in x) + +# Parameters for the model and dataset. +config.training_size = 50000 +config.digits = 5 +config.hidden_size = 128 +config.batch_size = 128 + +# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of +# int is DIGITS. +maxlen = config.digits + 1 + config.digits + +# All the numbers, plus sign and space for padding. +chars = '0123456789+- ' +ctable = CharacterTable(chars) + +questions = [] +expected = [] +seen = set() +print('Generating data...') +while len(questions) < config.training_size: + f = lambda: int(''.join(np.random.choice(list('0123456789')) + for i in range(np.random.randint(1, config.digits + 1)))) + a, b = f(), f() + # Skip any addition questions we've already seen + # Also skip any such that x+Y == Y+x (hence the sorting). + key = tuple(sorted((a, b))) + if key in seen: + continue + seen.add(key) + # Pad the data with spaces such that it is always MAXLEN. + q = '{}-{}'.format(a, b) + query = q + ' ' * (maxlen - len(q)) + ans = str(a - b) + # Answers can be of maximum size DIGITS + 1. + ans += ' ' * (config.digits + 1 - len(ans)) + + questions.append(query) + expected.append(ans) + +print('Total addition questions:', len(questions)) + +print('Vectorization...') +x = np.zeros((len(questions), maxlen, len(chars)), dtype=np.bool) +y = np.zeros((len(questions), config.digits + 1, len(chars)), dtype=np.bool) +for i, sentence in enumerate(questions): + x[i] = ctable.encode(sentence, maxlen) +for i, sentence in enumerate(expected): + y[i] = ctable.encode(sentence, config.digits + 1) + +# Shuffle (x, y) in unison as the later parts of x will almost all be larger +# digits. +indices = np.arange(len(y)) +np.random.shuffle(indices) +x = x[indices] +y = y[indices] + +# Explicitly set apart 10% for validation data that we never train over. +split_at = len(x) - len(x) // 10 +(x_train, x_val) = x[:split_at], x[split_at:] +(y_train, y_val) = y[:split_at], y[split_at:] + +model = Sequential() +model.add(LSTM(config.hidden_size, input_shape=(maxlen, len(chars)))) +model.add(RepeatVector(config.digits + 1)) +model.add(LSTM(config.hidden_size, return_sequences=True)) +model.add(TimeDistributed(Dense(len(chars), activation='softmax'))) +model.compile(loss='categorical_crossentropy', + optimizer='adam', + metrics=['accuracy']) +model.summary() + +# Train the model each generation and show predictions against the validation +# dataset. +for iteration in range(1, 200): + print() + print('-' * 50) + print('Iteration', iteration) + model.fit(x_train, y_train, + batch_size=config.batch_size, + epochs=1, + validation_data=(x_val, y_val),callbacks=[WandbCallback()]) + # Select 10 samples from the validation set at random so we can visualize + # errors. + for i in range(10): + ind = np.random.randint(0, len(x_val)) + rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])] + preds = model.predict_classes(rowx, verbose=0) + q = ctable.decode(rowx[0]) + correct = ctable.decode(rowy[0]) + guess = ctable.decode(preds[0], calc_argmax=False) + print('Q', q, end=' ') + print('T', correct, end=' ') + if correct == guess: + print('☑', end=' ') + else: + print('☒', end=' ') + print(guess) \ No newline at end of file diff --git a/projects/9-lstm-classifier/download-imdb.py b/projects/9-lstm-classifier/download-imdb.py new file mode 100644 index 000000000..509a3cfc2 --- /dev/null +++ b/projects/9-lstm-classifier/download-imdb.py @@ -0,0 +1,29 @@ +import os +import shutil +import sys +import tempfile +import urllib.request + + +IMDB_URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" +OUTPUT_NAME = "aclImdb" + +def main(): + download_and_extract_archive() + + +def download_and_extract_archive(): + if os.path.exists(OUTPUT_NAME): + print("Imdb dataset download target exists at " + OUTPUT_NAME) + else: + with urllib.request.urlopen(IMDB_URL) as response: + with tempfile.NamedTemporaryFile() as temp_archive: + temp_archive.write(response.read()) + imdb_tar = shutil.unpack_archive( + temp_archive.name, extract_dir=".", format="gztar") + + return + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/projects/9-lstm-classifier/imdb-lstm.py b/projects/9-lstm-classifier/imdb-lstm.py new file mode 100644 index 000000000..c7b008cca --- /dev/null +++ b/projects/9-lstm-classifier/imdb-lstm.py @@ -0,0 +1,50 @@ +from keras.api.preprocessing import sequence +from keras.api.models import Sequential +from keras.api.layers import Dense, Dropout, Activation +from keras.api.layers import Embedding, LSTM, Bidirectional +from keras.api.layers import Conv1D, Flatten +import wandb +from wandb.integration.keras import WandbCallback +import imdb +import numpy as np +import tensorflow as tf + +from tensorflow.keras.preprocessing.text import Tokenizer + +wandb.init() +config = wandb.config + +# set parameters: +config.vocab_size = 1000 +config.maxlen = 300 +config.batch_size = 32 +config.embedding_dims = 50 +config.filters = 10 +config.kernel_size = 3 +config.hidden_dims = 10 +config.epochs = 10 + +(X_train, y_train), (X_test, y_test) = imdb.load_imdb() + +tokenizer = Tokenizer(num_words=config.vocab_size) +tokenizer.fit_on_texts(X_train) +X_train = tokenizer.texts_to_matrix(X_train) +X_test = tokenizer.texts_to_matrix(X_test) + +X_train = sequence.pad_sequences(X_train, maxlen=config.maxlen) +X_test = sequence.pad_sequences(X_test, maxlen=config.maxlen) + +model = Sequential() +model.add(Embedding(config.vocab_size, + config.embedding_dims, + input_length=config.maxlen)) +model.add(LSTM(config.hidden_dims, activation="sigmoid")) +model.add(Dense(1, activation='sigmoid')) +model.compile(loss='binary_crossentropy', + optimizer='rmsprop', + metrics=['accuracy']) + +model.fit(X_train, y_train, + batch_size=config.batch_size, + epochs=config.epochs, + validation_data=(X_test, y_test), callbacks=[WandbCallback()]) \ No newline at end of file diff --git a/projects/9-lstm-classifier/imdb.py b/projects/9-lstm-classifier/imdb.py new file mode 100644 index 000000000..3b9be98af --- /dev/null +++ b/projects/9-lstm-classifier/imdb.py @@ -0,0 +1,32 @@ +import numpy as np +import os + +sep = os.path.sep + +def load_imdb(): + X_train = [] + y_train = [] + + path = os.path.join('aclImdb', 'train', 'pos', '') + X_train.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')]) + y_train.extend([1 for _ in range(12500)]) + + path = os.path.join('aclImdb', 'train', 'neg', '') + X_train.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')]) + y_train.extend([0 for _ in range(12500)]) + + X_test = [] + y_test = [] + + path = os.path.join('aclImdb', 'test', 'pos', '') + X_test.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')]) + y_test.extend([1 for _ in range(12500)]) + + path = os.path.join('aclImdb', 'test', 'neg', '') + X_test.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')]) + y_test.extend([0 for _ in range(12500)]) + + y_train = np.array(y_train, dtype=np.int32) + y_test = np.array(y_test, dtype=np.int32) + + return (X_train, y_train), (X_test, y_test) \ No newline at end of file