diff --git a/CapGenerator/eval_model.py b/CapGenerator/eval_model.py index 490f3df..3036724 100644 --- a/CapGenerator/eval_model.py +++ b/CapGenerator/eval_model.py @@ -33,7 +33,7 @@ def extract_features(filename): return feature # generate a description for an image -def generate_desc(model, tokenizer, photo, index_word, max_length, beam_size=10): +def generate_desc(model, tokenizer, photo, index_word, max_length, beam_size=5): captions = [['startseq', 0.0]] # seed the generation process @@ -91,7 +91,19 @@ def evaluate_model(model, descriptions, photos, tokenizer, index_word, max_lengt print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))) print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))) - +def eval_test_set(model, descriptions, photos, tokenizer, index_word, max_length): + actual, predicted = list(), list() + # step over the whole set + for key, desc_list in descriptions.items(): + # generate description + yhat = generate_desc(model, tokenizer, photos[key], index_word, max_length)[0] + # store actual and predicted + references = [d.split() for d in desc_list] + actual.append(references) + # Use best caption + predicted.append(yhat[0].split()) + predicted = sorted(predicted) + actual = [x for _,x in sorted(zip(actual,predicted))] if __name__ == '__main__': @@ -111,7 +123,7 @@ def evaluate_model(model, descriptions, photos, tokenizer, index_word, max_lengt if args.model: filename = args.model else: - filename = 'models/model-ep005-loss3.454-val_loss3.872.h5' + filename = 'models/model-ep005-loss3.504-val_loss3.893.h5' model = load_model(filename) if args.image: diff --git a/CapGenerator/generate_model.py b/CapGenerator/generate_model.py index b8632db..52538c7 100644 --- a/CapGenerator/generate_model.py +++ b/CapGenerator/generate_model.py @@ -1,4 +1,5 @@ import numpy as np +import tensorflow as tf from pickle import load from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences @@ -14,8 +15,13 @@ from keras.layers import TimeDistributed from keras.layers import concatenate from keras.callbacks import ModelCheckpoint +from keras.optimizers import Adam -EMBEDDING_DIM = 128 +EMBEDDING_DIM = 256 + +lstm_layers = 2 +dropout_rate = 0.2 +learning_rate = 0.001 # convert a dictionary of clean descriptions to a list of descriptions def to_lines(descriptions): @@ -41,7 +47,7 @@ def max_length(descriptions): def create_sequences(tokenizer, max_length, desc_list, photo): vocab_size = len(tokenizer.word_index) + 1 - X1, X2, y = list(), list(), list() + X1, X2, y = [], [], [] # walk through each description for the image for desc in desc_list: # encode the sequence @@ -61,14 +67,51 @@ def create_sequences(tokenizer, max_length, desc_list, photo): return np.array(X1), np.array(X2), np.array(y) # data generator, intended to be used in a call to model.fit_generator() -def data_generator(descriptions, photos, tokenizer, max_length): +def data_generator(descriptions, photos, tokenizer, max_length, n_step = 1): # loop for ever over images while 1: - for key, desc_list in descriptions.items(): - # retrieve the photo feature - photo = photos[key][0] - in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo) - yield [[in_img, in_seq], out_word] + # loop over photo identifiers in the dataset + keys = list(descriptions.keys()) + for i in range(0, len(keys), n_step): + Ximages, XSeq, y = list(), list(),list() + for j in range(i, min(len(keys), i+n_step)): + image_id = keys[j] + # retrieve the photo feature + photo = photos[image_id][0] + desc_list = descriptions[image_id] + in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo) + for k in range(len(in_img)): + Ximages.append(in_img[k]) + XSeq.append(in_seq[k]) + y.append(out_word[k]) + yield [[np.array(Ximages), np.array(XSeq)], np.array(y)] + +def categorical_crossentropy_from_logits(y_true, y_pred): + y_true = y_true[:, :-1, :] # Discard the last timestep + y_pred = y_pred[:, :-1, :] # Discard the last timestep + loss = tf.nn.softmax_cross_entropy_with_logits(labels=y_true, + logits=y_pred) + return loss + +def categorical_accuracy_with_variable_timestep(y_true, y_pred): + y_true = y_true[:, :-1, :] # Discard the last timestep + y_pred = y_pred[:, :-1, :] # Discard the last timestep + + # Flatten the timestep dimension + shape = tf.shape(y_true) + y_true = tf.reshape(y_true, [-1, shape[-1]]) + y_pred = tf.reshape(y_pred, [-1, shape[-1]]) + + # Discard rows that are all zeros as they represent padding words. + is_zero_y_true = tf.equal(y_true, 0) + is_zero_row_y_true = tf.reduce_all(is_zero_y_true, axis=-1) + y_true = tf.boolean_mask(y_true, ~is_zero_row_y_true) + y_pred = tf.boolean_mask(y_pred, ~is_zero_row_y_true) + + accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_true, axis=1), + tf.argmax(y_pred, axis=1)), + dtype=tf.float32)) + return accuracy # define the captioning model def define_model(vocab_size, max_length): @@ -80,14 +123,12 @@ def define_model(vocab_size, max_length): # embedding inputs2 = Input(shape=(max_length,)) - emb2 = Embedding(vocab_size, 256, mask_zero=True)(inputs2) - emb3 = LSTM(256, return_sequences=True)(emb2) - emb4 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(emb3) + emb2 = Embedding(vocab_size, EMBEDDING_DIM, mask_zero=True)(inputs2) # merge inputs - merged = concatenate([fe3, emb4]) + merged = concatenate([fe3, emb2]) # language model (decoder) - lm2 = LSTM(1000)(merged) + lm2 = LSTM(500, return_sequences=False)(merged) #lm3 = Dense(500, activation='relu')(lm2) outputs = Dense(vocab_size, activation='softmax')(lm2) diff --git a/Devel.ipynb b/Devel.ipynb index d856f8d..76111da 100644 --- a/Devel.ipynb +++ b/Devel.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -32,15 +32,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "ename": "SyntaxError", - "evalue": "invalid syntax (, line 11)", + "evalue": "invalid syntax (, line 11)", "output_type": "error", "traceback": [ - "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m11\u001b[0m\n\u001b[0;31m for key Pin descriptions.keys():\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" + "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m11\u001b[0m\n\u001b[0;31m for key Pin descriptions.keys():\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" ] } ], @@ -132,7 +132,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -155,9 +155,15 @@ "from IPython.display import SVG\n", "from keras.utils.vis_utils import model_to_dot\n", "\n", - "EMBEDDING_DIM = 128\n", + "from keras.optimizers import Adam\n", "\n", "\n", + "EMBEDDING_DIM = 128\n", + "lstm_layers = 3\n", + "dropout_rate = 0.22\n", + "learning_rate = 0.001\n", + "\n", + "# define the captioning model\n", "def define_model(vocab_size, max_length):\n", " # feature extractor (encoder)\n", " inputs1 = Input(shape=(4096, ))\n", @@ -168,77 +174,53 @@ " # embedding\n", " inputs2 = Input(shape=(max_length, ))\n", " emb2 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)\n", - " emb3 = LSTM(256, return_sequences=True)(emb2)\n", - " emb4 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(emb3)\n", "\n", " # merge inputs\n", - " merged = concatenate([fe3, emb4])\n", + " merged = concatenate([fe3, emb2])\n", " # language model (decoder)\n", - " lm2 = LSTM(1000)(merged)\n", - " #lm3 = Dense(500, activation='relu')(lm2)\n", - " outputs = Dense(vocab_size, activation='softmax')(lm2)\n", + " input_ = merged\n", + " for _ in range(lstm_layers):\n", + " input_ = BatchNormalization()(input_)\n", + " lstm_out = LSTM(\n", + " 300,\n", + " return_sequences=True,\n", + " dropout=dropout_rate,\n", + " recurrent_dropout=dropout_rate)(input_)\n", + " input_ = lstm_out\n", + " outputs = Dense(vocab_size, activation='softmax')(lstm_out)\n", "\n", " # tie it together [image, seq] [word]\n", " model = Model(inputs=[inputs1, inputs2], outputs=outputs)\n", " model.compile(\n", " loss='categorical_crossentropy',\n", - " optimizer='adam',\n", + " optimizer=Adam(lr=learning_rate),\n", " metrics=['accuracy'])\n", " print(model.summary())\n", - " plot_model(model, show_shapes=True, to_file='plot.png')\n", + " plot_model(model, show_shapes=True, to_file='model.png')\n", " return model" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "__________________________________________________________________________________________________\n", - "Layer (type) Output Shape Param # Connected to \n", - "==================================================================================================\n", - "input_11 (InputLayer) (None, 4096) 0 \n", - "__________________________________________________________________________________________________\n", - "input_12 (InputLayer) (None, 34) 0 \n", - "__________________________________________________________________________________________________\n", - "dropout_2 (Dropout) (None, 4096) 0 input_11[0][0] \n", - "__________________________________________________________________________________________________\n", - "embedding_4 (Embedding) (None, 34, 256) 51200 input_12[0][0] \n", - "__________________________________________________________________________________________________\n", - "dense_4 (Dense) (None, 128) 524416 dropout_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "lstm_3 (LSTM) (None, 34, 256) 525312 embedding_4[0][0] \n", - "__________________________________________________________________________________________________\n", - "repeat_vector_2 (RepeatVector) (None, 34, 128) 0 dense_4[0][0] \n", - "__________________________________________________________________________________________________\n", - "time_distributed_2 (TimeDistrib (None, 34, 128) 32896 lstm_3[0][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_2 (Concatenate) (None, 34, 256) 0 repeat_vector_2[0][0] \n", - " time_distributed_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "lstm_4 (LSTM) (None, 1000) 5028000 concatenate_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "dense_6 (Dense) (None, 200) 200200 lstm_4[0][0] \n", - "==================================================================================================\n", - "Total params: 6,362,024\n", - "Trainable params: 6,362,024\n", - "Non-trainable params: 0\n", - "__________________________________________________________________________________________________\n", - "None\n" - ] - } - ], + "outputs": [], "source": [ "model = define_model(200, 34)" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(SVG(model_to_dot(model, show_shapes=True, show_layer_names=True).create(prog='dot', format='svg')))" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -288,32 +270,9 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "generating vocab_history model v5\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/Divyansh/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:14: UserWarning: Update your `Embedding` call to the Keras 2 API: `Embedding(output_dim=100, input_dim=2187, input_length=1, embeddings_initializer=\"glorot_uniform\")`\n", - " \n", - "/Users/Divyansh/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:17: UserWarning: The `merge` function is deprecated and will be removed after 08/2017. Use instead layers from `keras.layers.merge`, e.g. `add`, `concatenate`, etc.\n", - "/Users/Divyansh/anaconda3/lib/python3.6/site-packages/keras/legacy/layers.py:465: UserWarning: The `Merge` layer is deprecated and will be removed after 08/2017. Use instead layers from `keras.layers.merge`, e.g. `add`, `concatenate`, etc.\n", - " name=name)\n", - "/Users/Divyansh/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:21: UserWarning: The `merge` function is deprecated and will be removed after 08/2017. Use instead layers from `keras.layers.merge`, e.g. `add`, `concatenate`, etc.\n", - "/Users/Divyansh/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:30: UserWarning: The `merge` function is deprecated and will be removed after 08/2017. Use instead layers from `keras.layers.merge`, e.g. `add`, `concatenate`, etc.\n", - "/Users/Divyansh/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:31: UserWarning: The `merge` function is deprecated and will be removed after 08/2017. Use instead layers from `keras.layers.merge`, e.g. `add`, `concatenate`, etc.\n", - "/Users/Divyansh/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:39: UserWarning: Update your `Model` call to the Keras 2 API: `Model(inputs=[\n", - "\n", - "G\n", - "\n", - "\n", - "4763032264\n", - "\n", - "input_33: InputLayer\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 1)\n", - "\n", - "(None, 1)\n", - "\n", - "\n", - "4762936208\n", - "\n", - "embedding_10: Embedding\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 1)\n", - "\n", - "(None, 1, 100)\n", - "\n", - "\n", - "4763032264->4762936208\n", - "\n", - "\n", - "\n", - "\n", - "4762936544\n", - "\n", - "reshape_12: Reshape\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 1, 100)\n", - "\n", - "(None, 100)\n", - "\n", - "\n", - "4762936208->4762936544\n", - "\n", - "\n", - "\n", - "\n", - "4762936656\n", - "\n", - "merge_11: Merge\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "[(None, 100), (None, 53)]\n", - "\n", - "(None, 153)\n", - "\n", - "\n", - "4762936544->4762936656\n", - "\n", - "\n", - "\n", - "\n", - "4763031984\n", - "\n", - "input_35: InputLayer\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 53)\n", - "\n", - "(None, 53)\n", - "\n", - "\n", - "4763031984->4762936656\n", - "\n", - "\n", - "\n", - "\n", - "4757292984\n", - "\n", - "dense_15: Dense\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 153)\n", - "\n", - "(None, 100)\n", - "\n", - "\n", - "4762936656->4757292984\n", - "\n", - "\n", - "\n", - "\n", - "4757298088\n", - "\n", - "dropout_11: Dropout\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 100)\n", - "\n", - "(None, 100)\n", - "\n", - "\n", - "4757292984->4757298088\n", - "\n", - "\n", - "\n", - "\n", - "4763033496\n", - "\n", - "input_34: InputLayer\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 2048)\n", - "\n", - "(None, 2048)\n", - "\n", - "\n", - "4757298480\n", - "\n", - "merge_12: Merge\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "[(None, 2048), (None, 100), (None, 2187)]\n", - "\n", - "(None, 4335)\n", - "\n", - "\n", - "4763033496->4757298480\n", - "\n", - "\n", - "\n", - "\n", - "4750942160\n", - "\n", - "merge_13: Merge\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "[(None, 2048), (None, 2048)]\n", - "\n", - "(None, 2048)\n", - "\n", - "\n", - "4763033496->4750942160\n", - "\n", - "\n", - "\n", - "\n", - "4757298088->4757298480\n", - "\n", - "\n", - "\n", - "\n", - "4757066304\n", - "\n", - "merge_14: Merge\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "[(None, 2048), (None, 100), (None, 2187)]\n", - "\n", - "(None, 4335)\n", - "\n", - "\n", - "4757298088->4757066304\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4763032768\n", - "\n", - "input_36: InputLayer\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 2187)\n", - "\n", - "(None, 2187)\n", - "\n", - "\n", - "4763032768->4757298480\n", - "\n", - "\n", - "\n", - "\n", - "4763032768->4757066304\n", - "\n", - "\n", - "\n", - "4757290904\n", - "\n", - "reshape_13: Reshape\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 4335)\n", - "\n", - "(None, 1, 4335)\n", - "\n", - "\n", - "4757298480->4757290904\n", - "\n", - "\n", - "\n", - "\n", - "4764038480\n", - "\n", - "gru_5: GRU\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 1, 4335)\n", - "\n", - "(None, 2048)\n", - "\n", - "\n", - "4757290904->4764038480\n", - "\n", - "\n", - "\n", - "\n", - "4763917168\n", - "\n", - "dropout_12: Dropout\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 2048)\n", - "\n", - "(None, 2048)\n", - "\n", - "\n", - "4764038480->4763917168\n", - "\n", - "\n", - "\n", - "\n", - "4763916160\n", - "\n", - "dense_16: Dense\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 2048)\n", - "\n", - "(None, 2048)\n", - "\n", - "\n", - "4763917168->4763916160\n", - "\n", - "\n", - "\n", - "\n", - "4750841784\n", - "\n", - "batch_normalization_4: BatchNormalization\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 2048)\n", - "\n", - "(None, 2048)\n", - "\n", - "\n", - "4763916160->4750841784\n", - "\n", - "\n", - "\n", - "\n", - "4751188544\n", - "\n", - "activation_3: Activation\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 2048)\n", - "\n", - "(None, 2048)\n", - "\n", - "\n", - "4750841784->4751188544\n", - "\n", - "\n", - "\n", - "\n", - "4751188544->4750942160\n", - "\n", - "\n", - "\n", - "\n", - "4750942160->4757066304\n", - "\n", - "\n", - "\n", - "\n", - "4757337368\n", - "\n", - "reshape_14: Reshape\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 4335)\n", - "\n", - "(None, 1, 4335)\n", - "\n", - "\n", - "4757066304->4757337368\n", - "\n", - "\n", - "\n", - "\n", - "4764071696\n", - "\n", - "gru_6: GRU\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 1, 4335)\n", - "\n", - "(None, 1024)\n", - "\n", - "\n", - "4757337368->4764071696\n", - "\n", - "\n", - "\n", - "\n", - "4756963056\n", - "\n", - "dropout_13: Dropout\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 1024)\n", - "\n", - "(None, 1024)\n", - "\n", - "\n", - "4764071696->4756963056\n", - "\n", - "\n", - "\n", - "\n", - "4756960088\n", - "\n", - "dense_17: Dense\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 1024)\n", - "\n", - "(None, 2187)\n", - "\n", - "\n", - "4756963056->4756960088\n", - "\n", - "\n", - "\n", - "\n", - "4760931968\n", - "\n", - "batch_normalization_5: BatchNormalization\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 2187)\n", - "\n", - "(None, 2187)\n", - "\n", - "\n", - "4756960088->4760931968\n", - "\n", - "\n", - "\n", - "\n", - "4761206456\n", - "\n", - "activation_4: Activation\n", - "\n", - "input:\n", - "\n", - "output:\n", - "\n", - "(None, 2187)\n", - "\n", - "(None, 2187)\n", - "\n", - "\n", - "4760931968->4761206456\n", - "\n", - "\n", - "\n", - "\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "SVG(model_to_dot(model, show_shapes=True, show_layer_names=True).create(prog='dot', format='svg'))" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1082,25 +580,9 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "Operands could not be broadcast together with shapes (512, 1) (128, 1)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNIC\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m16\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1024\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mplot_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'../images/NIC.png'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mNIC\u001b[0;34m(max_token_length, vocabulary_size, rnn, num_image_features, hidden_size, embedding_size, regularizer, batch_size)\u001b[0m\n\u001b[1;32m 154\u001b[0m \u001b[0mdim_change2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPermute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim_change\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 155\u001b[0m \u001b[0;31m#per_out2 = Reshape((196, 1))(dim_change)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 156\u001b[0;31m \u001b[0mattendout3\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mMultiply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mper_out1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim_change2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 157\u001b[0m \u001b[0mpre_merge\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mReshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0membedding_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mattendout3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0mgettx1layer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mLambda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/engine/topology.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, inputs, **kwargs)\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_shapes\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 593\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 594\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_shapes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 595\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuilt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 596\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/layers/merge.py\u001b[0m in \u001b[0;36mbuild\u001b[0;34m(self, input_shape)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0mshape\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput_shape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 90\u001b[0;31m \u001b[0moutput_shape\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compute_elemwise_op_output_shape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_shape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 91\u001b[0m \u001b[0;31m# If the inputs have different ranks, we have to reshape them\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0;31m# to make them broadcastable.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/keras/layers/merge.py\u001b[0m in \u001b[0;36m_compute_elemwise_op_output_shape\u001b[0;34m(self, shape1, shape2)\u001b[0m\n\u001b[1;32m 59\u001b[0m raise ValueError('Operands could not be broadcast '\n\u001b[1;32m 60\u001b[0m \u001b[0;34m'together with shapes '\u001b[0m \u001b[0;34m+\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 61\u001b[0;31m str(shape1) + ' ' + str(shape2))\n\u001b[0m\u001b[1;32m 62\u001b[0m \u001b[0moutput_shape\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_shape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: Operands could not be broadcast together with shapes (512, 1) (128, 1)" - ] - } - ], + "outputs": [], "source": [ "model = NIC(16, 1024)\n", "plot_model(model, '../images/NIC.png')" @@ -1176,6 +658,257 @@ " feature = model.predict(image, verbose=0)\n", " return feature" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from keras.layers import Embedding, Input\n", + "from keras.layers import BatchNormalization, Dense, RepeatVector\n", + "from keras.applications.vgg16 import VGG16\n", + "\n", + "# The top layer is the last layer\n", + "image_model = VGG16(weights='imagenet')\n", + "# Fix the weights\n", + "for layer in image_model.layers:\n", + " layer.trainable = False\n", + "\n", + "embedding_size = 300\n", + "dense_input = BatchNormalization(axis=-1)(image_model.output)\n", + "image_dense = Dense(units=embedding_size)(dense_input) # FC layer\n", + "# Add a timestep dimension to match LSTM's input size\n", + "image_embedding = RepeatVector(1)(image_dense)\n", + "image_input = image_model.input\n", + "\n", + "vocab_size = 2536\n", + "embedding_size = 300\n", + "\n", + "sentence_input = Input(shape=[34])\n", + "word_embedding = Embedding(\n", + " input_dim=vocab_size, output_dim=embedding_size)(sentence_input)\n", + "\n", + "from keras.layers import (BatchNormalization, Concatenate, Dense, LSTM,\n", + " TimeDistributed)\n", + "from keras.models import Model\n", + "from keras.optimizers import Adam\n", + "\n", + "sequence_input = Concatenate(axis=1)([image_embedding, word_embedding])\n", + "\n", + "learning_rate = 0.00051\n", + "lstm_output_size = 300\n", + "vocab_size = 2536\n", + "lstm_layers = 3\n", + "dropout_rate = 0.22\n", + "input_ = sequence_input\n", + "\n", + "for _ in range(lstm_layers):\n", + " input_ = BatchNormalization(axis=-1)(input_)\n", + " lstm_out = LSTM(\n", + " units=lstm_output_size,\n", + " return_sequences=True,\n", + " dropout=dropout_rate,\n", + " recurrent_dropout=dropout_rate)(input_)\n", + " input_ = lstm_out\n", + "sequence_output = TimeDistributed(Dense(units=vocab_size))(lstm_out)\n", + "\n", + "model = Model(inputs=[image_input, sentence_input], outputs=sequence_output)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SVG(model_to_dot(model, show_shapes=True, show_layer_names=True).create(prog='dot', format='svg'))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "usage: ipykernel_launcher.py [-h] [-i IMAGE] [-m MODEL]\n", + "ipykernel_launcher.py: error: unrecognized arguments: -f /Users/Divyansh/Library/Jupyter/runtime/kernel-421658b4-8964-4bee-84bf-e58e86f851f8.json\n" + ] + }, + { + "ename": "SystemExit", + "evalue": "2", + "output_type": "error", + "traceback": [ + "An exception has occurred, use %tb to see the full traceback.\n", + "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m 2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/Divyansh/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2870: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.\n", + " warn(\"To exit: use 'exit', 'quit', or Ctrl-D.\", stacklevel=1)\n" + ] + } + ], + "source": [ + "from pickle import load\n", + "import numpy as np\n", + "from keras.preprocessing.sequence import pad_sequences\n", + "from keras.applications.vgg16 import VGG16\n", + "from keras.preprocessing.image import load_img\n", + "from keras.preprocessing.image import img_to_array\n", + "from keras.applications.vgg16 import preprocess_input\n", + "from keras.models import Model\n", + "from keras.models import load_model\n", + "from nltk.translate.bleu_score import corpus_bleu\n", + "\n", + "from CapGenerator import load_data as ld\n", + "from CapGenerator import generate_model as gen\n", + "import argparse\n", + "\n", + "# extract features from each photo in the directory\n", + "def extract_features(filename):\n", + " # load the model\n", + " model = VGG16()\n", + " # re-structure the model\n", + " model.layers.pop()\n", + " model = Model(inputs=model.inputs, outputs=model.layers[-1].output)\n", + " # load the photo\n", + " image = load_img(filename, target_size=(224, 224))\n", + " # convert the image pixels to a numpy array\n", + " image = img_to_array(image)\n", + " # reshape data for the model\n", + " image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))\n", + " # prepare the image for the VGG model\n", + " image = preprocess_input(image)\n", + " # get features\n", + " feature = model.predict(image, verbose=0)\n", + " return feature\n", + "\n", + "# generate a description for an image\n", + "def generate_desc(model, tokenizer, photo, index_word, max_length, beam_size=5):\n", + "\n", + " captions = [['startseq', 0.0]]\n", + " # seed the generation process\n", + " in_text = 'startseq'\n", + " # iterate over the whole length of the sequence\n", + " for i in range(max_length):\n", + " all_caps = []\n", + " # expand each current candidate\n", + " for cap in captions:\n", + " sentence, score = cap\n", + " # if final word is 'end' token, just add the current caption\n", + " if sentence.split()[-1] == 'endseq':\n", + " all_caps.append(cap)\n", + " continue\n", + " # integer encode input sequence\n", + " sequence = tokenizer.texts_to_sequences([sentence])[0]\n", + " # pad input\n", + " sequence = pad_sequences([sequence], maxlen=max_length)\n", + " # predict next words\n", + " y_pred = model.predict([photo,sequence], verbose=0)[0]\n", + " # convert probability to integer\n", + " yhats = np.argsort(y_pred)[-beam_size:]\n", + "\n", + " for j in yhats:\n", + " # map integer to word\n", + " word = index_word.get(j)\n", + " # stop if we cannot map the word\n", + " if word is None:\n", + " continue\n", + " # Add word to caption, and generate log prob\n", + " caption = [sentence + ' ' + word, score + np.log(y_pred[j])]\n", + " all_caps.append(caption)\n", + "\n", + " # order all candidates by score\n", + " ordered = sorted(all_caps, key=lambda tup:tup[1], reverse=True)\n", + " captions = ordered[:beam_size]\n", + "\n", + " return captions\n", + "\n", + "# evaluate the skill of the model\n", + "def evaluate_model(model, descriptions, photos, tokenizer, index_word, max_length):\n", + " actual, predicted = list(), list()\n", + " # step over the whole set\n", + " for key, desc_list in descriptions.items():\n", + " # generate description\n", + " yhat = generate_desc(model, tokenizer, photos[key], index_word, max_length)[0]\n", + " # store actual and predicted\n", + " references = [d.split() for d in desc_list]\n", + " actual.append(references)\n", + " # Use best caption\n", + " predicted.append(yhat[0].split())\n", + " # calculate BLEU score\n", + " print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))\n", + " print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))\n", + " print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))\n", + " print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))\n", + "\n", + "def eval_test_set(model, descriptions, photos, tokenizer, index_word, max_length):\n", + " actual, predicted = list(), list()\n", + " # step over the whole set\n", + " for key, desc_list in descriptions.items():\n", + " # generate description\n", + " yhat = generate_desc(model, tokenizer, photos[key], index_word, max_length)[0]\n", + " # store actual and predicted\n", + " references = [d.split() for d in desc_list]\n", + " actual.append(references)\n", + " # Use best caption\n", + " predicted.append(yhat[0].split())\n", + " predicted = sorted(predicted)\n", + " actual = [x for _,x in sorted(zip(actual,predicted))]\n", + "\n", + "if __name__ == '__main__':\n", + "\n", + " parser = argparse.ArgumentParser(description='Generate image captions')\n", + " parser.add_argument(\"-i\", \"--image\", help=\"Input image path\")\n", + " parser.add_argument(\"-m\", \"--model\", help=\"model checkpoint\")\n", + " args = parser.parse_args()\n", + "\n", + "\n", + " # load the tokenizer\n", + " tokenizer = load(open('models/tokenizer.pkl', 'rb'))\n", + " index_word = load(open('models/index_word.pkl', 'rb'))\n", + " # pre-define the max sequence length (from training)\n", + " max_length = 34\n", + "\n", + " # load the model\n", + " if args.model:\n", + " filename = args.model\n", + " else:\n", + " filename = 'models/model-ep005-loss3.504-val_loss3.893.h5'\n", + " model = load_model(filename)\n", + "\n", + " if args.image:\n", + " # load and prepare the photograph\n", + " photo = extract_features(args.image)\n", + " # generate description\n", + " captions = generate_desc(model, tokenizer, photo, index_word, max_length)\n", + " for cap in captions:\n", + " # remove start and end tokens\n", + " seq = cap[0].split()[1:-1]\n", + " desc = ' '.join(seq)\n", + " print('{} [log prob: {:1.2f}]'.format(desc,cap[1]))\n", + " else:\n", + " # load test set\n", + " test_features, test_descriptions = ld.prepare_dataset('test')[1]\n", + "\n", + " # evaluate model\n", + " evaluate_model(model, test_descriptions, test_features, tokenizer, index_word, max_length)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1183,18 +916,6 @@ "display_name": "Anaconda", "language": "python", "name": "anaconda" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" } }, "nbformat": 4, diff --git a/model.png b/model.png index 76fbb3e..e4cfa13 100644 Binary files a/model.png and b/model.png differ diff --git a/people.jpg b/people.jpg new file mode 100644 index 0000000..dc19ff0 Binary files /dev/null and b/people.jpg differ diff --git a/ski.jpg b/ski.jpg new file mode 100644 index 0000000..c171b11 Binary files /dev/null and b/ski.jpg differ diff --git a/worker.jpg b/worker.jpg new file mode 100644 index 0000000..17d1228 Binary files /dev/null and b/worker.jpg differ