diff --git a/notebooks/text_models/labs/rnn_encoder_decoder.ipynb b/notebooks/text_models/labs/rnn_encoder_decoder.ipynb
index 8fa4ee26..6b9dbcd1 100644
--- a/notebooks/text_models/labs/rnn_encoder_decoder.ipynb
+++ b/notebooks/text_models/labs/rnn_encoder_decoder.ipynb
@@ -20,15 +20,6 @@
     "At last, we'll benchmark our results using the industry standard BLEU score."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pip install nltk"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -51,7 +42,7 @@
     "import pickle\n",
     "import sys\n",
     "\n",
-    "import nltk\n",
+    "import evaluate\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "import tensorflow as tf\n",
@@ -59,6 +50,7 @@
     "from sklearn.model_selection import train_test_split\n",
     "from tensorflow.keras.layers import GRU, Dense, Embedding, Input\n",
     "from tensorflow.keras.models import Model, load_model\n",
+    "from tqdm import tqdm\n",
     "\n",
     "print(tf.__version__)"
    ]
@@ -724,7 +716,7 @@
    "source": [
     "## Implementing the translation (or decoding) function\n",
     "\n",
-    "We can't just use model.predict(), because we don't know all the inputs we used during training. We only know the encoder_input (source language) but not the decoder_input (target language), which is what we want to predict (i.e., the translation of the source language)!\n",
+    "We can't just use model(), because we don't know all the inputs we used during training. We only know the encoder_input (source language) but not the decoder_input (target language), which is what we want to predict (i.e., the translation of the source language)!\n",
     "\n",
     "We do however know the first token of the decoder input, which is the `<start>` token. So using this plus the state of the encoder RNN, we can predict the next token. We will then use that token to be the second token of decoder input, and continue like this until we predict the `<end>` token, or we reach some defined max length.\n",
     "\n",
@@ -764,8 +756,8 @@
    "outputs": [],
    "source": [
     "if LOAD_CHECKPOINT:\n",
-    "    encoder_model = load_model(os.path.join(MODEL_PATH, 'encoder_model.h5'))\n",
-    "    decoder_model = load_model(os.path.join(MODEL_PATH, 'decoder_model.h5'))\n",
+    "    encoder_model = load_model(os.path.join(MODEL_PATH, 'encoder_model'))\n",
+    "    decoder_model = load_model(os.path.join(MODEL_PATH, 'decoder_model'))\n",
     "\n",
     "else:\n",
     "    encoder_model = # TODO\n",
@@ -814,7 +806,7 @@
     "    Returns translated sentences\n",
     "    \"\"\"\n",
     "    # Encode the input as state vectors.\n",
-    "    states_value = encoder_model.predict(input_seqs)\n",
+    "    states_value = encoder_model(input_seqs)\n",
     "\n",
     "    # Populate the first character of target sequence with the start character.\n",
     "    batch_size = input_seqs.shape[0]\n",
@@ -824,7 +816,7 @@
     "\n",
     "    for i in range(max_decode_length):\n",
     "\n",
-    "        output_tokens, decoder_state = decoder_model.predict(\n",
+    "        output_tokens, decoder_state = decoder_model(\n",
     "            [target_seq, states_value])\n",
     "\n",
     "        # Sample a token\n",
@@ -905,9 +897,9 @@
     "### Exercise 9\n",
     "\n",
     "Save\n",
-    "* `model` to disk as the file `model.h5`\n",
-    "* `encoder_model` to disk as the file `encoder_model.h5`\n",
-    "* `decoder_model` to disk as the file `decoder_model.h5`\n"
+    "* `model` to disk as the file `model`\n",
+    "* `encoder_model` to disk as the file `encoder_model`\n",
+    "* `decoder_model` to disk as the file `decoder_model`\n"
    ]
   },
   {
@@ -951,7 +943,7 @@
     "\n",
     "It still is imperfect, since it gives no credit to synonyms and so human evaluation is still best when feasible. However BLEU is commonly considered the best among bad options for an automated metric.\n",
     "\n",
-    "The NLTK framework has an implementation that we will use.\n",
+    "The Hugging Face evaluate framework has an implementation that we will use.\n",
     "\n",
     "We can't run calculate BLEU during training, because at that time the correct decoder input is used. Instead we'll calculate it now.\n",
     "\n",
@@ -964,13 +956,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def bleu_1(reference, candidate):\n",
-    "    reference = list(filter(lambda x: x != \"\", reference))  # remove padding\n",
-    "    candidate = list(filter(lambda x: x != \"\", candidate))  # remove padding\n",
-    "    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1\n",
-    "    return nltk.translate.bleu_score.sentence_bleu(\n",
-    "        reference, candidate, (1,), smoothing_function\n",
-    "    )"
+    "def postprocess(sentence):\n",
+    "    filtered = list(filter(lambda x: x != \"\" and x != \"<end>\", sentence))\n",
+    "    return \" \".join(filtered)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's now average the `bleu_1` and `bleu_4` scores for all the sentence pairs in the eval set. The next cell takes around 1 minute (8 minutes for full dataset eval) to run, the bulk of which is decoding the sentences in the validation set. Please wait until completes."
    ]
   },
   {
@@ -979,22 +974,30 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def bleu_4(reference, candidate):\n",
-    "    reference = list(filter(lambda x: x != \"\", reference))  # remove padding\n",
-    "    candidate = list(filter(lambda x: x != \"\", candidate))  # remove padding\n",
-    "    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1\n",
-    "    return nltk.translate.bleu_score.sentence_bleu(\n",
-    "        reference, candidate, (0.25, 0.25, 0.25, 0.25), smoothing_function\n",
-    "    )"
+    "NUM_EVALUATE = 1000  # `len(input_tensor_val)` for full eval.\n",
+    "\n",
+    "reference = []\n",
+    "candidate = []\n",
+    "\n",
+    "\n",
+    "for idx in tqdm(range(NUM_EVALUATE)):\n",
+    "    reference_sentence = utils_preproc.int2word(\n",
+    "        targ_lang, target_tensor_val[idx][1:]\n",
+    "    )\n",
+    "\n",
+    "    decoded_sentence = decode_sequences(\n",
+    "        input_tensor_val[idx : idx + 1], targ_lang, max_length_targ\n",
+    "    )[0]\n",
+    "\n",
+    "    candidate.append(postprocess(decoded_sentence))\n",
+    "    reference.append([postprocess(reference_sentence)])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Exercise 10\n",
-    "\n",
-    "Let's now average the `bleu_1` and `bleu_4` scores for all the sentence pairs in the eval set. The next cell takes some time to run, the bulk of which is decoding the 6000 sentences in the validation set. Please wait unitl completes."
+    "### Check the score"
    ]
   },
   {
@@ -1003,47 +1006,27 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%time\n",
-    "num_examples = len(input_tensor_val)\n",
-    "bleu_1_total = 0\n",
-    "bleu_4_total = 0\n",
-    "\n",
-    "\n",
-    "for idx in range(num_examples):\n",
-    "    reference_sentence = utils_preproc.int2word(\n",
-    "        targ_lang, target_tensor_val[idx][1:])\n",
-    "\n",
-    "    decoded_sentence = decode_sequences(\n",
-    "        input_tensor_val[idx:idx+1], targ_lang, max_length_targ)[0]\n",
-    "\n",
-    "    bleu_1_total += # TODO\n",
-    "    bleu_4_total += # TODO\n",
-    "\n",
-    "print('BLEU 1: {}'.format(bleu_1_total/num_examples))\n",
-    "print('BLEU 4: {}'.format(bleu_4_total/num_examples))"
+    "bleu = evaluate.load(\"bleu\")\n",
+    "bleu_1 = bleu.compute(predictions=candidate, references=reference, max_order=1)\n",
+    "bleu_4 = bleu.compute(predictions=candidate, references=reference, max_order=4)"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bleu_1[\"bleu\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## Results\n",
-    "\n",
-    "**Hyperparameters**\n",
-    "\n",
-    "- Batch_Size: 64\n",
-    "- Optimizer: adam\n",
-    "- Embed_dim: 256\n",
-    "- GRU Units: 1024\n",
-    "- Train Examples: 24,000\n",
-    "- Epochs: 10\n",
-    "- Hardware: P100 GPU\n",
-    "\n",
-    "**Performance**\n",
-    "- Training Time: 5min \n",
-    "- Cross-entropy loss: train: 0.0722 - val: 0.9062\n",
-    "- BLEU 1: 0.2519574312515255\n",
-    "- BLEU 4: 0.04589972764144636"
+    "bleu_4[\"bleu\"]"
    ]
   },
   {
diff --git a/notebooks/text_models/solutions/rnn_encoder_decoder.ipynb b/notebooks/text_models/solutions/rnn_encoder_decoder.ipynb
index 756e57f8..2e01dbf2 100644
--- a/notebooks/text_models/solutions/rnn_encoder_decoder.ipynb
+++ b/notebooks/text_models/solutions/rnn_encoder_decoder.ipynb
@@ -23,16 +23,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pip install nltk"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "import os\n",
@@ -51,7 +44,7 @@
     "import pickle\n",
     "import sys\n",
     "\n",
-    "import nltk\n",
+    "import evaluate\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "import tensorflow as tf\n",
@@ -59,6 +52,7 @@
     "from sklearn.model_selection import train_test_split\n",
     "from tensorflow.keras.layers import GRU, Dense, Embedding, Input\n",
     "from tensorflow.keras.models import Model, load_model\n",
+    "from tqdm import tqdm\n",
     "\n",
     "print(tf.__version__)"
    ]
@@ -737,7 +731,7 @@
    "source": [
     "## Implementing the translation (or decoding) function\n",
     "\n",
-    "We can't just use model.predict(), because we don't know all the inputs we used during training. We only know the encoder_input (source language) but not the decoder_input (target language), which is what we want to predict (i.e., the translation of the source language)!\n",
+    "We can't just use model(), because we don't know all the inputs we used during training. We only know the encoder_input (source language) but not the decoder_input (target language), which is what we want to predict (i.e., the translation of the source language)!\n",
     "\n",
     "We do however know the first token of the decoder input, which is the `<start>` token. So using this plus the state of the encoder RNN, we can predict the next token. We will then use that token to be the second token of decoder input, and continue like this until we predict the `<end>` token, or we reach some defined max length.\n",
     "\n",
@@ -768,8 +762,8 @@
    "outputs": [],
    "source": [
     "if LOAD_CHECKPOINT:\n",
-    "    encoder_model = load_model(os.path.join(MODEL_PATH, \"encoder_model.h5\"))\n",
-    "    decoder_model = load_model(os.path.join(MODEL_PATH, \"decoder_model.h5\"))\n",
+    "    encoder_model = load_model(os.path.join(MODEL_PATH, \"encoder_model\"))\n",
+    "    decoder_model = load_model(os.path.join(MODEL_PATH, \"decoder_model\"))\n",
     "\n",
     "else:\n",
     "    # TODO 3a\n",
@@ -823,7 +817,7 @@
     "    Returns translated sentences\n",
     "    \"\"\"\n",
     "    # Encode the input as state vectors.\n",
-    "    states_value = encoder_model.predict(input_seqs)\n",
+    "    states_value = encoder_model(input_seqs)\n",
     "\n",
     "    # Populate the first character of target sequence with the start character.\n",
     "    batch_size = input_seqs.shape[0]\n",
@@ -833,9 +827,7 @@
     "\n",
     "    # TODO 4: Sampling loop\n",
     "    for i in range(max_decode_length):\n",
-    "        output_tokens, decoder_state = decoder_model.predict(\n",
-    "            [target_seq, states_value]\n",
-    "        )\n",
+    "        output_tokens, decoder_state = decoder_model([target_seq, states_value])\n",
     "\n",
     "        # Sample a token\n",
     "        sampled_token_index = np.argmax(output_tokens[:, -1, :], axis=-1)\n",
@@ -925,9 +917,9 @@
     "    os.makedirs(MODEL_PATH, exist_ok=True)\n",
     "\n",
     "    # TODO 3b\n",
-    "    model.save(os.path.join(MODEL_PATH, \"model.h5\"))\n",
-    "    encoder_model.save(os.path.join(MODEL_PATH, \"encoder_model.h5\"))\n",
-    "    decoder_model.save(os.path.join(MODEL_PATH, \"decoder_model.h5\"))\n",
+    "    model.save(os.path.join(MODEL_PATH, \"model\"))\n",
+    "    encoder_model.save(os.path.join(MODEL_PATH, \"encoder_model\"))\n",
+    "    decoder_model.save(os.path.join(MODEL_PATH, \"decoder_model\"))\n",
     "\n",
     "    with open(os.path.join(MODEL_PATH, \"encoder_tokenizer.pkl\"), \"wb\") as fp:\n",
     "        pickle.dump(inp_lang, fp)\n",
@@ -959,7 +951,7 @@
     "\n",
     "It still is imperfect, since it gives no credit to synonyms and so human evaluation is still best when feasible. However BLEU is commonly considered the best among bad options for an automated metric.\n",
     "\n",
-    "The NLTK framework has an implementation that we will use.\n",
+    "The Hugging Face evaluate framework has an implementation that we will use.\n",
     "\n",
     "We can't run calculate BLEU during training, because at that time the correct decoder input is used. Instead we'll calculate it now.\n",
     "\n",
@@ -969,38 +961,21 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def bleu_1(reference, candidate):\n",
-    "    reference = list(filter(lambda x: x != \"\", reference))  # remove padding\n",
-    "    candidate = list(filter(lambda x: x != \"\", candidate))  # remove padding\n",
-    "    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1\n",
-    "    return nltk.translate.bleu_score.sentence_bleu(\n",
-    "        reference, candidate, (1,), smoothing_function\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
-    "def bleu_4(reference, candidate):\n",
-    "    reference = list(filter(lambda x: x != \"\", reference))  # remove padding\n",
-    "    candidate = list(filter(lambda x: x != \"\", candidate))  # remove padding\n",
-    "    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1\n",
-    "    return nltk.translate.bleu_score.sentence_bleu(\n",
-    "        reference, candidate, (0.25, 0.25, 0.25, 0.25), smoothing_function\n",
-    "    )"
+    "def postprocess(sentence):\n",
+    "    filtered = list(filter(lambda x: x != \"\" and x != \"<end>\", sentence))\n",
+    "    return \" \".join(filtered)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's now average the `bleu_1` and `bleu_4` scores for all the sentence pairs in the eval set. The next cell takes some time to run, the bulk of which is decoding the 6000 sentences in the validation set. Please wait unitl completes."
+    "Let's now average the `bleu_1` and `bleu_4` scores for all the sentence pairs in the eval set. The next cell takes around 1 minute (8 minutes for full dataset eval) to run, the bulk of which is decoding the sentences in the validation set. Please wait until completes."
    ]
   },
   {
@@ -1009,14 +984,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%time\n",
-    "num_examples = len(input_tensor_val)\n",
-    "bleu_1_total = 0\n",
-    "bleu_4_total = 0\n",
+    "NUM_EVALUATE = 1000  # `len(input_tensor_val)` for full eval.\n",
     "\n",
+    "reference = []\n",
+    "candidate = []\n",
     "\n",
-    "for idx in range(num_examples):\n",
-    "    # TODO 5\n",
+    "for idx in tqdm(range(NUM_EVALUATE)):\n",
     "    reference_sentence = utils_preproc.int2word(\n",
     "        targ_lang, target_tensor_val[idx][1:]\n",
     "    )\n",
@@ -1025,34 +998,50 @@
     "        input_tensor_val[idx : idx + 1], targ_lang, max_length_targ\n",
     "    )[0]\n",
     "\n",
-    "    bleu_1_total += bleu_1(reference_sentence, decoded_sentence)\n",
-    "    bleu_4_total += bleu_4(reference_sentence, decoded_sentence)\n",
-    "\n",
-    "print(f\"BLEU 1: {bleu_1_total / num_examples}\")\n",
-    "print(f\"BLEU 4: {bleu_4_total / num_examples}\")"
+    "    candidate.append(postprocess(decoded_sentence))\n",
+    "    reference.append([postprocess(reference_sentence)])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Results\n",
-    "\n",
-    "**Hyperparameters**\n",
-    "\n",
-    "- Batch_Size: 64\n",
-    "- Optimizer: adam\n",
-    "- Embed_dim: 256\n",
-    "- GRU Units: 1024\n",
-    "- Train Examples: 24,000\n",
-    "- Epochs: 10\n",
-    "- Hardware: P100 GPU\n",
-    "\n",
-    "**Performance**\n",
-    "- Training Time: 5min \n",
-    "- Cross-entropy loss: train: 0.0722 - val: 0.9062\n",
-    "- BLEU 1: 0.2519574312515255\n",
-    "- BLEU 4: 0.04589972764144636"
+    "### Check the score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "bleu = evaluate.load(\"bleu\")\n",
+    "bleu_1 = bleu.compute(predictions=candidate, references=reference, max_order=1)\n",
+    "bleu_4 = bleu.compute(predictions=candidate, references=reference, max_order=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "bleu_1[\"bleu\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "bleu_4[\"bleu\"]"
    ]
   },
   {