SeldonIO · jbauer2718 · Dec 12, 2023 · Dec 12, 2023 · Dec 12, 2023 · Dec 14, 2023
diff --git a/docs/examples/huggingface/README.ipynb b/docs/examples/huggingface/README.ipynb
@@ -23,6 +23,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "!pip install requests\n",
     "# Import required dependencies\n",
     "import requests"
    ]
@@ -437,6 +438,93 @@
     ").json()"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "f1d4b24a-4c09-4743-a086-6f8b143711ad",
+   "metadata": {},
+   "source": [
+    "### Masked Language Modeling (Optional Japanese Language Example)\n",
+    "\n",
+    "We can also serve a masked language model. In the following example, we also build the `huggingface` runtime with the `-E japanese` flag to enable support for Japanese tokenizers. For example, after running the normal project build from the root directory with `make install-dev`, we can install the optional Japanese dependencies in dev mode:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "63e368ee-a5ef-44b7-aab8-cafd30ab227a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting ./model-settings.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile ./model-settings.json\n",
+    "{\n",
+    "  \"name\": \"transformer\",\n",
+    "  \"implementation\": \"mlserver_huggingface.runtime.HuggingFaceRuntime\",\n",
+    "  \"parameters\": {\n",
+    "    \"extra\": {\n",
+    "      \"task\": \"fill-mask\",\n",
+    "      \"pretrained_model\": \"cl-tohoku/bert-base-japanese\",\n",
+    "      \"pretrained_tokenizer\": \"cl-tohoku/bert-base-japanese\"\n",
+    "    }\n",
+    "  }"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72b05244-f52f-41c0-a1bc-a5d11d1c75a0",
+   "metadata": {},
+   "source": [
+    "Using the shell to start mlserver like so,\n",
+    "\n",
+    "```shell\n",
+    "mlserver start .\n",
+    "```\n",
+    "we can pass inferences like this. Note the `[MASK]` token. The mask token can be different for different models, so check the HuggingFace model config for special tokens."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b619c852-f35d-4506-91e1-0a1e3bcb7b8b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'実際 に 空 が 見える の か?'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from mlserver_huggingface.codecs import HuggingfaceRequestCodec\n",
+    "import json\n",
+    "\n",
+    "# Test sentence: Is the sky really [MASK]?\n",
+    "test_sentence = \"実際に空が[MASK]のか？\"\n",
+    "# [MASK] = visible\n",
+    "expected_output = \"見える\"\n",
+    "\n",
+    "inference_request = HuggingfaceRequestCodec.encode_request(\n",
+    "    {\"inputs\": [test_sentence]},\n",
+    "    use_bytes=False,\n",
+    ")\n",
+    "json.dumps(inference_request.dict())\n",
+    "response = requests.post(\"http://localhost:8080/v2/models/transformer/infer\", json=inference_request.dict()).json()\n",
+    "json.loads(response['outputs'][0]['data'][0])[\"sequence\"]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "fe6655d9",
@@ -714,19 +802,11 @@
     "                report \\\n",
     "                -type=text"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0ddcb458",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -740,7 +820,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.8"
+   "version": "3.9.5"
   }
  },
  "nbformat": 4,

diff --git a/docs/examples/huggingface/README.md b/docs/examples/huggingface/README.md
@@ -259,21 +259,67 @@ requests.post(
     "http://localhost:8080/v2/models/transformer/infer", json=inference_request
 ).json()
 ```
+```
+{'model_name': 'transformer',
+    'id': '835eabbd-daeb-4423-a64f-a7c4d7c60a9b',
+    'parameters': {},
+    'outputs': [{'name': 'output',
+    'shape': [1, 1],
+    'datatype': 'BYTES',
+    'parameters': {'content_type': 'hg_jsonlist'},
+    'data': ['{"label": "NEGATIVE", "score": 0.9996137022972107}']}]}
+```
 
+### Masked Language Modeling (Optional Japanese Language Example)
 
+We can also serve a masked language model. In the following example, we also build the `huggingface` runtime with the `-E japanese` flag to enable support for Japanese tokenizers. For example, after running the normal project build from the root directory with `make install-dev`, we can install the optional Japanese dependencies in dev mode:
 
+`poetry install -E japanese`
 
-    {'model_name': 'transformer',
-     'id': '835eabbd-daeb-4423-a64f-a7c4d7c60a9b',
-     'parameters': {},
-     'outputs': [{'name': 'output',
-       'shape': [1, 1],
-       'datatype': 'BYTES',
-       'parameters': {'content_type': 'hg_jsonlist'},
-       'data': ['{"label": "NEGATIVE", "score": 0.9996137022972107}']}]}
+from the `./runtimes/huggingface` level. 
 
+```python
+%%writefile ./model-settings.json
+{
+  "name": "model",
+  "implementation": "mlserver_huggingface.runtime.HuggingFaceRuntime",
+  "parameters": {
+    "extra": {
+      "task": "fill-mask",
+      "pretrained_model": "cl-tohoku/bert-base-japanese",
+      "pretrained_tokenizer": "cl-tohoku/bert-base-japanese"
+    }
+  }
+}
+```
+Using the shell to start mlserver like so,
+```shell
+mlserver start .
+```
+we can pass inferences like this. Note the `[MASK]` token. The mask token can be different for different models, so check the HuggingFace model config for special tokens.
+```python
+from mlserver_huggingface.codecs import HuggingfaceRequestCodec
+import json
 
+# Test sentence: Is the sky really [MASK]?
+test_sentence = "実際に空が[MASK]のか？"
+# [MASK] = visible
+expected_output = "見える"
 
+inference_request = HuggingfaceRequestCodec.encode_request(
+    {"inputs": [test_sentence]},
+    use_bytes=False,
+)
+json.dumps(inference_request.dict())
+response = requests.post("http://localhost:8080/v2/models/transformer/infer", json=inference_request.dict()).json()
+json.loads(response['outputs'][0]['data'][0])["sequence"]
+```
+```
+Response:
+{'model_name': 'transformer', 'id': '9e966d8d-b43d-4ab4-8d47-90e367196233', 'parameters': {}, 'outputs': [{'name': 'output', 'shape': [5, 1], 'datatype': 'BYTES', 'parameters': {'content_type': 'hg_jsonlist'}, 'data': ['{"score": 0.3277095854282379, "token": 11819, "token_str": "\\u3042\\u308b", "sequence": "\\u5b9f\\u969b \\u306b \\u7a7a \\u304c \\u3042\\u308b \\u306e \\u304b?"}', '{"score": 0.10271108895540237, "token": 14656, "token_str": "\\u898b\\u3048\\u308b", "sequence": "\\u5b9f\\u969b \\u306b \\u7a7a \\u304c \\u898b\\u3048\\u308b \\u306e \\u304b?"}', '{"score": 0.08325661718845367, "token": 11835, "token_str": "\\u306a\\u3044", "sequence": "\\u5b9f\\u969b \\u306b \\u7a7a \\u304c \\u306a\\u3044 \\u306e \\u304b?"}', '{"score": 0.036131054162979126, "token": 18413, "token_str": "\\u6b63\\u3057\\u3044", "sequence": "\\u5b9f\\u969b \\u306b \\u7a7a \\u304c \\u6b63\\u3057\\u3044 \\u306e \\u304b?"}', '{"score": 0.029351236298680305, "token": 11820, "token_str": "\\u3044\\u308b", "sequence": "\\u5b9f\\u969b \\u306b \\u7a7a \\u304c \\u3044\\u308b \\u306e \\u304b?"}']}]}
+Data:
+{'score': 0.3277095854282379, 'token': 11819, 'token_str': 'ある', 'sequence': '実際 に 空 が ある の か?'}
+```
 ## GPU Acceleration
 
 We can also evaluate GPU acceleration, we can test the speed on CPU vs GPU using the following parameters