From aa17e0a941ad3413b7b73098050c3152b80c6f97 Mon Sep 17 00:00:00 2001
From: shuaills <shishuaiuoe@gmail.com>
Date: Sat, 21 Dec 2024 22:59:53 +0000
Subject: [PATCH 1/7] updated constrained decoding doc

---
 docs/backend/openai_api_completions.ipynb | 123 ++++++++++++++++++++--
 1 file changed, 114 insertions(+), 9 deletions(-)
diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 067a046885..50b2764445 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -36,9 +36,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-21 21:57:29] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=2048, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=512968292, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=8, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n",
+      "[2024-12-21 21:57:37 TP0] Init torch distributed begin.\n",
+      "[2024-12-21 21:57:37 TP0] Load weight begin. avail mem=21.87 GB\n",
+      "[2024-12-21 21:57:41 TP0] Using model weights format ['*.safetensors']\n"
+     ]
+    }
+   ],
    "source": [
     "from sglang.utils import (\n",
     "    execute_shell_command,\n",
@@ -69,9 +92,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='1306d8dd0eb14493bbfd0c8b6f029435', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **France** - **Paris**\\n2. **Japan** - **Tokyo**\\n3. **Australia** - **Canberra**', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1734818316, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=39, prompt_tokens=18, total_tokens=57, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "import openai\n",
     "\n",
@@ -102,9 +138,54 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. Expansion: Rome expanded its territories through conquest, creating the largest empire the world had ever seen, spanning from Britain to Egypt and from Spain to Syria.<br>2. Law and Governance: Rome developed a system of laws, the Twelve Tables, which became the foundation of modern law. The Roman Republic and later the Roman Empire also established a system of governance that lasted for centuries.<br>3. Architecture and Engineering: Rome built impressive structures such as the Colosseum, Pantheon, and aqueducts, showcasing its engineering and architectural skills.<br>4. Language and Literature: Latin became the language of</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:31<01:33, 31.20s/it]\n",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:53<00:52, 26.12s/it]\n",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:54<00:14, 14.31s/it]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [01:24<00:00, 20.57s/it]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [01:24<00:00, 21.05s/it]\n",
+      "\n",
+      "[2024-12-21 22:38:13 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=7.39 GB\n",
+      "[2024-12-21 22:38:15 TP0] Memory pool end. avail mem=1.51 GB\n",
+      "[2024-12-21 22:39:27 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
+      "100%|██████████| 4/4 [02:37<00:00, 39.47s/it] \n",
+      "[2024-12-21 22:42:27 TP0] Capture cuda graph end. Time elapsed: 179.72 s\n",
+      "[2024-12-21 22:42:31 TP0] max_total_num_tokens=39026, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+      "[2024-12-21 22:42:34] INFO:     Started server process [495767]\n",
+      "[2024-12-21 22:42:34] INFO:     Waiting for application startup.\n",
+      "[2024-12-21 22:42:34] INFO:     Application startup complete.\n",
+      "[2024-12-21 22:42:34] ERROR:    [Errno 98] error while attempting to bind on address ('0.0.0.0', 30000): address already in use\n",
+      "[2024-12-21 22:42:34] INFO:     Waiting for application shutdown.\n",
+      "[2024-12-21 22:42:34] INFO:     Application shutdown complete.\n",
+      "[2024-12-21 22:42:40] The server is fired up and ready to roll!\n",
+      "/home/shuai/miniconda3/envs/sglang/lib/python3.10/multiprocessing/resource_tracker.py:104: UserWarning: resource_tracker: process died unexpectedly, relaunching.  Some resources might leak.\n",
+      "  warnings.warn('resource_tracker: process died unexpectedly, '\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/shuai/miniconda3/envs/sglang/lib/python3.10/multiprocessing/resource_tracker.py\", line 209, in main\n",
+      "    cache[rtype].remove(name)\n",
+      "KeyError: '/mp-cr7f7zak'\n"
+     ]
+    }
+   ],
    "source": [
     "response = client.chat.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -223,14 +304,32 @@
     "## Structured decoding (JSON, Regex)\n",
     "You can specify a JSON schema or a regular expression to constrain the model output. The model output will be guaranteed to follow the given constraints.\n",
     "\n",
+    "By default, SGlang uses outlines for structured decoding. To enable Xgrammar (which offers better performance and supports JSON but not regex patterns), add `--grammar-backend xgrammar` when launching the server:\n",
+    "\n",
+    "```bash\n",
+    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --grammar-backend xgrammar\n",
+    "```\n",
+    "\n",
     "### JSON"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'client' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 14\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mjson\u001b[39;00m\n\u001b[1;32m      3\u001b[0m json_schema \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mdumps(\n\u001b[1;32m      4\u001b[0m     {\n\u001b[1;32m      5\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mobject\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     11\u001b[0m     }\n\u001b[1;32m     12\u001b[0m )\n\u001b[0;32m---> 14\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mclient\u001b[49m\u001b[38;5;241m.\u001b[39mchat\u001b[38;5;241m.\u001b[39mcompletions\u001b[38;5;241m.\u001b[39mcreate(\n\u001b[1;32m     15\u001b[0m     model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmeta-llama/Meta-Llama-3.1-8B-Instruct\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     16\u001b[0m     messages\u001b[38;5;241m=\u001b[39m[\n\u001b[1;32m     17\u001b[0m         {\n\u001b[1;32m     18\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrole\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muser\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     19\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGive me the information of the capital of France in the JSON format.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     20\u001b[0m         },\n\u001b[1;32m     21\u001b[0m     ],\n\u001b[1;32m     22\u001b[0m     temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m     23\u001b[0m     max_tokens\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m128\u001b[39m,\n\u001b[1;32m     24\u001b[0m     response_format\u001b[38;5;241m=\u001b[39m{\n\u001b[1;32m     25\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mjson_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     26\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mjson_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfoo\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mschema\u001b[39m\u001b[38;5;124m\"\u001b[39m: json\u001b[38;5;241m.\u001b[39mloads(json_schema)},\n\u001b[1;32m     27\u001b[0m     },\n\u001b[1;32m     28\u001b[0m )\n\u001b[1;32m     30\u001b[0m print_highlight(response\u001b[38;5;241m.\u001b[39mchoices[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mmessage\u001b[38;5;241m.\u001b[39mcontent)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'client' is not defined"
+     ]
+    }
+   ],
    "source": [
     "import json\n",
     "\n",
@@ -585,6 +684,11 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "sglang",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
@@ -594,7 +698,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
   }
  },
  "nbformat": 4,

From 88db12eeb32621d5fdf484907a667668d69ba00c Mon Sep 17 00:00:00 2001
From: shuaills <shishuaiuoe@gmail.com>
Date: Sat, 21 Dec 2024 23:20:20 +0000
Subject: [PATCH 2/7] rm ipynb outputs

---
 docs/backend/openai_api_completions.ipynb | 109 ++--------------------
 1 file changed, 8 insertions(+), 101 deletions(-)

diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 50b2764445..61f58eb695 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -36,32 +36,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-21 21:57:29] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=2048, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=512968292, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=8, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n",
-      "[2024-12-21 21:57:37 TP0] Init torch distributed begin.\n",
-      "[2024-12-21 21:57:37 TP0] Load weight begin. avail mem=21.87 GB\n",
-      "[2024-12-21 21:57:41 TP0] Using model weights format ['*.safetensors']\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sglang.utils import (\n",
     "    execute_shell_command,\n",
@@ -92,22 +69,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: ChatCompletion(id='1306d8dd0eb14493bbfd0c8b6f029435', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **France** - **Paris**\\n2. **Japan** - **Tokyo**\\n3. **Australia** - **Canberra**', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1734818316, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=39, prompt_tokens=18, total_tokens=57, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import openai\n",
     "\n",
@@ -138,54 +102,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. Expansion: Rome expanded its territories through conquest, creating the largest empire the world had ever seen, spanning from Britain to Egypt and from Spain to Syria.<br>2. Law and Governance: Rome developed a system of laws, the Twelve Tables, which became the foundation of modern law. The Roman Republic and later the Roman Empire also established a system of governance that lasted for centuries.<br>3. Architecture and Engineering: Rome built impressive structures such as the Colosseum, Pantheon, and aqueducts, showcasing its engineering and architectural skills.<br>4. Language and Literature: Latin became the language of</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:31<01:33, 31.20s/it]\n",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:53<00:52, 26.12s/it]\n",
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:54<00:14, 14.31s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [01:24<00:00, 20.57s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [01:24<00:00, 21.05s/it]\n",
-      "\n",
-      "[2024-12-21 22:38:13 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=7.39 GB\n",
-      "[2024-12-21 22:38:15 TP0] Memory pool end. avail mem=1.51 GB\n",
-      "[2024-12-21 22:39:27 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
-      "100%|██████████| 4/4 [02:37<00:00, 39.47s/it] \n",
-      "[2024-12-21 22:42:27 TP0] Capture cuda graph end. Time elapsed: 179.72 s\n",
-      "[2024-12-21 22:42:31 TP0] max_total_num_tokens=39026, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
-      "[2024-12-21 22:42:34] INFO:     Started server process [495767]\n",
-      "[2024-12-21 22:42:34] INFO:     Waiting for application startup.\n",
-      "[2024-12-21 22:42:34] INFO:     Application startup complete.\n",
-      "[2024-12-21 22:42:34] ERROR:    [Errno 98] error while attempting to bind on address ('0.0.0.0', 30000): address already in use\n",
-      "[2024-12-21 22:42:34] INFO:     Waiting for application shutdown.\n",
-      "[2024-12-21 22:42:34] INFO:     Application shutdown complete.\n",
-      "[2024-12-21 22:42:40] The server is fired up and ready to roll!\n",
-      "/home/shuai/miniconda3/envs/sglang/lib/python3.10/multiprocessing/resource_tracker.py:104: UserWarning: resource_tracker: process died unexpectedly, relaunching.  Some resources might leak.\n",
-      "  warnings.warn('resource_tracker: process died unexpectedly, '\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/shuai/miniconda3/envs/sglang/lib/python3.10/multiprocessing/resource_tracker.py\", line 209, in main\n",
-      "    cache[rtype].remove(name)\n",
-      "KeyError: '/mp-cr7f7zak'\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "response = client.chat.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -315,21 +234,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'client' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[1], line 14\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mjson\u001b[39;00m\n\u001b[1;32m      3\u001b[0m json_schema \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mdumps(\n\u001b[1;32m      4\u001b[0m     {\n\u001b[1;32m      5\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mobject\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     11\u001b[0m     }\n\u001b[1;32m     12\u001b[0m )\n\u001b[0;32m---> 14\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mclient\u001b[49m\u001b[38;5;241m.\u001b[39mchat\u001b[38;5;241m.\u001b[39mcompletions\u001b[38;5;241m.\u001b[39mcreate(\n\u001b[1;32m     15\u001b[0m     model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmeta-llama/Meta-Llama-3.1-8B-Instruct\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     16\u001b[0m     messages\u001b[38;5;241m=\u001b[39m[\n\u001b[1;32m     17\u001b[0m         {\n\u001b[1;32m     18\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrole\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muser\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     19\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGive me the information of the capital of France in the JSON format.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     20\u001b[0m         },\n\u001b[1;32m     21\u001b[0m     ],\n\u001b[1;32m     22\u001b[0m     temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m     23\u001b[0m     max_tokens\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m128\u001b[39m,\n\u001b[1;32m     24\u001b[0m     response_format\u001b[38;5;241m=\u001b[39m{\n\u001b[1;32m     25\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mjson_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     26\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mjson_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfoo\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mschema\u001b[39m\u001b[38;5;124m\"\u001b[39m: json\u001b[38;5;241m.\u001b[39mloads(json_schema)},\n\u001b[1;32m     27\u001b[0m     },\n\u001b[1;32m     28\u001b[0m )\n\u001b[1;32m     30\u001b[0m print_highlight(response\u001b[38;5;241m.\u001b[39mchoices[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mmessage\u001b[38;5;241m.\u001b[39mcontent)\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'client' is not defined"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import json\n",
     "\n",

From 48d58aacb698e91c221ffcd3e6c675357c031bf3 Mon Sep 17 00:00:00 2001
From: shuaills <shishuaiuoe@gmail.com>
Date: Sun, 22 Dec 2024 13:59:59 +0000
Subject: [PATCH 3/7] updated docs

---
 docs/backend/openai_api_completions.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 61f58eb695..369e950b78 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -223,7 +223,7 @@
     "## Structured decoding (JSON, Regex)\n",
     "You can specify a JSON schema or a regular expression to constrain the model output. The model output will be guaranteed to follow the given constraints.\n",
     "\n",
-    "By default, SGlang uses outlines for structured decoding. To enable Xgrammar (which offers better performance and supports JSON but not regex patterns), add `--grammar-backend xgrammar` when launching the server:\n",
+    "SGlang supports two grammar backends: outlines (default) and Xgrammar. Xgrammar offers better JSON decoding performance but does not support regex patterns. To enable it:\n",
     "\n",
     "```bash\n",
     "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --grammar-backend xgrammar\n",

From f26eb4f340c2e11b95b51330f7806a3e94bc3521 Mon Sep 17 00:00:00 2001
From: shuaills <shishuaiuoe@gmail.com>
Date: Sun, 22 Dec 2024 14:04:25 +0000
Subject: [PATCH 4/7] updated docs

---
 docs/backend/openai_api_completions.ipynb | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 369e950b78..fba7682c5d 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -223,10 +223,11 @@
     "## Structured decoding (JSON, Regex)\n",
     "You can specify a JSON schema or a regular expression to constrain the model output. The model output will be guaranteed to follow the given constraints.\n",
     "\n",
-    "SGlang supports two grammar backends: outlines (default) and Xgrammar. Xgrammar offers better JSON decoding performance but does not support regex patterns. To enable it:\n",
+    "SGlang supports two grammar backends: outlines (default) and Xgrammar. Xgrammar offers better JSON decoding performance but does not support regex patterns. To enable it, add the `--grammar-backend xgrammar` when launching the server:\n",
     "\n",
     "```bash\n",
-    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --grammar-backend xgrammar\n",
+    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct \\\n",
+    "--grammar-backend xgrammar\n",
     "```\n",
     "\n",
     "### JSON"

From 0eaf40105554f29ad6965c922476fd67474c25de Mon Sep 17 00:00:00 2001
From: shuaills <shishuaiuoe@gmail.com>
Date: Sun, 22 Dec 2024 14:15:53 +0000
Subject: [PATCH 5/7] remove ipynb outputs

---
 docs/backend/openai_api_completions.ipynb | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index fba7682c5d..6cb5d884e5 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -592,11 +592,6 @@
   }
  ],
  "metadata": {
-  "kernelspec": {
-   "display_name": "sglang",
-   "language": "python",
-   "name": "python3"
-  },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
@@ -606,8 +601,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,

From f9d7349e544e16154cef5f2fbfcc25f9a782d700 Mon Sep 17 00:00:00 2001
From: shuaills <shishuaiuoe@gmail.com>
Date: Sun, 22 Dec 2024 18:59:26 +0000
Subject: [PATCH 6/7] Update doc

---
 docs/backend/openai_api_completions.ipynb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 6cb5d884e5..9c756e0217 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -221,13 +221,13 @@
    "metadata": {},
    "source": [
     "## Structured decoding (JSON, Regex)\n",
-    "You can specify a JSON schema or a regular expression to constrain the model output. The model output will be guaranteed to follow the given constraints.\n",
+    "You can define a JSON schema or regular expression to constrain the model's output, which depends on the grammar backend.\n",
     "\n",
-    "SGlang supports two grammar backends: outlines (default) and Xgrammar. Xgrammar offers better JSON decoding performance but does not support regex patterns. To enable it, add the `--grammar-backend xgrammar` when launching the server:\n",
+    "SGlang has two backends: outlines (default) and Xgrammar. Xgrammar enhances JSON decoding performance but does not support regular expressions. To use Xgrammar, add the `--grammar-backend xgrammar` when launching the server:\n",
     "\n",
     "```bash\n",
-    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct \\\n",
-    "--grammar-backend xgrammar\n",
+    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
+    "--port 30000 --host 0.0.0.0 --grammar-backend xgrammar\n",
     "```\n",
     "\n",
     "### JSON"

From a00ecb18abd7c5136fa6670595b445fdc2c8399c Mon Sep 17 00:00:00 2001
From: shuaills <shishuaiuoe@gmail.com>
Date: Sun, 22 Dec 2024 23:35:41 +0000
Subject: [PATCH 7/7] update doc

---
 docs/backend/openai_api_completions.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 9c756e0217..9340f953f1 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -221,7 +221,7 @@
    "metadata": {},
    "source": [
     "## Structured decoding (JSON, Regex)\n",
-    "You can define a JSON schema or regular expression to constrain the model's output, which depends on the grammar backend.\n",
+    "You can define a JSON schema or regular expression to constrain the model's output. The model output will be guaranteed to follow the given constraints and this depends on the grammar backend.\n",
     "\n",
     "SGlang has two backends: outlines (default) and Xgrammar. Xgrammar enhances JSON decoding performance but does not support regular expressions. To use Xgrammar, add the `--grammar-backend xgrammar` when launching the server:\n",
     "\n",