diff --git a/docs/en/_static/css/config_generator.css b/docs/en/_static/css/config_generator.css new file mode 100644 index 0000000000..2c41719332 --- /dev/null +++ b/docs/en/_static/css/config_generator.css @@ -0,0 +1,216 @@ +/* ================================================================ + LMDeploy Interactive Configuration Generator – pill-bar layout + Matches the SGLang Cookbook segmented-control style + ================================================================ */ + +/* Wrapper: full content width, no card background */ +.cg-wrapper { + width: 100%; + margin: 0; + padding: 0; +} + +/* ── Dimension row ─────────────────────────────────────────────── */ +.cg-row { + margin-bottom: 1.25em; +} + +.cg-label { + font-weight: 600; + font-size: 0.95em; + margin-bottom: 0.4em; + color: var(--pst-color-text-base, #24292e); +} + +/* ── Pill bar (segmented control) ──────────────────────────────── */ +.cg-pill-bar { + display: flex; + flex-wrap: wrap; + width: 100%; + border: 1px solid var(--pst-color-border, #d1d5db); + border-radius: 6px; + overflow: hidden; + background: var(--pst-color-surface, #ffffff); +} + +.cg-pill { + flex: 1 1 0; + min-width: 0; + padding: 0.55em 0.4em; + margin: 0; + border: none; + border-right: 1px solid var(--pst-color-border, #d1d5db); + background: transparent; + color: var(--pst-color-text-base, #24292e); + font-size: 0.88em; + font-weight: 500; + cursor: pointer; + text-align: center; + transition: background 0.15s ease, color 0.15s ease; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + line-height: 1.4; +} + +.cg-pill:last-child { + border-right: none; +} + +.cg-pill:hover { + background: rgba(3, 102, 214, 0.08); +} + +.cg-pill.active { + background: #0366d6; + color: #ffffff; + font-weight: 600; +} + +.cg-pill.active:hover { + background: #0256c2; +} + +/* ── Command output section ────────────────────────────────────── */ +.cg-command-section { + margin-top: 1.5em; +} + +.cg-command-label { + font-weight: 600; + font-size: 0.95em; + margin-bottom: 0.4em; + color: var(--pst-color-text-base, #24292e); +} + +.cg-command-box { + background: #1e1e1e; + border-radius: 6px; + padding: 1em 1em 1em 1.2em; + position: relative; + width: 100%; + box-sizing: border-box; +} + +.cg-command-box pre { + margin: 0; + padding: 0; + padding-right: 4.5em; /* space for the copy button */ + background: transparent; + overflow-x: auto; +} + +.cg-command-box code { + color: #d4d4d4; + font-family: 'SFMono-Regular', 'Consolas', 'Liberation Mono', 'Menlo', monospace; + font-size: 0.88em; + line-height: 1.6; + white-space: pre; +} + +.cg-copy-btn { + position: absolute; + top: 0.6em; + right: 0.6em; + background: #0366d6; + color: #ffffff; + border: none; + border-radius: 4px; + padding: 0.35em 0.9em; + cursor: pointer; + font-size: 0.8em; + font-weight: 500; + transition: background 0.2s ease; +} + +.cg-copy-btn:hover { + background: #0256c2; +} + +.cg-copy-btn:active { + background: #014a9e; +} + +/* ── Responsive: stack pills on narrow screens ─────────────────── */ +@media (max-width: 640px) { + .cg-pill-bar { + flex-direction: column; + } + + .cg-pill { + border-right: none; + border-bottom: 1px solid var(--pst-color-border, #d1d5db); + } + + .cg-pill:last-child { + border-bottom: none; + } +} + +/* ── Dark-mode overrides (sphinx-book-theme data-theme) ────────── */ +html[data-theme="dark"] .cg-label, +html[data-theme="dark"] .cg-command-label { + color: #f0f6fc; +} + +html[data-theme="dark"] .cg-pill-bar { + border-color: #30363d; + background: #161b22; +} + +html[data-theme="dark"] .cg-pill { + color: #c9d1d9; + border-color: #30363d; +} + +html[data-theme="dark"] .cg-pill:hover { + background: rgba(88, 166, 255, 0.12); +} + +html[data-theme="dark"] .cg-pill.active { + background: #1f6feb; + color: #ffffff; +} + +html[data-theme="dark"] .cg-command-box { + background: #0d1117; +} + +html[data-theme="dark"] .cg-command-box code { + color: #c9d1d9; +} + +/* Also handle prefers-color-scheme for themes without data-theme */ +@media (prefers-color-scheme: dark) { + .cg-label, + .cg-command-label { + color: #f0f6fc; + } + + .cg-pill-bar { + border-color: #30363d; + background: #161b22; + } + + .cg-pill { + color: #c9d1d9; + border-color: #30363d; + } + + .cg-pill:hover { + background: rgba(88, 166, 255, 0.12); + } + + .cg-pill.active { + background: #1f6feb; + color: #ffffff; + } + + .cg-command-box { + background: #0d1117; + } + + .cg-command-box code { + color: #c9d1d9; + } +} diff --git a/docs/en/_static/js/config_generator.js b/docs/en/_static/js/config_generator.js new file mode 100644 index 0000000000..e81745a19a --- /dev/null +++ b/docs/en/_static/js/config_generator.js @@ -0,0 +1,165 @@ +// LMDeploy Interactive Configuration Generator — Generic Engine +// Model-specific configurations are loaded from js/models/*.js via +// the window.LMDeployModelConfigs global registry. +(function() { + 'use strict'; + + function initConfigGenerator() { + var container = document.getElementById('lmdeploy-config-generator'); + if (!container) return; + + // ── Read model config from registry ────────────────────────── + var configKey = container.getAttribute('data-model-config') || 'qwen3'; + var configs = window.LMDeployModelConfigs || {}; + var config = configs[configKey]; + if (!config) { + container.textContent = 'Unknown model config: ' + configKey + + '. Available: ' + Object.keys(configs).join(', '); + return; + } + + // ── TP estimation (generic) ───────────────────────────────── + function getRecommendedTP(sel) { + var mem = (config.gpuMem || {})[sel.hardware] || 80; + var need = (config.modelMem || {})[sel.model_size] || 16; + if (sel.quantization === 'awq' || sel.quantization === 'gptq') { + need *= 0.3; + } else if (sel.quantization === 'fp8') { + need *= 0.55; + } + var tp = 1; + while (tp * mem < need * 1.15 && tp < 8) { + tp *= 2; + } + return tp; + } + + // ── Generate command ──────────────────────────────────────── + function generateCommand() { + var sel = {}; + container.querySelectorAll('.cg-pill-bar').forEach(function(bar) { + var key = bar.getAttribute('data-key'); + var active = bar.querySelector('.cg-pill.active'); + if (active) sel[key] = active.getAttribute('data-value'); + }); + + var modelPath = config.buildModelPath(sel); + var tp = getRecommendedTP(sel); + var parts = ['lmdeploy serve api_server ' + modelPath]; + + if (tp > 1) parts.push('--tp ' + tp); + + var extraFlags = config.buildExtraFlags ? config.buildExtraFlags(sel) : []; + parts = parts.concat(extraFlags); + + if (parts.length <= 2) return parts.join(' '); + return parts[0] + ' \\\n' + + parts.slice(1).map(function(p) { return ' ' + p; }).join(' \\\n'); + } + + // ── Update command display ────────────────────────────────── + function updateCommand() { + var el = container.querySelector('.cg-generated-command'); + if (el) el.textContent = generateCommand(); + } + + // ── Render a single dimension row ─────────────────────────── + function renderDimension(dim) { + var row = document.createElement('div'); + row.className = 'cg-row'; + + var label = document.createElement('div'); + label.className = 'cg-label'; + label.textContent = dim.label; + row.appendChild(label); + + var bar = document.createElement('div'); + bar.className = 'cg-pill-bar'; + bar.setAttribute('data-key', dim.key); + + dim.options.forEach(function(opt) { + var pill = document.createElement('button'); + pill.className = 'cg-pill'; + pill.setAttribute('data-value', opt.value); + pill.textContent = opt.label; + if (opt.value === dim.default) pill.classList.add('active'); + + pill.addEventListener('click', function() { + bar.querySelectorAll('.cg-pill').forEach(function(p) { + p.classList.remove('active'); + }); + pill.classList.add('active'); + updateCommand(); + }); + + bar.appendChild(pill); + }); + + row.appendChild(bar); + return row; + } + + // ── Build the full UI ─────────────────────────────────────── + var wrapper = document.createElement('div'); + wrapper.className = 'cg-wrapper'; + + config.dimensions.forEach(function(dim) { + wrapper.appendChild(renderDimension(dim)); + }); + + // Command output section + var cmdSection = document.createElement('div'); + cmdSection.className = 'cg-command-section'; + + var cmdLabel = document.createElement('div'); + cmdLabel.className = 'cg-command-label'; + cmdLabel.textContent = 'Generated Command'; + cmdSection.appendChild(cmdLabel); + + var cmdBox = document.createElement('div'); + cmdBox.className = 'cg-command-box'; + + var pre = document.createElement('pre'); + var code = document.createElement('code'); + code.className = 'cg-generated-command'; + pre.appendChild(code); + cmdBox.appendChild(pre); + + var copyBtn = document.createElement('button'); + copyBtn.className = 'cg-copy-btn'; + copyBtn.textContent = 'Copy'; + copyBtn.addEventListener('click', function() { + var text = code.textContent; + navigator.clipboard.writeText(text).then(function() { + copyBtn.textContent = 'Copied!'; + setTimeout(function() { copyBtn.textContent = 'Copy'; }, 2000); + }).catch(function() { + // Fallback for older browsers + var ta = document.createElement('textarea'); + ta.value = text; + ta.style.position = 'fixed'; + ta.style.left = '-9999px'; + document.body.appendChild(ta); + ta.select(); + document.execCommand('copy'); + document.body.removeChild(ta); + copyBtn.textContent = 'Copied!'; + setTimeout(function() { copyBtn.textContent = 'Copy'; }, 2000); + }); + }); + cmdBox.appendChild(copyBtn); + + cmdSection.appendChild(cmdBox); + wrapper.appendChild(cmdSection); + + container.appendChild(wrapper); + updateCommand(); + } + + // Initialize when DOM is ready + if (document.readyState === 'loading') { + document.addEventListener('DOMContentLoaded', initConfigGenerator); + } else { + initConfigGenerator(); + } +})(); diff --git a/docs/en/_static/js/models/deepseek.js b/docs/en/_static/js/models/deepseek.js new file mode 100644 index 0000000000..5c386d83ff --- /dev/null +++ b/docs/en/_static/js/models/deepseek.js @@ -0,0 +1,68 @@ +// models/deepseek.js — DeepSeek model configuration for LMDeploy Config Generator +(function() { + 'use strict'; + window.LMDeployModelConfigs = window.LMDeployModelConfigs || {}; + + window.LMDeployModelConfigs['deepseek'] = { + name: 'DeepSeek', + + dimensions: [ + { + key: 'hardware', label: 'Hardware Platform', default: 'H800', + options: [ + { value: 'A100', label: 'A100(80G)' }, + { value: 'H800', label: 'H800(80G)' }, + { value: 'H200', label: 'H200(140G)' } + ] + }, + { + key: 'model_size', label: 'Model Version', default: 'V3', + options: [ + { value: 'V2-Lite', label: 'V2 Lite (16B)' }, + { value: 'V2', label: 'V2 (236B)' }, + { value: 'V2.5', label: 'V2.5 (236B)' }, + { value: 'V3', label: 'V3 (685B)' }, + { value: 'V3.2', label: 'V3.2 (685B)' } + ] + }, + { + key: 'quantization', label: 'Quantization', default: 'auto', + options: [ + { value: 'auto', label: 'Auto (BF16)' } + ] + }, + { + key: 'reasoning_parser', label: 'Reasoning Parser', default: 'disabled', + options: [ + { value: 'disabled', label: 'Disabled' }, + { value: 'enabled', label: 'Enabled' } + ] + } + ], + + gpuMem: { 'A100': 80, 'H800': 80, 'H200': 140 }, + + modelMem: { + 'V2-Lite': 32, 'V2': 440, 'V2.5': 440, + 'V3': 1300, 'V3.2': 1300 + }, + + buildModelPath: function(sel) { + var map = { + 'V2-Lite': 'deepseek-ai/DeepSeek-V2-Lite-Chat', + 'V2': 'deepseek-ai/DeepSeek-V2-Chat', + 'V2.5': 'deepseek-ai/DeepSeek-V2.5', + 'V3': 'deepseek-ai/DeepSeek-V3', + 'V3.2': 'deepseek-ai/DeepSeek-V3-0324' + }; + return map[sel.model_size] || 'deepseek-ai/DeepSeek-V3'; + }, + + buildExtraFlags: function(sel) { + var flags = []; + flags.push('--backend pytorch'); + if (sel.reasoning_parser === 'enabled') flags.push('--reasoning-parser deepseek-r1'); + return flags; + } + }; +})(); diff --git a/docs/en/_static/js/models/glm4.js b/docs/en/_static/js/models/glm4.js new file mode 100644 index 0000000000..719a795b42 --- /dev/null +++ b/docs/en/_static/js/models/glm4.js @@ -0,0 +1,70 @@ +// models/glm4.js — GLM-4 model configuration for LMDeploy Config Generator +(function() { + 'use strict'; + window.LMDeployModelConfigs = window.LMDeployModelConfigs || {}; + + window.LMDeployModelConfigs['glm4'] = { + name: 'GLM-4', + + dimensions: [ + { + key: 'hardware', label: 'Hardware Platform', default: 'A100', + options: [ + { value: 'A100', label: 'A100(80G)' }, + { value: 'H800', label: 'H800(80G)' }, + { value: 'H200', label: 'H200(140G)' } + ] + }, + { + key: 'model_size', label: 'Model Version', default: 'GLM-4-9B', + options: [ + { value: 'GLM-4-9B', label: 'GLM-4 (9B)' }, + { value: 'GLM-4-0414-9B', label: 'GLM-4-0414 (9B)' }, + { value: 'GLM-4.5-355B', label: 'GLM-4.5 (355B)' }, + { value: 'GLM-4.5-Air-106B', label: 'GLM-4.5-Air (106B)' }, + { value: 'GLM-4.7-Flash-30B', label: 'GLM-4.7-Flash (30B)' }, + { value: 'GLM-5-754B', label: 'GLM-5 (754B)' } + ] + }, + { + key: 'quantization', label: 'Quantization', default: 'auto', + options: [ + { value: 'auto', label: 'Auto (BF16)' }, + { value: 'awq', label: 'AWQ (W4A16)' } + ] + }, + { + key: 'category', label: 'Categories', default: 'chat', + options: [ + { value: 'chat', label: 'Chat' } + ] + } + ], + + gpuMem: { 'A100': 80, 'H800': 80, 'H200': 140 }, + + modelMem: { + 'GLM-4-9B': 18, 'GLM-4-0414-9B': 18, + 'GLM-4.5-355B': 700, 'GLM-4.5-Air-106B': 212, + 'GLM-4.7-Flash-30B': 60, 'GLM-5-754B': 1400 + }, + + buildModelPath: function(sel) { + var map = { + 'GLM-4-9B': 'THUDM/glm-4-9b-chat', + 'GLM-4-0414-9B': 'THUDM/GLM-4-0414-9B-Chat', + 'GLM-4.5-355B': 'THUDM/GLM-4.5-355B-Chat', + 'GLM-4.5-Air-106B': 'THUDM/GLM-4.5-Air-106B-Chat', + 'GLM-4.7-Flash-30B': 'THUDM/GLM-4.7-Flash-30B', + 'GLM-5-754B': 'THUDM/GLM-5-754B' + }; + return map[sel.model_size] || 'THUDM/glm-4-9b-chat'; + }, + + buildExtraFlags: function(sel) { + var flags = []; + if (sel.quantization === 'awq') flags.push('--model-format awq'); + return flags; + } + }; +})(); diff --git a/docs/en/_static/js/models/internlm.js b/docs/en/_static/js/models/internlm.js new file mode 100644 index 0000000000..b02b368568 --- /dev/null +++ b/docs/en/_static/js/models/internlm.js @@ -0,0 +1,93 @@ +// models/internlm.js — InternLM model configuration for LMDeploy Config Generator +(function() { + 'use strict'; + window.LMDeployModelConfigs = window.LMDeployModelConfigs || {}; + + window.LMDeployModelConfigs['internlm'] = { + name: 'InternLM', + + dimensions: [ + { + key: 'hardware', label: 'Hardware Platform', default: 'A100', + options: [ + { value: 'A100', label: 'A100(80G)' }, + { value: 'H800', label: 'H800(80G)' }, + { value: 'H200', label: 'H200(140G)' } + ] + }, + { + key: 'model_size', label: 'Model Version', default: 'InternLM3-8B', + options: [ + { value: 'InternLM2-7B', label: 'InternLM2 (7B)' }, + { value: 'InternLM2-20B', label: 'InternLM2 (20B)' }, + { value: 'InternLM2.5-7B', label: 'InternLM2.5 (7B)' }, + { value: 'InternLM3-8B', label: 'InternLM3 (8B)' } + ] + }, + { + key: 'quantization', label: 'Quantization', default: 'auto', + options: [ + { value: 'auto', label: 'Auto (BF16)' }, + { value: 'awq', label: 'AWQ (W4A16)' }, + { value: 'kv8', label: 'KV Cache INT8' } + ] + }, + { + key: 'category', label: 'Categories', default: 'chat', + options: [ + { value: 'base', label: 'Base' }, + { value: 'chat', label: 'Chat' } + ] + }, + { + key: 'reasoning_parser', label: 'Reasoning Parser', default: 'disabled', + options: [ + { value: 'disabled', label: 'Disabled' }, + { value: 'enabled', label: 'Enabled' } + ] + }, + { + key: 'tool_call_parser', label: 'Tool Call Parser', default: 'disabled', + options: [ + { value: 'disabled', label: 'Disabled' }, + { value: 'enabled', label: 'Enabled' } + ] + } + ], + + gpuMem: { 'A100': 80, 'H800': 80, 'H200': 140 }, + + modelMem: { + 'InternLM2-7B': 14, 'InternLM2-20B': 40, + 'InternLM2.5-7B': 14, 'InternLM3-8B': 16 + }, + + buildModelPath: function(sel) { + var chatMap = { + 'InternLM2-7B': 'internlm/internlm2-chat-7b', + 'InternLM2-20B': 'internlm/internlm2-chat-20b', + 'InternLM2.5-7B': 'internlm/internlm2_5-7b-chat', + 'InternLM3-8B': 'internlm/internlm3-8b-instruct' + }; + var baseMap = { + 'InternLM2-7B': 'internlm/internlm2-7b', + 'InternLM2-20B': 'internlm/internlm2-20b', + 'InternLM2.5-7B': 'internlm/internlm2_5-7b', + 'InternLM3-8B': 'internlm/internlm3-8b' + }; + if (sel.category === 'base') { + return baseMap[sel.model_size] || 'internlm/internlm3-8b'; + } + return chatMap[sel.model_size] || 'internlm/internlm3-8b-instruct'; + }, + + buildExtraFlags: function(sel) { + var flags = []; + if (sel.quantization === 'awq') flags.push('--model-format awq'); + else if (sel.quantization === 'kv8') flags.push('--quant-policy 8'); + if (sel.reasoning_parser === 'enabled') flags.push('--reasoning-parser intern-s1'); + if (sel.tool_call_parser === 'enabled') flags.push('--tool-call-parser internlm'); + return flags; + } + }; +})(); diff --git a/docs/en/_static/js/models/qwen3.js b/docs/en/_static/js/models/qwen3.js new file mode 100644 index 0000000000..f167658933 --- /dev/null +++ b/docs/en/_static/js/models/qwen3.js @@ -0,0 +1,90 @@ +// models/qwen3.js — Qwen3 model configuration for LMDeploy Config Generator +(function() { + 'use strict'; + window.LMDeployModelConfigs = window.LMDeployModelConfigs || {}; + + window.LMDeployModelConfigs['qwen3'] = { + name: 'Qwen3', + + dimensions: [ + { + key: 'hardware', label: 'Hardware Platform', default: 'A100', + options: [ + { value: 'A100', label: 'A100(80G)' }, + { value: 'H800', label: 'H800(80G)' }, + { value: 'H200', label: 'H200(140G)' }, + { value: 'V100', label: 'V100(32G)' } + ] + }, + { + key: 'model_size', label: 'Model Size', default: '8B', + options: [ + { value: '235B-A22B', label: '235B MoE' }, + { value: '30B-A3B', label: '30B MoE' }, + { value: '32B', label: '32B' }, + { value: '14B', label: '14B' }, + { value: '8B', label: '8B' }, + { value: '4B', label: '4B' }, + { value: '1.7B', label: '1.7B' }, + { value: '0.6B', label: '0.6B' } + ] + }, + { + key: 'quantization', label: 'Quantization', default: 'auto', + options: [ + { value: 'auto', label: 'Auto' }, + { value: 'awq', label: 'AWQ (W4A16)' }, + { value: 'gptq', label: 'GPTQ (W4A16)' }, + { value: 'fp8', label: 'FP8' } + ] + }, + { + key: 'category', label: 'Categories', default: 'instruct', + options: [ + { value: 'base', label: 'Base' }, + { value: 'instruct', label: 'Instruct' }, + { value: 'thinking', label: 'Thinking' } + ] + }, + { + key: 'reasoning_parser', label: 'Reasoning Parser', default: 'disabled', + options: [ + { value: 'disabled', label: 'Disabled' }, + { value: 'enabled', label: 'Enabled' } + ] + }, + { + key: 'tool_call_parser', label: 'Tool Call Parser', default: 'disabled', + options: [ + { value: 'disabled', label: 'Disabled' }, + { value: 'enabled', label: 'Enabled' } + ] + } + ], + + // GPU memory (GB) for TP estimation + gpuMem: { 'A100': 80, 'H800': 80, 'H200': 140, 'V100': 32 }, + + // Approximate BF16 model weight memory (GB) + modelMem: { + '235B-A22B': 440, '30B-A3B': 60, '32B': 64, + '14B': 28, '8B': 16, '4B': 8, '1.7B': 4, '0.6B': 2 + }, + + buildModelPath: function(sel) { + var base = 'Qwen/Qwen3-' + sel.model_size; + if (sel.category === 'instruct') base += '-Instruct'; + else if (sel.category === 'thinking') base += '-Thinking'; + return base; + }, + + buildExtraFlags: function(sel) { + var flags = []; + if (sel.quantization === 'awq') flags.push('--model-format awq'); + else if (sel.quantization === 'gptq') flags.push('--model-format gptq'); + if (sel.reasoning_parser === 'enabled') flags.push('--reasoning-parser qwen-qwq'); + if (sel.tool_call_parser === 'enabled') flags.push('--tool-call-parser qwen3'); + return flags; + } + }; +})(); diff --git a/docs/en/best_practice/deepseek/deepseek.md b/docs/en/best_practice/deepseek/deepseek.md new file mode 100644 index 0000000000..dfb238cc30 --- /dev/null +++ b/docs/en/best_practice/deepseek/deepseek.md @@ -0,0 +1,59 @@ +# DeepSeek + +## 1. Model Introduction + +DeepSeek is a series of powerful open-source large language models developed by DeepSeek AI. The series features Mixture-of-Experts (MoE) architecture for efficient inference with massive parameter counts. + +| Model | Parameters | Architecture | +| :--------------: | :--------: | :----------: | +| DeepSeek-V2-Lite | 16B | MoE | +| DeepSeek-V2 | 236B | MoE | +| DeepSeek-V2.5 | 236B | MoE | +| DeepSeek-V3 | 685B | MoE | +| DeepSeek-V3.2 | 685B | MoE | + +Key features: + +- **MoE Architecture**: Efficient inference through sparse activation of expert modules. +- **Large-scale Models**: Up to 685B total parameters with efficient activated parameter counts. +- **Strong Reasoning**: Deep reasoning capabilities, especially with DeepSeek-R1 reasoning mode. + +For more details, please refer to the [DeepSeek GitHub Repository](https://github.com/deepseek-ai). + +## 2. Model Deployment + +### 2.1 Basic Configuration + +DeepSeek models are supported by LMDeploy with the PyTorch backend. Use the interactive generator below to create your deployment command. + +**Interactive Command Generator**: + +```{raw} html +
+``` + +### 2.2 Configuration Tips + +- **Backend**: DeepSeek models use the PyTorch backend (`--backend pytorch`). +- **Tensor Parallelism (`--tp`)**: DeepSeek-V3 (685B) requires at least 8×80G GPUs. Smaller models like V2-Lite (16B) can run on a single GPU. +- **Session Length (`--session-len`)**: Set explicitly to conserve memory, e.g., `--session-len 32768`. +- **Cache Management (`--cache-max-entry-count`)**: Lower this value if you encounter OOM errors. + +## 3. Model Invocation + +### 3.1 Basic Usage + +For basic API usage, please refer to: + +- [OpenAI Compatible Server](../../llm/api_server.md) +- [Pipeline (Offline Inference)](../../llm/pipeline.md) + +### 3.2 Reasoning Parser + +DeepSeek models support reasoning mode via the DeepSeek-R1 reasoning parser: + +```shell +lmdeploy serve api_server deepseek-ai/DeepSeek-V3 --backend pytorch --reasoning-parser deepseek-r1 +``` + +For detailed usage and examples, see [Reasoning Outputs](../../llm/api_server_reasoning.md). diff --git a/docs/en/best_practice/deepseek/index.rst b/docs/en/best_practice/deepseek/index.rst new file mode 100644 index 0000000000..f7306fcc8c --- /dev/null +++ b/docs/en/best_practice/deepseek/index.rst @@ -0,0 +1,7 @@ +DeepSeek +======================== + +.. toctree:: + :maxdepth: 1 + + deepseek.md diff --git a/docs/en/best_practice/glm/glm4.md b/docs/en/best_practice/glm/glm4.md new file mode 100644 index 0000000000..01cefbf693 --- /dev/null +++ b/docs/en/best_practice/glm/glm4.md @@ -0,0 +1,51 @@ +# GLM-4 + +## 1. Model Introduction + +GLM-4 is a series of large language models developed by Tsinghua University (THUDM). The series spans from compact 9B models to large-scale 754B models, offering strong multilingual and reasoning capabilities. + +| Model | Parameters | Architecture | +| :-----------: | :--------: | :----------: | +| GLM-4 | 9B | Dense | +| GLM-4-0414 | 9B | Dense | +| GLM-4.5 | 355B | MoE | +| GLM-4.5-Air | 106B | MoE | +| GLM-4.7-Flash | 30B | Dense | +| GLM-5 | 754B | MoE | + +Key features: + +- **Scalable Architecture**: From 9B dense to 754B MoE models. +- **Strong Multilingual Support**: Excellent Chinese and English capabilities. +- **Tool Calling**: Built-in function calling support. +- **Vision-Language Models**: GLM-4V variants available for multimodal tasks. + +For more details, please refer to the [GLM GitHub Repository](https://github.com/THUDM). + +## 2. Model Deployment + +### 2.1 Basic Configuration + +GLM-4 models are supported by LMDeploy with both TurboMind (9B models) and PyTorch backends. Use the interactive generator below to create your deployment command. + +**Interactive Command Generator**: + +```{raw} html +
+``` + +### 2.2 Configuration Tips + +- **Backend Selection**: GLM-4 (9B) works with both TurboMind and PyTorch backends. Larger models (GLM-4.5, GLM-5) require the PyTorch backend. +- **Tensor Parallelism (`--tp`)**: GLM-4 (9B) can run on a single 80G GPU. GLM-4.5 (355B) requires multi-GPU setups. +- **Quantization**: AWQ quantization is supported for GLM-4 (9B) models on TurboMind backend. +- **Session Length (`--session-len`)**: Set explicitly to conserve memory, e.g., `--session-len 32768`. + +## 3. Model Invocation + +### 3.1 Basic Usage + +For basic API usage, please refer to: + +- [OpenAI Compatible Server](../../llm/api_server.md) +- [Pipeline (Offline Inference)](../../llm/pipeline.md) diff --git a/docs/en/best_practice/glm/index.rst b/docs/en/best_practice/glm/index.rst new file mode 100644 index 0000000000..e7e0452b77 --- /dev/null +++ b/docs/en/best_practice/glm/index.rst @@ -0,0 +1,7 @@ +GLM +======================== + +.. toctree:: + :maxdepth: 1 + + glm4.md diff --git a/docs/en/best_practice/internlm/index.rst b/docs/en/best_practice/internlm/index.rst new file mode 100644 index 0000000000..131e8f2e39 --- /dev/null +++ b/docs/en/best_practice/internlm/index.rst @@ -0,0 +1,7 @@ +InternLM +======================== + +.. toctree:: + :maxdepth: 1 + + internlm.md diff --git a/docs/en/best_practice/internlm/internlm.md b/docs/en/best_practice/internlm/internlm.md new file mode 100644 index 0000000000..407c0d4f25 --- /dev/null +++ b/docs/en/best_practice/internlm/internlm.md @@ -0,0 +1,69 @@ +# InternLM + +## 1. Model Introduction + +InternLM is a series of large language models developed by Shanghai AI Laboratory and SenseTime. The series spans multiple generations with progressive improvements in reasoning, code generation, and tool usage. + +| Model | Parameters | Architecture | +| :------------: | :--------: | :----------: | +| InternLM2-7B | 7B | Dense | +| InternLM2-20B | 20B | Dense | +| InternLM2.5-7B | 7B | Dense | +| InternLM3-8B | 8B | Dense | + +Key features: + +- **Strong Reasoning**: Excellent performance on reasoning and math benchmarks. +- **Tool Calling**: Built-in function calling and agent capabilities. +- **Code Generation**: Strong code generation and understanding abilities. +- **Long Context**: Support for extended context windows. + +For more details, please refer to the [InternLM GitHub Repository](https://github.com/InternLM/InternLM). + +## 2. Model Deployment + +### 2.1 Basic Configuration + +InternLM models are fully supported by LMDeploy with both TurboMind and PyTorch backends. Use the interactive generator below to create your deployment command. + +**Interactive Command Generator**: + +```{raw} html +
+``` + +### 2.2 Configuration Tips + +- **Backend Selection**: TurboMind is the default high-performance backend. Use PyTorch backend (`--backend pytorch`) for broader compatibility. +- **Tensor Parallelism (`--tp`)**: InternLM2-20B may require 2 GPUs for BF16 inference. Smaller models (7B/8B) fit on a single GPU. +- **Quantization**: AWQ quantization (`--model-format awq`) and KV cache INT8 (`--quant-policy 8`) are supported. +- **Session Length (`--session-len`)**: Set explicitly to conserve memory, e.g., `--session-len 32768`. + +## 3. Model Invocation + +### 3.1 Basic Usage + +For basic API usage, please refer to: + +- [OpenAI Compatible Server](../../llm/api_server.md) +- [Pipeline (Offline Inference)](../../llm/pipeline.md) + +### 3.2 Reasoning Parser + +InternLM models support reasoning mode via the `intern-s1` reasoning parser: + +```shell +lmdeploy serve api_server internlm/internlm3-8b-instruct --reasoning-parser intern-s1 +``` + +For detailed usage and examples, see [Reasoning Outputs](../../llm/api_server_reasoning.md). + +### 3.3 Tool Calling + +InternLM supports tool calling capabilities. Enable the tool call parser: + +```shell +lmdeploy serve api_server internlm/internlm3-8b-instruct --tool-call-parser internlm +``` + +For detailed usage and examples, see [Tools](../../llm/api_server_tools.md). diff --git a/docs/en/best_practice/qwen/index.rst b/docs/en/best_practice/qwen/index.rst new file mode 100644 index 0000000000..2313cd58c3 --- /dev/null +++ b/docs/en/best_practice/qwen/index.rst @@ -0,0 +1,7 @@ +Qwen +======================== + +.. toctree:: + :maxdepth: 1 + + qwen3.md diff --git a/docs/en/best_practice/qwen/qwen3.md b/docs/en/best_practice/qwen/qwen3.md new file mode 100644 index 0000000000..faf96e771b --- /dev/null +++ b/docs/en/best_practice/qwen/qwen3.md @@ -0,0 +1,77 @@ +# Qwen3 + +## 1. Model Introduction + +Qwen3 is the latest generation of large language models in the Qwen series developed by Alibaba, offering significant improvements in instruction following, reasoning, multilingual understanding, and tool usage. + +The Qwen3 series provides models in both **Dense** and **MoE** (Mixture-of-Experts) architectures: + +| Model | Type | Parameters | Active Parameters | +| :-------------: | :---: | :--------: | :---------------: | +| Qwen3-0.6B | Dense | 0.6B | 0.6B | +| Qwen3-1.7B | Dense | 1.7B | 1.7B | +| Qwen3-4B | Dense | 4B | 4B | +| Qwen3-8B | Dense | 8B | 8B | +| Qwen3-14B | Dense | 14B | 14B | +| Qwen3-32B | Dense | 32B | 32B | +| Qwen3-30B-A3B | MoE | 30B | 3B | +| Qwen3-235B-A22B | MoE | 235B | 22B | + +Key features: + +- **Extended context length**: Up to 256K tokens for long-context understanding and reasoning. +- **Flexible deployment**: Available in Base, Instruct, and Thinking editions. +- **Tool calling**: Built-in support for function calling and agent workflows. +- **Multilingual**: Broad multilingual knowledge coverage. + +For more details, please refer to the [Qwen3 GitHub Repository](https://github.com/QwenLM/Qwen3). + +## 2. Model Deployment + +### 2.1 Basic Configuration + +The Qwen3 series is fully supported by LMDeploy with both TurboMind and PyTorch backends. Recommended launch configurations vary by hardware and model size. + +**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, model size, quantization method, and capabilities. + +```{raw} html +
+``` + +### 2.2 Configuration Tips + +- **Backend Selection**: TurboMind is the default high-performance backend. Use PyTorch backend (`--backend pytorch`) for broader model format compatibility and features like LoRA adapters. +- **Tensor Parallelism (`--tp`)**: Set based on model size and available GPUs. Larger models (32B+) typically require multi-GPU setups. +- **KV Cache Memory (`--cache-max-entry-count`)**: Controls the percentage of free GPU memory used for KV cache (default: 0.8). Lower this value if you encounter OOM errors. +- **Session Length (`--session-len`)**: Defaults to model's max length. Set explicitly to conserve memory, e.g., `--session-len 32768`. +- **Prefix Caching (`--enable-prefix-caching`)**: Enables automatic prefix caching for improved throughput when serving repeated prompt patterns. +- **Quantization**: LMDeploy supports AWQ 4-bit (`--model-format awq`) and KV cache quantization (`--quant-policy 4` or `--quant-policy 8`) for Qwen3 models. + +## 3. Model Invocation + +### 3.1 Basic Usage + +For basic API usage, please refer to: + +- [OpenAI Compatible Server](../../llm/api_server.md) +- [Pipeline (Offline Inference)](../../llm/pipeline.md) + +### 3.2 Reasoning Parser + +Qwen3 Thinking models support reasoning mode. Enable the reasoning parser during deployment to separate the thinking and content sections: + +```shell +lmdeploy serve api_server Qwen/Qwen3-32B-Thinking --reasoning-parser qwen-qwq +``` + +For detailed usage and examples, see [Reasoning Outputs](../../llm/api_server_reasoning.md). + +### 3.3 Tool Calling + +Qwen3 supports tool calling capabilities. Enable the tool call parser: + +```shell +lmdeploy serve api_server Qwen/Qwen3-32B-Instruct --tool-call-parser qwen3 +``` + +For detailed usage and examples, see [Tools](../../llm/api_server_tools.md). diff --git a/docs/en/conf.py b/docs/en/conf.py index 94ca2a4def..6209c82bf3 100644 --- a/docs/en/conf.py +++ b/docs/en/conf.py @@ -174,7 +174,14 @@ def metrics(): # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] -html_css_files = ['css/readthedocs.css'] +html_css_files = ['css/readthedocs.css', 'css/config_generator.css'] +html_js_files = [ + 'js/models/qwen3.js', + 'js/models/deepseek.js', + 'js/models/glm4.js', + 'js/models/internlm.js', + 'js/config_generator.js', +] # Enable ::: for my_st myst_enable_extensions = [ diff --git a/docs/en/index.rst b/docs/en/index.rst index d1f78ecece..bc6b2cf420 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -51,6 +51,16 @@ Documentation supported_models/supported_models.md supported_models/reward_models.md +.. _best_practice: +.. toctree:: + :maxdepth: 3 + :caption: Deployment Best Practice + + best_practice/qwen/index + best_practice/deepseek/index + best_practice/glm/index + best_practice/internlm/index + .. _llm_deployment: .. toctree:: :maxdepth: 1