InternLM · lvhan028 · Mar 17, 2025 · Mar 18, 2025 · Mar 24, 2025 · Mar 25, 2025
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
@@ -329,7 +329,7 @@ def parse_args():
     cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
     cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
     session_len_act = ArgumentHelper.session_len(pt_group, default=2048)
-    prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
+    prefix_caching_act = ArgumentHelper.disable_prefix_caching(pt_group)
     rope_scaling_factor_act = ArgumentHelper.rope_scaling_factor(pt_group)
     dtype_act = ArgumentHelper.dtype(pt_group)
 
@@ -395,7 +395,7 @@ def main():
                     session_len=session_len,
                     rope_scaling_factor=args.rope_scaling_factor,
                     tp=args.tp,
-                    enable_prefix_caching=args.enable_prefix_caching,
+                    enable_prefix_caching=not args.disable_prefix_caching,
                     dtype=args.dtype,
                 )
             elif args.backend == 'pytorch':
@@ -405,7 +405,7 @@ def main():
                     session_len=session_len,
                     tp=args.tp,
                     eager_mode=args.eager_mode,
-                    enable_prefix_caching=args.enable_prefix_caching,
+                    enable_prefix_caching=not args.disable_prefix_caching,
                     dtype=args.dtype,
                 )
             gen_config = GenerationConfig(top_k=args.top_k,

diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py
@@ -258,7 +258,7 @@ def parse_args():
     tp_act = ArgumentHelper.tp(pt_group)
     cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
     cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
-    prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
+    prefix_caching_act = ArgumentHelper.disable_prefix_caching(pt_group)
 
     # turbomind engine args
     tb_group = parser.add_argument_group('TurboMind engine argument')
@@ -290,7 +290,7 @@ def main():
             quant_policy=args.quant_policy,
             num_tokens_per_iter=args.num_tokens_per_iter,
             max_prefill_iters=args.max_prefill_iters,
-            enable_prefix_caching=args.enable_prefix_caching,
+            enable_prefix_caching=not args.disable_prefix_caching,
             communicator=args.communicator,
         )
     elif args.backend == 'pytorch':
@@ -301,7 +301,7 @@ def main():
             tp=args.tp,
             thread_safe=False,
             eager_mode=args.eager_mode,
-            enable_prefix_caching=args.enable_prefix_caching,
+            enable_prefix_caching=not args.disable_prefix_caching,
         )
 
     engine = Engine(args.model_path, engine_config, csv=args.csv)

diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
@@ -148,8 +148,8 @@ def __init__(self, model_path: str, engine_config: Union[PytorchEngineConfig, Tu
         self.tm_model = tm_model
         self.pbar = None
 
-    async def _inference(self, req_queue: Queue, session_id: int, temperature: float, top_p: float, top_k: int,
-                         stream_output: bool, skip_tokenize: bool, skip_detokenize: bool, concurrency: int):
+    async def _inference(self, req_queue: Queue, temperature: float, top_p: float, top_k: int, stream_output: bool,
+                         skip_tokenize: bool, skip_detokenize: bool):
         model_inst = self.tm_model.create_instance()
         sess: Session = None
         for prompt, _, output_seqlen, cancel_after, sess in iter(req_queue.get_nowait, None):
@@ -166,7 +166,7 @@ async def _inference(self, req_queue: Queue, session_id: int, temperature: float
             prev_len = 0
             token_ids = input_ids.copy()
 
-            generator = model_inst.async_stream_infer(session_id,
+            generator = model_inst.async_stream_infer(sess.id,
                                                       input_ids=input_ids,
                                                       gen_config=GenerationConfig(max_new_tokens=output_seqlen,
                                                                                   temperature=temperature,
@@ -193,10 +193,9 @@ async def _inference(self, req_queue: Queue, session_id: int, temperature: float
 
             # for pytorch engine to restart a session
             if self.backend == 'pytorch':
-                await model_inst.async_end(session_id)
+                await model_inst.async_end(sess.id)
 
             self.pbar.update(1)
-            session_id += concurrency
 
     def process_request(self, requests, profiler: Profiler, concurrency, temperature, top_p, top_k, stream_output,
                         skip_tokenize, skip_detokenize, cancel_rate):
@@ -219,8 +218,7 @@ def process_request(self, requests, profiler: Profiler, concurrency, temperature
         # start threads
         tasks = []
         for i in range(concurrency):
-            task = self._inference(req_queue, i, temperature, top_p, top_k, stream_output, skip_tokenize,
-                                   skip_detokenize, concurrency)
+            task = self._inference(req_queue, temperature, top_p, top_k, stream_output, skip_tokenize, skip_detokenize)
             tasks.append(task)
 
         async def _gather_tasks(tasks):
@@ -311,7 +309,7 @@ def parse_args():
     tp_act = ArgumentHelper.tp(pt_group)
     cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
     cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
-    prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
+    prefix_caching_act = ArgumentHelper.disable_prefix_caching(pt_group)
     quant_policy_act = ArgumentHelper.quant_policy(pt_group, default=0)
     dtype_act = ArgumentHelper.dtype(pt_group)
 
@@ -348,7 +346,7 @@ def main():
             quant_policy=args.quant_policy,
             num_tokens_per_iter=args.num_tokens_per_iter,
             max_prefill_iters=args.max_prefill_iters,
-            enable_prefix_caching=args.enable_prefix_caching,
+            enable_prefix_caching=not args.disable_prefix_caching,
             dtype=args.dtype,
             communicator=args.communicator,
         )
@@ -359,7 +357,7 @@ def main():
             max_batch_size=args.concurrency,
             tp=args.tp,
             eager_mode=args.eager_mode,
-            enable_prefix_caching=args.enable_prefix_caching,
+            enable_prefix_caching=not args.disable_prefix_caching,
             quant_policy=args.quant_policy,
             dtype=args.dtype,
             distributed_executor_backend=args.distributed_executor_backend,

diff --git a/lmdeploy/api.py b/lmdeploy/api.py
@@ -69,17 +69,18 @@ def pipeline(model_path: str,
         model_path = get_model(model_path, download_dir, revision)
 
     task, pipeline_class = get_task(model_path)
-    if task == 'vlm':
-        if backend_config and backend_config.enable_prefix_caching:
-            backend_config.enable_prefix_caching = False
-            logger.warning('VLM does not support prefix caching.')
 
     if type(backend_config) is not PytorchEngineConfig:
         # set auto backend mode
         backend_config = autoget_backend_config(model_path, backend_config)
     backend = 'pytorch' if type(backend_config) is PytorchEngineConfig else 'turbomind'
     logger.info(f'Using {backend} engine')
 
+    if task == 'vlm':
+        if backend_config and backend_config.enable_prefix_caching:
+            backend_config.enable_prefix_caching = False
+            logger.warning('VLM does not support prefix caching.')
+
     return pipeline_class(model_path,
                           backend=backend,
                           backend_config=backend_config,

diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import fire
+
+from lmdeploy import ChatTemplateConfig, GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
+from lmdeploy.archs import autoget_backend
+
+
+def input_prompt():
+    """Input a prompt in the consolo interface."""
+    print('\ndouble enter to end input >>> ', end='')
+    sentinel = ''  # ends when this string is seen
+    return '\n'.join(iter(input, sentinel))
+
+
+def build_pipe(model_path, backend, **kwargs):
+    # set enable_prefix_cache
+    disable_prefix_cache = kwargs.pop('disable_prefix_cache', False)
+    kwargs.update(enable_prefix_caching=not disable_prefix_cache)
+    # set engine config
+    engine_config = None
+    if backend == 'turbomind':
+        engine_config = TurbomindEngineConfig()
+        for key, value in kwargs.items():
+            if hasattr(TurbomindEngineConfig, key):
+                setattr(engine_config, key, value)
+    else:
+        engine_config = PytorchEngineConfig()
+        for key, value in kwargs.items():
+            if hasattr(PytorchEngineConfig, key):
+                setattr(engine_config, key, value)
+        if kwargs.get('adapters', None):
+            from .utils import get_lora_adapters
+            adapters = get_lora_adapters(kwargs['adapters'])
+            engine_config.adapters = adapters
+    # set chat template config
+    chat_template = kwargs.get('chat_template', None)
+    chat_template_config = None
+    if chat_template:
+        chat_template_config = ChatTemplateConfig(model_name=chat_template)
+
+    pipe = pipeline(model_path,
+                    backend_config=engine_config,
+                    chat_template_config=chat_template_config,
+                    log_level='ERROR',
+                    **kwargs)
+    return pipe
+
+
+def build_gen_config(**kwargs):
+    gen_config = GenerationConfig(max_new_tokens=1024, top_k=40, top_p=0.8, temperature=0.8, repetition_penalty=1.0)
+    for key, value in kwargs.items():
+        if hasattr(GenerationConfig, key):
+            setattr(gen_config, key, value)
+    return gen_config
+
+
+def main(model_path, backend, **kwargs):
+    if backend != 'pytorch':
+        # set auto backend mode
+        backend = autoget_backend(model_path)
+
+    pipe = build_pipe(model_path, backend, **kwargs)
+    gen_config = build_gen_config(**kwargs)
+
+    quit = False
+    while True:
+        with pipe.session(gen_config) as sess:
+            while True:
+                try:
+                    prompt = input_prompt()
+                except KeyboardInterrupt:
+                    quit = True
+                    break
+                if prompt == 'end':
+                    sess.close()
+                    break
+                if prompt == 'exit':
+                    quit = True
+                    break
+                resps = sess(prompt)
+                try:
+                    for resp in resps:
+                        print(resp.text, end='', flush=True)
+                    sess.messages.append(dict(role='assistant', content=resp.text))
+                except KeyboardInterrupt:
+                    sess.stop()
+                finally:
+                    print('\ncancelling the conversation')
+        if quit:
+            print('exiting...')
+            break
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
@@ -4,7 +4,7 @@
 import os
 
 from ..version import __version__
-from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args, get_chat_template, get_lora_adapters
+from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args
 
 
 class CLI(object):
@@ -104,7 +104,7 @@ def add_parser_chat():
         tp_act = ArgumentHelper.tp(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
-        prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
+        prefix_caching_act = ArgumentHelper.disable_prefix_caching(pt_group)
         quant_policy = ArgumentHelper.quant_policy(pt_group)
 
         # turbomind args
@@ -218,39 +218,9 @@ def get_gpu_topo():
     @staticmethod
     def chat(args):
         """Chat with pytorch or turbomind engine."""
-        from lmdeploy.archs import autoget_backend
-
-        chat_template_config = get_chat_template(args.chat_template)
-
-        backend = args.backend
-        if backend != 'pytorch':
-            # set auto backend mode
-            backend = autoget_backend(args.model_path)
-
-        if backend == 'pytorch':
-            from lmdeploy.messages import PytorchEngineConfig
-            from lmdeploy.pytorch.chat import run_chat
-
-            adapters = get_lora_adapters(args.adapters)
-            engine_config = PytorchEngineConfig(dtype=args.dtype,
-                                                tp=args.tp,
-                                                session_len=args.session_len,
-                                                cache_max_entry_count=args.cache_max_entry_count,
-                                                adapters=adapters,
-                                                enable_prefix_caching=args.enable_prefix_caching,
-                                                device_type=args.device,
-                                                eager_mode=args.eager_mode,
-                                                quant_policy=args.quant_policy)
-            run_chat(args.model_path, engine_config, chat_template_config=chat_template_config)
-        else:
-            from lmdeploy.turbomind.chat import main as run_chat
-            kwargs = convert_args(args)
-            kwargs.pop('chat_template')
-            kwargs.pop('backend')
-            kwargs.pop('device')
-            kwargs.pop('eager_mode')
-            kwargs['chat_template_config'] = chat_template_config
-            run_chat(**kwargs)
+        from .chat import main
+        kwargs = convert_args(args)
+        main(**kwargs)
 
     @staticmethod
     def add_parsers():

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
@@ -60,7 +60,7 @@ def add_parser_gradio():
         max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
         cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
-        prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
+        prefix_caching_act = ArgumentHelper.disable_prefix_caching(pt_group)
         max_prefill_token_num_act = ArgumentHelper.max_prefill_token_num(pt_group)
         model_format_act = ArgumentHelper.model_format(pt_group)
         # turbomind args
@@ -161,7 +161,7 @@ def add_parser_api_server():
         max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
         cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
-        prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
+        prefix_caching_act = ArgumentHelper.disable_prefix_caching(pt_group)
         max_prefill_token_num_act = ArgumentHelper.max_prefill_token_num(pt_group)
         quant_policy = ArgumentHelper.quant_policy(pt_group)
         model_format = ArgumentHelper.model_format(pt_group)
@@ -274,7 +274,7 @@ def gradio(args):
                                                  cache_max_entry_count=args.cache_max_entry_count,
                                                  block_size=args.cache_block_seq_len,
                                                  session_len=args.session_len,
-                                                 enable_prefix_caching=args.enable_prefix_caching,
+                                                 enable_prefix_caching=not args.disable_prefix_caching,
                                                  device_type=args.device,
                                                  quant_policy=args.quant_policy,
                                                  eager_mode=args.eager_mode,
@@ -290,7 +290,7 @@ def gradio(args):
                                                    rope_scaling_factor=args.rope_scaling_factor,
                                                    cache_max_entry_count=args.cache_max_entry_count,
                                                    cache_block_seq_len=args.cache_block_seq_len,
-                                                   enable_prefix_caching=args.enable_prefix_caching,
+                                                   enable_prefix_caching=not args.disable_prefix_caching,
                                                    max_prefill_token_num=args.max_prefill_token_num,
                                                    communicator=args.communicator)
         chat_template_config = get_chat_template(args.chat_template)
@@ -327,7 +327,7 @@ def api_server(args):
                                                  block_size=args.cache_block_seq_len,
                                                  session_len=args.session_len,
                                                  adapters=adapters,
-                                                 enable_prefix_caching=args.enable_prefix_caching,
+                                                 enable_prefix_caching=not args.disable_prefix_caching,
                                                  device_type=args.device,
                                                  quant_policy=args.quant_policy,
                                                  eager_mode=args.eager_mode,
@@ -349,7 +349,7 @@ def api_server(args):
                                                    rope_scaling_factor=args.rope_scaling_factor,
                                                    cache_max_entry_count=args.cache_max_entry_count,
                                                    cache_block_seq_len=args.cache_block_seq_len,
-                                                   enable_prefix_caching=args.enable_prefix_caching,
+                                                   enable_prefix_caching=not args.disable_prefix_caching,
                                                    max_prefill_token_num=args.max_prefill_token_num,
                                                    communicator=args.communicator)
         chat_template_config = get_chat_template(args.chat_template)

diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
@@ -476,13 +476,13 @@ def cache_block_seq_len(parser):
                                    'be ignored')
 
     @staticmethod
-    def enable_prefix_caching(parser):
+    def disable_prefix_caching(parser):
         """Add argument enable_prefix_caching to parser."""
 
-        return parser.add_argument('--enable-prefix-caching',
+        return parser.add_argument('--disable-prefix-caching',
                                    action='store_true',
                                    default=False,
-                                   help='Enable cache and match prefix')
+                                   help='Disable prefix caching')
 
     @staticmethod
     def num_tokens_per_iter(parser):