update

lvhan028 · Jul 22, 2023 · 458d5e6 · 458d5e6
1 parent 53f4559
commit 458d5e6
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 21 deletions.
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -2,7 +2,7 @@
 
 We provide several profiling tools to benchmark our models.
 
-## profiling with dataset
+## profile with dataset
 
 Download the dataset below or create your own dataset.
 
@@ -16,7 +16,6 @@ Profiling your model with `profile_throughput.py`
 python profile_throughput.py \
  ShareGPT_V3_unfiltered_cleaned_split.json \
  /path/to/your/model \
- ${ModelType} \
  --concurrency 64
 ```
 
@@ -27,7 +26,6 @@ python profile_throughput.py \
 ```bash
 python profile_generation.py \
  /path/to/your/model \
- ${ModelType} \
  --concurrency 8 --input_seqlen 0 --output_seqlen 2048
 ```
 
@@ -36,10 +34,11 @@ python profile_generation.py \
 Tools above profile models with Python API. `profile_serving.py` is used to do benchmark on serving.
 
 ```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
 python profile_serving.py \
     ${TritonServerAddress} \
-    ${ModelName} \
     /path/to/tokenizer \
-    /path/to/dataset \
+    ShareGPT_V3_unfiltered_cleaned_split.json \
     --concurrency 64
 ```
diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py
@@ -156,7 +156,7 @@ def stream_infer(self,
                 break
             else:
                 yield status, res, tokens
-        if status.value >= 0:
+        if status.value == 0:
             self._session.histories = \
                 self._session.histories + self._session.prompt + \
                 self._session.response
@@ -197,11 +197,11 @@ def end(self, session_id: int, *args, **kwargs):
                                                request_output_len=0,
                                                sequence_start=False,
                                                sequence_end=True):
-            if status != StatusCode.TRITON_STREAM_END:
-                return status
+            if status.value < 0:
+                break
 
         self.reset_session()
-        return StatusCode.TRITON_STREAM_END
+        return status
 
     def cancel(self, session_id: int, *args, **kwargs):
         """Cancel the session during generating tokens.
@@ -244,7 +244,7 @@ def cancel(self, session_id: int, *args, **kwargs):
         if status == StatusCode.TRITON_STREAM_END:
             logger.info(f'cancel session {session_id} successfully')
             if prev_session.histories:
-                logger.warn(f'TODO: start to recover session {session_id}')
+                logger.warning(f'TODO: start to recover session {session_id}')
         else:
             logger.info(f'cancel session {session_id} failed: {res}')
         return status
@@ -285,7 +285,7 @@ def resume(self, session_id: int, *args, **kwargs):
                                                sequence_start=True,
                                                sequence_end=False):
             if status.value < 0:
-                return status
+                break
 
         self._session.histories = histories
         return status
@@ -420,16 +420,12 @@ def _stream_infer(self,
                                           request_output_len, sequence_start,
                                           sequence_end, preseq_length, cancel))
         producer.start()
-        for state, res, tokens in self.stream_consumer(self.postprocess, que,
-                                                       session, input_tokens,
-                                                       preseq_length, cancel,
-                                                       logger, self.display,
-                                                       self.profile_generation,
-                                                       self.eos_id):
-            if state.value < 0:
-                yield state, res, 0
-            else:
-                yield state, res, tokens
+        for status, res, n_token in self.stream_consumer(
+                self.postprocess, que, session, input_tokens, preseq_length,
+                cancel, logger, self.display, self.profile_generation,
+                self.eos_id):
+            yield status, res, n_token
+
         producer.join()
         self._session = que.get()
         curseq_length = self._session.sequence_length