update

lvhan028 · Nov 10, 2023 · 4a11a23 · 4a11a23
1 parent 619f543
commit 4a11a23
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 54 deletions.
diff --git a/lmdeploy/serve/turbomind/triton_models/preprocessing/1/model.py b/lmdeploy/serve/turbomind/triton_models/preprocessing/1/model.py
@@ -42,9 +42,7 @@ def initialize(self, args):
         self.model_config = model_config = json.loads(args['model_config'])
 
         # Parse model output configs and convert Triton types to numpy types
-        input_names = [
-            'INPUT_ID', 'REQUEST_INPUT_LEN', 'BAD_WORDS_IDS', 'STOP_WORDS_IDS'
-        ]
+        input_names = ['INPUT_ID', 'REQUEST_INPUT_LEN']
         for input_name in input_names:
             setattr(
                 self,
@@ -89,8 +87,6 @@ def execute(self, requests):
             # Get input tensors
             query = pb_utils.get_input_tensor_by_name(request,
                                                       'QUERY').as_numpy()
-            request_output_len = pb_utils.get_input_tensor_by_name(
-                request, 'REQUEST_OUTPUT_LEN').as_numpy()
 
             # Preprocessing input data.
             input_id, request_input_len = self._create_request(query)
@@ -104,8 +100,6 @@ def execute(self, requests):
                 'REQUEST_INPUT_LEN',
                 np.array(request_input_len).astype(
                     self.request_input_len_dtype))
-            request_output_len_tensor = pb_utils.Tensor(
-                'REQUEST_OUTPUT_LEN', request_output_len)
 
             # Create InferenceResponse. You can set an error here in case
             # there was a problem with handling this inference request.
@@ -114,10 +108,8 @@ def execute(self, requests):
             #
             # pb_utils.InferenceResponse(
             #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(output_tensors=[
-                input_id_tensor, request_input_len_tensor,
-                request_output_len_tensor
-            ])
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[input_id_tensor, request_input_len_tensor])
             responses.append(inference_response)
 
         # You should return a list of pb_utils.InferenceResponse. Length

diff --git a/lmdeploy/serve/turbomind/triton_models/preprocessing/config.pbtxt b/lmdeploy/serve/turbomind/triton_models/preprocessing/config.pbtxt
@@ -7,53 +7,16 @@ input [
         name: "QUERY"
         data_type: TYPE_STRING
         dims: [ -1 ]
-    },
-    {
-        name: "BAD_WORDS_DICT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "STOP_WORDS_DICT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_UINT32
-        dims: [ -1 ]
     }
 ]
 output [
     {
         name: "INPUT_ID"
-        data_type: TYPE_UINT32
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_INPUT_LEN"
-        data_type: TYPE_UINT32
-        dims: [ 1 ]
-    },
-    {
-        name: "BAD_WORDS_IDS"
         data_type: TYPE_INT32
-        dims: [ 2, -1 ]
-    },
-    {
-        name: "STOP_WORDS_IDS"
-        data_type: TYPE_INT32
-        dims: [ 2, -1 ]
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_UINT32
         dims: [ -1 ]
     },
     {
-        name: "PROMPT_LEARNING_TASK_NAME_IDS"
+        name: "REQUEST_INPUT_LEN"
         data_type: TYPE_UINT32
         dims: [ 1 ]
     }

diff --git a/lmdeploy/serve/turbomind/utils.py b/lmdeploy/serve/turbomind/utils.py
@@ -48,11 +48,7 @@ def infer(self, prompts: Union[str, List[str]]) -> tuple:
                       f'{type(prompts)}'
 
         input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.uint32)
-        inputs = [
-            prepare_tensor('QUERY', input0_data),
-            prepare_tensor('REQUEST_OUTPUT_LEN', output0_len)
-        ]
+        inputs = [prepare_tensor('QUERY', input0_data)]
 
         with grpcclient.InferenceServerClient(self.tritonserver_addr) as \
                 client: