Skip to content

Commit

Permalink
tmpo
Browse files Browse the repository at this point in the history
  • Loading branch information
lvhan028 committed Aug 11, 2023
1 parent 262055e commit 13307ee
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 37 deletions.
63 changes: 43 additions & 20 deletions lmdeploy/serve/turbomind/triton_models/postprocessing/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def initialize(self, args):

# Parse model output configs
output_config = pb_utils.get_output_config_by_name(
model_config, 'OUTPUT')
model_config, 'new_token_text')

# Convert Triton types to numpy types
self.output_dtype = pb_utils.triton_string_to_numpy(
Expand Down Expand Up @@ -80,21 +80,33 @@ def execute(self, requests):
# and create a pb_utils.InferenceResponse for each of them.
for idx, request in enumerate(requests):
# Get input tensors
tokens_batch = pb_utils.get_input_tensor_by_name(
request, 'TOKENS_BATCH').as_numpy()
sequence_length = pb_utils.get_input_tensor_by_name(
request, 'sequence_length').as_numpy()
prev_token_ids = pb_utils.get_input_tensor_by_name(
request, 'prev_token_ids').as_numpy().flatten().tolist()
prev_token_texts = pb_utils.get_input_tensor_by_name(
request, 'prev_token_texts').as_numpy().flatten().tolist()
token_ids = pb_utils.get_input_tensor_by_name(
request, 'token_ids').as_numpy().flatten().tolist()

print(prev_token_ids, prev_token_texts, token_ids)

prev_token_texts = [
token_text.decode('utf-8') for token_text in prev_token_texts
]

print(prev_token_ids, prev_token_texts, token_ids)

# Postprocessing output data.
outputs = self._postprocessing(tokens_batch.tolist(),
sequence_length)
new_token_text, output_text = self._postprocessing(
prev_token_ids, prev_token_texts, token_ids)

# Create output tensors. You need pb_utils.Tensor
# objects to create pb_utils.InferenceResponse.
output_tensor = pb_utils.Tensor(
'OUTPUT',
np.array(outputs).astype(self.output_dtype))

new_token_text = pb_utils.Tensor(
'new_token_text',
np.array(new_token_text).astype(self.output_dtype))
output_text = pb_utils.Tensor(
'output_text',
np.array(output_text).astype(self.output_dtype))
# Create InferenceResponse. You can set an error here in case
# there was a problem with handling this inference request.
# Below is an example of how you can set errors in inference
Expand All @@ -103,7 +115,7 @@ def execute(self, requests):
# pb_utils.InferenceResponse(
# output_tensors=..., TritonError("An error occurred"))
inference_response = pb_utils.InferenceResponse(
output_tensors=[output_tensor])
output_tensors=[new_token_text, output_text])
responses.append(inference_response)

# You should return a list of pb_utils.InferenceResponse. Length
Expand All @@ -118,12 +130,23 @@ def finalize(self):
"""
print('Cleaning up...')

def _postprocessing(self, tokens_batch, sequence_length):
def _postprocessing(self, prev_token_ids, prev_token_texts, new_token_ids):
"""decode token ids into texts."""
outputs = []
for beam_tokens, beam_len in zip(tokens_batch, sequence_length):
for tokens, _len in zip(beam_tokens, beam_len):
output = self.tokenizer.decode(tokens[:_len])
output = output.encode('utf8')
outputs.append(output)
return outputs

for new_token_id in new_token_ids:
new_token, output_text = self.tokenizer.decode_incrementally(
prev_token_ids, prev_token_texts, new_token_id)
if new_token is not None:
prev_token_texts.append(new_token)
prev_token_ids.append(new_token_id)

# print(f'{new_token}')
return [new_token], [output_text]
# for prev_token_ids, prev_token_texts, new_token_id in zip(
# prev_token_batch, prev_token_text_batch, new_token_batch):
# new_token, output_text = self.tokenizer.decode_incrementally(
# prev_token_ids,
# prev_token_texts,
# new_token_id)
# outputs.append((new_token, output_text))
# return outputs
24 changes: 17 additions & 7 deletions lmdeploy/serve/turbomind/triton_models/postprocessing/config.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,37 @@ backend: "python"
max_batch_size: 1
input [
{
name: "TOKENS_BATCH"
name: "prev_token_ids"
data_type: TYPE_UINT32
dims: [ -1, -1 ]
dims: [ -1 ]
},
{
name: "sequence_length"
data_type: TYPE_UINT32
name: "prev_token_texts"
data_type: TYPE_STRING
dims: [ -1 ]
},
{
name: "token_ids"
data_type: TYPE_UINT32
dims: [-1]
}
]
output [
{
name: "OUTPUT"
name: "new_token_text"
data_type: TYPE_STRING
dims: [ -1, -1 ]
dims: [ -1 ]
},
{
name: "output_text"
data_type: TYPE_STRING
dims: [ -1 ]
}
]

instance_group [
{
count: 1
count: 16
kind: KIND_CPU
}
]
Expand Down
28 changes: 18 additions & 10 deletions lmdeploy/serve/turbomind/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

def prepare_tensor(name, input_tensor):
"""Create grpcclient's InferInput instance according to a given tensor."""
print(name, input_tensor.shape)
t = grpcclient.InferInput(name, list(input_tensor.shape),
np_to_triton_dtype(input_tensor.dtype))
t.set_data_from_numpy(input_tensor)
Expand Down Expand Up @@ -76,25 +77,32 @@ def __init__(self, tritonserver_addr: str):
def __call__(self, *args, **kwargs):
return self.infer(*args, **kwargs)

def infer(self, output_ids: np.ndarray, seqlen: np.ndarray):
def infer(self, prev_token_ids: np.ndarray, prev_token_texts: np.ndarray,
token_ids: np.ndarray):
"""De-tokenize tokens for text.
Args:
output_ids(np.ndarray): tokens' id
seqlen(np.ndarray): sequence length
prev_token_ids(np.ndarray): an array of token_id of
previously decoded tokens
prev_token_texts(np.ndarray): an array of string of
previously decoded tokens
token_ids(np.ndarray): an array of to-be-decoded tokens
Returns:
str: decoded tokens
new_token_text: The new token as a string.
output_text: The new output text as a string.
"""
inputs = [
prepare_tensor('TOKENS_BATCH', output_ids),
prepare_tensor('sequence_length', seqlen)
prepare_tensor('prev_token_ids', prev_token_ids),
prepare_tensor('prev_token_texts', prev_token_texts),
prepare_tensor('token_ids', token_ids)
]
inputs[0].set_data_from_numpy(output_ids)
inputs[1].set_data_from_numpy(seqlen)

model_name = 'postprocessing'
with grpcclient.InferenceServerClient(self.tritonserver_addr) \
as client:
result = client.infer(model_name, inputs)
output0 = result.as_numpy('OUTPUT')
return output0
new_token_text = result[0].as_numpy('new_token_text')
output_text = result[1].as_numpy('output_text')

return new_token_text, output_text

0 comments on commit 13307ee

Please sign in to comment.