Skip to content

Commit

Permalink
add progress bar
Browse files Browse the repository at this point in the history
  • Loading branch information
lvhan028 committed Nov 27, 2023
1 parent 1f2c414 commit 831d9d2
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 48 deletions.
64 changes: 27 additions & 37 deletions benchmark/profile_restful_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

import fire
import numpy as np
import requests
from tqdm import tqdm

from lmdeploy.serve.openai.api_client import APIClient
from lmdeploy.tokenizer import Tokenizer


Expand Down Expand Up @@ -61,60 +62,47 @@ def __init__(self,
temperature: float = 0.8,
top_p: float = 1.0):
self.tokenizer = Tokenizer(tokenzier_path)
self.api_url = server_addr + '/v1/chat/completions'
self.server_addr = server_addr
self.temperature = temperature
self.top_p = top_p
client = APIClient(self.server_addr)
self.model_name = client.available_models[0]
self.pbar = None

def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
stream_output: bool):

stats = []
client = APIClient(self.server_addr)

for prompt, input_seqlen, output_seqlen in iter(
req_queue.get, [None, None, None]):
timestamps = []
tokens = []
timestamps.append(time.perf_counter())
headers = {'content-type': 'application/json'}
pload = {
'model': 'llama',
'messages': prompt,
'temperature': self.temperature,
'top_p': self.top_p,
'n': 1,
'max_tokens': output_seqlen,
'stream': stream_output,
'session_id': session_id,
'ignore_eos': True,
}
response = requests.post(self.api_url,
headers=headers,
json=pload,
stream=stream_output)
for chunk in response.iter_lines(chunk_size=8192,
decode_unicode=False,
delimiter=b'\n'):
for output in client.chat_completions_v1(
model=self.model_name,
messages=prompt,
temperature=self.temperature,
top_p=self.top_p,
n=1,
max_tokens=output_seqlen,
stream=stream_output,
session_id=session_id,
ignore_eos=True):
timestamps.append(time.perf_counter())
if chunk:
data = json.loads(chunk.decode('utf-8'))
n_token = data.pop('tokens', 0)
tokens.append(n_token)

first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
token_latency = np.round(timestamps[-1] - timestamps[0], 3)
completion_tokens = tokens[-1]
assert output_seqlen <= completion_tokens <= output_seqlen + 1, \
f'Error. session_id({session_id}) request {output_seqlen} ' \
f'tokens, but generate {completion_tokens} tokens.\n' \
f'prompt: {prompt}'
total_tokens = tokens[-1] + input_seqlen
# assert output.pop('finish_reason') == 'length', \
# f'Error. session_id({session_id}) request {output_seqlen} ' \
# f'tokens, but `finish_reason` is not `length`'
total_tokens = input_seqlen + output_seqlen
stats.append([
first_token_latency, completion_tokens, output_seqlen,
first_token_latency, output_seqlen, output_seqlen,
total_tokens, token_latency
])
print(
f'session {session_id}: '
f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}, '
f'completion_tokens {completion_tokens}')
self.pbar.update(1)

res_queue.put((session_id, stats))

def process_request(self,
Expand All @@ -125,6 +113,8 @@ def process_request(self,
req_queue = Queue()
threads = []

self.pbar = tqdm(total=len(requests))

# feed request to q
for req in requests:
req_queue.put(req)
Expand Down
9 changes: 5 additions & 4 deletions benchmark/profile_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import fire
import numpy as np
from tqdm import tqdm

from lmdeploy.serve.turbomind.chatbot import Chatbot
from lmdeploy.tokenizer import Tokenizer
Expand Down Expand Up @@ -68,6 +69,7 @@ def __init__(self,
self.top_k = top_k
self.top_p = top_p
self.log_level = log_level
self.pbar = None

def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
stream_output: bool):
Expand Down Expand Up @@ -105,10 +107,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
first_token_latency, completion_tokens, output_seqlen,
total_tokens, token_latency
])
print(
f'session {session_id}: '
f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}, '
f'completion_tokens {completion_tokens}')
self.pbar.update(1)
res_queue.put((session_id, stats))

def process_request(self,
Expand All @@ -119,6 +118,8 @@ def process_request(self,
req_queue = Queue()
threads = []

self.pbar = tqdm(total=len(requests))

# feed request to q
for req in requests:
req_queue.put(req)
Expand Down
16 changes: 10 additions & 6 deletions benchmark/profile_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import fire
import numpy as np
from tqdm import tqdm

from lmdeploy.tokenizer import Tokenizer
from lmdeploy.turbomind import TurboMind
Expand Down Expand Up @@ -65,6 +66,7 @@ def __init__(self, model_path: str, tp: int = 1, **kwargs):
**kwargs)
self.tm_model = tm_model
self.tokenizer = tm_model.tokenizer
self.pbar = None

def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
stream_output: bool):
Expand Down Expand Up @@ -95,16 +97,16 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
token_latency = np.round(timestamps[-1] - timestamps[0], 3)
completion_tokens = tokens[-1]
assert output_seqlen <= completion_tokens <= output_seqlen + 1
total_tokens = tokens[-1] + len(input_ids)
assert output_seqlen <= completion_tokens <= output_seqlen + 1, \
f'Error. session_id({session_id}) request {output_seqlen} ' \
f'tokens, but generate {completion_tokens} tokens.\n' \
f'prompt: {prompt}'
total_tokens = tokens[-1] + input_seqlen
stats.append([
first_token_latency, completion_tokens, output_seqlen,
total_tokens, token_latency
])
print(
f'session {session_id}: '
f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}, '
f'completion_tokens {completion_tokens}')
self.pbar.update(1)
res_queue.put((session_id, stats))

def process_request(self,
Expand All @@ -115,6 +117,8 @@ def process_request(self,
req_queue = Queue()
threads = []

self.pbar = tqdm(total=len(requests))

# feed request to q
for req in requests:
req_queue.put(req)
Expand Down
2 changes: 1 addition & 1 deletion lmdeploy/serve/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ def main(model_path: str,
allow_credentials: bool = True,
allow_methods: List[str] = ['*'],
allow_headers: List[str] = ['*'],
log_level: str = 'INFO',
log_level: str = 'ERROR',
**kwargs):
"""An example to perform model inference through the command line
interface.
Expand Down

0 comments on commit 831d9d2

Please sign in to comment.