Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
lvhan028 committed Oct 13, 2023
1 parent 1b935fc commit a4bd498
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 13 deletions.
24 changes: 12 additions & 12 deletions benchmark/profile_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def infer(model, session_id: int, input_ids: str, output_seqlen: int,
sequence_start=True,
sequence_end=True,
ignore_eos=True,
stream_output=True):
stream_output=False):
res, token = outputs[0]
timestamps.append(time.perf_counter())
tokens.append(token)
Expand Down Expand Up @@ -152,8 +152,8 @@ def profile_throughput(model_path: str,
f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
f'{token_latency_ave:.2f}s\n'
f'throughput: {throughput:.2f} token/s\n{"-" * 50}')
return tm_model.model_name, first_token_latency_ave, throughput, \
tm_model.gpu_count
return tm_model.model_name, throughput, tm_model.gpu_count, \
first_token_latency_ave


class MemoryMonitor:
Expand Down Expand Up @@ -234,12 +234,12 @@ class ProfileResult:
batch: int
prompt_tokens: int
completion_tokens: int
first_token_latency: float
throughput_per_proc: float
throughput_per_node: float
mem_per_proc: float
mem_per_gpu: float
mem_per_node: float
first_token_latency: float


def parse_args():
Expand Down Expand Up @@ -294,7 +294,7 @@ def main():
output_seqlen=completion_tokens,
tp=args.tp)
output = Pool(1).map(profile_target, (args.model_path, ))
model_name, first_token_latency, throughput_per_proc, tp = output[
model_name, throughput_per_proc, tp, first_token_latency = output[
0]
time.sleep(5) # wait a while for releasing GPU mem
memory = MemoryMonitor.terminate()
Expand All @@ -304,28 +304,28 @@ def main():
batch=batch,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
first_token_latency=first_token_latency,
throughput_per_proc=throughput_per_proc,
throughput_per_node=throughput_per_proc / tp *
device_count,
mem_per_proc=memory,
mem_per_gpu=memory / tp,
mem_per_node=memory / tp * device_count))
mem_per_node=memory / tp * device_count,
first_token_latency=first_token_latency))
with open(args.dst_csv, 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([
'batch', 'prompt_tokens', 'completion_tokens',
'first_token_latency', 'throughput_per_proc(token/s)',
'throughput_per_node(token/s)', 'mem_per_proc(GB)',
'mem_per_gpu(GB)', 'mem_per_node(GB)'
'throughput_per_proc(token/s)', 'throughput_per_node(token/s)',
'mem_per_proc(GB)', 'mem_per_gpu(GB)', 'mem_per_node(GB)',
'1st_token_latency(s)'
])
for re in results:
writer.writerow([
re.batch, re.prompt_tokens, re.completion_tokens,
f'{re.first_token_latency:.3f}',
f'{re.throughput_per_proc:.2f}',
f'{re.throughput_per_node:.2f}', f'{re.mem_per_proc:.2f}',
f'{re.mem_per_gpu:.2f}', f'{re.mem_per_node:.2f}'
f'{re.mem_per_gpu:.2f}', f'{re.mem_per_node:.2f}',
f'{re.first_token_latency:.3f}'
])


Expand Down
2 changes: 1 addition & 1 deletion generate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
cmake .. \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-DCMAKE_INSTALL_PREFIX=./install \
-DCMAKE_INSTALL_PREFIX=/opt/tritonserver \
-DBUILD_PY_FFI=ON \
-DBUILD_MULTI_GPU=ON \
-DCMAKE_CUDA_FLAGS="-lineinfo" \
Expand Down

0 comments on commit a4bd498

Please sign in to comment.