update

lvhan028 · Oct 13, 2023 · a4bd498 · a4bd498
1 parent 1b935fc
commit a4bd498
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 13 deletions.
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
@@ -35,7 +35,7 @@ def infer(model, session_id: int, input_ids: str, output_seqlen: int,
                                             sequence_start=True,
                                             sequence_end=True,
                                             ignore_eos=True,
-                                            stream_output=True):
+                                            stream_output=False):
             res, token = outputs[0]
             timestamps.append(time.perf_counter())
             tokens.append(token)
@@ -152,8 +152,8 @@ def profile_throughput(model_path: str,
           f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
           f'{token_latency_ave:.2f}s\n'
           f'throughput: {throughput:.2f} token/s\n{"-" * 50}')
-    return tm_model.model_name, first_token_latency_ave, throughput, \
-        tm_model.gpu_count
+    return tm_model.model_name, throughput, tm_model.gpu_count, \
+        first_token_latency_ave
 
 
 class MemoryMonitor:
@@ -234,12 +234,12 @@ class ProfileResult:
     batch: int
     prompt_tokens: int
     completion_tokens: int
-    first_token_latency: float
     throughput_per_proc: float
     throughput_per_node: float
     mem_per_proc: float
     mem_per_gpu: float
     mem_per_node: float
+    first_token_latency: float
 
 
 def parse_args():
@@ -294,7 +294,7 @@ def main():
                                      output_seqlen=completion_tokens,
                                      tp=args.tp)
             output = Pool(1).map(profile_target, (args.model_path, ))
-            model_name, first_token_latency, throughput_per_proc, tp = output[
+            model_name, throughput_per_proc, tp, first_token_latency = output[
                 0]
             time.sleep(5)  # wait a while for releasing GPU mem
             memory = MemoryMonitor.terminate()
@@ -304,28 +304,28 @@ def main():
                               batch=batch,
                               prompt_tokens=prompt_tokens,
                               completion_tokens=completion_tokens,
-                              first_token_latency=first_token_latency,
                               throughput_per_proc=throughput_per_proc,
                               throughput_per_node=throughput_per_proc / tp *
                               device_count,
                               mem_per_proc=memory,
                               mem_per_gpu=memory / tp,
-                              mem_per_node=memory / tp * device_count))
+                              mem_per_node=memory / tp * device_count,
+                              first_token_latency=first_token_latency))
     with open(args.dst_csv, 'w') as csvfile:
         writer = csv.writer(csvfile)
         writer.writerow([
             'batch', 'prompt_tokens', 'completion_tokens',
-            'first_token_latency', 'throughput_per_proc(token/s)',
-            'throughput_per_node(token/s)', 'mem_per_proc(GB)',
-            'mem_per_gpu(GB)', 'mem_per_node(GB)'
+            'throughput_per_proc(token/s)', 'throughput_per_node(token/s)',
+            'mem_per_proc(GB)', 'mem_per_gpu(GB)', 'mem_per_node(GB)',
+            '1st_token_latency(s)'
         ])
         for re in results:
             writer.writerow([
                 re.batch, re.prompt_tokens, re.completion_tokens,
-                f'{re.first_token_latency:.3f}',
                 f'{re.throughput_per_proc:.2f}',
                 f'{re.throughput_per_node:.2f}', f'{re.mem_per_proc:.2f}',
-                f'{re.mem_per_gpu:.2f}', f'{re.mem_per_node:.2f}'
+                f'{re.mem_per_gpu:.2f}', f'{re.mem_per_node:.2f}',
+                f'{re.first_token_latency:.3f}'
             ])
 
 

diff --git a/generate.sh b/generate.sh
@@ -3,7 +3,7 @@
 cmake .. \
     -DCMAKE_BUILD_TYPE=RelWithDebInfo \
     -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-    -DCMAKE_INSTALL_PREFIX=./install \
+    -DCMAKE_INSTALL_PREFIX=/opt/tritonserver \
     -DBUILD_PY_FFI=ON \
     -DBUILD_MULTI_GPU=ON \
     -DCMAKE_CUDA_FLAGS="-lineinfo" \