Skip to content

Commit

Permalink
queue length for MTIA (facebookresearch#202)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookresearch#202

HTA computes the number of outstanding operations on each stream and is represented by queue length. It generates another trace with the queue length info

Add MTIA launch id to the trace symbol table to track it

Reviewed By: fengxizhou, Chenguang-Zhu

Differential Revision:
D65774955

Privacy Context Container: L1188860
  • Loading branch information
fenypatel99 authored and facebook-github-bot committed Nov 13, 2024
1 parent 0af9308 commit 4d1bbf6
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 2 deletions.
6 changes: 4 additions & 2 deletions hta/common/trace_symbol_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,13 @@ def get_runtime_launch_events_query(self) -> str:
cuLaunchKernel_id = self.sym_index.get("cuLaunchKernel", self.NULL)
cudaMemcpyAsync_id = self.sym_index.get("cudaMemcpyAsync", self.NULL)
cudaMemsetAsync_id = self.sym_index.get("cudaMemsetAsync", self.NULL)

mtiaLaunchKernel_id = self.sym_index.get(
"runFunction - job_prep_and_submit_for_execution", self.NULL
)
return (
f"((name == {cudaMemsetAsync_id}) or (name == {cudaMemcpyAsync_id}) or "
f" (name == {cudaLaunchKernel_id}) or (name == {cudaLaunchKernelExC_id})"
f" or (name == {cuLaunchKernel_id})) and (index_correlation > 0)"
f" or (name == {cuLaunchKernel_id}) or (name == {mtiaLaunchKernel_id})) and (index_correlation > 0)"
)


Expand Down
58 changes: 58 additions & 0 deletions tests/test_trace_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,21 @@ def test_get_gpu_kernel_breakdown(self):
self.assertEqual(kernel_breakdown.iloc[151]["kernel_type"], "MEMORY")
self.assertEqual(kernel_breakdown.iloc[151]["sum (us)"], 1064)

def test_get_mtia_kernel_breakdown(self):
(
kernel_type_breakdown,
kernel_breakdown,
) = self.mtia_single_rank_trace_t.get_gpu_kernel_breakdown(
visualize=False, include_memory_kernels=True
)

self.assertEqual(kernel_type_breakdown.iloc[0]["kernel_type"], "COMPUTATION")
self.assertEqual(kernel_type_breakdown.iloc[0]["sum"], 7305597)
self.assertEqual(kernel_breakdown.iloc[0]["kernel_type"], "COMPUTATION")
self.assertEqual(kernel_breakdown.iloc[0]["sum (us)"], 118526.0)
self.assertEqual(kernel_breakdown.iloc[6]["kernel_type"], "MEMORY")
self.assertEqual(kernel_breakdown.iloc[6]["sum (us)"], 400892.0)

def test_get_queue_length_stats(self):
qd_summary = self.vision_transformer_t.get_queue_length_summary(ranks=[0])
streams = qd_summary.index.to_list()
Expand Down Expand Up @@ -288,6 +303,49 @@ def test_get_queue_length_stats(self):
msg=f"queue_full_df = {queue_full_df}",
)

def test_get_mtia_queue_length_stats(self):
qd_summary = self.mtia_single_rank_trace_t.get_queue_length_summary(ranks=[0])
streams = qd_summary.index.to_list()
self.assertEqual(streams, list(zip([0] * 2, [1, 102])))

stream7_stats = qd_summary.loc[0, 102]["queue_length"].to_dict()
expected_stats = {
"count": 6.0,
"mean": 0.5,
"std": 0.547723,
"min": 0.0,
"25%": 0.0,
"50%": 0.5,
"75%": 1.0,
"max": 1.0,
}
for key, expval in expected_stats.items():
self.assertAlmostEqual(
stream7_stats[key],
expval,
places=2,
msg=f"Stream 102 stats mismatch key={key}",
)

queue_len_ts_dict = self.mtia_single_rank_trace_t.get_queue_length_time_series()
queue_full_df = (
self.mtia_single_rank_trace_t.get_time_spent_blocked_on_full_queue(
queue_len_ts_dict, max_queue_length=1 # Just a hack for testing
)
)
self.assertEqual(len(queue_full_df), 1)
self.assertAlmostEqual(
queue_full_df.loc[0]["duration_at_max_queue_length"],
1060.0,
msg=f"queue_full_df = {queue_full_df}",
)
self.assertAlmostEqual(
queue_full_df.loc[0]["relative_duration_at_max_queue_length"],
0.000079,
places=5,
msg=f"queue_full_df = {queue_full_df}",
)

@patch.object(hta.common.trace.Trace, "write_raw_trace")
def test_generate_trace_with_counters(self, mock_write_trace):
# Use a trace with some kernels missing attribution to operators
Expand Down

0 comments on commit 4d1bbf6

Please sign in to comment.