Skip to content

Commit

Permalink
queue length for MTIA (#202)
Browse files Browse the repository at this point in the history
Summary:

HTA computes the number of outstanding operations on each stream and is represented by queue length. It generates another trace with the queue length info

Add MTIA launch id to the trace symbol table to track it

Reviewed By: fengxizhou, Chenguang-Zhu

Differential Revision:
D65774955

Privacy Context Container: L1188860
  • Loading branch information
fenypatel99 authored and facebook-github-bot committed Nov 13, 2024
1 parent e144b74 commit 74d08b1
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 2 deletions.
6 changes: 4 additions & 2 deletions hta/common/trace_symbol_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,13 @@ def get_runtime_launch_events_query(self) -> str:
cuLaunchKernel_id = self.sym_index.get("cuLaunchKernel", self.NULL)
cudaMemcpyAsync_id = self.sym_index.get("cudaMemcpyAsync", self.NULL)
cudaMemsetAsync_id = self.sym_index.get("cudaMemsetAsync", self.NULL)

mtiaLaunchKernel_id = self.sym_index.get(
"runFunction - job_prep_and_submit_for_execution", self.NULL
)
return (
f"((name == {cudaMemsetAsync_id}) or (name == {cudaMemcpyAsync_id}) or "
f" (name == {cudaLaunchKernel_id}) or (name == {cudaLaunchKernelExC_id})"
f" or (name == {cuLaunchKernel_id})) and (index_correlation > 0)"
f" or (name == {cuLaunchKernel_id}) or (name == {mtiaLaunchKernel_id})) and (index_correlation > 0)"
)


Expand Down
43 changes: 43 additions & 0 deletions tests/test_trace_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,49 @@ def test_get_queue_length_stats(self):
msg=f"queue_full_df = {queue_full_df}",
)

def test_get_mtia_queue_length_stats(self):
qd_summary = self.mtia_single_rank_trace_t.get_queue_length_summary(ranks=[0])
streams = qd_summary.index.to_list()
self.assertEqual(streams, list(zip([0] * 2, [1, 102])))

stream102_stats = qd_summary.loc[0, 102]["queue_length"].to_dict()
expected_stats = {
"count": 6.0,
"mean": 0.5,
"std": 0.547723,
"min": 0.0,
"25%": 0.0,
"50%": 0.5,
"75%": 1.0,
"max": 1.0,
}
for key, expval in expected_stats.items():
self.assertAlmostEqual(
stream102_stats[key],
expval,
places=2,
msg=f"Stream 102 stats mismatch key={key}",
)

queue_len_ts_dict = self.mtia_single_rank_trace_t.get_queue_length_time_series()
queue_full_df = (
self.mtia_single_rank_trace_t.get_time_spent_blocked_on_full_queue(
queue_len_ts_dict, max_queue_length=1 # Just a hack for testing
)
)
self.assertEqual(len(queue_full_df), 1)
self.assertAlmostEqual(
queue_full_df.loc[0]["duration_at_max_queue_length"],
1060.0,
msg=f"queue_full_df = {queue_full_df}",
)
self.assertAlmostEqual(
queue_full_df.loc[0]["relative_duration_at_max_queue_length"],
0.000079,
places=5,
msg=f"queue_full_df = {queue_full_df}",
)

@patch.object(hta.common.trace.Trace, "write_raw_trace")
def test_generate_trace_with_counters(self, mock_write_trace):
# Use a trace with some kernels missing attribution to operators
Expand Down

0 comments on commit 74d08b1

Please sign in to comment.