Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add nsys report analyzer #65

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def get_arithmetic_intensity(kernel):
def read_ncu_report(report_path: str, required_metrics: List[str]):
assert os.path.exists(
report_path
), f"The NCU report at {report_path} does not exist. Ensure you add --metrics ncu_rep to your benchmark run."
), f"The NCU report at {report_path} does not exist."
import_ncu_python_path()
import ncu_report

Expand Down
105 changes: 105 additions & 0 deletions tritonbench/components/ncu/nsys_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import csv
import os
import subprocess
from typing import Dict, List

# The nsys metrics to the reports. The value is the list of reports of nsys.
nsys_metrics_to_reports = {
# the sum of kernel execution time
"nsys_gpu_kernel_sum": ["nvtx_kern_sum", "nvtx_sum"],
# the overhead of kernel launch
"nsys_launch_overhead": ["nvtx_kern_sum", "nvtx_sum"],
# the names of kernels
"nsys_kernel_names": ["nvtx_kern_sum"],
# the durations of kernels
"nsys_kernel_durations": ["nvtx_kern_sum"],
# the duration of nvtx range
"nsys_nvtx_range_duration": ["nvtx_sum"],
# the number of kernels
"nsys_num_of_kernels": ["nvtx_kern_sum"],
}


def read_nsys_report(
report_path: str, required_metrics: List[str]
) -> Dict[str, List[float]]:
assert os.path.exists(
report_path
), f"The nsys report at {report_path} does not exist."
reports_required = []
for metric in required_metrics:
if metric in nsys_metrics_to_reports:
reports_required.extend(nsys_metrics_to_reports[metric])
reports_required = list(set(reports_required))
assert reports_required, "No nsys reports required"
cmd = f"nsys stats --report {','.join(reports_required)} --force-export=true --format csv --output . --force-overwrite=true {report_path}"
try:
subprocess.check_call(
cmd.split(), stdout=subprocess.DEVNULL, stderr=subprocess.PIPE
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should not use split

)
except subprocess.CalledProcessError as e:
print(f"Failed to run nsys command: {cmd}\nError: {e}")
raise e
# Get the base path and filename without extension
base_path = os.path.dirname(report_path)
base_name = os.path.splitext(os.path.basename(report_path))[0]

results = {}
csv_contents = {}

for report in reports_required:
csv_path = os.path.join(base_path, f"{base_name}_{report}.csv")
if not os.path.exists(csv_path):
raise RuntimeError(f"Expected CSV report not found at {csv_path}")

# Read CSV using DictReader
with open(csv_path, "r") as f:
reader = csv.DictReader(f)
csv_contents[report] = list(reader)
kernel_duration = []
kernel_names = []
sum_kernel_duration = 0
nvtx_range_duration = 0
if "nvtx_kern_sum" in csv_contents:
# gpu kernel execution time summary
for row in csv_contents["nvtx_kern_sum"]:
# use ms as the unit
kernel_duration.append(float(row["Total Time (ns)"]) / 1_000_000)
kernel_names.append(row["Kernel Name"])
sum_kernel_duration = sum(kernel_duration)
if "nvtx_sum" in csv_contents:
# It is supposed to be only one row. The nvtx range is `:tritonbench_range`
assert len(csv_contents["nvtx_sum"]) == 1
# @TODO: nsys has a bug that the unit of nvtx range duration is ms sometimes.
# waiting for nvidia replys.
nvtx_range_duration = (
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nsys returns this number in us rather than ns in some cases. it's not our bug. left this comment for future check.

float(csv_contents["nvtx_sum"][0]["Total Time (ns)"]) / 1_000_000
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's better to add a note here for the nsys errors in unit.

)

# Define mapping of metrics to their values. The keys must be in nsys_bench_metrics.
metrics_map = {
# Because tritonbench takes the median of numerical values, we need to convert
# the list of floats to a list of strings.
"nsys_kernel_durations": [str(duration) for duration in kernel_duration],
"nsys_kernel_names": kernel_names,
"nsys_gpu_kernel_sum": sum_kernel_duration,
"nsys_nvtx_range_duration": nvtx_range_duration,
"nsys_launch_overhead": nvtx_range_duration - sum_kernel_duration,
"nsys_num_of_kernels": len(kernel_names),
}
# Verify that metrics_map keys match nsys_metrics_to_reports keys
assert set(metrics_map.keys()) == set(nsys_metrics_to_reports.keys()), (
f"Mismatch between metrics_map keys and nsys_metrics_to_reports keys.\n"
f"metrics_map keys: {set(metrics_map.keys())}\n"
f"nsys_metrics_to_reports keys: {set(nsys_metrics_to_reports.keys())}"
)
# Add only requested metrics to results
results.update(
{
metric: metrics_map[metric]
for metric in required_metrics
if metric in metrics_map
}
)

return results
59 changes: 54 additions & 5 deletions tritonbench/utils/triton_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import torch
import triton

from tritonbench.components.ncu import analyzer as ncu_analyzer
from tritonbench.components.ncu import ncu_analyzer, nsys_analyzer
from tritonbench.utils.env_utils import (
apply_precision,
fresh_triton_cache,
Expand Down Expand Up @@ -68,7 +68,12 @@ class BenchmarkOperatorBackend:
REGISTERED_METRICS: Dict[str, List[str]] = {}
REGISTERED_X_VALS: Dict[str, str] = {}
BASELINE_BENCHMARKS: Dict[str, str] = {}
BASELINE_SKIP_METRICS = {"speedup", "accuracy", "mem_footprint_compression_ratio"}
BASELINE_SKIP_METRICS = {
"speedup",
"accuracy",
"mem_footprint_compression_ratio",
"nsys_gpu_speedup",
}
X_ONLY_METRICS = set(["hw_roofline"])
PRECISION_DTYPE_MAPPING = {
"fp32": torch.float32,
Expand Down Expand Up @@ -222,6 +227,8 @@ class BenchmarkOperatorMetrics:
mem_footprint_compression_ratio: Optional[float] = None
# gbps
gbps: Optional[float] = None
# speedup for the summary of kernel GPU time only
nsys_gpu_speedup: Optional[float] = None


BUILTIN_METRICS = {x.name for x in fields(BenchmarkOperatorMetrics)} - {"extra_metrics"}
Expand Down Expand Up @@ -307,9 +314,25 @@ def select_metric(backend, m):
)
metric_val = _metrics_dict.get(metric, None)
if isinstance(metric_val, list):
row.append(numpy.median(metric_val))
# Check if all elements are numbers before calculating median
if all(isinstance(x, Number) for x in metric_val):
row.append(numpy.median(metric_val))
else:
# For non-numeric lists, convert to string representation
metric_val_str = str(metric_val)
if ";" in metric_val_str:
logger.warning(
f"Metric value '{metric_val_str}' contains semicolon which may cause CSV parsing issues"
)
row.append(metric_val_str)
elif isinstance(metric_val, bool):
row.append(1.0 if metric_val else 0.0)
elif isinstance(metric_val, str):
if ";" in metric_val:
logger.warning(
f"Metric value '{metric_val}' contains semicolon which may cause CSV parsing issues"
)
row.append(metric_val)
else:
row.append(metric_val)
table.append(row)
Expand Down Expand Up @@ -1065,8 +1088,34 @@ def _init_extra_metrics() -> Dict[str, Any]:
metrics.ncu_rep_ir = self.ncu_trace(
input_id, fn_name, replay=True, profile_ir=True
)
if "nsys_rep" in self.required_metrics:
metrics.nsys_rep = self.nsys_rep(input_id, fn_name)
nsys_metrics = []
for metric_name in nsys_analyzer.nsys_metrics_to_reports.keys():
if metric_name in self.required_metrics:
nsys_metrics.append(metric_name)

if "nsys_rep" in self.required_metrics or nsys_metrics:
nsys_rep_path = self.nsys_rep(input_id, fn_name)
metrics.nsys_rep = nsys_rep_path
if nsys_metrics:
nsys_analyzer_results = nsys_analyzer.read_nsys_report(
nsys_rep_path, nsys_metrics
)
for metric_name, metric_value in nsys_analyzer_results.items():
metrics.extra_metrics[metric_name] = metric_value
if "nsys_gpu_speedup" in self.required_metrics:
baseline_nsys_gpu_kernel_sum = (
self.baseline_metrics.extra_metrics.get("nsys_gpu_kernel_sum", None)
if self.baseline_metrics
else None
)
current_nsys_gpu_kernel_sum = metrics.extra_metrics.get(
"nsys_gpu_kernel_sum", None
)
metrics.nsys_gpu_speedup = (
baseline_nsys_gpu_kernel_sum / current_nsys_gpu_kernel_sum
if baseline_nsys_gpu_kernel_sum and current_nsys_gpu_kernel_sum
else None
)
if "kineto_trace" in self.required_metrics:
metrics.kineto_trace = self.kineto_trace(input_id, fn)
if "best_config" in self.required_metrics:
Expand Down
Loading