Skip to content

Commit 8ef60d1

Browse files
authored
[support bundle] Add perfetto tracing to bundle collection (#9482)
This is the last step of bundle collection: turn all bundle collection steps into a perfetto-formatted trace file, and stash it in "meta/trace.json". The cost here is small, and this allows us to easily inspect bundle collection time. Example output, visualized at ui.perfetto.dev: <img width="1041" height="565" alt="image" src="https://github.com/user-attachments/assets/ffdc244f-4b3e-433b-b6f5-35f55b9c3714" />
1 parent 56244ec commit 8ef60d1

File tree

5 files changed

+257
-0
lines changed

5 files changed

+257
-0
lines changed

nexus/src/app/background/tasks/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ pub mod region_snapshot_replacement_start;
4343
pub mod region_snapshot_replacement_step;
4444
pub mod saga_recovery;
4545
pub mod service_firewall_rules;
46+
pub mod support_bundle;
4647
pub mod support_bundle_collector;
4748
pub mod sync_service_zone_nat;
4849
pub mod sync_switch_configuration;
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! Support bundle related types and utilities
6+
7+
pub mod perfetto;
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! Perfetto Trace Event format support for visualizing support bundle collection
6+
7+
use serde::Deserialize;
8+
use serde::Serialize;
9+
10+
/// Represents a Perfetto Trace Event format JSON file for visualization.
11+
///
12+
/// This format is used by the Perfetto trace viewer (<https://ui.perfetto.dev/>)
13+
/// to visualize timing information for operations.
14+
#[derive(Serialize, Deserialize)]
15+
pub struct Trace {
16+
#[serde(rename = "traceEvents")]
17+
pub trace_events: Vec<TraceEvent>,
18+
/// Display unit for time values in the UI (e.g., "ms" for milliseconds)
19+
#[serde(rename = "displayTimeUnit")]
20+
pub display_time_unit: String,
21+
}
22+
23+
/// A single event in the Perfetto Trace Event format.
24+
///
25+
/// This represents a complete event (duration event) showing when an operation
26+
/// started and how long it took.
27+
#[derive(Serialize, Deserialize)]
28+
pub struct TraceEvent {
29+
/// Human-readable name of the event
30+
pub name: String,
31+
/// Category name (abbreviated as "cat" in Perfetto format).
32+
/// Used to group related events together in the trace viewer.
33+
pub cat: String,
34+
/// Phase type (abbreviated as "ph" in Perfetto format).
35+
/// "X" means a "Complete" event with both timestamp and duration.
36+
pub ph: String,
37+
/// Timestamp in microseconds (abbreviated as "ts" in Perfetto format).
38+
/// Represents when the event started, as microseconds since the epoch.
39+
pub ts: i64,
40+
/// Duration in microseconds (abbreviated as "dur" in Perfetto format).
41+
/// How long the event took to complete.
42+
pub dur: i64,
43+
/// Process ID. Used to separate events into different process lanes
44+
/// in the trace viewer.
45+
pub pid: u32,
46+
/// Thread ID. Used to separate events into different thread lanes
47+
/// within a process in the trace viewer.
48+
pub tid: usize,
49+
/// Arbitrary key-value pairs with additional event metadata
50+
pub args: serde_json::Value,
51+
}

nexus/src/app/background/tasks/support_bundle_collector.rs

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ use zip::ZipArchive;
7878
use zip::ZipWriter;
7979
use zip::write::FullFileOptions;
8080

81+
use super::support_bundle::perfetto;
82+
8183
// We use "/var/tmp" to use Nexus' filesystem for temporary storage,
8284
// rather than "/tmp", which would keep this collected data in-memory.
8385
const TEMPDIR: &str = "/var/tmp";
@@ -1037,11 +1039,94 @@ impl BundleCollection {
10371039
//
10381040
// Only finish if we've exhausted all possible steps and joined all spawned work.
10391041
if steps.is_empty() {
1042+
// Write trace file before returning
1043+
if let Err(err) = self.write_trace_file(output, &report).await {
1044+
warn!(
1045+
self.log,
1046+
"Failed to write trace file";
1047+
"error" => ?err
1048+
);
1049+
}
10401050
return report;
10411051
}
10421052
}
10431053
}
10441054

1055+
// Write a Perfetto Event format JSON file for visualization
1056+
async fn write_trace_file(
1057+
&self,
1058+
output: &Utf8TempDir,
1059+
report: &SupportBundleCollectionReport,
1060+
) -> anyhow::Result<()> {
1061+
let meta_dir = output.path().join("meta");
1062+
tokio::fs::create_dir_all(&meta_dir).await.with_context(|| {
1063+
format!("Failed to create meta directory {meta_dir}")
1064+
})?;
1065+
1066+
let trace_path = meta_dir.join("trace.json");
1067+
1068+
// Convert steps to Perfetto Trace Event format.
1069+
// Sort steps by start time and assign each a unique sequential ID.
1070+
//
1071+
// This is necessary because the trace event format does not like
1072+
// multiple slices to overlap - so we make each slice distinct.
1073+
//
1074+
// Ideally we'd be able to correlate these with actual tokio tasks,
1075+
// but it's hard to convert tokio::task::Id to a u64 because
1076+
// of https://github.com/tokio-rs/tokio/issues/7430
1077+
let mut sorted_steps: Vec<_> = report.steps.iter().collect();
1078+
sorted_steps.sort_by_key(|s| s.start);
1079+
1080+
// Generate trace events - each step gets a unique ID (1, 2, 3, ...)
1081+
// based on its start time order
1082+
let trace_events: Vec<_> = sorted_steps
1083+
.iter()
1084+
.enumerate()
1085+
.map(|(i, step)| {
1086+
let start_us = step.start.timestamp_micros();
1087+
let duration_us = (step.end - step.start)
1088+
.num_microseconds()
1089+
.unwrap_or(0)
1090+
.max(0);
1091+
let step_id = i + 1;
1092+
1093+
perfetto::TraceEvent {
1094+
name: step.name.clone(),
1095+
cat: "bundle_collection".to_string(),
1096+
ph: "X".to_string(),
1097+
ts: start_us,
1098+
dur: duration_us,
1099+
pid: 1,
1100+
tid: step_id,
1101+
args: json!({
1102+
"status": step.status.to_string(),
1103+
}),
1104+
}
1105+
})
1106+
.collect();
1107+
1108+
let trace = perfetto::Trace {
1109+
trace_events,
1110+
display_time_unit: "ms".to_string(),
1111+
};
1112+
1113+
let trace_content = serde_json::to_string_pretty(&trace)
1114+
.context("Failed to serialize trace JSON")?;
1115+
1116+
tokio::fs::write(&trace_path, trace_content).await.with_context(
1117+
|| format!("Failed to write trace file to {trace_path}"),
1118+
)?;
1119+
1120+
info!(
1121+
self.log,
1122+
"Wrote trace file";
1123+
"path" => %trace_path,
1124+
"num_events" => trace.trace_events.len()
1125+
);
1126+
1127+
Ok(())
1128+
}
1129+
10451130
async fn collect_bundle_id(
10461131
&self,
10471132
dir: &Utf8Path,
@@ -2528,6 +2613,117 @@ mod test {
25282613
assert!(report.is_none());
25292614
}
25302615

2616+
#[nexus_test(server = crate::Server)]
2617+
async fn test_trace_file_generated(cptestctx: &ControlPlaneTestContext) {
2618+
let nexus = &cptestctx.server.server_context().nexus;
2619+
let datastore = nexus.datastore();
2620+
let resolver = nexus.resolver();
2621+
let opctx = OpContext::for_tests(
2622+
cptestctx.logctx.log.clone(),
2623+
datastore.clone(),
2624+
);
2625+
2626+
// Before we can create any bundles, we need to create the
2627+
// space for them to be provisioned.
2628+
let _datasets =
2629+
TestDataset::setup(cptestctx, &datastore, &opctx, 1).await;
2630+
2631+
// Create a bundle to collect
2632+
let bundle = datastore
2633+
.support_bundle_create(
2634+
&opctx,
2635+
"For trace file testing",
2636+
nexus.id(),
2637+
None,
2638+
)
2639+
.await
2640+
.expect("Couldn't allocate a support bundle");
2641+
2642+
let collector = SupportBundleCollector::new(
2643+
datastore.clone(),
2644+
resolver.clone(),
2645+
false,
2646+
nexus.id(),
2647+
);
2648+
2649+
// Collect the bundle
2650+
let mut request = BundleRequest::default();
2651+
request.data_selection.insert(BundleData::HostInfo(HashSet::new()));
2652+
let report = collector
2653+
.collect_bundle(&opctx, &request)
2654+
.await
2655+
.expect("Collection should have succeeded")
2656+
.expect("Should have generated a report");
2657+
2658+
// Download the trace file from the bundle
2659+
let head = false;
2660+
let range = None;
2661+
let response = nexus
2662+
.support_bundle_download(
2663+
&opctx,
2664+
bundle.id.into(),
2665+
SupportBundleQueryType::Path {
2666+
file_path: "meta/trace.json".to_string(),
2667+
},
2668+
head,
2669+
range,
2670+
)
2671+
.await
2672+
.expect("Should be able to download trace file");
2673+
2674+
// Parse the trace file using our Perfetto structs
2675+
let body_bytes =
2676+
response.into_body().collect().await.unwrap().to_bytes();
2677+
let trace: perfetto::Trace = serde_json::from_slice(&body_bytes)
2678+
.expect("Trace file should be valid Perfetto JSON");
2679+
2680+
// Verify display time unit
2681+
assert_eq!(
2682+
trace.display_time_unit, "ms",
2683+
"Display time unit should be milliseconds"
2684+
);
2685+
2686+
// We should have at least the main collection steps
2687+
assert!(
2688+
!trace.trace_events.is_empty(),
2689+
"Should have at least one trace event"
2690+
);
2691+
2692+
// Verify each event has the expected structure
2693+
for event in &trace.trace_events {
2694+
// Verify category
2695+
assert_eq!(
2696+
event.cat, "bundle_collection",
2697+
"Event should have category 'bundle_collection'"
2698+
);
2699+
// Verify phase type
2700+
assert_eq!(event.ph, "X", "Event should be Complete event type");
2701+
// Verify timestamps are positive
2702+
assert!(event.ts >= 0, "Event timestamp should be non-negative");
2703+
assert!(event.dur >= 0, "Event duration should be non-negative");
2704+
// Verify process and thread IDs are set
2705+
assert_eq!(event.pid, 1, "All events should have pid=1");
2706+
assert!(event.tid > 0, "Event thread ID should be positive");
2707+
}
2708+
2709+
// Verify we have the same number of events as steps in the report
2710+
assert_eq!(
2711+
trace.trace_events.len(),
2712+
report.steps.len(),
2713+
"Number of events should match number of steps"
2714+
);
2715+
2716+
// Verify step names match between report and trace
2717+
let trace_names: std::collections::HashSet<_> =
2718+
trace.trace_events.iter().map(|e| e.name.as_str()).collect();
2719+
let report_names: std::collections::HashSet<_> =
2720+
report.steps.iter().map(|s| s.name.as_str()).collect();
2721+
assert_eq!(
2722+
trace_names, report_names,
2723+
"Trace event names should match report step names"
2724+
);
2725+
}
2726+
25312727
#[nexus_test(server = crate::Server)]
25322728
async fn test_collect_chunked(cptestctx: &ControlPlaneTestContext) {
25332729
let nexus = &cptestctx.server.server_context().nexus;

nexus/tests/integration_tests/support_bundles.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,8 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) {
530530
let archive = ZipArchive::new(Cursor::new(&contents)).unwrap();
531531
let mut names = archive.file_names();
532532
assert_eq!(names.next(), Some("bundle_id.txt"));
533+
assert_eq!(names.next(), Some("meta/"));
534+
assert_eq!(names.next(), Some("meta/trace.json"));
533535
assert_eq!(names.next(), Some("rack/"));
534536
assert!(names.any(|n| n == "sp_task_dumps/"));
535537
// There's much more data in the bundle, but validating it isn't the point

0 commit comments

Comments
 (0)