Skip to content

Commit f60f53f

Browse files
committed
tracing
1 parent c3a2245 commit f60f53f

File tree

1 file changed

+207
-0
lines changed

1 file changed

+207
-0
lines changed

nexus/src/app/background/tasks/support_bundle_collector.rs

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,11 +1037,94 @@ impl BundleCollection {
10371037
//
10381038
// Only finish if we've exhausted all possible steps and joined all spawned work.
10391039
if steps.is_empty() {
1040+
// Write trace file before returning
1041+
if let Err(err) = self.write_trace_file(output, &report).await {
1042+
warn!(
1043+
self.log,
1044+
"Failed to write trace file";
1045+
"error" => ?err
1046+
);
1047+
}
10401048
return report;
10411049
}
10421050
}
10431051
}
10441052

1053+
// Write a Perfetto Event format JSON file for visualization
1054+
async fn write_trace_file(
1055+
&self,
1056+
output: &Utf8TempDir,
1057+
report: &SupportBundleCollectionReport,
1058+
) -> anyhow::Result<()> {
1059+
let meta_dir = output.path().join("meta");
1060+
tokio::fs::create_dir_all(&meta_dir).await.with_context(|| {
1061+
format!("Failed to create meta directory {meta_dir}")
1062+
})?;
1063+
1064+
let trace_path = meta_dir.join("trace.json");
1065+
1066+
// Convert steps to Perfetto Trace Event format.
1067+
// Sort steps by start time and assign each a unique sequential ID.
1068+
//
1069+
// This is necessary because the trace event format does not like
1070+
// multiple slices to overlap - so we make each slice distinct.
1071+
//
1072+
// Ideally we'd be able to correlate these with actual tokio tasks,
1073+
// but it's hard to convert tokio::task::Id to a u64 because
1074+
// of https://github.com/tokio-rs/tokio/issues/7430
1075+
let mut sorted_steps: Vec<_> = report.steps.iter().collect();
1076+
sorted_steps.sort_by_key(|s| s.start);
1077+
1078+
// Generate trace events - each step gets a unique ID (1, 2, 3, ...)
1079+
// based on its start time order
1080+
let trace_events: Vec<_> = sorted_steps
1081+
.iter()
1082+
.enumerate()
1083+
.map(|(i, step)| {
1084+
let start_us = step.start.timestamp_micros();
1085+
let duration_us = (step.end - step.start)
1086+
.num_microseconds()
1087+
.unwrap_or(0)
1088+
.max(0);
1089+
let step_id = i + 1;
1090+
1091+
json!({
1092+
"name": step.name,
1093+
"cat": "bundle_collection",
1094+
"ph": "X", // Complete event (has duration)
1095+
"ts": start_us,
1096+
"dur": duration_us,
1097+
"pid": 1,
1098+
"tid": step_id,
1099+
"args": {
1100+
"status": step.status.to_string(),
1101+
}
1102+
})
1103+
})
1104+
.collect();
1105+
1106+
let trace_json = json!({
1107+
"traceEvents": trace_events,
1108+
"displayTimeUnit": "ms",
1109+
});
1110+
1111+
let trace_content = serde_json::to_string_pretty(&trace_json)
1112+
.context("Failed to serialize trace JSON")?;
1113+
1114+
tokio::fs::write(&trace_path, trace_content).await.with_context(
1115+
|| format!("Failed to write trace file to {trace_path}"),
1116+
)?;
1117+
1118+
info!(
1119+
self.log,
1120+
"Wrote trace file";
1121+
"path" => %trace_path,
1122+
"num_events" => trace_events.len()
1123+
);
1124+
1125+
Ok(())
1126+
}
1127+
10451128
async fn collect_bundle_id(
10461129
&self,
10471130
dir: &Utf8Path,
@@ -2528,6 +2611,130 @@ mod test {
25282611
assert!(report.is_none());
25292612
}
25302613

2614+
#[nexus_test(server = crate::Server)]
2615+
async fn test_trace_file_generated(cptestctx: &ControlPlaneTestContext) {
2616+
let nexus = &cptestctx.server.server_context().nexus;
2617+
let datastore = nexus.datastore();
2618+
let resolver = nexus.resolver();
2619+
let opctx = OpContext::for_tests(
2620+
cptestctx.logctx.log.clone(),
2621+
datastore.clone(),
2622+
);
2623+
2624+
// Before we can create any bundles, we need to create the
2625+
// space for them to be provisioned.
2626+
let _datasets =
2627+
TestDataset::setup(cptestctx, &datastore, &opctx, 1).await;
2628+
2629+
// Create a bundle to collect
2630+
let bundle = datastore
2631+
.support_bundle_create(
2632+
&opctx,
2633+
"For trace file testing",
2634+
nexus.id(),
2635+
None,
2636+
)
2637+
.await
2638+
.expect("Couldn't allocate a support bundle");
2639+
2640+
let collector = SupportBundleCollector::new(
2641+
datastore.clone(),
2642+
resolver.clone(),
2643+
false,
2644+
nexus.id(),
2645+
);
2646+
2647+
// Collect the bundle
2648+
let mut request = BundleRequest::default();
2649+
request.data_selection.insert(BundleData::HostInfo(HashSet::new()));
2650+
let report = collector
2651+
.collect_bundle(&opctx, &request)
2652+
.await
2653+
.expect("Collection should have succeeded")
2654+
.expect("Should have generated a report");
2655+
2656+
// Download the trace file from the bundle
2657+
let head = false;
2658+
let range = None;
2659+
let response = nexus
2660+
.support_bundle_download(
2661+
&opctx,
2662+
bundle.id.into(),
2663+
SupportBundleQueryType::Path {
2664+
file_path: "meta/trace.json".to_string(),
2665+
},
2666+
head,
2667+
range,
2668+
)
2669+
.await
2670+
.expect("Should be able to download trace file");
2671+
2672+
// Parse the trace file as JSON
2673+
let body_bytes =
2674+
response.into_body().collect().await.unwrap().to_bytes();
2675+
let trace_json: serde_json::Value = serde_json::from_slice(&body_bytes)
2676+
.expect("Trace file should be valid JSON");
2677+
2678+
// Verify the structure matches Perfetto Trace Event format
2679+
let trace_events = trace_json
2680+
.get("traceEvents")
2681+
.expect("Should have traceEvents field")
2682+
.as_array()
2683+
.expect("traceEvents should be an array");
2684+
2685+
// We should have at least the main collection steps
2686+
assert!(
2687+
!trace_events.is_empty(),
2688+
"Should have at least one trace event"
2689+
);
2690+
2691+
// Verify each event has the expected fields
2692+
for event in trace_events {
2693+
assert!(event.get("name").is_some(), "Event should have name");
2694+
assert_eq!(
2695+
event.get("cat").and_then(|v| v.as_str()),
2696+
Some("bundle_collection"),
2697+
"Event should have category 'bundle_collection'"
2698+
);
2699+
assert_eq!(
2700+
event.get("ph").and_then(|v| v.as_str()),
2701+
Some("X"),
2702+
"Event should be Complete event type"
2703+
);
2704+
assert!(
2705+
event.get("ts").and_then(|v| v.as_i64()).is_some(),
2706+
"Event should have timestamp"
2707+
);
2708+
assert!(
2709+
event.get("dur").and_then(|v| v.as_i64()).is_some(),
2710+
"Event should have duration"
2711+
);
2712+
assert!(
2713+
event.get("args").is_some(),
2714+
"Event should have args field"
2715+
);
2716+
}
2717+
2718+
// Verify we have the same number of events as steps in the report
2719+
assert_eq!(
2720+
trace_events.len(),
2721+
report.steps.len(),
2722+
"Number of events should match number of steps"
2723+
);
2724+
2725+
// Verify step names match between report and trace
2726+
let trace_names: std::collections::HashSet<_> = trace_events
2727+
.iter()
2728+
.filter_map(|e| e.get("name").and_then(|v| v.as_str()))
2729+
.collect();
2730+
let report_names: std::collections::HashSet<_> =
2731+
report.steps.iter().map(|s| s.name.as_str()).collect();
2732+
assert_eq!(
2733+
trace_names, report_names,
2734+
"Trace event names should match report step names"
2735+
);
2736+
}
2737+
25312738
#[nexus_test(server = crate::Server)]
25322739
async fn test_collect_chunked(cptestctx: &ControlPlaneTestContext) {
25332740
let nexus = &cptestctx.server.server_context().nexus;

0 commit comments

Comments
 (0)