Skip to content
Merged
1 change: 1 addition & 0 deletions nexus/src/app/background/tasks/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ pub mod region_snapshot_replacement_start;
pub mod region_snapshot_replacement_step;
pub mod saga_recovery;
pub mod service_firewall_rules;
pub mod support_bundle;
pub mod support_bundle_collector;
pub mod sync_service_zone_nat;
pub mod sync_switch_configuration;
Expand Down
7 changes: 7 additions & 0 deletions nexus/src/app/background/tasks/support_bundle/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Support bundle related types and utilities

pub mod perfetto;
51 changes: 51 additions & 0 deletions nexus/src/app/background/tasks/support_bundle/perfetto.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Perfetto Trace Event format support for visualizing support bundle collection

use serde::Deserialize;
use serde::Serialize;

/// Represents a Perfetto Trace Event format JSON file for visualization.
///
/// This format is used by the Perfetto trace viewer (<https://ui.perfetto.dev/>)
/// to visualize timing information for operations.
#[derive(Serialize, Deserialize)]
pub struct Trace {
#[serde(rename = "traceEvents")]
pub trace_events: Vec<TraceEvent>,
/// Display unit for time values in the UI (e.g., "ms" for milliseconds)
#[serde(rename = "displayTimeUnit")]
pub display_time_unit: String,
}

/// A single event in the Perfetto Trace Event format.
///
/// This represents a complete event (duration event) showing when an operation
/// started and how long it took.
#[derive(Serialize, Deserialize)]
pub struct TraceEvent {
/// Human-readable name of the event
pub name: String,
/// Category name (abbreviated as "cat" in Perfetto format).
/// Used to group related events together in the trace viewer.
pub cat: String,
/// Phase type (abbreviated as "ph" in Perfetto format).
/// "X" means a "Complete" event with both timestamp and duration.
pub ph: String,
/// Timestamp in microseconds (abbreviated as "ts" in Perfetto format).
/// Represents when the event started, as microseconds since the epoch.
pub ts: i64,
/// Duration in microseconds (abbreviated as "dur" in Perfetto format).
/// How long the event took to complete.
pub dur: i64,
/// Process ID. Used to separate events into different process lanes
/// in the trace viewer.
pub pid: u32,
/// Thread ID. Used to separate events into different thread lanes
/// within a process in the trace viewer.
pub tid: usize,
/// Arbitrary key-value pairs with additional event metadata
pub args: serde_json::Value,
}
196 changes: 196 additions & 0 deletions nexus/src/app/background/tasks/support_bundle_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ use zip::ZipArchive;
use zip::ZipWriter;
use zip::write::FullFileOptions;

use super::support_bundle::perfetto;

// We use "/var/tmp" to use Nexus' filesystem for temporary storage,
// rather than "/tmp", which would keep this collected data in-memory.
const TEMPDIR: &str = "/var/tmp";
Expand Down Expand Up @@ -1037,11 +1039,94 @@ impl BundleCollection {
//
// Only finish if we've exhausted all possible steps and joined all spawned work.
if steps.is_empty() {
// Write trace file before returning
if let Err(err) = self.write_trace_file(output, &report).await {
warn!(
self.log,
"Failed to write trace file";
"error" => ?err
);
}
return report;
}
}
}

// Write a Perfetto Event format JSON file for visualization
async fn write_trace_file(
&self,
output: &Utf8TempDir,
report: &SupportBundleCollectionReport,
) -> anyhow::Result<()> {
let meta_dir = output.path().join("meta");
tokio::fs::create_dir_all(&meta_dir).await.with_context(|| {
format!("Failed to create meta directory {meta_dir}")
})?;

let trace_path = meta_dir.join("trace.json");

// Convert steps to Perfetto Trace Event format.
// Sort steps by start time and assign each a unique sequential ID.
//
// This is necessary because the trace event format does not like
// multiple slices to overlap - so we make each slice distinct.
//
// Ideally we'd be able to correlate these with actual tokio tasks,
// but it's hard to convert tokio::task::Id to a u64 because
// of https://github.com/tokio-rs/tokio/issues/7430
let mut sorted_steps: Vec<_> = report.steps.iter().collect();
sorted_steps.sort_by_key(|s| s.start);

// Generate trace events - each step gets a unique ID (1, 2, 3, ...)
// based on its start time order
let trace_events: Vec<_> = sorted_steps
.iter()
.enumerate()
.map(|(i, step)| {
let start_us = step.start.timestamp_micros();
let duration_us = (step.end - step.start)
.num_microseconds()
.unwrap_or(0)
.max(0);
let step_id = i + 1;

perfetto::TraceEvent {
name: step.name.clone(),
cat: "bundle_collection".to_string(),
ph: "X".to_string(),
ts: start_us,
dur: duration_us,
pid: 1,
tid: step_id,
args: json!({
"status": step.status.to_string(),
}),
}
})
.collect();

let trace = perfetto::Trace {
trace_events,
display_time_unit: "ms".to_string(),
};

let trace_content = serde_json::to_string_pretty(&trace)
.context("Failed to serialize trace JSON")?;

tokio::fs::write(&trace_path, trace_content).await.with_context(
|| format!("Failed to write trace file to {trace_path}"),
)?;

info!(
self.log,
"Wrote trace file";
"path" => %trace_path,
"num_events" => trace.trace_events.len()
);

Ok(())
}

async fn collect_bundle_id(
&self,
dir: &Utf8Path,
Expand Down Expand Up @@ -2528,6 +2613,117 @@ mod test {
assert!(report.is_none());
}

#[nexus_test(server = crate::Server)]
async fn test_trace_file_generated(cptestctx: &ControlPlaneTestContext) {
let nexus = &cptestctx.server.server_context().nexus;
let datastore = nexus.datastore();
let resolver = nexus.resolver();
let opctx = OpContext::for_tests(
cptestctx.logctx.log.clone(),
datastore.clone(),
);

// Before we can create any bundles, we need to create the
// space for them to be provisioned.
let _datasets =
TestDataset::setup(cptestctx, &datastore, &opctx, 1).await;

// Create a bundle to collect
let bundle = datastore
.support_bundle_create(
&opctx,
"For trace file testing",
nexus.id(),
None,
)
.await
.expect("Couldn't allocate a support bundle");

let collector = SupportBundleCollector::new(
datastore.clone(),
resolver.clone(),
false,
nexus.id(),
);

// Collect the bundle
let mut request = BundleRequest::default();
request.data_selection.insert(BundleData::HostInfo(HashSet::new()));
let report = collector
.collect_bundle(&opctx, &request)
.await
.expect("Collection should have succeeded")
.expect("Should have generated a report");

// Download the trace file from the bundle
let head = false;
let range = None;
let response = nexus
.support_bundle_download(
&opctx,
bundle.id.into(),
SupportBundleQueryType::Path {
file_path: "meta/trace.json".to_string(),
},
head,
range,
)
.await
.expect("Should be able to download trace file");

// Parse the trace file using our Perfetto structs
let body_bytes =
response.into_body().collect().await.unwrap().to_bytes();
let trace: perfetto::Trace = serde_json::from_slice(&body_bytes)
.expect("Trace file should be valid Perfetto JSON");

// Verify display time unit
assert_eq!(
trace.display_time_unit, "ms",
"Display time unit should be milliseconds"
);

// We should have at least the main collection steps
assert!(
!trace.trace_events.is_empty(),
"Should have at least one trace event"
);

// Verify each event has the expected structure
for event in &trace.trace_events {
// Verify category
assert_eq!(
event.cat, "bundle_collection",
"Event should have category 'bundle_collection'"
);
// Verify phase type
assert_eq!(event.ph, "X", "Event should be Complete event type");
// Verify timestamps are positive
assert!(event.ts >= 0, "Event timestamp should be non-negative");
assert!(event.dur >= 0, "Event duration should be non-negative");
// Verify process and thread IDs are set
assert_eq!(event.pid, 1, "All events should have pid=1");
assert!(event.tid > 0, "Event thread ID should be positive");
}

// Verify we have the same number of events as steps in the report
assert_eq!(
trace.trace_events.len(),
report.steps.len(),
"Number of events should match number of steps"
);

// Verify step names match between report and trace
let trace_names: std::collections::HashSet<_> =
trace.trace_events.iter().map(|e| e.name.as_str()).collect();
let report_names: std::collections::HashSet<_> =
report.steps.iter().map(|s| s.name.as_str()).collect();
assert_eq!(
trace_names, report_names,
"Trace event names should match report step names"
);
}

#[nexus_test(server = crate::Server)]
async fn test_collect_chunked(cptestctx: &ControlPlaneTestContext) {
let nexus = &cptestctx.server.server_context().nexus;
Expand Down
2 changes: 2 additions & 0 deletions nexus/tests/integration_tests/support_bundles.rs
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,8 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) {
let archive = ZipArchive::new(Cursor::new(&contents)).unwrap();
let mut names = archive.file_names();
assert_eq!(names.next(), Some("bundle_id.txt"));
assert_eq!(names.next(), Some("meta/"));
assert_eq!(names.next(), Some("meta/trace.json"));
assert_eq!(names.next(), Some("rack/"));
assert!(names.any(|n| n == "sp_task_dumps/"));
// There's much more data in the bundle, but validating it isn't the point
Expand Down
Loading