Skip to content

Commit a4b0acf

Browse files
committed
Capture SP task dumps in support bundles
Update the support bundle collector to capture task dumps from the SPs.
1 parent 69a8d6b commit a4b0acf

File tree

2 files changed

+167
-19
lines changed

2 files changed

+167
-19
lines changed

nexus/src/app/background/init.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,7 @@ impl BackgroundTasksInitializer {
505505
task_impl: Box::new(
506506
support_bundle_collector::SupportBundleCollector::new(
507507
datastore.clone(),
508+
resolver.clone(),
508509
config.support_bundle_collector.disable,
509510
nexus_id,
510511
),

nexus/src/app/background/tasks/support_bundle_collector.rs

Lines changed: 166 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
77
use crate::app::background::BackgroundTask;
88
use anyhow::Context;
9+
use base64::Engine;
910
use camino::Utf8DirEntry;
1011
use camino::Utf8Path;
1112
use camino_tempfile::Utf8TempDir;
@@ -15,6 +16,10 @@ use futures::FutureExt;
1516
use futures::StreamExt;
1617
use futures::future::BoxFuture;
1718
use futures::stream::FuturesUnordered;
19+
use gateway_client::Client as MgsClient;
20+
use gateway_client::types::SpIdentifier;
21+
use internal_dns_resolver::Resolver;
22+
use internal_dns_types::names::ServiceName;
1823
use nexus_db_model::SupportBundle;
1924
use nexus_db_model::SupportBundleState;
2025
use nexus_db_queries::authz;
@@ -36,7 +41,9 @@ use omicron_uuid_kinds::SupportBundleUuid;
3641
use omicron_uuid_kinds::ZpoolUuid;
3742
use serde_json::json;
3843
use sha2::{Digest, Sha256};
44+
use slog_error_chain::InlineErrorChain;
3945
use std::future::Future;
46+
use std::io::Cursor;
4047
use std::io::Write;
4148
use std::sync::Arc;
4249
use tokio::io::AsyncReadExt;
@@ -84,17 +91,19 @@ enum DatabaseBundleCleanupResult {
8491
/// The background task responsible for cleaning and collecting support bundles
8592
pub struct SupportBundleCollector {
8693
datastore: Arc<DataStore>,
94+
resolver: Resolver,
8795
disable: bool,
8896
nexus_id: OmicronZoneUuid,
8997
}
9098

9199
impl SupportBundleCollector {
92100
pub fn new(
93101
datastore: Arc<DataStore>,
102+
resolver: Resolver,
94103
disable: bool,
95104
nexus_id: OmicronZoneUuid,
96105
) -> Self {
97-
SupportBundleCollector { datastore, disable, nexus_id }
106+
SupportBundleCollector { datastore, resolver, disable, nexus_id }
98107
}
99108

100109
// Tells a sled agent to delete a support bundle
@@ -376,6 +385,7 @@ impl SupportBundleCollector {
376385

377386
let collection = Arc::new(BundleCollection {
378387
datastore: self.datastore.clone(),
388+
resolver: self.resolver.clone(),
379389
log: opctx.log.new(slog::o!("bundle" => bundle.id.to_string())),
380390
opctx: opctx.child(std::collections::BTreeMap::new()),
381391
request: request.clone(),
@@ -419,6 +429,7 @@ impl SupportBundleCollector {
419429
// Wraps up all arguments to perform a single support bundle collection
420430
struct BundleCollection {
421431
datastore: Arc<DataStore>,
432+
resolver: Resolver,
422433
log: slog::Logger,
423434
opctx: OpContext,
424435
request: BundleRequest,
@@ -605,6 +616,17 @@ impl BundleCollection {
605616
}
606617
}
607618

619+
let sp_dumps_dir = dir.path().join("sp_task_dumps");
620+
tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| {
621+
format!("failed to create SP task dump directory {sp_dumps_dir}")
622+
})?;
623+
624+
if let Err(e) =
625+
save_all_sp_dumps(log, &self.resolver, &sp_dumps_dir).await
626+
{
627+
error!(log, "failed to capture SP task dumps"; "error" => InlineErrorChain::new(e.as_ref()));
628+
};
629+
608630
Ok(report)
609631
}
610632

@@ -981,6 +1003,86 @@ where
9811003
Ok(())
9821004
}
9831005

1006+
/// Collect task dumps from all SPs via MGS and save them to a directory.
1007+
async fn save_all_sp_dumps(
1008+
log: &slog::Logger,
1009+
resolver: &Resolver,
1010+
sp_dumps_dir: &Utf8Path,
1011+
) -> anyhow::Result<()> {
1012+
let mgs_client = resolver
1013+
.lookup_socket_v6(ServiceName::ManagementGatewayService)
1014+
.await
1015+
.map(|sockaddr| {
1016+
let url = format!("http://{}", sockaddr);
1017+
gateway_client::Client::new(&url, log.clone())
1018+
})
1019+
.context("failed to resolve address of MGS")?;
1020+
1021+
let all_sps = mgs_client
1022+
.sp_all_ids()
1023+
.await
1024+
.context("failed to get list of SPs from MGS")?
1025+
.into_inner();
1026+
1027+
let mut futures = futures::stream::iter(all_sps.into_iter())
1028+
.map(|sp| {
1029+
let mgs_client = mgs_client.clone();
1030+
1031+
async move {
1032+
save_sp_dumps(mgs_client, sp, &sp_dumps_dir)
1033+
.await
1034+
.with_context(|| format!("SP {} {}", sp.type_, sp.slot))
1035+
}
1036+
})
1037+
.buffer_unordered(10);
1038+
1039+
while let Some(result) = futures.next().await {
1040+
if let Err(e) = result {
1041+
error!(
1042+
log,
1043+
"failed to capture task dumps";
1044+
"error" => InlineErrorChain::new(e.as_ref())
1045+
);
1046+
}
1047+
}
1048+
1049+
Ok(())
1050+
}
1051+
1052+
/// Fetch and save task dumps from a single SP.
1053+
async fn save_sp_dumps(
1054+
mgs_client: MgsClient,
1055+
sp: SpIdentifier,
1056+
sp_dumps_dir: &Utf8Path,
1057+
) -> anyhow::Result<()> {
1058+
let dump_count = mgs_client
1059+
.sp_task_dump_count(sp.type_, sp.slot)
1060+
.await
1061+
.context("failed to get task dump count from SP")?
1062+
.into_inner();
1063+
1064+
let output_dir = sp_dumps_dir.join(format!("{}/{}", sp.type_, sp.slot));
1065+
tokio::fs::create_dir_all(&output_dir).await?;
1066+
1067+
for i in 0..dump_count {
1068+
let task_dump = mgs_client
1069+
.sp_task_dump_get(sp.type_, sp.slot, i)
1070+
.await
1071+
.with_context(|| format!("failed to get task dump {i} from SP"))?
1072+
.into_inner();
1073+
1074+
let zip_bytes = base64::engine::general_purpose::STANDARD
1075+
.decode(task_dump.base64_zip)
1076+
.context("failed to decode base64-encoded SP task dump zip")?;
1077+
let mut z = zip::ZipArchive::new(Cursor::new(zip_bytes))
1078+
.context("failed to open SP task dump zip")?;
1079+
z.extract(&output_dir).with_context(|| {
1080+
format!("failed to extract SP task dump zip file to: {output_dir}")
1081+
})?;
1082+
}
1083+
Ok(())
1084+
}
1085+
9841086
#[cfg(test)]
9851087
mod test {
9861088
use super::*;
@@ -1037,12 +1139,17 @@ mod test {
10371139
async fn test_cleanup_noop(cptestctx: &ControlPlaneTestContext) {
10381140
let nexus = &cptestctx.server.server_context().nexus;
10391141
let datastore = nexus.datastore();
1142+
let resolver = nexus.resolver();
10401143
let opctx = OpContext::for_tests(
10411144
cptestctx.logctx.log.clone(),
10421145
datastore.clone(),
10431146
);
1044-
let collector =
1045-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1147+
let collector = SupportBundleCollector::new(
1148+
datastore.clone(),
1149+
resolver.clone(),
1150+
false,
1151+
nexus.id(),
1152+
);
10461153

10471154
let report = collector
10481155
.cleanup_destroyed_bundles(&opctx)
@@ -1058,12 +1165,17 @@ mod test {
10581165
async fn test_collect_noop(cptestctx: &ControlPlaneTestContext) {
10591166
let nexus = &cptestctx.server.server_context().nexus;
10601167
let datastore = nexus.datastore();
1168+
let resolver = nexus.resolver();
10611169
let opctx = OpContext::for_tests(
10621170
cptestctx.logctx.log.clone(),
10631171
datastore.clone(),
10641172
);
1065-
let collector =
1066-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1173+
let collector = SupportBundleCollector::new(
1174+
datastore.clone(),
1175+
resolver.clone(),
1176+
false,
1177+
nexus.id(),
1178+
);
10671179

10681180
let request = BundleRequest::default();
10691181
let report = collector
@@ -1224,6 +1336,7 @@ mod test {
12241336
async fn test_collect_one(cptestctx: &ControlPlaneTestContext) {
12251337
let nexus = &cptestctx.server.server_context().nexus;
12261338
let datastore = nexus.datastore();
1339+
let resolver = nexus.resolver();
12271340
let opctx = OpContext::for_tests(
12281341
cptestctx.logctx.log.clone(),
12291342
datastore.clone(),
@@ -1242,8 +1355,12 @@ mod test {
12421355
.expect("Couldn't allocate a support bundle");
12431356
assert_eq!(bundle.state, SupportBundleState::Collecting);
12441357

1245-
let collector =
1246-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1358+
let collector = SupportBundleCollector::new(
1359+
datastore.clone(),
1360+
resolver.clone(),
1361+
false,
1362+
nexus.id(),
1363+
);
12471364

12481365
// The bundle collection should complete successfully.
12491366
let request = BundleRequest {
@@ -1279,6 +1396,7 @@ mod test {
12791396
async fn test_collect_many(cptestctx: &ControlPlaneTestContext) {
12801397
let nexus = &cptestctx.server.server_context().nexus;
12811398
let datastore = nexus.datastore();
1399+
let resolver = nexus.resolver();
12821400
let opctx = OpContext::for_tests(
12831401
cptestctx.logctx.log.clone(),
12841402
datastore.clone(),
@@ -1299,8 +1417,12 @@ mod test {
12991417
.await
13001418
.expect("Couldn't allocate a second support bundle");
13011419

1302-
let collector =
1303-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1420+
let collector = SupportBundleCollector::new(
1421+
datastore.clone(),
1422+
resolver.clone(),
1423+
false,
1424+
nexus.id(),
1425+
);
13041426

13051427
// Each time we call "collect_bundle", we collect a SINGLE bundle.
13061428
let request = BundleRequest { skip_sled_info: true };
@@ -1355,6 +1477,7 @@ mod test {
13551477
) {
13561478
let nexus = &cptestctx.server.server_context().nexus;
13571479
let datastore = nexus.datastore();
1480+
let resolver = nexus.resolver();
13581481
let opctx = OpContext::for_tests(
13591482
cptestctx.logctx.log.clone(),
13601483
datastore.clone(),
@@ -1384,8 +1507,12 @@ mod test {
13841507
.await
13851508
.unwrap();
13861509

1387-
let collector =
1388-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1510+
let collector = SupportBundleCollector::new(
1511+
datastore.clone(),
1512+
resolver.clone(),
1513+
false,
1514+
nexus.id(),
1515+
);
13891516

13901517
let report = collector
13911518
.cleanup_destroyed_bundles(&opctx)
@@ -1410,6 +1537,7 @@ mod test {
14101537
) {
14111538
let nexus = &cptestctx.server.server_context().nexus;
14121539
let datastore = nexus.datastore();
1540+
let resolver = nexus.resolver();
14131541
let opctx = OpContext::for_tests(
14141542
cptestctx.logctx.log.clone(),
14151543
datastore.clone(),
@@ -1427,8 +1555,12 @@ mod test {
14271555
.expect("Couldn't allocate a support bundle");
14281556
assert_eq!(bundle.state, SupportBundleState::Collecting);
14291557

1430-
let collector =
1431-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1558+
let collector = SupportBundleCollector::new(
1559+
datastore.clone(),
1560+
resolver.clone(),
1561+
false,
1562+
nexus.id(),
1563+
);
14321564
let request = BundleRequest { skip_sled_info: true };
14331565
let report = collector
14341566
.collect_bundle(&opctx, &request)
@@ -1475,6 +1607,7 @@ mod test {
14751607
) {
14761608
let nexus = &cptestctx.server.server_context().nexus;
14771609
let datastore = nexus.datastore();
1610+
let resolver = nexus.resolver();
14781611
let opctx = OpContext::for_tests(
14791612
cptestctx.logctx.log.clone(),
14801613
datastore.clone(),
@@ -1506,8 +1639,12 @@ mod test {
15061639
.await
15071640
.unwrap();
15081641

1509-
let collector =
1510-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1642+
let collector = SupportBundleCollector::new(
1643+
datastore.clone(),
1644+
resolver.clone(),
1645+
false,
1646+
nexus.id(),
1647+
);
15111648

15121649
let report = collector
15131650
.cleanup_destroyed_bundles(&opctx)
@@ -1535,6 +1672,7 @@ mod test {
15351672
) {
15361673
let nexus = &cptestctx.server.server_context().nexus;
15371674
let datastore = nexus.datastore();
1675+
let resolver = nexus.resolver();
15381676
let opctx = OpContext::for_tests(
15391677
cptestctx.logctx.log.clone(),
15401678
datastore.clone(),
@@ -1552,8 +1690,12 @@ mod test {
15521690
.expect("Couldn't allocate a support bundle");
15531691
assert_eq!(bundle.state, SupportBundleState::Collecting);
15541692

1555-
let collector =
1556-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1693+
let collector = SupportBundleCollector::new(
1694+
datastore.clone(),
1695+
resolver.clone(),
1696+
false,
1697+
nexus.id(),
1698+
);
15571699
let request = BundleRequest { skip_sled_info: true };
15581700
let report = collector
15591701
.collect_bundle(&opctx, &request)
@@ -1609,6 +1751,7 @@ mod test {
16091751
) {
16101752
let nexus = &cptestctx.server.server_context().nexus;
16111753
let datastore = nexus.datastore();
1754+
let resolver = nexus.resolver();
16121755
let opctx = OpContext::for_tests(
16131756
cptestctx.logctx.log.clone(),
16141757
datastore.clone(),
@@ -1626,8 +1769,12 @@ mod test {
16261769
.expect("Couldn't allocate a support bundle");
16271770
assert_eq!(bundle.state, SupportBundleState::Collecting);
16281771

1629-
let collector =
1630-
SupportBundleCollector::new(datastore.clone(), false, nexus.id());
1772+
let collector = SupportBundleCollector::new(
1773+
datastore.clone(),
1774+
resolver.clone(),
1775+
false,
1776+
nexus.id(),
1777+
);
16311778
let request = BundleRequest { skip_sled_info: true };
16321779
let report = collector
16331780
.collect_bundle(&opctx, &request)

0 commit comments

Comments
 (0)