6
6
7
7
use crate :: app:: background:: BackgroundTask ;
8
8
use anyhow:: Context ;
9
+ use base64:: Engine ;
9
10
use camino:: Utf8DirEntry ;
10
11
use camino:: Utf8Path ;
11
12
use camino_tempfile:: Utf8TempDir ;
@@ -15,6 +16,10 @@ use futures::FutureExt;
15
16
use futures:: StreamExt ;
16
17
use futures:: future:: BoxFuture ;
17
18
use futures:: stream:: FuturesUnordered ;
19
+ use gateway_client:: Client as MgsClient ;
20
+ use gateway_client:: types:: SpIdentifier ;
21
+ use internal_dns_resolver:: Resolver ;
22
+ use internal_dns_types:: names:: ServiceName ;
18
23
use nexus_db_model:: SupportBundle ;
19
24
use nexus_db_model:: SupportBundleState ;
20
25
use nexus_db_queries:: authz;
@@ -36,7 +41,9 @@ use omicron_uuid_kinds::SupportBundleUuid;
36
41
use omicron_uuid_kinds:: ZpoolUuid ;
37
42
use serde_json:: json;
38
43
use sha2:: { Digest , Sha256 } ;
44
+ use slog_error_chain:: InlineErrorChain ;
39
45
use std:: future:: Future ;
46
+ use std:: io:: Cursor ;
40
47
use std:: io:: Write ;
41
48
use std:: sync:: Arc ;
42
49
use tokio:: io:: AsyncReadExt ;
@@ -84,17 +91,19 @@ enum DatabaseBundleCleanupResult {
84
91
/// The background task responsible for cleaning and collecting support bundles
85
92
pub struct SupportBundleCollector {
86
93
datastore : Arc < DataStore > ,
94
+ resolver : Resolver ,
87
95
disable : bool ,
88
96
nexus_id : OmicronZoneUuid ,
89
97
}
90
98
91
99
impl SupportBundleCollector {
92
100
pub fn new (
93
101
datastore : Arc < DataStore > ,
102
+ resolver : Resolver ,
94
103
disable : bool ,
95
104
nexus_id : OmicronZoneUuid ,
96
105
) -> Self {
97
- SupportBundleCollector { datastore, disable, nexus_id }
106
+ SupportBundleCollector { datastore, resolver , disable, nexus_id }
98
107
}
99
108
100
109
// Tells a sled agent to delete a support bundle
@@ -376,6 +385,7 @@ impl SupportBundleCollector {
376
385
377
386
let collection = Arc :: new ( BundleCollection {
378
387
datastore : self . datastore . clone ( ) ,
388
+ resolver : self . resolver . clone ( ) ,
379
389
log : opctx. log . new ( slog:: o!( "bundle" => bundle. id. to_string( ) ) ) ,
380
390
opctx : opctx. child ( std:: collections:: BTreeMap :: new ( ) ) ,
381
391
request : request. clone ( ) ,
@@ -419,6 +429,7 @@ impl SupportBundleCollector {
419
429
// Wraps up all arguments to perform a single support bundle collection
420
430
struct BundleCollection {
421
431
datastore : Arc < DataStore > ,
432
+ resolver : Resolver ,
422
433
log : slog:: Logger ,
423
434
opctx : OpContext ,
424
435
request : BundleRequest ,
@@ -605,6 +616,17 @@ impl BundleCollection {
605
616
}
606
617
}
607
618
619
+ let sp_dumps_dir = dir. path ( ) . join ( "sp_task_dumps" ) ;
620
+ tokio:: fs:: create_dir_all ( & sp_dumps_dir) . await . with_context ( || {
621
+ format ! ( "failed to create SP task dump directory {sp_dumps_dir}" )
622
+ } ) ?;
623
+
624
+ if let Err ( e) =
625
+ save_all_sp_dumps ( log, & self . resolver , & sp_dumps_dir) . await
626
+ {
627
+ error ! ( log, "failed to capture SP task dumps" ; "error" => InlineErrorChain :: new( e. as_ref( ) ) ) ;
628
+ } ;
629
+
608
630
Ok ( report)
609
631
}
610
632
@@ -981,6 +1003,86 @@ where
981
1003
Ok ( ( ) )
982
1004
}
983
1005
1006
+ /// Collect task dumps from all SPs via MGS and save them to a directory.
1007
+ async fn save_all_sp_dumps (
1008
+ log : & slog:: Logger ,
1009
+ resolver : & Resolver ,
1010
+ sp_dumps_dir : & Utf8Path ,
1011
+ ) -> anyhow:: Result < ( ) > {
1012
+ let mgs_client = resolver
1013
+ . lookup_socket_v6 ( ServiceName :: ManagementGatewayService )
1014
+ . await
1015
+ . map ( |sockaddr| {
1016
+ let url = format ! ( "http://{}" , sockaddr) ;
1017
+ gateway_client:: Client :: new ( & url, log. clone ( ) )
1018
+ } )
1019
+ . context ( "failed to resolve address of MGS" ) ?;
1020
+
1021
+ let all_sps = mgs_client
1022
+ . sp_all_ids ( )
1023
+ . await
1024
+ . context ( "failed to get list of SPs from MGS" ) ?
1025
+ . into_inner ( ) ;
1026
+
1027
+ let mut futures = futures:: stream:: iter ( all_sps. into_iter ( ) )
1028
+ . map ( |sp| {
1029
+ let mgs_client = mgs_client. clone ( ) ;
1030
+
1031
+ async move {
1032
+ save_sp_dumps ( mgs_client, sp, & sp_dumps_dir)
1033
+ . await
1034
+ . with_context ( || format ! ( "SP {} {}" , sp. type_, sp. slot) )
1035
+ }
1036
+ } )
1037
+ . buffer_unordered ( 10 ) ;
1038
+
1039
+ while let Some ( result) = futures. next ( ) . await {
1040
+ if let Err ( e) = result {
1041
+ error ! (
1042
+ log,
1043
+ "failed to capture task dumps" ;
1044
+ "error" => InlineErrorChain :: new( e. as_ref( ) )
1045
+ ) ;
1046
+ }
1047
+ }
1048
+
1049
+ Ok ( ( ) )
1050
+ }
1051
+
1052
+ /// Fetch and save task dumps from a single SP.
1053
+ async fn save_sp_dumps (
1054
+ mgs_client : MgsClient ,
1055
+ sp : SpIdentifier ,
1056
+ sp_dumps_dir : & Utf8Path ,
1057
+ ) -> anyhow:: Result < ( ) > {
1058
+ let dump_count = mgs_client
1059
+ . sp_task_dump_count ( sp. type_ , sp. slot )
1060
+ . await
1061
+ . context ( "failed to get task dump count from SP" ) ?
1062
+ . into_inner ( ) ;
1063
+
1064
+ let output_dir = sp_dumps_dir. join ( format ! ( "{}/{}" , sp. type_, sp. slot) ) ;
1065
+ tokio:: fs:: create_dir_all ( & output_dir) . await ?;
1066
+
1067
+ for i in 0 ..dump_count {
1068
+ let task_dump = mgs_client
1069
+ . sp_task_dump_get ( sp. type_ , sp. slot , i)
1070
+ . await
1071
+ . with_context ( || format ! ( "failed to get task dump {i} from SP" ) ) ?
1072
+ . into_inner ( ) ;
1073
+
1074
+ let zip_bytes = base64:: engine:: general_purpose:: STANDARD
1075
+ . decode ( task_dump. base64_zip )
1076
+ . context ( "failed to decode base64-encoded SP task dump zip" ) ?;
1077
+ let mut z = zip:: ZipArchive :: new ( Cursor :: new ( zip_bytes) )
1078
+ . context ( "failed to open SP task dump zip" ) ?;
1079
+ z. extract ( & output_dir) . with_context ( || {
1080
+ format ! ( "failed to extract SP task dump zip file to: {output_dir}" )
1081
+ } ) ?;
1082
+ }
1083
+ Ok ( ( ) )
1084
+ }
1085
+
984
1086
#[ cfg( test) ]
985
1087
mod test {
986
1088
use super :: * ;
@@ -1037,12 +1139,17 @@ mod test {
1037
1139
async fn test_cleanup_noop ( cptestctx : & ControlPlaneTestContext ) {
1038
1140
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1039
1141
let datastore = nexus. datastore ( ) ;
1142
+ let resolver = nexus. resolver ( ) ;
1040
1143
let opctx = OpContext :: for_tests (
1041
1144
cptestctx. logctx . log . clone ( ) ,
1042
1145
datastore. clone ( ) ,
1043
1146
) ;
1044
- let collector =
1045
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1147
+ let collector = SupportBundleCollector :: new (
1148
+ datastore. clone ( ) ,
1149
+ resolver. clone ( ) ,
1150
+ false ,
1151
+ nexus. id ( ) ,
1152
+ ) ;
1046
1153
1047
1154
let report = collector
1048
1155
. cleanup_destroyed_bundles ( & opctx)
@@ -1058,12 +1165,17 @@ mod test {
1058
1165
async fn test_collect_noop ( cptestctx : & ControlPlaneTestContext ) {
1059
1166
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1060
1167
let datastore = nexus. datastore ( ) ;
1168
+ let resolver = nexus. resolver ( ) ;
1061
1169
let opctx = OpContext :: for_tests (
1062
1170
cptestctx. logctx . log . clone ( ) ,
1063
1171
datastore. clone ( ) ,
1064
1172
) ;
1065
- let collector =
1066
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1173
+ let collector = SupportBundleCollector :: new (
1174
+ datastore. clone ( ) ,
1175
+ resolver. clone ( ) ,
1176
+ false ,
1177
+ nexus. id ( ) ,
1178
+ ) ;
1067
1179
1068
1180
let request = BundleRequest :: default ( ) ;
1069
1181
let report = collector
@@ -1224,6 +1336,7 @@ mod test {
1224
1336
async fn test_collect_one ( cptestctx : & ControlPlaneTestContext ) {
1225
1337
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1226
1338
let datastore = nexus. datastore ( ) ;
1339
+ let resolver = nexus. resolver ( ) ;
1227
1340
let opctx = OpContext :: for_tests (
1228
1341
cptestctx. logctx . log . clone ( ) ,
1229
1342
datastore. clone ( ) ,
@@ -1242,8 +1355,12 @@ mod test {
1242
1355
. expect ( "Couldn't allocate a support bundle" ) ;
1243
1356
assert_eq ! ( bundle. state, SupportBundleState :: Collecting ) ;
1244
1357
1245
- let collector =
1246
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1358
+ let collector = SupportBundleCollector :: new (
1359
+ datastore. clone ( ) ,
1360
+ resolver. clone ( ) ,
1361
+ false ,
1362
+ nexus. id ( ) ,
1363
+ ) ;
1247
1364
1248
1365
// The bundle collection should complete successfully.
1249
1366
let request = BundleRequest {
@@ -1279,6 +1396,7 @@ mod test {
1279
1396
async fn test_collect_many ( cptestctx : & ControlPlaneTestContext ) {
1280
1397
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1281
1398
let datastore = nexus. datastore ( ) ;
1399
+ let resolver = nexus. resolver ( ) ;
1282
1400
let opctx = OpContext :: for_tests (
1283
1401
cptestctx. logctx . log . clone ( ) ,
1284
1402
datastore. clone ( ) ,
@@ -1299,8 +1417,12 @@ mod test {
1299
1417
. await
1300
1418
. expect ( "Couldn't allocate a second support bundle" ) ;
1301
1419
1302
- let collector =
1303
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1420
+ let collector = SupportBundleCollector :: new (
1421
+ datastore. clone ( ) ,
1422
+ resolver. clone ( ) ,
1423
+ false ,
1424
+ nexus. id ( ) ,
1425
+ ) ;
1304
1426
1305
1427
// Each time we call "collect_bundle", we collect a SINGLE bundle.
1306
1428
let request = BundleRequest { skip_sled_info : true } ;
@@ -1355,6 +1477,7 @@ mod test {
1355
1477
) {
1356
1478
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1357
1479
let datastore = nexus. datastore ( ) ;
1480
+ let resolver = nexus. resolver ( ) ;
1358
1481
let opctx = OpContext :: for_tests (
1359
1482
cptestctx. logctx . log . clone ( ) ,
1360
1483
datastore. clone ( ) ,
@@ -1384,8 +1507,12 @@ mod test {
1384
1507
. await
1385
1508
. unwrap ( ) ;
1386
1509
1387
- let collector =
1388
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1510
+ let collector = SupportBundleCollector :: new (
1511
+ datastore. clone ( ) ,
1512
+ resolver. clone ( ) ,
1513
+ false ,
1514
+ nexus. id ( ) ,
1515
+ ) ;
1389
1516
1390
1517
let report = collector
1391
1518
. cleanup_destroyed_bundles ( & opctx)
@@ -1410,6 +1537,7 @@ mod test {
1410
1537
) {
1411
1538
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1412
1539
let datastore = nexus. datastore ( ) ;
1540
+ let resolver = nexus. resolver ( ) ;
1413
1541
let opctx = OpContext :: for_tests (
1414
1542
cptestctx. logctx . log . clone ( ) ,
1415
1543
datastore. clone ( ) ,
@@ -1427,8 +1555,12 @@ mod test {
1427
1555
. expect ( "Couldn't allocate a support bundle" ) ;
1428
1556
assert_eq ! ( bundle. state, SupportBundleState :: Collecting ) ;
1429
1557
1430
- let collector =
1431
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1558
+ let collector = SupportBundleCollector :: new (
1559
+ datastore. clone ( ) ,
1560
+ resolver. clone ( ) ,
1561
+ false ,
1562
+ nexus. id ( ) ,
1563
+ ) ;
1432
1564
let request = BundleRequest { skip_sled_info : true } ;
1433
1565
let report = collector
1434
1566
. collect_bundle ( & opctx, & request)
@@ -1475,6 +1607,7 @@ mod test {
1475
1607
) {
1476
1608
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1477
1609
let datastore = nexus. datastore ( ) ;
1610
+ let resolver = nexus. resolver ( ) ;
1478
1611
let opctx = OpContext :: for_tests (
1479
1612
cptestctx. logctx . log . clone ( ) ,
1480
1613
datastore. clone ( ) ,
@@ -1506,8 +1639,12 @@ mod test {
1506
1639
. await
1507
1640
. unwrap ( ) ;
1508
1641
1509
- let collector =
1510
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1642
+ let collector = SupportBundleCollector :: new (
1643
+ datastore. clone ( ) ,
1644
+ resolver. clone ( ) ,
1645
+ false ,
1646
+ nexus. id ( ) ,
1647
+ ) ;
1511
1648
1512
1649
let report = collector
1513
1650
. cleanup_destroyed_bundles ( & opctx)
@@ -1535,6 +1672,7 @@ mod test {
1535
1672
) {
1536
1673
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1537
1674
let datastore = nexus. datastore ( ) ;
1675
+ let resolver = nexus. resolver ( ) ;
1538
1676
let opctx = OpContext :: for_tests (
1539
1677
cptestctx. logctx . log . clone ( ) ,
1540
1678
datastore. clone ( ) ,
@@ -1552,8 +1690,12 @@ mod test {
1552
1690
. expect ( "Couldn't allocate a support bundle" ) ;
1553
1691
assert_eq ! ( bundle. state, SupportBundleState :: Collecting ) ;
1554
1692
1555
- let collector =
1556
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1693
+ let collector = SupportBundleCollector :: new (
1694
+ datastore. clone ( ) ,
1695
+ resolver. clone ( ) ,
1696
+ false ,
1697
+ nexus. id ( ) ,
1698
+ ) ;
1557
1699
let request = BundleRequest { skip_sled_info : true } ;
1558
1700
let report = collector
1559
1701
. collect_bundle ( & opctx, & request)
@@ -1609,6 +1751,7 @@ mod test {
1609
1751
) {
1610
1752
let nexus = & cptestctx. server . server_context ( ) . nexus ;
1611
1753
let datastore = nexus. datastore ( ) ;
1754
+ let resolver = nexus. resolver ( ) ;
1612
1755
let opctx = OpContext :: for_tests (
1613
1756
cptestctx. logctx . log . clone ( ) ,
1614
1757
datastore. clone ( ) ,
@@ -1626,8 +1769,12 @@ mod test {
1626
1769
. expect ( "Couldn't allocate a support bundle" ) ;
1627
1770
assert_eq ! ( bundle. state, SupportBundleState :: Collecting ) ;
1628
1771
1629
- let collector =
1630
- SupportBundleCollector :: new ( datastore. clone ( ) , false , nexus. id ( ) ) ;
1772
+ let collector = SupportBundleCollector :: new (
1773
+ datastore. clone ( ) ,
1774
+ resolver. clone ( ) ,
1775
+ false ,
1776
+ nexus. id ( ) ,
1777
+ ) ;
1631
1778
let request = BundleRequest { skip_sled_info : true } ;
1632
1779
let report = collector
1633
1780
. collect_bundle ( & opctx, & request)
0 commit comments