Skip to content

Commit 5d08d2f

Browse files
authored
split the DebugCollector into different modules (#9493)
1 parent 745d832 commit 5d08d2f

File tree

10 files changed

+556
-409
lines changed

10 files changed

+556
-409
lines changed
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
use super::helpers::RealCoreDumpAdm;
6+
use super::helpers::RealZfs;
7+
use super::helpers::RealZone;
8+
use super::worker::CoreZpool;
9+
use super::worker::DebugCollectorCmd;
10+
use super::worker::DebugCollectorWorker;
11+
use super::worker::DebugZpool;
12+
use super::worker::DumpSlicePath;
13+
use camino::Utf8Path;
14+
use illumos_utils::zpool::ZpoolHealth;
15+
use omicron_common::disk::DiskVariant;
16+
use sled_storage::config::MountConfig;
17+
use sled_storage::disk::Disk;
18+
use slog::Logger;
19+
use slog::error;
20+
use slog::info;
21+
use slog::o;
22+
use slog::warn;
23+
use std::sync::Arc;
24+
use tokio::sync::oneshot;
25+
26+
/// Handle to the DebugCollectorWorker, used by the DebugCollectorTask
27+
///
28+
/// The DebugCollectorTask (a tiny task that passes information from the rest of
29+
/// sled agent to this subystem) has this handle and uses it to send commands to
30+
/// the DebugCollectorWorker.
31+
pub struct DebugCollector {
32+
tx: tokio::sync::mpsc::Sender<DebugCollectorCmd>,
33+
mount_config: Arc<MountConfig>,
34+
_poller: tokio::task::JoinHandle<()>,
35+
log: Logger,
36+
}
37+
38+
impl DebugCollector {
39+
pub fn new(log: &Logger, mount_config: Arc<MountConfig>) -> Self {
40+
let (tx, rx) = tokio::sync::mpsc::channel(16);
41+
let worker = DebugCollectorWorker::new(
42+
Box::new(RealCoreDumpAdm {}),
43+
Box::new(RealZfs {}),
44+
Box::new(RealZone {}),
45+
log.new(o!("component" => "DebugCollector-worker")),
46+
rx,
47+
);
48+
let _poller =
49+
tokio::spawn(async move { worker.poll_file_archival().await });
50+
let log = log.new(o!("component" => "DebugCollector"));
51+
Self { tx, mount_config, _poller, log }
52+
}
53+
54+
/// Given the set of all managed disks, updates the dump device location
55+
/// for logs and dumps.
56+
///
57+
/// This function returns only once this request has been handled, which
58+
/// can be used as a signal by callers that any "old disks" are no longer
59+
/// being used by [DebugCollector].
60+
pub async fn update_dumpdev_setup(
61+
&self,
62+
disks: impl Iterator<Item = &Disk>,
63+
) {
64+
let log = &self.log;
65+
let mut m2_dump_slices = Vec::new();
66+
let mut u2_debug_datasets = Vec::new();
67+
let mut m2_core_datasets = Vec::new();
68+
let mount_config = self.mount_config.clone();
69+
for disk in disks {
70+
match disk.variant() {
71+
DiskVariant::M2 => {
72+
// We only setup dump devices on real disks
73+
if !disk.is_synthetic() {
74+
match disk.dump_device_devfs_path(false) {
75+
Ok(path) => {
76+
m2_dump_slices.push(DumpSlicePath::from(path))
77+
}
78+
Err(err) => {
79+
warn!(
80+
log,
81+
"Error getting dump device devfs path: \
82+
{err:?}"
83+
);
84+
}
85+
}
86+
}
87+
let name = disk.zpool_name();
88+
if let Ok(info) =
89+
illumos_utils::zpool::Zpool::get_info(&name.to_string())
90+
.await
91+
{
92+
if info.health() == ZpoolHealth::Online {
93+
m2_core_datasets.push(CoreZpool {
94+
mount_config: mount_config.clone(),
95+
name: *name,
96+
});
97+
} else {
98+
warn!(
99+
log,
100+
"Zpool {name:?} not online, won't attempt to \
101+
save process core dumps there"
102+
);
103+
}
104+
}
105+
}
106+
DiskVariant::U2 => {
107+
let name = disk.zpool_name();
108+
if let Ok(info) =
109+
illumos_utils::zpool::Zpool::get_info(&name.to_string())
110+
.await
111+
{
112+
if info.health() == ZpoolHealth::Online {
113+
u2_debug_datasets.push(DebugZpool {
114+
mount_config: mount_config.clone(),
115+
name: *name,
116+
});
117+
} else {
118+
warn!(
119+
log,
120+
"Zpool {name:?} not online, won't attempt to \
121+
save kernel core dumps there"
122+
);
123+
}
124+
}
125+
}
126+
}
127+
}
128+
129+
let (tx, rx) = oneshot::channel();
130+
if let Err(err) = self
131+
.tx
132+
.send(DebugCollectorCmd::UpdateDumpdevSetup {
133+
dump_slices: m2_dump_slices,
134+
debug_datasets: u2_debug_datasets,
135+
core_datasets: m2_core_datasets,
136+
update_complete_tx: tx,
137+
})
138+
.await
139+
{
140+
error!(log, "DebugCollector channel closed: {:?}", err.0);
141+
};
142+
143+
if let Err(err) = rx.await {
144+
error!(log, "DebugCollector failed to await update"; "err" => ?err);
145+
}
146+
}
147+
148+
/// Request archive of logs from the specified directory, which is assumed
149+
/// to correspond to the root filesystem of a zone that is no longer
150+
/// running.
151+
///
152+
/// Unlike typical log file archival, this includes non-rotated log files.
153+
///
154+
/// This makes a best-effort and logs failures rather than reporting them to
155+
/// the caller.
156+
///
157+
/// When this future completes, the request has only been enqueued. To know
158+
/// when archival has completed, you must wait on the receive side of
159+
/// `completion_tx`.
160+
pub async fn archive_former_zone_root(
161+
&self,
162+
zone_root: &Utf8Path,
163+
completion_tx: oneshot::Sender<()>,
164+
) {
165+
let log = self.log.new(o!("zone_root" => zone_root.to_string()));
166+
167+
// Validate the path that we were given. We're only ever given zone
168+
// root filesystems, whose basename is always a zonename, and we always
169+
// prefix our zone names with `oxz_`. If that's not what we find here,
170+
// log an error and bail out. These error cases should be impossible to
171+
// hit in practice.
172+
let Some(file_name) = zone_root.file_name() else {
173+
error!(
174+
log,
175+
"cannot archive former zone root";
176+
"error" => "path has no filename part",
177+
);
178+
return;
179+
};
180+
181+
if !file_name.starts_with("oxz_") {
182+
error!(
183+
log,
184+
"cannot archive former zone root";
185+
"error" => "filename does not start with \"oxz_\"",
186+
);
187+
return;
188+
}
189+
190+
info!(log, "requesting archive of former zone root");
191+
let zone_root = zone_root.to_owned();
192+
let zone_name = file_name.to_string();
193+
let cmd = DebugCollectorCmd::ArchiveFormerZoneRoot {
194+
zone_root,
195+
zone_name,
196+
completion_tx,
197+
};
198+
if let Err(_) = self.tx.send(cmd).await {
199+
error!(
200+
log,
201+
"failed to request archive of former zone root";
202+
"error" => "DebugCollector channel closed"
203+
);
204+
}
205+
}
206+
}

0 commit comments

Comments
 (0)