Skip to content

Commit 2f1373e

Browse files
authored
Flesh out status report from reconfigurator rendezvous RPW (#7401)
Example output from `omicron-dev run-all` (so no meaningful rows, but just checking the field formatting and omdb deserialization): ``` task: "blueprint_rendezvous" configured period: every 5m currently executing: no last completed activation: iter 5, triggered by a dependent task completing started at 2025-01-24T21:07:40.740Z (222s ago) and ran for 68ms target blueprint: 72d199ef-d6bd-4895-8858-006f3ed212ad inventory collection: 7dfbf99c-3613-498c-a871-cfb9b2945abe debug_dataset rendezvous counts: num_inserted: 0 num_already_exist: 0 num_not_in_inventory: 0 num_tombstoned: 0 num_already_tombstoned: 0 crucible_dataset rendezvous counts: num_inserted: 0 num_already_exist: 0 num_not_in_inventory: 0 ``` Closes #7392.
1 parent 8f7a673 commit 2f1373e

File tree

4 files changed

+142
-14
lines changed

4 files changed

+142
-14
lines changed

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,6 +1128,27 @@ fn print_task_blueprint_rendezvous(details: &serde_json::Value) {
11281128
" inventory collection: {}",
11291129
status.inventory_collection_id
11301130
);
1131+
println!(" debug_dataset rendezvous counts:");
1132+
println!(
1133+
" num_inserted: {}",
1134+
status.stats.debug_dataset.num_inserted
1135+
);
1136+
println!(
1137+
" num_already_exist: {}",
1138+
status.stats.debug_dataset.num_already_exist
1139+
);
1140+
println!(
1141+
" num_not_in_inventory: {}",
1142+
status.stats.debug_dataset.num_not_in_inventory
1143+
);
1144+
println!(
1145+
" num_tombstoned: {}",
1146+
status.stats.debug_dataset.num_tombstoned
1147+
);
1148+
println!(
1149+
" num_already_tombstoned: {}",
1150+
status.stats.debug_dataset.num_already_tombstoned
1151+
);
11311152
println!(" crucible_dataset rendezvous counts:");
11321153
println!(
11331154
" num_inserted: {}",

nexus/reconfigurator/rendezvous/src/debug_dataset.rs

Lines changed: 68 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use nexus_db_queries::db::model::RendezvousDebugDataset;
1010
use nexus_db_queries::db::DataStore;
1111
use nexus_types::deployment::BlueprintDatasetConfig;
1212
use nexus_types::deployment::BlueprintDatasetDisposition;
13+
use nexus_types::internal_api::background::DebugDatasetsRendezvousStats;
1314
use omicron_common::api::internal::shared::DatasetKind;
1415
use omicron_uuid_kinds::BlueprintUuid;
1516
use omicron_uuid_kinds::DatasetUuid;
@@ -23,7 +24,7 @@ pub(crate) async fn reconcile_debug_datasets(
2324
blueprint_id: BlueprintUuid,
2425
blueprint_datasets: impl Iterator<Item = &BlueprintDatasetConfig>,
2526
inventory_datasets: &BTreeSet<DatasetUuid>,
26-
) -> anyhow::Result<()> {
27+
) -> anyhow::Result<DebugDatasetsRendezvousStats> {
2728
// We expect basically all executions of this task to do nothing: we're
2829
// activated periodically, and only do work when a dataset has been
2930
// newly-added or newly-expunged.
@@ -39,27 +40,42 @@ pub(crate) async fn reconcile_debug_datasets(
3940
.map(|d| (d.id(), d))
4041
.collect::<BTreeMap<_, _>>();
4142

43+
let mut stats = DebugDatasetsRendezvousStats::default();
44+
4245
for dataset in blueprint_datasets.filter(|d| d.kind == DatasetKind::Debug) {
4346
match dataset.disposition {
4447
BlueprintDatasetDisposition::InService => {
4548
// Only attempt to insert this dataset if it has shown up in
4649
// inventory (required for correctness) and isn't already
4750
// present in the db (performance optimization only). Inserting
4851
// an already-present row is a no-op, so it's safe to skip.
49-
if inventory_datasets.contains(&dataset.id)
50-
&& !existing_db_datasets.contains_key(&dataset.id)
51-
{
52+
if existing_db_datasets.contains_key(&dataset.id) {
53+
stats.num_already_exist += 1;
54+
} else if !inventory_datasets.contains(&dataset.id) {
55+
stats.num_not_in_inventory += 1;
56+
} else {
5257
let db_dataset = RendezvousDebugDataset::new(
5358
dataset.id,
5459
dataset.pool.id(),
5560
blueprint_id,
5661
);
57-
datastore
62+
let did_insert = datastore
5863
.debug_dataset_insert_if_not_exists(opctx, db_dataset)
5964
.await
6065
.with_context(|| {
6166
format!("failed to insert dataset {}", dataset.id)
62-
})?;
67+
})?
68+
.is_some();
69+
70+
if did_insert {
71+
stats.num_inserted += 1;
72+
} else {
73+
// This means we hit the TOCTOU race mentioned above:
74+
// when we queried the DB this row didn't exist, but
75+
// another Nexus must have beat us to actually inserting
76+
// it.
77+
stats.num_already_exist += 1;
78+
}
6379
}
6480
}
6581
BlueprintDatasetDisposition::Expunged => {
@@ -81,7 +97,9 @@ pub(crate) async fn reconcile_debug_datasets(
8197
.get(&dataset.id)
8298
.map(|d| d.is_tombstoned())
8399
.unwrap_or(false);
84-
if !already_tombstoned {
100+
if already_tombstoned {
101+
stats.num_already_tombstoned += 1;
102+
} else {
85103
if datastore
86104
.debug_dataset_tombstone(
87105
opctx,
@@ -96,17 +114,23 @@ pub(crate) async fn reconcile_debug_datasets(
96114
)
97115
})?
98116
{
117+
stats.num_tombstoned += 1;
99118
info!(
100119
opctx.log, "tombstoned expunged dataset";
101120
"dataset_id" => %dataset.id,
102121
);
122+
} else {
123+
// Similar TOCTOU race lost as above; this dataset was
124+
// either already tombstoned by another racing Nexus, or
125+
// has been hard deleted.
126+
stats.num_already_tombstoned += 1;
103127
}
104128
}
105129
}
106130
}
107131
}
108132

109-
Ok(())
133+
Ok(stats)
110134
}
111135

112136
#[cfg(test)]
@@ -213,15 +237,15 @@ mod tests {
213237
))| {
214238
let blueprint_id = BlueprintUuid::new_v4();
215239

216-
let datastore_datasets = runtime.block_on(async {
240+
let (result_stats, datastore_datasets) = runtime.block_on(async {
217241
let (blueprint_datasets, inventory_datasets) = proptest_do_prep(
218242
opctx,
219243
datastore,
220244
blueprint_id,
221245
&prep,
222246
).await;
223247

224-
reconcile_debug_datasets(
248+
let result_stats = reconcile_debug_datasets(
225249
opctx,
226250
datastore,
227251
blueprint_id,
@@ -231,15 +255,19 @@ mod tests {
231255
.await
232256
.expect("reconciled debug dataset");
233257

234-
datastore
258+
let datastore_datasets = datastore
235259
.debug_dataset_list_all_batched(opctx)
236260
.await
237261
.unwrap()
238262
.into_iter()
239263
.map(|d| (d.id(), d))
240-
.collect::<BTreeMap<_, _>>()
264+
.collect::<BTreeMap<_, _>>();
265+
266+
(result_stats, datastore_datasets)
241267
});
242268

269+
let mut expected_stats = DebugDatasetsRendezvousStats::default();
270+
243271
for (id, prep) in prep {
244272
let id: DatasetUuid = u32_to_id(id);
245273

@@ -252,6 +280,32 @@ mod tests {
252280
prep.disposition == ArbitraryDisposition::InService;
253281
let in_inventory = prep.in_inventory;
254282

283+
// Validate rendezvous output
284+
match (in_db_before, in_service, in_inventory) {
285+
// "Not in database and expunged" is consistent with "hard
286+
// deleted", which we can't separate from "already
287+
// tombstoned".
288+
(false, false, _) => {
289+
expected_stats.num_already_tombstoned += 1;
290+
}
291+
// "In database and expunged" should result in tombstoning.
292+
(true, false, _) => {
293+
expected_stats.num_tombstoned += 1;
294+
}
295+
// In service but already existed
296+
(true, true, _) => {
297+
expected_stats.num_already_exist += 1;
298+
}
299+
// In service, not in db yet, but not in inventory
300+
(false, true, false) => {
301+
expected_stats.num_not_in_inventory += 1;
302+
}
303+
// In service, not in db yet, present in inventory
304+
(false, true, true) => {
305+
expected_stats.num_inserted += 1;
306+
}
307+
}
308+
255309
// Validate database state
256310
match (in_db_before, in_service, in_inventory) {
257311
// Wasn't in DB, isn't in service: should still not be in db
@@ -298,6 +352,8 @@ mod tests {
298352
}
299353
}
300354
}
355+
356+
assert_eq!(result_stats, expected_stats);
301357
});
302358

303359
runtime.block_on(db.terminate());

nexus/reconfigurator/rendezvous/src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ pub async fn reconcile_blueprint_rendezvous_tables(
3030
.flat_map(|sled| sled.datasets.iter().flat_map(|d| d.id))
3131
.collect();
3232

33-
debug_dataset::reconcile_debug_datasets(
33+
let debug_dataset = debug_dataset::reconcile_debug_datasets(
3434
opctx,
3535
datastore,
3636
blueprint.id,
@@ -51,7 +51,7 @@ pub async fn reconcile_blueprint_rendezvous_tables(
5151
)
5252
.await?;
5353

54-
Ok(BlueprintRendezvousStats { crucible_dataset })
54+
Ok(BlueprintRendezvousStats { debug_dataset, crucible_dataset })
5555
}
5656

5757
#[cfg(test)]

nexus/types/src/internal_api/background.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ pub struct BlueprintRendezvousStatus {
247247

248248
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
249249
pub struct BlueprintRendezvousStats {
250+
pub debug_dataset: DebugDatasetsRendezvousStats,
250251
pub crucible_dataset: CrucibleDatasetsRendezvousStats,
251252
}
252253

@@ -283,3 +284,53 @@ impl slog::KV for CrucibleDatasetsRendezvousStats {
283284
Ok(())
284285
}
285286
}
287+
288+
#[derive(
289+
Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize,
290+
)]
291+
pub struct DebugDatasetsRendezvousStats {
292+
/// Number of new Debug datasets recorded.
293+
///
294+
/// This is a count of in-service Debug datasets that were also present
295+
/// in inventory and newly-inserted into `rendezvous_debug_dataset`.
296+
pub num_inserted: usize,
297+
/// Number of Debug datasets that would have been inserted, except
298+
/// records for them already existed.
299+
pub num_already_exist: usize,
300+
/// Number of Debug datasets that the current blueprint says are
301+
/// in-service, but we did not attempt to insert them because they're not
302+
/// present in the latest inventory collection.
303+
pub num_not_in_inventory: usize,
304+
/// Number of Debug datasets that we tombstoned based on their disposition
305+
/// in the current blueprint being expunged.
306+
pub num_tombstoned: usize,
307+
/// Number of Debug datasets that we would have tombstoned, except they were
308+
/// already tombstoned or deleted.
309+
pub num_already_tombstoned: usize,
310+
}
311+
312+
impl slog::KV for DebugDatasetsRendezvousStats {
313+
fn serialize(
314+
&self,
315+
_record: &slog::Record,
316+
serializer: &mut dyn slog::Serializer,
317+
) -> slog::Result {
318+
let Self {
319+
num_inserted,
320+
num_already_exist,
321+
num_not_in_inventory,
322+
num_tombstoned,
323+
num_already_tombstoned,
324+
} = *self;
325+
serializer.emit_usize("num_inserted".into(), num_inserted)?;
326+
serializer.emit_usize("num_already_exist".into(), num_already_exist)?;
327+
serializer
328+
.emit_usize("num_not_in_inventory".into(), num_not_in_inventory)?;
329+
serializer.emit_usize("num_tombstoned".into(), num_tombstoned)?;
330+
serializer.emit_usize(
331+
"num_already_tombstoned".into(),
332+
num_already_tombstoned,
333+
)?;
334+
Ok(())
335+
}
336+
}

0 commit comments

Comments
 (0)