diff --git a/Cargo.lock b/Cargo.lock index 52cc88e4625..0723344d8c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7516,6 +7516,7 @@ dependencies = [ "nexus-inventory", "nexus-reconfigurator-preparation", "nexus-saga-recovery", + "nexus-sled-agent-shared", "nexus-test-utils", "nexus-test-utils-macros", "nexus-types", diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index 4ed8ddb8d8a..11bfa8ac318 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -44,6 +44,7 @@ nexus-db-schema.workspace = true nexus-inventory.workspace = true nexus-reconfigurator-preparation.workspace = true nexus-saga-recovery.workspace = true +nexus-sled-agent-shared.workspace = true nexus-types.workspace = true omicron-common.workspace = true omicron-uuid-kinds.workspace = true diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index e8b3d1e58cf..9ab25e2ae13 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -123,6 +123,10 @@ use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::pagination::paginated; use nexus_db_queries::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_db_queries::db::queries::region_allocation; +use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult; +use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus; +use nexus_sled_agent_shared::inventory::OmicronSledConfig; +use nexus_sled_agent_shared::inventory::OmicronZoneImageSource; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; @@ -149,6 +153,7 @@ use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::DownstairsRegionUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; +use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::ParseError; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::PropolisUuid; @@ -7329,27 +7334,204 @@ fn inv_collection_print_sleds(collection: &Collection) { println!(" reservation: {reservation:?}, quota: {quota:?}"); } - println!( - " zones generation: {} (count: {})", - sled.omicron_zones.generation, - sled.omicron_zones.zones.len(), - ); + if let Some(config) = &sled.ledgered_sled_config { + inv_collection_print_sled_config("LEDGERED", config); + } else { + println!(" no ledgered sled config"); + } - if sled.omicron_zones.zones.is_empty() { - continue; + if let Some(last_reconciliation) = &sled.last_reconciliation { + if Some(&last_reconciliation.last_reconciled_config) + == sled.ledgered_sled_config.as_ref() + { + println!(" last reconciled config: matches ledgered config"); + } else { + inv_collection_print_sled_config( + "LAST RECONCILED CONFIG", + &last_reconciliation.last_reconciled_config, + ); + let disk_errs = collect_config_reconciler_errors( + &last_reconciliation.external_disks, + ); + let dataset_errs = collect_config_reconciler_errors( + &last_reconciliation.datasets, + ); + let zone_errs = collect_config_reconciler_errors( + &last_reconciliation.zones, + ); + for (label, errs) in [ + ("disk", disk_errs), + ("dataset", dataset_errs), + ("zone", zone_errs), + ] { + if errs.is_empty() { + println!(" all {label}s reconciled successfully"); + } else { + println!( + " {} {label} reconciliation errors:", + errs.len() + ); + for err in errs { + println!(" {err}"); + } + } + } + } } - println!(" ZONES FOUND"); - for z in &sled.omicron_zones.zones { - println!( - " zone {} (type {})", - z.id, - z.zone_type.kind().report_str() - ); + print!(" reconciler task status: "); + match &sled.reconciler_status { + ConfigReconcilerInventoryStatus::NotYetRun => { + println!("not yet run"); + } + ConfigReconcilerInventoryStatus::Running { + config, + started_at, + running_for, + } => { + println!("running for {running_for:?} (since {started_at})"); + if Some(config) == sled.ledgered_sled_config.as_ref() { + println!(" reconciling currently-ledgered config"); + } else { + inv_collection_print_sled_config( + "RECONCILING CONFIG", + config, + ); + } + } + ConfigReconcilerInventoryStatus::Idle { completed_at, ran_for } => { + println!( + "idle (finished at {completed_at} \ + after running for {ran_for:?})" + ); + } } } } +fn collect_config_reconciler_errors( + results: &BTreeMap, +) -> Vec { + results + .iter() + .filter_map(|(id, result)| match result { + ConfigReconcilerInventoryResult::Ok => None, + ConfigReconcilerInventoryResult::Err { message } => { + Some(format!("{id}: {message}")) + } + }) + .collect() +} + +fn inv_collection_print_sled_config(label: &str, config: &OmicronSledConfig) { + let OmicronSledConfig { + generation, + disks, + datasets, + zones, + remove_mupdate_override, + } = config; + + println!("\n{label} SLED CONFIG"); + println!(" generation: {}", generation); + println!(" remove_mupdate_override: {remove_mupdate_override:?}"); + + if disks.is_empty() { + println!(" disk config empty"); + } else { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct DiskRow { + id: PhysicalDiskUuid, + zpool_id: ZpoolUuid, + vendor: String, + model: String, + serial: String, + } + + let rows = disks.iter().map(|d| DiskRow { + id: d.id, + zpool_id: d.pool_id, + vendor: d.identity.vendor.clone(), + model: d.identity.model.clone(), + serial: d.identity.serial.clone(), + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(8, 1, 0, 0)) + .to_string(); + println!(" DISKS: {}", disks.len()); + println!("{table}"); + } + + if datasets.is_empty() { + println!(" dataset config empty"); + } else { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct DatasetRow { + id: DatasetUuid, + name: String, + compression: String, + quota: String, + reservation: String, + } + + let rows = datasets.iter().map(|d| DatasetRow { + id: d.id, + name: d.name.full_name(), + compression: d.inner.compression.to_string(), + quota: d + .inner + .quota + .map(|q| q.to_string()) + .unwrap_or_else(|| "none".to_string()), + reservation: d + .inner + .reservation + .map(|r| r.to_string()) + .unwrap_or_else(|| "none".to_string()), + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(8, 1, 0, 0)) + .to_string(); + println!(" DATASETS: {}", datasets.len()); + println!("{table}"); + } + + if zones.is_empty() { + println!(" zone config empty"); + } else { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct ZoneRow { + id: OmicronZoneUuid, + kind: &'static str, + image_source: String, + } + + let rows = zones.iter().map(|z| ZoneRow { + id: z.id, + kind: z.zone_type.kind().report_str(), + image_source: match &z.image_source { + OmicronZoneImageSource::InstallDataset => { + "install-dataset".to_string() + } + OmicronZoneImageSource::Artifact { hash } => { + format!("artifact: {hash}") + } + }, + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(8, 1, 0, 0)) + .to_string(); + println!(" ZONES: {}", zones.len()); + println!("{table}"); + } +} + fn inv_collection_print_keeper_membership(collection: &Collection) { println!("\nKEEPER MEMBERSHIP"); for k in &collection.clickhouse_keeper_cluster_membership { diff --git a/id-map/src/lib.rs b/id-map/src/lib.rs index c8e6699717e..20b37aa776a 100644 --- a/id-map/src/lib.rs +++ b/id-map/src/lib.rs @@ -289,6 +289,7 @@ impl Diffable for IdMap { /// Wrapper around a `&'a mut T` that panics when dropped if the borrowed /// value's `id()` has changed since the wrapper was created. +#[derive(Debug)] pub struct RefMut<'a, T: IdMappable> { original_id: T::Id, // Always `Some(_)` until the `RefMut` is consumed by `into_ref()`. diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index 1bf5b0ea7ba..8630d94659b 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -186,9 +186,21 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { let agent = latest_collection.sled_agents.get(&sled_id).expect( "collection information for the sled we added a Nexus to", ); - if agent.omicron_zones.zones.iter().any(|z| z.id == new_zone.id) { - debug!(log, "zone still present in inventory"); - return Err(CondCheckError::<()>::NotYet); + if let Some(config) = &agent.ledgered_sled_config { + if config.zones.iter().any(|z| z.id == new_zone.id) { + debug!(log, "zone still present in ledger"); + return Err(CondCheckError::<()>::NotYet); + } + } + if let Some(config) = agent + .last_reconciliation + .as_ref() + .map(|lr| &lr.last_reconciled_config) + { + if config.zones.iter().any(|z| z.id == new_zone.id) { + debug!(log, "zone still present in inventory"); + return Err(CondCheckError::<()>::NotYet); + } } return Ok(latest_collection); }, diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 4e6653ea808..4b1b525e0e6 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -111,11 +111,12 @@ pub struct Inventory { pub usable_hardware_threads: u32, pub usable_physical_ram: ByteCount, pub reservoir_size: ByteCount, - pub omicron_zones: OmicronZonesConfig, pub disks: Vec, pub zpools: Vec, pub datasets: Vec, - pub omicron_physical_disks_generation: Generation, + pub ledgered_sled_config: Option, + pub reconciler_status: ConfigReconcilerInventoryStatus, + pub last_reconciliation: Option, } /// Describes the last attempt made by the sled-agent-config-reconciler to @@ -196,6 +197,18 @@ pub struct OmicronSledConfig { pub remove_mupdate_override: Option, } +impl Default for OmicronSledConfig { + fn default() -> Self { + Self { + generation: Generation::new(), + disks: IdMap::default(), + datasets: IdMap::default(), + zones: IdMap::default(), + remove_mupdate_override: None, + } + } +} + impl Ledgerable for OmicronSledConfig { fn is_newer_than(&self, other: &Self) -> bool { self.generation > other.generation diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 62abfacdab2..eb41c2091f6 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -4,11 +4,13 @@ //! Types for representing the hardware/software inventory in the database +use crate::ArtifactHash; +use crate::Generation; use crate::PhysicalDiskKind; use crate::omicron_zone_config::{self, OmicronZoneNic}; use crate::typed_uuid::DbTypedUuid; use crate::{ - ByteCount, Generation, MacAddr, Name, ServiceKind, SqlU8, SqlU16, SqlU32, + ByteCount, MacAddr, Name, ServiceKind, SqlU8, SqlU16, SqlU32, impl_enum_type, ipv6, }; use anyhow::{Context, Result, anyhow, bail}; @@ -24,24 +26,40 @@ use diesel::{serialize, sql_types}; use ipnetwork::IpNetwork; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, - inv_collection, inv_collection_error, inv_dataset, inv_nvme_disk_firmware, - inv_omicron_zone, inv_omicron_zone_nic, inv_physical_disk, - inv_root_of_trust, inv_root_of_trust_page, inv_service_processor, - inv_sled_agent, inv_sled_omicron_zones, inv_zpool, sw_caboose, + inv_collection, inv_collection_error, inv_dataset, + inv_last_reconciliation_dataset_result, + inv_last_reconciliation_disk_result, inv_last_reconciliation_zone_result, + inv_nvme_disk_firmware, inv_omicron_sled_config, + inv_omicron_sled_config_dataset, inv_omicron_sled_config_disk, + inv_omicron_sled_config_zone, inv_omicron_sled_config_zone_nic, + inv_physical_disk, inv_root_of_trust, inv_root_of_trust_page, + inv_service_processor, inv_sled_agent, inv_zpool, sw_caboose, sw_root_of_trust_page, }; +use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus; use nexus_sled_agent_shared::inventory::{ - OmicronZoneConfig, OmicronZoneDataset, OmicronZoneImageSource, - OmicronZoneType, + ConfigReconcilerInventoryResult, OmicronSledConfig, OmicronZoneConfig, + OmicronZoneDataset, OmicronZoneImageSource, OmicronZoneType, }; use nexus_types::inventory::{ BaseboardId, Caboose, Collection, NvmeFirmware, PowerState, RotPage, RotSlot, }; +use omicron_common::api::external; use omicron_common::api::internal::shared::NetworkInterface; +use omicron_common::disk::DatasetConfig; +use omicron_common::disk::DatasetName; +use omicron_common::disk::DiskIdentity; +use omicron_common::disk::OmicronPhysicalDiskConfig; use omicron_common::zpool_name::ZpoolName; use omicron_uuid_kinds::DatasetKind; +use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::MupdateOverrideKind; +use omicron_uuid_kinds::MupdateOverrideUuid; +use omicron_uuid_kinds::OmicronSledConfigKind; +use omicron_uuid_kinds::OmicronSledConfigUuid; +use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledKind; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolKind; @@ -50,6 +68,7 @@ use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind}; use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid}; use std::collections::BTreeSet; use std::net::{IpAddr, SocketAddrV6}; +use std::time::Duration; use thiserror::Error; use uuid::Uuid; @@ -793,13 +812,119 @@ pub struct InvSledAgent { pub usable_hardware_threads: SqlU32, pub usable_physical_ram: ByteCount, pub reservoir_size: ByteCount, - pub omicron_physical_disks_generation: Generation, + // Soft foreign key to an `InvOmicronSledConfig` + pub ledgered_sled_config: Option>, + + // Soft foreign key to an `InvOmicronSledConfig`. May or may not be the same + // as `ledgered_sled_config`. + pub last_reconciliation_sled_config: + Option>, + + #[diesel(embed)] + pub reconciler_status: InvConfigReconcilerStatus, } +/// See [`nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus`]. +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_sled_agent)] +pub struct InvConfigReconcilerStatus { + pub reconciler_status_kind: InvConfigReconcilerStatusKind, + // Soft foreign key to an `InvOmicronSledConfig`. May or may not be the same + // as either of the other sled config keys above. Only populated if + // `reconciler_status_kind` is `Running`. + pub reconciler_status_sled_config: + Option>, + // Interpretation varies based on `reconciler_status_kind`: + // + // * `NotYetRun` - always `None` + // * `Running` - `started_at` time + // * `Idle` - `completed_at` time + pub reconciler_status_timestamp: Option>, + // Interpretation varies based on `reconciler_status_kind`: + // + // * `NotYetRun` - always `None` + // * `Running` - `running_for` duration + // * `Idle` - `ran_for` duration + pub reconciler_status_duration_secs: Option, +} + +impl InvConfigReconcilerStatus { + /// Convert `self` to a [`ConfigReconcilerInventoryStatus`]. + /// + /// `get_config` should perform the lookup of a serialized + /// `OmicronSledConfig` given its ID. It will be called only if `self.kind` + /// is `InvConfigReconcilerStatusKind::Running`; the other variants do not + /// carry a sled config ID. + pub fn to_status( + &self, + get_config: F, + ) -> anyhow::Result + where + F: FnOnce(&OmicronSledConfigUuid) -> Option, + { + let status = match self.reconciler_status_kind { + InvConfigReconcilerStatusKind::NotYetRun => { + ConfigReconcilerInventoryStatus::NotYetRun + } + InvConfigReconcilerStatusKind::Running => { + let config_id = self.reconciler_status_sled_config.context( + "missing reconciler status sled config for kind 'running'", + )?; + let config = get_config(&config_id.into()) + .context("missing sled config we should have fetched")?; + ConfigReconcilerInventoryStatus::Running { + config, + started_at: self.reconciler_status_timestamp.context( + "missing reconciler status timestamp \ + for kind 'running'", + )?, + running_for: Duration::from_secs_f64( + self.reconciler_status_duration_secs.context( + "missing reconciler status duration \ + for kind 'running'", + )?, + ), + } + } + InvConfigReconcilerStatusKind::Idle => { + ConfigReconcilerInventoryStatus::Idle { + completed_at: self.reconciler_status_timestamp.context( + "missing reconciler status timestamp for kind 'idle'", + )?, + ran_for: Duration::from_secs_f64( + self.reconciler_status_duration_secs.context( + "missing reconciler status duration \ + for kind 'idle'", + )?, + ), + } + } + }; + Ok(status) + } +} + +// See [`nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus`]. +impl_enum_type!( + InvConfigReconcilerStatusKindEnum: + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + pub enum InvConfigReconcilerStatusKind; + + // Enum values + NotYetRun => b"not-yet-run" + Running => b"running" + Idle => b"idle" +); + impl InvSledAgent { + /// Construct a new `InvSledAgent`. pub fn new_without_baseboard( collection_id: CollectionUuid, sled_agent: &nexus_types::inventory::SledAgent, + ledgered_sled_config: Option, + last_reconciliation_sled_config: Option, + reconciler_status: InvConfigReconcilerStatus, ) -> Result { // It's irritating to have to check this case at runtime. The challenge // is that if this sled agent does have a baseboard id, we don't know @@ -837,14 +962,131 @@ impl InvSledAgent { sled_agent.usable_physical_ram, ), reservoir_size: ByteCount::from(sled_agent.reservoir_size), - omicron_physical_disks_generation: Generation::from( - sled_agent.omicron_physical_disks_generation, - ), + ledgered_sled_config: ledgered_sled_config.map(From::from), + last_reconciliation_sled_config: + last_reconciliation_sled_config.map(From::from), + reconciler_status, }) } } } +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_last_reconciliation_disk_result)] +pub struct InvLastReconciliationDiskResult { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub disk_id: DbTypedUuid, + pub error_message: Option, +} + +impl InvLastReconciliationDiskResult { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + disk_id: PhysicalDiskUuid, + result: ConfigReconcilerInventoryResult, + ) -> Self { + let error_message = match result { + ConfigReconcilerInventoryResult::Ok => None, + ConfigReconcilerInventoryResult::Err { message } => Some(message), + }; + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + disk_id: disk_id.into(), + error_message, + } + } +} + +impl From for ConfigReconcilerInventoryResult { + fn from(result: InvLastReconciliationDiskResult) -> Self { + match result.error_message { + None => Self::Ok, + Some(message) => Self::Err { message }, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_last_reconciliation_dataset_result)] +pub struct InvLastReconciliationDatasetResult { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub dataset_id: DbTypedUuid, + pub error_message: Option, +} + +impl InvLastReconciliationDatasetResult { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + dataset_id: DatasetUuid, + result: ConfigReconcilerInventoryResult, + ) -> Self { + let error_message = match result { + ConfigReconcilerInventoryResult::Ok => None, + ConfigReconcilerInventoryResult::Err { message } => Some(message), + }; + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + dataset_id: dataset_id.into(), + error_message, + } + } +} + +impl From + for ConfigReconcilerInventoryResult +{ + fn from(result: InvLastReconciliationDatasetResult) -> Self { + match result.error_message { + None => Self::Ok, + Some(message) => Self::Err { message }, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_last_reconciliation_zone_result)] +pub struct InvLastReconciliationZoneResult { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub zone_id: DbTypedUuid, + pub error_message: Option, +} + +impl InvLastReconciliationZoneResult { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + zone_id: OmicronZoneUuid, + result: ConfigReconcilerInventoryResult, + ) -> Self { + let error_message = match result { + ConfigReconcilerInventoryResult::Ok => None, + ConfigReconcilerInventoryResult::Err { message } => Some(message), + }; + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + zone_id: zone_id.into(), + error_message, + } + } +} + +impl From for ConfigReconcilerInventoryResult { + fn from(result: InvLastReconciliationZoneResult) -> Self { + match result.error_message { + None => Self::Ok, + Some(message) => Self::Err { message }, + } + } +} + /// See [`nexus_types::inventory::PhysicalDisk`]. #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_physical_disk)] @@ -1164,32 +1406,28 @@ impl From for nexus_types::inventory::Dataset { } } -/// Information about a sled's Omicron zones, part of -/// [`nexus_types::inventory::SledAgent`]. -/// -/// TODO: This table is vestigial and can be combined with `InvSledAgent`. See -/// [issue #6770](https://github.com/oxidecomputer/omicron/issues/6770). +/// Top-level information contained in an [`OmicronSledConfig`]. #[derive(Queryable, Clone, Debug, Selectable, Insertable)] -#[diesel(table_name = inv_sled_omicron_zones)] -pub struct InvSledOmicronZones { +#[diesel(table_name = inv_omicron_sled_config)] +pub struct InvOmicronSledConfig { pub inv_collection_id: DbTypedUuid, - pub time_collected: DateTime, - pub source: String, - pub sled_id: DbTypedUuid, + pub id: DbTypedUuid, pub generation: Generation, + pub remove_mupdate_override: Option>, } -impl InvSledOmicronZones { +impl InvOmicronSledConfig { pub fn new( inv_collection_id: CollectionUuid, - sled_agent: &nexus_types::inventory::SledAgent, - ) -> InvSledOmicronZones { - InvSledOmicronZones { + id: OmicronSledConfigUuid, + generation: external::Generation, + remove_mupdate_override: Option, + ) -> Self { + Self { inv_collection_id: inv_collection_id.into(), - time_collected: sled_agent.time_collected, - source: sled_agent.source.clone(), - sled_id: sled_agent.sled_id.into(), - generation: Generation(sled_agent.omicron_zones.generation), + id: id.into(), + generation: Generation(generation), + remove_mupdate_override: remove_mupdate_override.map(From::from), } } } @@ -1275,12 +1513,131 @@ impl From for ZoneType { } } +/// See [`omicron_common::disk::OmicronPhysicalDiskConfig`]. +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_omicron_sled_config_disk)] +pub struct InvOmicronSledConfigDisk { + pub inv_collection_id: DbTypedUuid, + pub sled_config_id: DbTypedUuid, + pub id: DbTypedUuid, + + pub vendor: String, + pub serial: String, + pub model: String, + + pub pool_id: DbTypedUuid, +} + +impl InvOmicronSledConfigDisk { + pub fn new( + inv_collection_id: CollectionUuid, + sled_config_id: OmicronSledConfigUuid, + disk_config: OmicronPhysicalDiskConfig, + ) -> Self { + Self { + inv_collection_id: inv_collection_id.into(), + sled_config_id: sled_config_id.into(), + id: disk_config.id.into(), + vendor: disk_config.identity.vendor, + serial: disk_config.identity.serial, + model: disk_config.identity.model, + pool_id: disk_config.pool_id.into(), + } + } +} + +impl From for OmicronPhysicalDiskConfig { + fn from(disk: InvOmicronSledConfigDisk) -> Self { + Self { + identity: DiskIdentity { + vendor: disk.vendor, + serial: disk.serial, + model: disk.model, + }, + id: disk.id.into(), + pool_id: disk.pool_id.into(), + } + } +} + +/// See [`omicron_common::disk::DatasetConfig`]. +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_omicron_sled_config_dataset)] +pub struct InvOmicronSledConfigDataset { + pub inv_collection_id: DbTypedUuid, + pub sled_config_id: DbTypedUuid, + pub id: DbTypedUuid, + + pub pool_id: DbTypedUuid, + pub kind: crate::DatasetKind, + zone_name: Option, + + pub quota: Option, + pub reservation: Option, + pub compression: String, +} + +impl InvOmicronSledConfigDataset { + pub fn new( + inv_collection_id: CollectionUuid, + sled_config_id: OmicronSledConfigUuid, + dataset_config: &DatasetConfig, + ) -> Self { + Self { + inv_collection_id: inv_collection_id.into(), + sled_config_id: sled_config_id.into(), + id: dataset_config.id.into(), + pool_id: dataset_config.name.pool().id().into(), + kind: dataset_config.name.kind().into(), + zone_name: dataset_config.name.kind().zone_name().map(String::from), + quota: dataset_config.inner.quota.map(|q| q.into()), + reservation: dataset_config.inner.reservation.map(|r| r.into()), + compression: dataset_config.inner.compression.to_string(), + } + } +} + +impl TryFrom for DatasetConfig { + type Error = anyhow::Error; + + fn try_from( + dataset: InvOmicronSledConfigDataset, + ) -> Result { + let pool = omicron_common::zpool_name::ZpoolName::new_external( + dataset.pool_id.into(), + ); + let kind = + crate::DatasetKind::try_into_api(dataset.kind, dataset.zone_name)?; + + Ok(Self { + id: dataset.id.into(), + name: DatasetName::new(pool, kind), + inner: omicron_common::disk::SharedDatasetConfig { + quota: dataset.quota.map(|b| b.into()), + reservation: dataset.reservation.map(|b| b.into()), + compression: dataset.compression.parse()?, + }, + }) + } +} + +impl_enum_type!( + InvZoneImageSourceEnum: + + #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, PartialEq)] + pub enum InvZoneImageSource; + + // Enum values + InstallDataset => b"install_dataset" + Artifact => b"artifact" +); + /// See [`nexus_sled_agent_shared::inventory::OmicronZoneConfig`]. #[derive(Queryable, Clone, Debug, Selectable, Insertable)] -#[diesel(table_name = inv_omicron_zone)] -pub struct InvOmicronZone { +#[diesel(table_name = inv_omicron_sled_config_zone)] +pub struct InvOmicronSledConfigZone { pub inv_collection_id: DbTypedUuid, - pub sled_id: DbTypedUuid, + pub sled_config_id: DbTypedUuid, pub id: DbTypedUuid, pub zone_type: ZoneType, pub primary_service_ip: ipv6::Ipv6Addr, @@ -1300,21 +1657,32 @@ pub struct InvOmicronZone { pub snat_first_port: Option, pub snat_last_port: Option, pub filesystem_pool: Option>, + pub image_source: InvZoneImageSource, + pub image_artifact_sha256: Option, } -impl InvOmicronZone { +impl InvOmicronSledConfigZone { pub fn new( inv_collection_id: CollectionUuid, - sled_id: SledUuid, + sled_config_id: OmicronSledConfigUuid, zone: &OmicronZoneConfig, - ) -> Result { + ) -> Result { + let (image_source, image_artifact_sha256) = match &zone.image_source { + OmicronZoneImageSource::InstallDataset => { + (InvZoneImageSource::InstallDataset, None) + } + OmicronZoneImageSource::Artifact { hash } => { + (InvZoneImageSource::Artifact, Some(ArtifactHash(*hash))) + } + }; + // Create a dummy record to start, then fill in the rest // according to the zone type - let mut inv_omicron_zone = InvOmicronZone { + let mut inv_omicron_zone = InvOmicronSledConfigZone { // Fill in the known fields that don't require inspecting // `zone.zone_type` inv_collection_id: inv_collection_id.into(), - sled_id: sled_id.into(), + sled_config_id: sled_config_id.into(), id: zone.id.into(), filesystem_pool: zone .filesystem_pool @@ -1342,6 +1710,8 @@ impl InvOmicronZone { snat_ip: None, snat_first_port: None, snat_last_port: None, + image_source, + image_artifact_sha256, }; match &zone.zone_type { @@ -1491,7 +1861,7 @@ impl InvOmicronZone { pub fn into_omicron_zone_config( self, - nic_row: Option, + nic_row: Option, ) -> Result { // Build up a set of common fields for our `OmicronZoneType`s // @@ -1629,21 +1999,40 @@ impl InvOmicronZone { } }; + let image_source = match (self.image_source, self.image_artifact_sha256) + { + (InvZoneImageSource::InstallDataset, None) => { + OmicronZoneImageSource::InstallDataset + } + (InvZoneImageSource::Artifact, Some(ArtifactHash(hash))) => { + OmicronZoneImageSource::Artifact { hash } + } + (InvZoneImageSource::InstallDataset, Some(_)) + | (InvZoneImageSource::Artifact, None) => { + bail!( + "invalid image source column combination: {:?}, {:?}", + self.image_source, + self.image_artifact_sha256 + ) + } + }; + Ok(OmicronZoneConfig { id: self.id.into(), filesystem_pool: self .filesystem_pool .map(|id| ZpoolName::new_external(id.into())), zone_type, - image_source: OmicronZoneImageSource::InstallDataset, + image_source, }) } } #[derive(Queryable, Clone, Debug, Selectable, Insertable)] -#[diesel(table_name = inv_omicron_zone_nic)] -pub struct InvOmicronZoneNic { +#[diesel(table_name = inv_omicron_sled_config_zone_nic)] +pub struct InvOmicronSledConfigZoneNic { inv_collection_id: DbTypedUuid, + pub sled_config_id: DbTypedUuid, pub id: Uuid, name: Name, ip: IpNetwork, @@ -1654,8 +2043,8 @@ pub struct InvOmicronZoneNic { slot: SqlU8, } -impl From for OmicronZoneNic { - fn from(value: InvOmicronZoneNic) -> Self { +impl From for OmicronZoneNic { + fn from(value: InvOmicronSledConfigZoneNic) -> Self { OmicronZoneNic { id: value.id, name: value.name, @@ -1669,17 +2058,19 @@ impl From for OmicronZoneNic { } } -impl InvOmicronZoneNic { +impl InvOmicronSledConfigZoneNic { pub fn new( inv_collection_id: CollectionUuid, + sled_config_id: OmicronSledConfigUuid, zone: &OmicronZoneConfig, - ) -> Result, anyhow::Error> { + ) -> Result, anyhow::Error> { let Some(nic) = zone.zone_type.service_vnic() else { return Ok(None); }; let nic = OmicronZoneNic::new(zone.id, nic)?; Ok(Some(Self { inv_collection_id: inv_collection_id.into(), + sled_config_id: sled_config_id.into(), id: nic.id, name: nic.name, ip: nic.ip, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 23a206039e6..3a787f4e141 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(143, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(144, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(144, "inventory-omicron-sled-config"), KnownVersion::new(143, "alerts-renamening"), KnownVersion::new(142, "bp-add-remove-mupdate-override"), KnownVersion::new(141, "caboose-sign-value"), diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 809df30aa16..1df74355b8f 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -2135,7 +2135,7 @@ mod tests { assert_eq!(blueprint1.sleds.len(), collection.sled_agents.len()); assert_eq!( blueprint1.all_omicron_zones(BlueprintZoneDisposition::any).count(), - collection.all_omicron_zones().count() + collection.all_ledgered_omicron_zones().count() ); // All zones should be in service. assert_all_zones_in_service(&blueprint1); diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index a64d2310f8b..4b84da350d0 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -25,6 +25,7 @@ use diesel::expression::SelectableHelper; use diesel::sql_types::Nullable; use futures::FutureExt; use futures::future::BoxFuture; +use id_map::{IdMap, IdMappable}; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_errors::public_error_from_diesel_lookup; @@ -35,16 +36,23 @@ use nexus_db_model::InvCaboose; use nexus_db_model::InvClickhouseKeeperMembership; use nexus_db_model::InvCollection; use nexus_db_model::InvCollectionError; +use nexus_db_model::InvConfigReconcilerStatus; +use nexus_db_model::InvConfigReconcilerStatusKind; use nexus_db_model::InvDataset; +use nexus_db_model::InvLastReconciliationDatasetResult; +use nexus_db_model::InvLastReconciliationDiskResult; +use nexus_db_model::InvLastReconciliationZoneResult; use nexus_db_model::InvNvmeDiskFirmware; -use nexus_db_model::InvOmicronZone; -use nexus_db_model::InvOmicronZoneNic; +use nexus_db_model::InvOmicronSledConfig; +use nexus_db_model::InvOmicronSledConfigDataset; +use nexus_db_model::InvOmicronSledConfigDisk; +use nexus_db_model::InvOmicronSledConfigZone; +use nexus_db_model::InvOmicronSledConfigZoneNic; use nexus_db_model::InvPhysicalDisk; use nexus_db_model::InvRootOfTrust; use nexus_db_model::InvRotPage; use nexus_db_model::InvServiceProcessor; use nexus_db_model::InvSledAgent; -use nexus_db_model::InvSledOmicronZones; use nexus_db_model::InvZpool; use nexus_db_model::RotImageError; use nexus_db_model::SledRole; @@ -54,24 +62,34 @@ use nexus_db_model::SqlU32; use nexus_db_model::SwCaboose; use nexus_db_model::SwRotPage; use nexus_db_model::to_db_typed_uuid; -use nexus_db_schema::enums::CabooseWhichEnum; use nexus_db_schema::enums::HwPowerStateEnum; use nexus_db_schema::enums::HwRotSlotEnum; use nexus_db_schema::enums::RotImageErrorEnum; use nexus_db_schema::enums::RotPageWhichEnum; use nexus_db_schema::enums::SledRoleEnum; use nexus_db_schema::enums::SpTypeEnum; -use nexus_sled_agent_shared::inventory::OmicronZonesConfig; +use nexus_db_schema::enums::{ + CabooseWhichEnum, InvConfigReconcilerStatusKindEnum, +}; +use nexus_sled_agent_shared::inventory::ConfigReconcilerInventory; +use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult; +use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus; +use nexus_sled_agent_shared::inventory::OmicronSledConfig; use nexus_types::inventory::BaseboardId; use nexus_types::inventory::Collection; use nexus_types::inventory::PhysicalDiskFirmware; +use nexus_types::inventory::SledAgent; use omicron_common::api::external::Error; use omicron_common::api::external::InternalContext; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_common::bail_unless; use omicron_uuid_kinds::CollectionUuid; +use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::OmicronSledConfigUuid; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use std::collections::BTreeMap; use std::collections::BTreeSet; @@ -162,32 +180,6 @@ impl DataStore { } } - // Pull Omicron zone-related metadata out of all sled agents. - // - // TODO: InvSledOmicronZones is a vestigial table kept for backwards - // compatibility -- the only unique data within it (the generation - // number) can be moved into `InvSledAgent` in the future. See - // oxidecomputer/omicron#6770. - let sled_omicron_zones = collection - .sled_agents - .values() - .map(|sled_agent| { - InvSledOmicronZones::new(collection_id, sled_agent) - }) - .collect::>(); - - // Pull Omicron zones out of all sled agents. - let omicron_zones: Vec<_> = collection - .sled_agents - .iter() - .flat_map(|(sled_id, sled_agent)| { - sled_agent.omicron_zones.zones.iter().map(|zone| { - InvOmicronZone::new(collection_id, *sled_id, zone) - .map_err(|e| Error::internal_error(&e.to_string())) - }) - }) - .collect::, _>>()?; - // Pull disks out of all sled agents let disks: Vec<_> = collection .sled_agents @@ -222,6 +214,38 @@ impl DataStore { }) .collect(); + // Build up a list of `OmicronSledConfig`s we need to insert. Each sled + // has 0-3: + // + // * The ledgered sled config (if the sled has gotten a config from RSS + // or Nexus) + // * The most-recently-reconciled config (if the sled-agent's config + // reconciler has run since the last time it started) + // * The currently-being-reconciled config (if the sled-agent's config + // reconciler was actively running when inventory was collected) + // + // If more than one of these are present, they may be equal or distinct; + // for any that are equal, we'll only insert one copy and reuse the + // foreign key ID for subsequent instances. + // + // For each of these configs, we need to insert the top-level scalar + // data (held in an `InvOmicronSledConfig`), plus all of their zones, + // zone NICs, datasets, and disks, and we need collect the config + // reconciler properties for each sled. We need all of this to construct + // `InvSledAgent` instances below. + let ConfigReconcilerRows { + sled_configs: omicron_sled_configs, + disks: omicron_sled_config_disks, + datasets: omicron_sled_config_datasets, + zones: omicron_sled_config_zones, + zone_nics: omicron_sled_config_zone_nics, + disk_results: reconciler_disk_results, + dataset_results: reconciler_dataset_results, + zone_results: reconciler_zone_results, + mut config_reconciler_fields_by_sled, + } = ConfigReconcilerRows::new(collection_id, collection) + .map_err(|e| Error::internal_error(&format!("{e:#}")))?; + // Partition the sled agents into those with an associated baseboard id // and those without one. We handle these pretty differently. let (sled_agents_baseboards, sled_agents_no_baseboards): ( @@ -235,24 +259,24 @@ impl DataStore { .into_iter() .map(|sled_agent| { assert!(sled_agent.baseboard_id.is_none()); - InvSledAgent::new_without_baseboard(collection_id, sled_agent) - .map_err(|e| Error::internal_error(&e.to_string())) + let ConfigReconcilerFields { + ledgered_sled_config, + last_reconciliation_sled_config, + reconciler_status, + } = config_reconciler_fields_by_sled + .remove(&sled_agent.sled_id) + .expect("all sled IDs should exist"); + InvSledAgent::new_without_baseboard( + collection_id, + sled_agent, + ledgered_sled_config, + last_reconciliation_sled_config, + reconciler_status, + ) + .map_err(|e| Error::internal_error(&e.to_string())) }) .collect::, Error>>()?; - let omicron_zone_nics = collection - .sled_agents - .values() - .flat_map(|sled_agent| { - sled_agent.omicron_zones.zones.iter().filter_map(|found_zone| { - InvOmicronZoneNic::new(collection_id, found_zone) - .with_context(|| format!("zone {:?}", found_zone.id)) - .map_err(|e| Error::internal_error(&format!("{:#}", e))) - .transpose() - }) - }) - .collect::, _>>()?; - let mut inv_clickhouse_keeper_memberships = Vec::new(); for membership in &collection.clickhouse_keeper_cluster_membership { inv_clickhouse_keeper_memberships.push( @@ -851,6 +875,158 @@ impl DataStore { } } + // Insert rows for the all the sled configs we found. + { + use nexus_db_schema::schema::inv_omicron_sled_config::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut configs = omicron_sled_configs.into_iter(); + loop { + let some_configs = + configs.by_ref().take(batch_size).collect::>(); + if some_configs.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_omicron_sled_config) + .values(some_configs) + .execute_async(&conn) + .await?; + } + } + + // Insert rows for the all the sled configs' disks we found. + { + use nexus_db_schema::schema::inv_omicron_sled_config_disk::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut disks = omicron_sled_config_disks.into_iter(); + loop { + let some_disks = + disks.by_ref().take(batch_size).collect::>(); + if some_disks.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_omicron_sled_config_disk) + .values(some_disks) + .execute_async(&conn) + .await?; + } + } + + // Insert rows for the all the sled configs' datasets we found. + { + use nexus_db_schema::schema::inv_omicron_sled_config_dataset::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut datasets = omicron_sled_config_datasets.into_iter(); + loop { + let some_datasets = + datasets.by_ref().take(batch_size).collect::>(); + if some_datasets.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_omicron_sled_config_dataset) + .values(some_datasets) + .execute_async(&conn) + .await?; + } + } + + // Insert rows for the all the sled configs' zones we found. + { + use nexus_db_schema::schema::inv_omicron_sled_config_zone::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut zones = omicron_sled_config_zones.into_iter(); + loop { + let some_zones = + zones.by_ref().take(batch_size).collect::>(); + if some_zones.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_omicron_sled_config_zone) + .values(some_zones) + .execute_async(&conn) + .await?; + } + } + + // Insert rows for the all the sled configs' zones' nics we found. + { + use nexus_db_schema::schema::inv_omicron_sled_config_zone_nic::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut zone_nics = omicron_sled_config_zone_nics.into_iter(); + loop { + let some_zone_nics = + zone_nics.by_ref().take(batch_size).collect::>(); + if some_zone_nics.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_omicron_sled_config_zone_nic) + .values(some_zone_nics) + .execute_async(&conn) + .await?; + } + } + + // Insert rows for all the sled config reconciler disk results + { + use nexus_db_schema::schema::inv_last_reconciliation_disk_result::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut disk_results = reconciler_disk_results.into_iter(); + loop { + let some_disk_results = + disk_results.by_ref().take(batch_size).collect::>(); + if some_disk_results.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_last_reconciliation_disk_result) + .values(some_disk_results) + .execute_async(&conn) + .await?; + } + } + + // Insert rows for all the sled config reconciler dataset results + { + use nexus_db_schema::schema::inv_last_reconciliation_dataset_result::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut dataset_results = reconciler_dataset_results.into_iter(); + loop { + let some_dataset_results = + dataset_results.by_ref().take(batch_size).collect::>(); + if some_dataset_results.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_last_reconciliation_dataset_result) + .values(some_dataset_results) + .execute_async(&conn) + .await?; + } + } + + // Insert rows for all the sled config reconciler zone results + { + use nexus_db_schema::schema::inv_last_reconciliation_zone_result::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut zone_results = reconciler_zone_results.into_iter(); + loop { + let some_zone_results = + zone_results.by_ref().take(batch_size).collect::>(); + if some_zone_results.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_last_reconciliation_zone_result) + .values(some_zone_results) + .execute_async(&conn) + .await?; + } + } + // Insert rows for the sled agents that we found. In practice, we'd // expect these to all have baseboards (if using Oxide hardware) or // none have baseboards (if not). @@ -865,6 +1041,13 @@ impl DataStore { let baseboard_id = sled_agent.baseboard_id.as_ref().expect( "already selected only sled agents with baseboards", ); + let ConfigReconcilerFields { + ledgered_sled_config, + last_reconciliation_sled_config, + reconciler_status, + } = config_reconciler_fields_by_sled + .remove(&sled_agent.sled_id) + .expect("all sled IDs should exist"); let selection = nexus_db_schema::schema::hw_baseboard_id::table .select(( db_collection_id @@ -897,10 +1080,21 @@ impl DataStore { sled_agent.reservoir_size, ) .into_sql::(), - nexus_db_model::Generation( - sled_agent.omicron_physical_disks_generation, - ) - .into_sql::(), + ledgered_sled_config + .map(|id| id.into_untyped_uuid()) + .into_sql::>(), + last_reconciliation_sled_config + .map(|id| id.into_untyped_uuid()) + .into_sql::>(), + reconciler_status.reconciler_status_kind + .into_sql::(), + reconciler_status.reconciler_status_sled_config + .map(|id| id.into_untyped_uuid()) + .into_sql::>(), + reconciler_status.reconciler_status_timestamp + .into_sql::>(), + reconciler_status.reconciler_status_duration_secs + .into_sql::>(), )) .filter( baseboard_dsl::part_number @@ -926,7 +1120,12 @@ impl DataStore { sa_dsl::usable_hardware_threads, sa_dsl::usable_physical_ram, sa_dsl::reservoir_size, - sa_dsl::omicron_physical_disks_generation, + sa_dsl::ledgered_sled_config, + sa_dsl::last_reconciliation_sled_config, + sa_dsl::reconciler_status_kind, + sa_dsl::reconciler_status_sled_config, + sa_dsl::reconciler_status_timestamp, + sa_dsl::reconciler_status_duration_secs, )) .execute_async(&conn) .await?; @@ -946,7 +1145,12 @@ impl DataStore { _usable_hardware_threads, _usable_physical_ram, _reservoir_size, - _omicron_physical_disks_generation, + _ledgered_sled_config, + _last_reconciliation_sled_config, + _reconciler_status_kind, + _reconciler_status_sled_config, + _reconciler_status_timestamp, + _reconciler_status_duration_secs, ) = sa_dsl::inv_sled_agent::all_columns(); } @@ -962,32 +1166,6 @@ impl DataStore { .await?; } - // Insert all the Omicron zones that we found. - { - use nexus_db_schema::schema::inv_sled_omicron_zones::dsl as sled_zones; - let _ = diesel::insert_into(sled_zones::inv_sled_omicron_zones) - .values(sled_omicron_zones) - .execute_async(&conn) - .await?; - } - - { - use nexus_db_schema::schema::inv_omicron_zone::dsl as omicron_zone; - let _ = diesel::insert_into(omicron_zone::inv_omicron_zone) - .values(omicron_zones) - .execute_async(&conn) - .await?; - } - - { - use nexus_db_schema::schema::inv_omicron_zone_nic::dsl as omicron_zone_nic; - let _ = - diesel::insert_into(omicron_zone_nic::inv_omicron_zone_nic) - .values(omicron_zone_nics) - .execute_async(&conn) - .await?; - } - // Insert the clickhouse keeper memberships we've received { use nexus_db_schema::schema::inv_clickhouse_keeper_membership::dsl; @@ -1256,7 +1434,32 @@ impl DataStore { let conn = self.pool_connection_authorized(opctx).await?; let db_collection_id = to_db_typed_uuid(collection_id); - let ( + // Helper to pack and unpack all the counts of rows we delete in the + // transaction below (exclusively used for logging). + struct NumRowsDeleted { + ncollections: usize, + nsps: usize, + nrots: usize, + ncabooses: usize, + nrot_pages: usize, + nsled_agents: usize, + ndatasets: usize, + nphysical_disks: usize, + nnvme_disk_firmware: usize, + nlast_reconciliation_disk_results: usize, + nlast_reconciliation_dataset_results: usize, + nlast_reconciliation_zone_results: usize, + nomicron_sled_configs: usize, + nomicron_sled_config_disks: usize, + nomicron_sled_config_datasets: usize, + nomicron_sled_config_zones: usize, + nomicron_sled_config_zone_nics: usize, + nzpools: usize, + nerrors: usize, + nclickhouse_keeper_membership: usize, + } + + let NumRowsDeleted { ncollections, nsps, nrots, @@ -1265,14 +1468,19 @@ impl DataStore { nsled_agents, ndatasets, nphysical_disks, - nnvme_disk_disk_firmware, - nsled_agent_zones, - nzones, - nnics, + nnvme_disk_firmware, + nlast_reconciliation_disk_results, + nlast_reconciliation_dataset_results, + nlast_reconciliation_zone_results, + nomicron_sled_configs, + nomicron_sled_config_disks, + nomicron_sled_config_datasets, + nomicron_sled_config_zones, + nomicron_sled_config_zone_nics, nzpools, nerrors, nclickhouse_keeper_membership, - ) = + } = self.transaction_retry_wrapper("inventory_delete_collection") .transaction(&conn, |conn| async move { // Remove the record describing the collection itself. @@ -1357,7 +1565,7 @@ impl DataStore { }; // Remove rows for NVMe physical disk firmware found. - let nnvme_disk_firwmare = { + let nnvme_disk_firmware = { use nexus_db_schema::schema::inv_nvme_disk_firmware::dsl; diesel::delete(dsl::inv_nvme_disk_firmware.filter( dsl::inv_collection_id.eq(db_collection_id), @@ -1366,28 +1574,69 @@ impl DataStore { .await? }; - // Remove rows associated with Omicron zones - let nsled_agent_zones = { - use nexus_db_schema::schema::inv_sled_omicron_zones::dsl; - diesel::delete(dsl::inv_sled_omicron_zones.filter( + // Remove rows associated with the last reconciliation + // result (disks, datasets, and zones). + let nlast_reconciliation_disk_results = { + use nexus_db_schema::schema::inv_last_reconciliation_disk_result::dsl; + diesel::delete(dsl::inv_last_reconciliation_disk_result.filter( dsl::inv_collection_id.eq(db_collection_id), )) .execute_async(&conn) .await? }; - - let nzones = { - use nexus_db_schema::schema::inv_omicron_zone::dsl; - diesel::delete(dsl::inv_omicron_zone.filter( + let nlast_reconciliation_dataset_results = { + use nexus_db_schema::schema::inv_last_reconciliation_dataset_result::dsl; + diesel::delete(dsl::inv_last_reconciliation_dataset_result.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + let nlast_reconciliation_zone_results = { + use nexus_db_schema::schema::inv_last_reconciliation_zone_result::dsl; + diesel::delete(dsl::inv_last_reconciliation_zone_result.filter( dsl::inv_collection_id.eq(db_collection_id), )) .execute_async(&conn) .await? }; - let nnics = { - use nexus_db_schema::schema::inv_omicron_zone_nic::dsl; - diesel::delete(dsl::inv_omicron_zone_nic.filter( + // Remove rows associated with `OmicronSledConfig`s. + let nomicron_sled_configs = { + use nexus_db_schema::schema::inv_omicron_sled_config::dsl; + diesel::delete(dsl::inv_omicron_sled_config.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + let nomicron_sled_config_disks = { + use nexus_db_schema::schema::inv_omicron_sled_config_disk::dsl; + diesel::delete(dsl::inv_omicron_sled_config_disk.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + let nomicron_sled_config_datasets = { + use nexus_db_schema::schema::inv_omicron_sled_config_dataset::dsl; + diesel::delete(dsl::inv_omicron_sled_config_dataset.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + let nomicron_sled_config_zones = { + use nexus_db_schema::schema::inv_omicron_sled_config_zone::dsl; + diesel::delete(dsl::inv_omicron_sled_config_zone.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + let nomicron_sled_config_zone_nics = { + use nexus_db_schema::schema::inv_omicron_sled_config_zone_nic::dsl; + diesel::delete(dsl::inv_omicron_sled_config_zone_nic.filter( dsl::inv_collection_id.eq(db_collection_id), )) .execute_async(&conn) @@ -1425,7 +1674,7 @@ impl DataStore { .await? }; - Ok(( + Ok(NumRowsDeleted { ncollections, nsps, nrots, @@ -1434,14 +1683,19 @@ impl DataStore { nsled_agents, ndatasets, nphysical_disks, - nnvme_disk_firwmare, - nsled_agent_zones, - nzones, - nnics, + nnvme_disk_firmware, + nlast_reconciliation_disk_results, + nlast_reconciliation_dataset_results, + nlast_reconciliation_zone_results, + nomicron_sled_configs, + nomicron_sled_config_disks, + nomicron_sled_config_datasets, + nomicron_sled_config_zones, + nomicron_sled_config_zone_nics, nzpools, nerrors, nclickhouse_keeper_membership, - )) + }) }) .await .map_err(|error| { @@ -1458,10 +1712,18 @@ impl DataStore { "nsled_agents" => nsled_agents, "ndatasets" => ndatasets, "nphysical_disks" => nphysical_disks, - "nnvme_disk_firmware" => nnvme_disk_disk_firmware, - "nsled_agent_zones" => nsled_agent_zones, - "nzones" => nzones, - "nnics" => nnics, + "nnvme_disk_firmware" => nnvme_disk_firmware, + "nlast_reconciliation_disk_results" => + nlast_reconciliation_disk_results, + "nlast_reconciliation_dataset_results" => + nlast_reconciliation_dataset_results, + "nlast_reconciliation_zone_results" => + nlast_reconciliation_zone_results, + "nomicron_sled_configs" => nomicron_sled_configs, + "nomicron_sled_config_disks" => nomicron_sled_config_disks, + "nomicron_sled_config_datasets" => nomicron_sled_config_datasets, + "nomicron_sled_config_zones" => nomicron_sled_config_zones, + "nomicron_sled_config_zone_nics" => nomicron_sled_config_zone_nics, "nzpools" => nzpools, "nerrors" => nerrors, "nclickhouse_keeper_membership" => nclickhouse_keeper_membership @@ -2122,100 +2384,107 @@ impl DataStore { ); } - // Now read the Omicron zones. + // Now read the `OmicronSledConfig`s. + // + // There are 0-3 sled configs per sled agent (ledgered sled + // config, last reconciled config, and actively-being-reconciled + // config). Each of these is keyed by a database-only ID; these IDs are + // referenced by `inv_sled_agent`. Before we read `inv_sled_agent`, + // build up the map of all the sled configs it might reference. // - // In the first pass, we'll load the "inv_sled_omicron_zones" records. - // There's one of these per sled. It does not contain the actual list - // of zones -- basically just collection metadata and the generation - // number. We'll assemble these directly into the data structure we're - // trying to build, which maps sled ids to objects describing the zones - // found on each sled. - let mut omicron_zones: BTreeMap = { - use nexus_db_schema::schema::inv_sled_omicron_zones::dsl; + // In this first pass, we'll load the `inv_omicron_sled_config` records. + // It does not contain the actual disk/dataset/zone configs; we'll start + // with empty sets and build those up by reading additional tables + // below. + let mut omicron_sled_configs = { + use nexus_db_schema::schema::inv_omicron_sled_config::dsl; - let mut zones = BTreeMap::new(); + let mut configs = IdMap::new(); let mut paginator = Paginator::new(batch_size); while let Some(p) = paginator.next() { let batch = paginated( - dsl::inv_sled_omicron_zones, - dsl::sled_id, + dsl::inv_omicron_sled_config, + dsl::id, &p.current_pagparams(), ) .filter(dsl::inv_collection_id.eq(db_id)) - .select(InvSledOmicronZones::as_select()) + .select(InvOmicronSledConfig::as_select()) .load_async(&*conn) .await .map_err(|e| { public_error_from_diesel(e, ErrorHandler::Server) })?; - paginator = p.found_batch(&batch, &|row| row.sled_id); - zones.extend(batch.into_iter().map(|sled_zones_config| { - ( - sled_zones_config.sled_id.into(), - OmicronZonesConfig { - generation: sled_zones_config.generation.into(), - zones: Vec::new(), + paginator = p.found_batch(&batch, &|row| row.id); + for sled_config in batch { + configs.insert(OmicronSledConfigWithId { + id: sled_config.id.into(), + config: OmicronSledConfig { + generation: sled_config.generation.into(), + remove_mupdate_override: sled_config + .remove_mupdate_override + .map(From::from), + disks: IdMap::default(), + datasets: IdMap::default(), + zones: IdMap::default(), }, - ) - })) + }); + } } - zones + configs }; // Assemble a mutable map of all the NICs found, by NIC id. As we // match these up with the corresponding zone below, we'll remove items // from this set. That way we can tell if the same NIC was used twice // or not used at all. - let mut omicron_zone_nics: BTreeMap<_, _> = - { - use nexus_db_schema::schema::inv_omicron_zone_nic::dsl; + let mut omicron_zone_nics: BTreeMap<_, _> = { + use nexus_db_schema::schema::inv_omicron_sled_config_zone_nic::dsl; - let mut nics = BTreeMap::new(); + let mut nics = BTreeMap::new(); - let mut paginator = Paginator::new(batch_size); - while let Some(p) = paginator.next() { - let batch = paginated( - dsl::inv_omicron_zone_nic, - dsl::id, - &p.current_pagparams(), + let mut paginator = Paginator::new(batch_size); + while let Some(p) = paginator.next() { + let batch = paginated( + dsl::inv_omicron_sled_config_zone_nic, + dsl::id, + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvOmicronSledConfigZoneNic::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| row.id); + nics.extend(batch.into_iter().map(|found_zone_nic| { + ( + (found_zone_nic.sled_config_id, found_zone_nic.id), + found_zone_nic, ) - .filter(dsl::inv_collection_id.eq(db_id)) - .select(InvOmicronZoneNic::as_select()) - .load_async(&*conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; - paginator = p.found_batch(&batch, &|row| row.id); - nics.extend(batch.into_iter().map(|found_zone_nic| { - (found_zone_nic.id, found_zone_nic) - })); - } + })); + } - nics - }; + nics + }; - // Now load the actual list of zones from all sleds. + // Now load the actual list of zones from all configs. let omicron_zones_list = { - use nexus_db_schema::schema::inv_omicron_zone::dsl; + use nexus_db_schema::schema::inv_omicron_sled_config_zone::dsl; let mut zones = Vec::new(); let mut paginator = Paginator::new(batch_size); while let Some(p) = paginator.next() { let mut batch = paginated( - dsl::inv_omicron_zone, + dsl::inv_omicron_sled_config_zone, dsl::id, &p.current_pagparams(), ) .filter(dsl::inv_collection_id.eq(db_id)) - // It's not strictly necessary to order these by id. Doing so - // ensures a consistent representation for `Collection`, which - // makes testing easier. It's already indexed to do this, too. - .order_by(dsl::id) - .select(InvOmicronZone::as_select()) + .select(InvOmicronSledConfigZone::as_select()) .load_async(&*conn) .await .map_err(|e| { @@ -2231,12 +2500,13 @@ impl DataStore { let nic_row = z .nic_id .map(|id| { - // This error means that we found a row in inv_omicron_zone - // that references a NIC by id but there's no corresponding - // row in inv_omicron_zone_nic with that id. This should be - // impossible and reflects either a bug or database + // This error means that we found a row in + // inv_omicron_sled_config_zone that references a NIC by id + // but there's no corresponding row in + // inv_omicron_sled_config_zone with that id. This should + // be impossible and reflects either a bug or database // corruption. - omicron_zone_nics.remove(&id).ok_or_else(|| { + omicron_zone_nics.remove(&(z.sled_config_id, id)).ok_or_else(|| { Error::internal_error(&format!( "zone {:?}: expected to find NIC {:?}, but didn't", z.id, z.nic_id @@ -2244,15 +2514,16 @@ impl DataStore { }) }) .transpose()?; - let map = - omicron_zones.get_mut(&z.sled_id.into()).ok_or_else(|| { - // This error means that we found a row in inv_omicron_zone - // with no associated record in inv_sled_omicron_zones. - // This should be impossible and reflects either a bug or - // database corruption. + let mut config_with_id = omicron_sled_configs + .get_mut(&z.sled_config_id.into()) + .ok_or_else(|| { + // This error means that we found a row in + // inv_omicron_sled_config_zone with no associated record in + // inv_sled_omicron_config. This should be impossible and + // reflects either a bug or database corruption. Error::internal_error(&format!( - "zone {:?}: unknown sled: {:?}", - z.id, z.sled_id + "zone {:?}: unknown config ID: {:?}", + z.id, z.sled_config_id )) })?; let zone_id = z.id; @@ -2264,9 +2535,195 @@ impl DataStore { .map_err(|e| { Error::internal_error(&format!("{:#}", e.to_string())) })?; - map.zones.push(zone); + config_with_id.config.zones.insert(zone); } + bail_unless!( + omicron_zone_nics.is_empty(), + "found extra Omicron zone NICs: {:?}", + omicron_zone_nics.keys() + ); + + // Now load the datasets from all configs. + { + use nexus_db_schema::schema::inv_omicron_sled_config_dataset::dsl; + + let mut paginator = Paginator::new(batch_size); + while let Some(p) = paginator.next() { + let batch = paginated( + dsl::inv_omicron_sled_config_dataset, + dsl::id, + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvOmicronSledConfigDataset::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| row.id); + + for row in batch { + let mut config_with_id = omicron_sled_configs + .get_mut(&row.sled_config_id.into()) + .ok_or_else(|| { + Error::internal_error(&format!( + "dataset config {:?}: unknown config ID: {:?}", + row.id, row.sled_config_id + )) + })?; + config_with_id.config.datasets.insert( + row.try_into().map_err(|e| { + Error::internal_error(&format!("{e:#}")) + })?, + ); + } + } + } + + // Now load the disks from all configs. + { + use nexus_db_schema::schema::inv_omicron_sled_config_disk::dsl; + + let mut paginator = Paginator::new(batch_size); + while let Some(p) = paginator.next() { + let batch = paginated( + dsl::inv_omicron_sled_config_disk, + dsl::id, + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvOmicronSledConfigDisk::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| row.id); + + for row in batch { + let mut config_with_id = omicron_sled_configs + .get_mut(&row.sled_config_id.into()) + .ok_or_else(|| { + Error::internal_error(&format!( + "disk config {:?}: unknown config ID: {:?}", + row.id, row.sled_config_id + )) + })?; + config_with_id.config.disks.insert(row.into()); + } + } + } + + // Load all the config reconciler disk results; build a map of maps + // keyed by sled ID. + let mut last_reconciliation_disk_results = { + use nexus_db_schema::schema::inv_last_reconciliation_disk_result::dsl; + + let mut results: BTreeMap< + SledUuid, + BTreeMap, + > = BTreeMap::new(); + + let mut paginator = Paginator::new(batch_size); + while let Some(p) = paginator.next() { + let batch = paginated( + dsl::inv_last_reconciliation_disk_result, + dsl::disk_id, + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvLastReconciliationDiskResult::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| row.disk_id); + + for row in batch { + let sled_map = + results.entry(row.sled_id.into()).or_default(); + sled_map.insert(row.disk_id.into(), row.into()); + } + } + + results + }; + + // Load all the config reconciler dataset results; build a map of maps + // keyed by sled ID. + let mut last_reconciliation_dataset_results = { + use nexus_db_schema::schema::inv_last_reconciliation_dataset_result::dsl; + + let mut results: BTreeMap< + SledUuid, + BTreeMap, + > = BTreeMap::new(); + + let mut paginator = Paginator::new(batch_size); + while let Some(p) = paginator.next() { + let batch = paginated( + dsl::inv_last_reconciliation_dataset_result, + dsl::dataset_id, + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvLastReconciliationDatasetResult::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| row.dataset_id); + + for row in batch { + let sled_map = + results.entry(row.sled_id.into()).or_default(); + sled_map.insert(row.dataset_id.into(), row.into()); + } + } + + results + }; + + // Load all the config reconciler zone results; build a map of maps + // keyed by sled ID. + let mut last_reconciliation_zone_results = { + use nexus_db_schema::schema::inv_last_reconciliation_zone_result::dsl; + + let mut results: BTreeMap< + SledUuid, + BTreeMap, + > = BTreeMap::new(); + + let mut paginator = Paginator::new(batch_size); + while let Some(p) = paginator.next() { + let batch = paginated( + dsl::inv_last_reconciliation_zone_result, + dsl::zone_id, + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvLastReconciliationZoneResult::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| row.zone_id); + + for row in batch { + let sled_map = + results.entry(row.sled_id.into()).or_default(); + sled_map.insert(row.zone_id.into(), row.into()); + } + } + + results + }; + // Now load the clickhouse keeper cluster memberships let clickhouse_keeper_cluster_membership = { use nexus_db_schema::schema::inv_clickhouse_keeper_membership::dsl; @@ -2298,12 +2755,6 @@ impl DataStore { memberships }; - bail_unless!( - omicron_zone_nics.is_empty(), - "found extra Omicron zone NICs: {:?}", - omicron_zone_nics.keys() - ); - // Finally, build up the sled-agent map using the sled agent and // omicron zone rows. A for loop is easier to understand than into_iter // + filter_map + return Result + collect. @@ -2321,34 +2772,59 @@ impl DataStore { }) .transpose()?; - // Look up the Omicron zones. - // - // Older versions of Nexus fetched the Omicron zones in a separate - // request from the other sled agent data. The database model stil - // accounts for the possibility that for a given (collection, sled) - // pair, one of those queries succeeded while the other failed. But - // this has since been changed to fetch all the data in a single - // query, which means that newer collections will either have both - // sets of data or neither of them. - // - // If it _is_ the case that one of the pieces of data is missing, - // log that as a warning and drop the sled from the collection. - // This should only happen for old collections, and only in the - // unlikely case that exactly one of the two related requests - // failed. - // - // TODO: Update the database model to reflect the new reality - // (oxidecomputer/omicron#6770). - let Some(omicron_zones) = omicron_zones.remove(&sled_id) else { - warn!( - self.log, - "no sled Omicron zone data present -- assuming that collection was done - by an old Nexus version and dropping sled from it"; - "collection" => %id, - "sled_id" => %sled_id, - ); - continue; - }; + // Convert all the sled config foreign keys back into fully realized + // `OmicronSledConfig`s. We have to clone these: the same + // `OmicronSledConfig` may be referenced up to three times by this + // sled. + let ledgered_sled_config = s + .ledgered_sled_config + .map(|id| { + omicron_sled_configs.get(&id.into()).as_ref() + .map(|c| c.config.clone()) + .ok_or_else(|| { + Error::internal_error( + "missing sled config that we should have fetched", + ) + }) + }) + .transpose()?; + let reconciler_status = s + .reconciler_status + .to_status(|config_id| { + omicron_sled_configs + .get(config_id) + .as_ref() + .map(|c| c.config.clone()) + }) + .map_err(|e| Error::internal_error(&format!("{e:#}")))?; + let last_reconciliation = s + .last_reconciliation_sled_config + .map(|id| { + let last_reconciled_config = omicron_sled_configs + .get(&id.into()) + .as_ref() + .ok_or_else(|| { + Error::internal_error( + "missing sled config that we \ + should have fetched", + ) + })? + .config + .clone(); + Ok::<_, Error>(ConfigReconcilerInventory { + last_reconciled_config, + external_disks: last_reconciliation_disk_results + .remove(&sled_id) + .unwrap_or_default(), + datasets: last_reconciliation_dataset_results + .remove(&sled_id) + .unwrap_or_default(), + zones: last_reconciliation_zone_results + .remove(&sled_id) + .unwrap_or_default(), + }) + }) + .transpose()?; let sled_agent = nexus_types::inventory::SledAgent { time_collected: s.time_collected, @@ -2365,7 +2841,6 @@ impl DataStore { usable_hardware_threads: u32::from(s.usable_hardware_threads), usable_physical_ram: s.usable_physical_ram.into(), reservoir_size: s.reservoir_size.into(), - omicron_zones, // For disks, zpools, and datasets, the map for a sled ID is // only populated if there is at least one disk/zpool/dataset // for that sled. The `unwrap_or_default` calls cover the case @@ -2382,13 +2857,31 @@ impl DataStore { .get(sled_id.as_untyped_uuid()) .map(|datasets| datasets.to_vec()) .unwrap_or_default(), - omicron_physical_disks_generation: s - .omicron_physical_disks_generation - .into(), + ledgered_sled_config, + reconciler_status, + last_reconciliation, }; sled_agents.insert(sled_id, sled_agent); } + // Check that we consumed all the reconciliation results we found in + // this collection. + bail_unless!( + last_reconciliation_disk_results.is_empty(), + "found extra config reconciliation disk results: {:?}", + last_reconciliation_disk_results.keys() + ); + bail_unless!( + last_reconciliation_dataset_results.is_empty(), + "found extra config reconciliation dataset results: {:?}", + last_reconciliation_dataset_results.keys() + ); + bail_unless!( + last_reconciliation_zone_results.is_empty(), + "found extra config reconciliation zone results: {:?}", + last_reconciliation_zone_results.keys() + ); + Ok(Collection { id, errors, @@ -2408,6 +2901,232 @@ impl DataStore { } } +#[derive(Debug)] +struct OmicronSledConfigWithId { + id: OmicronSledConfigUuid, + config: OmicronSledConfig, +} + +impl IdMappable for OmicronSledConfigWithId { + type Id = OmicronSledConfigUuid; + + fn id(&self) -> Self::Id { + self.id + } +} + +#[derive(Debug)] +struct ConfigReconcilerFields { + ledgered_sled_config: Option, + last_reconciliation_sled_config: Option, + reconciler_status: InvConfigReconcilerStatus, +} + +// Helper to build the sets of rows for all the `OmicronSledConfig`s and +// per-sled config reconciler status rows for an inventory collection. +#[derive(Debug, Default)] +struct ConfigReconcilerRows { + sled_configs: Vec, + disks: Vec, + datasets: Vec, + zones: Vec, + zone_nics: Vec, + disk_results: Vec, + dataset_results: Vec, + zone_results: Vec, + config_reconciler_fields_by_sled: + BTreeMap, +} + +impl ConfigReconcilerRows { + fn new( + collection_id: CollectionUuid, + collection: &Collection, + ) -> anyhow::Result { + let mut this = Self::default(); + for (sled_id, sled_agent) in &collection.sled_agents { + this.accumulate(collection_id, *sled_id, sled_agent)?; + } + Ok(this) + } + + fn accumulate( + &mut self, + collection_id: CollectionUuid, + sled_id: SledUuid, + sled_agent: &SledAgent, + ) -> anyhow::Result<()> { + let mut ledgered_sled_config = None; + if let Some(config) = &sled_agent.ledgered_sled_config { + ledgered_sled_config = + Some(self.accumulate_config(collection_id, config)?); + } + + let mut last_reconciliation_sled_config = None; + if let Some(last_reconciliation) = &sled_agent.last_reconciliation { + // If this config exactly matches the ledgered sled config, we can + // reuse the foreign key; otherwise, accumulate the new one. + if Some(&last_reconciliation.last_reconciled_config) + == sled_agent.ledgered_sled_config.as_ref() + { + last_reconciliation_sled_config = ledgered_sled_config; + } else { + last_reconciliation_sled_config = + Some(self.accumulate_config( + collection_id, + &last_reconciliation.last_reconciled_config, + )?); + } + + self.disk_results.extend( + last_reconciliation.external_disks.iter().map( + |(disk_id, result)| { + InvLastReconciliationDiskResult::new( + collection_id, + sled_id, + *disk_id, + result.clone(), + ) + }, + ), + ); + self.dataset_results.extend( + last_reconciliation.datasets.iter().map( + |(dataset_id, result)| { + InvLastReconciliationDatasetResult::new( + collection_id, + sled_id, + *dataset_id, + result.clone(), + ) + }, + ), + ); + self.zone_results.extend(last_reconciliation.zones.iter().map( + |(zone_id, result)| { + InvLastReconciliationZoneResult::new( + collection_id, + sled_id, + *zone_id, + result.clone(), + ) + }, + )); + } + + let reconciler_status = match &sled_agent.reconciler_status { + ConfigReconcilerInventoryStatus::NotYetRun => { + InvConfigReconcilerStatus { + reconciler_status_kind: + InvConfigReconcilerStatusKind::NotYetRun, + reconciler_status_sled_config: None, + reconciler_status_timestamp: None, + reconciler_status_duration_secs: None, + } + } + ConfigReconcilerInventoryStatus::Running { + config, + started_at, + running_for, + } => { + // If this config exactly matches the ledgered or + // most-recently-reconciled configs, we can reuse those IDs. + // Otherwise, accumulate a new one. + let reconciler_status_sled_config = if Some(config) + == sled_agent.ledgered_sled_config.as_ref() + { + ledgered_sled_config + } else if Some(config) + == sled_agent + .last_reconciliation + .as_ref() + .map(|lr| &lr.last_reconciled_config) + { + last_reconciliation_sled_config + } else { + Some(self.accumulate_config(collection_id, config)?) + }; + InvConfigReconcilerStatus { + reconciler_status_kind: + InvConfigReconcilerStatusKind::Running, + reconciler_status_sled_config: + reconciler_status_sled_config.map(to_db_typed_uuid), + reconciler_status_timestamp: Some(*started_at), + reconciler_status_duration_secs: Some( + running_for.as_secs_f64(), + ), + } + } + ConfigReconcilerInventoryStatus::Idle { completed_at, ran_for } => { + InvConfigReconcilerStatus { + reconciler_status_kind: InvConfigReconcilerStatusKind::Idle, + reconciler_status_sled_config: None, + reconciler_status_timestamp: Some(*completed_at), + reconciler_status_duration_secs: Some( + ran_for.as_secs_f64(), + ), + } + } + }; + + self.config_reconciler_fields_by_sled.insert( + sled_id, + ConfigReconcilerFields { + ledgered_sled_config, + last_reconciliation_sled_config, + reconciler_status, + }, + ); + + Ok(()) + } + + fn accumulate_config( + &mut self, + collection_id: CollectionUuid, + config: &OmicronSledConfig, + ) -> anyhow::Result { + let sled_config_id = OmicronSledConfigUuid::new_v4(); + + self.sled_configs.push(InvOmicronSledConfig::new( + collection_id, + sled_config_id, + config.generation, + config.remove_mupdate_override, + )); + self.disks.extend(config.disks.iter().map(|disk| { + InvOmicronSledConfigDisk::new( + collection_id, + sled_config_id, + disk.clone(), + ) + })); + self.datasets.extend(config.datasets.iter().map(|dataset| { + InvOmicronSledConfigDataset::new( + collection_id, + sled_config_id, + dataset, + ) + })); + for zone in &config.zones { + self.zones.push(InvOmicronSledConfigZone::new( + collection_id, + sled_config_id, + zone, + )?); + if let Some(nic) = InvOmicronSledConfigZoneNic::new( + collection_id, + sled_config_id, + zone, + )? { + self.zone_nics.push(nic); + } + } + + Ok(sled_config_id) + } +} + /// Extra interfaces that are not intended (and potentially unsafe) for use in /// Nexus, but useful for testing and `omdb` pub trait DataStoreInventoryTest: Send + Sync { @@ -2468,15 +3187,24 @@ mod test { use nexus_db_schema::schema; use nexus_inventory::examples::Representative; use nexus_inventory::examples::representative; + use nexus_inventory::now_db_precision; + use nexus_sled_agent_shared::inventory::{ + ConfigReconcilerInventory, ConfigReconcilerInventoryResult, + ConfigReconcilerInventoryStatus, OmicronZoneImageSource, + }; use nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_types::inventory::BaseboardId; use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::RotPageWhich; use omicron_common::api::external::Error; use omicron_test_utils::dev; - use omicron_uuid_kinds::CollectionUuid; + use omicron_uuid_kinds::{ + CollectionUuid, DatasetUuid, OmicronZoneUuid, PhysicalDiskUuid, + }; use pretty_assertions::assert_eq; use std::num::NonZeroU32; + use std::time::Duration; + use tufaceous_artifact::ArtifactHash; struct CollectionCounts { baseboards: usize, @@ -3000,19 +3728,55 @@ mod test { .unwrap(); assert_eq!(0, count); let count = - schema::inv_sled_omicron_zones::dsl::inv_sled_omicron_zones + schema::inv_omicron_sled_config::dsl::inv_omicron_sled_config .select(diesel::dsl::count_star()) .first_async::(&conn) .await .unwrap(); assert_eq!(0, count); - let count = schema::inv_omicron_zone::dsl::inv_omicron_zone + let count = schema::inv_last_reconciliation_disk_result::dsl::inv_last_reconciliation_disk_result .select(diesel::dsl::count_star()) .first_async::(&conn) .await .unwrap(); assert_eq!(0, count); - let count = schema::inv_omicron_zone_nic::dsl::inv_omicron_zone_nic + let count = schema::inv_last_reconciliation_dataset_result::dsl::inv_last_reconciliation_dataset_result + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_last_reconciliation_zone_result::dsl::inv_last_reconciliation_zone_result + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_omicron_sled_config::dsl::inv_omicron_sled_config + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_omicron_sled_config_disk::dsl::inv_omicron_sled_config_disk + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_omicron_sled_config_dataset::dsl::inv_omicron_sled_config_dataset + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_omicron_sled_config_zone::dsl::inv_omicron_sled_config_zone + .select(diesel::dsl::count_star()) + .first_async::(&conn) + .await + .unwrap(); + assert_eq!(0, count); + let count = schema::inv_omicron_sled_config_zone_nic::dsl::inv_omicron_sled_config_zone_nic .select(diesel::dsl::count_star()) .first_async::(&conn) .await @@ -3186,4 +3950,151 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_reconciler_status_fields() { + // Setup + let logctx = dev::test_setup_log("reconciler_status_fields"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Start with a representative collection. + let Representative { builder, .. } = representative(); + let mut collection = builder.build(); + + // Mutate the sled-agent contents to test variants of the reconciler + // fields: + // + // * try all three `ConfigReconcilerInventoryStatus` variants + // * add a `last_reconciliation` with a mix of success/error results + let mut sled_agents = collection.sled_agents.values_mut(); + let sa1 = sled_agents.next().expect("at least 1 sled agent"); + let sa2 = sled_agents.next().expect("at least 2 sled agents"); + let sa3 = sled_agents.next().expect("at least 3 sled agents"); + + sa1.reconciler_status = ConfigReconcilerInventoryStatus::NotYetRun; + sa1.last_reconciliation = None; + + sa2.reconciler_status = ConfigReconcilerInventoryStatus::Running { + config: sa2.ledgered_sled_config.clone().unwrap(), + started_at: now_db_precision(), + running_for: Duration::from_secs(1), + }; + sa2.last_reconciliation = Some({ + let make_result = |kind, i| { + if i % 2 == 0 { + ConfigReconcilerInventoryResult::Ok + } else { + ConfigReconcilerInventoryResult::Err { + message: format!("fake {kind} error {i}"), + } + } + }; + ConfigReconcilerInventory { + last_reconciled_config: sa2 + .ledgered_sled_config + .clone() + .unwrap(), + external_disks: (0..10) + .map(|i| { + (PhysicalDiskUuid::new_v4(), make_result("disk", i)) + }) + .collect(), + datasets: (0..10) + .map(|i| (DatasetUuid::new_v4(), make_result("dataset", i))) + .collect(), + zones: (0..10) + .map(|i| { + (OmicronZoneUuid::new_v4(), make_result("zone", i)) + }) + .collect(), + } + }); + + sa3.reconciler_status = ConfigReconcilerInventoryStatus::Idle { + completed_at: now_db_precision(), + ran_for: Duration::from_secs(5), + }; + + // Write it to the db; read it back and check it survived. + datastore + .inventory_insert_collection(&opctx, &collection) + .await + .expect("failed to insert collection"); + let collection_read = datastore + .inventory_collection_read(&opctx, collection.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection, collection_read); + + // Now delete it and ensure we remove everything. + datastore + .inventory_delete_collection(&opctx, collection.id) + .await + .expect("failed to prune collections"); + + // Read all "inv_" tables and ensure that they're empty + check_all_inv_tables(&datastore, AllInvTables::AreEmpty).await.expect( + "All inv_... tables should be deleted alongside collection", + ); + + // Clean up. + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_zone_image_source() { + // Setup + let logctx = dev::test_setup_log("zone_image_source"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Start with a representative collection. + let Representative { builder, .. } = representative(); + let mut collection = builder.build(); + + // Mutate some zones on one of the sleds to have various image sources. + { + // Find a sled that has a few zones in one of its sled configs. + let sa = collection + .sled_agents + .values_mut() + .find(|sa| { + sa.ledgered_sled_config + .as_ref() + .map_or(false, |config| config.zones.len() >= 3) + }) + .expect("at least one sled has 3 or more zones"); + let mut zones_iter = + sa.ledgered_sled_config.as_mut().unwrap().zones.iter_mut(); + + let mut z1 = zones_iter.next().expect("at least 1 zone"); + let mut z2 = zones_iter.next().expect("at least 2 zones"); + let mut z3 = zones_iter.next().expect("at least 3 zones"); + z1.image_source = OmicronZoneImageSource::InstallDataset; + z2.image_source = OmicronZoneImageSource::Artifact { + hash: ArtifactHash([0; 32]), + }; + z3.image_source = OmicronZoneImageSource::Artifact { + hash: ArtifactHash([1; 32]), + }; + eprintln!("changed image sources: {z1:?} {z2:?} {z3:?}"); + } + + // Write it to the db; read it back and check it survived. + datastore + .inventory_insert_collection(&opctx, &collection) + .await + .expect("failed to insert collection"); + let collection_read = datastore + .inventory_collection_read(&opctx, collection.id) + .await + .expect("failed to read collection back"); + assert_eq!(collection, collection_read); + + // Clean up. + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/datastore/physical_disk.rs b/nexus/db-queries/src/db/datastore/physical_disk.rs index 1d2423ddeac..abfcb113a8b 100644 --- a/nexus/db-queries/src/db/datastore/physical_disk.rs +++ b/nexus/db-queries/src/db/datastore/physical_disk.rs @@ -339,7 +339,8 @@ mod test { use dropshot::PaginationOrder; use nexus_db_lookup::LookupPath; use nexus_sled_agent_shared::inventory::{ - Baseboard, Inventory, InventoryDisk, OmicronZonesConfig, SledRole, + Baseboard, ConfigReconcilerInventoryStatus, Inventory, InventoryDisk, + SledRole, }; use nexus_types::identity::Asset; use omicron_common::api::external::ByteCount; @@ -692,15 +693,13 @@ mod test { sled_id: SledUuid::from_untyped_uuid(sled.id()), usable_hardware_threads: 10, usable_physical_ram: ByteCount::from(1024 * 1024), - omicron_zones: OmicronZonesConfig { - generation: OmicronZonesConfig::INITIAL_GENERATION, - zones: vec![], - }, disks, zpools: vec![], datasets: vec![], - omicron_physical_disks_generation: - omicron_common::api::external::Generation::new(), + ledgered_sled_config: None, + reconciler_status: + ConfigReconcilerInventoryStatus::NotYetRun, + last_reconciliation: None, }, ) .unwrap(); diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index 917705d9e7e..6f8cd6f1774 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -46,6 +46,8 @@ define_enums! { InstanceAutoRestartPolicyEnum => "instance_auto_restart", InstanceStateEnum => "instance_state_v2", InstanceIntendedStateEnum => "instance_intended_state", + InvConfigReconcilerStatusKindEnum => "inv_config_reconciler_status_kind", + InvZoneImageSourceEnum => "inv_zone_image_source", IpAttachStateEnum => "ip_attach_state", IpKindEnum => "ip_kind", IpPoolResourceTypeEnum => "ip_pool_resource_type", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 1fd05de42fe..9875d3bc395 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1582,7 +1582,45 @@ table! { usable_hardware_threads -> Int8, usable_physical_ram -> Int8, reservoir_size -> Int8, - omicron_physical_disks_generation -> Int8, + + ledgered_sled_config -> Nullable, + last_reconciliation_sled_config -> Nullable, + reconciler_status_kind -> crate::enums::InvConfigReconcilerStatusKindEnum, + reconciler_status_sled_config -> Nullable, + reconciler_status_timestamp -> Nullable, + reconciler_status_duration_secs -> Nullable, + } +} + +table! { + inv_last_reconciliation_disk_result (inv_collection_id, sled_id, disk_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + disk_id -> Uuid, + + error_message -> Nullable, + } +} + +table! { + inv_last_reconciliation_dataset_result + (inv_collection_id, sled_id, dataset_id) + { + inv_collection_id -> Uuid, + sled_id -> Uuid, + dataset_id -> Uuid, + + error_message -> Nullable, + } +} + +table! { + inv_last_reconciliation_zone_result (inv_collection_id, sled_id, zone_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + zone_id -> Uuid, + + error_message -> Nullable, } } @@ -1640,20 +1678,19 @@ table! { } table! { - inv_sled_omicron_zones (inv_collection_id, sled_id) { + inv_omicron_sled_config (inv_collection_id, id) { inv_collection_id -> Uuid, - time_collected -> Timestamptz, - source -> Text, - sled_id -> Uuid, + id -> Uuid, generation -> Int8, + remove_mupdate_override -> Nullable, } } table! { - inv_omicron_zone (inv_collection_id, id) { + inv_omicron_sled_config_zone (inv_collection_id, sled_config_id, id) { inv_collection_id -> Uuid, - sled_id -> Uuid, + sled_config_id -> Uuid, id -> Uuid, zone_type -> crate::enums::ZoneTypeEnum, @@ -1675,12 +1712,16 @@ table! { snat_first_port -> Nullable, snat_last_port -> Nullable, filesystem_pool -> Nullable, + + image_source -> crate::enums::InvZoneImageSourceEnum, + image_artifact_sha256 -> Nullable, } } table! { - inv_omicron_zone_nic (inv_collection_id, id) { + inv_omicron_sled_config_zone_nic (inv_collection_id, sled_config_id, id) { inv_collection_id -> Uuid, + sled_config_id -> Uuid, id -> Uuid, name -> Text, ip -> Inet, @@ -1692,6 +1733,38 @@ table! { } } +table! { + inv_omicron_sled_config_dataset (inv_collection_id, sled_config_id, id) { + inv_collection_id -> Uuid, + sled_config_id -> Uuid, + sled_id -> Uuid, + id -> Uuid, + + pool_id -> Uuid, + kind -> crate::enums::DatasetKindEnum, + zone_name -> Nullable, + + quota -> Nullable, + reservation -> Nullable, + compression -> Text, + } +} + +table! { + inv_omicron_sled_config_disk (inv_collection_id, sled_config_id, id) { + inv_collection_id -> Uuid, + sled_config_id -> Uuid, + sled_id -> Uuid, + id -> Uuid, + + vendor -> Text, + serial -> Text, + model -> Text, + + pool_id -> Uuid, + } +} + table! { inv_clickhouse_keeper_membership (inv_collection_id, queried_keeper_id) { inv_collection_id -> Uuid, @@ -2152,14 +2225,35 @@ allow_tables_to_appear_in_same_query!(external_ip, project); allow_tables_to_appear_in_same_query!(external_ip, ip_pool_resource); allow_tables_to_appear_in_same_query!(external_ip, vmm); allow_tables_to_appear_in_same_query!(external_ip, network_interface); -allow_tables_to_appear_in_same_query!(external_ip, inv_omicron_zone); -allow_tables_to_appear_in_same_query!(external_ip, inv_omicron_zone_nic); -allow_tables_to_appear_in_same_query!(inv_omicron_zone, inv_omicron_zone_nic); -allow_tables_to_appear_in_same_query!(network_interface, inv_omicron_zone); -allow_tables_to_appear_in_same_query!(network_interface, inv_omicron_zone_nic); +allow_tables_to_appear_in_same_query!( + external_ip, + inv_omicron_sled_config_zone +); +allow_tables_to_appear_in_same_query!( + external_ip, + inv_omicron_sled_config_zone_nic +); +allow_tables_to_appear_in_same_query!( + inv_omicron_sled_config_zone, + inv_omicron_sled_config_zone_nic +); +allow_tables_to_appear_in_same_query!( + network_interface, + inv_omicron_sled_config_zone +); +allow_tables_to_appear_in_same_query!( + network_interface, + inv_omicron_sled_config_zone_nic +); allow_tables_to_appear_in_same_query!(network_interface, inv_collection); -allow_tables_to_appear_in_same_query!(inv_omicron_zone, inv_collection); -allow_tables_to_appear_in_same_query!(inv_omicron_zone_nic, inv_collection); +allow_tables_to_appear_in_same_query!( + inv_omicron_sled_config_zone, + inv_collection +); +allow_tables_to_appear_in_same_query!( + inv_omicron_sled_config_zone_nic, + inv_collection +); allow_tables_to_appear_in_same_query!(external_ip, inv_collection); allow_tables_to_appear_in_same_query!(external_ip, internet_gateway); allow_tables_to_appear_in_same_query!(external_ip, internet_gateway_ip_pool); diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index 683d028a136..7c75c94de84 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -149,12 +149,6 @@ impl CollectionBuilder { /// Assemble a complete `Collection` representation pub fn build(mut self) -> Collection { - // This is not strictly necessary. But for testing, it's helpful for - // things to be in sorted order. - for v in self.sleds.values_mut() { - v.omicron_zones.zones.sort_by(|a, b| a.id.cmp(&b.id)); - } - Collection { id: self.rng.id_rng.next(), errors: self.errors.into_iter().map(|e| e.to_string()).collect(), @@ -531,7 +525,6 @@ impl CollectionBuilder { reservoir_size: inventory.reservoir_size, time_collected, sled_id, - omicron_zones: inventory.omicron_zones, disks: inventory.disks.into_iter().map(|d| d.into()).collect(), zpools: inventory .zpools @@ -543,8 +536,9 @@ impl CollectionBuilder { .into_iter() .map(|d| d.into()) .collect(), - omicron_physical_disks_generation: inventory - .omicron_physical_disks_generation, + ledgered_sled_config: inventory.ledgered_sled_config, + reconciler_status: inventory.reconciler_status, + last_reconciliation: inventory.last_reconciliation, }; if let Some(previous) = self.sleds.get(&sled_id) { diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index 2e2e4c1ef36..d7f418c32cb 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -409,6 +409,7 @@ mod test { use crate::StaticSledAgentEnumerator; use gateway_messages::SpPort; use id_map::IdMap; + use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus; use nexus_sled_agent_shared::inventory::OmicronSledConfig; use nexus_sled_agent_shared::inventory::OmicronZoneConfig; use nexus_sled_agent_shared::inventory::OmicronZoneImageSource; @@ -426,6 +427,52 @@ mod test { use std::net::SocketAddrV6; use std::sync::Arc; + fn dump_sled_config(s: &mut String, config: &OmicronSledConfig) { + let OmicronSledConfig { + generation, + disks, + datasets, + zones, + remove_mupdate_override, + } = config; + + writeln!(s, " generation: {generation}").unwrap(); + writeln!( + s, + " remove_mupdate_override: {remove_mupdate_override:?}" + ) + .unwrap(); + for disk in disks { + writeln!( + s, + " disk {}: {} / {} / {}", + disk.id, + disk.identity.vendor, + disk.identity.model, + disk.identity.serial + ) + .unwrap(); + } + for dataset in datasets { + writeln!( + s, + " dataset {}: {}", + dataset.id, + dataset.name.full_name() + ) + .unwrap(); + } + for zone in zones { + writeln!( + s, + " zone {} type {}", + zone.id, + zone.zone_type.kind().report_str(), + ) + .unwrap(); + } + } + fn dump_collection(collection: &Collection) -> String { // Construct a stable, human-readable summary of the Collection // contents. We could use a `Debug` impl for this, but that's not quite @@ -520,21 +567,51 @@ mod test { write!(&mut s, " baseboard {:?}\n", sled_info.baseboard_id) .unwrap(); - write!( - &mut s, - " zone generation: {:?}\n", - sled_info.omicron_zones.generation - ) - .unwrap(); - write!(&mut s, " zones found:\n").unwrap(); - for zone in &sled_info.omicron_zones.zones { - write!( + if let Some(config) = &sled_info.ledgered_sled_config { + writeln!(&mut s, " ledgered sled config:").unwrap(); + dump_sled_config(&mut s, config); + } else { + writeln!(&mut s, " no ledgered sled config").unwrap(); + } + + if let Some(last_reconciliation) = &sled_info.last_reconciliation { + writeln!(&mut s, " last reconciled config:").unwrap(); + dump_sled_config( &mut s, - " zone {} type {}\n", - zone.id, - zone.zone_type.kind().report_str(), - ) - .unwrap(); + &last_reconciliation.last_reconciled_config, + ); + for (id, result) in &last_reconciliation.external_disks { + writeln!(&mut s, " result for disk {id}: {result:?}") + .unwrap(); + } + for (id, result) in &last_reconciliation.datasets { + writeln!(&mut s, " result for dataset {id}: {result:?}") + .unwrap(); + } + for (id, result) in &last_reconciliation.zones { + writeln!(&mut s, " result for zone {id}: {result:?}") + .unwrap(); + } + } else { + writeln!(&mut s, " no completed reconciliation").unwrap(); + } + + match &sled_info.reconciler_status { + ConfigReconcilerInventoryStatus::NotYetRun => { + writeln!(&mut s, " reconciler task not yet run") + .unwrap(); + } + ConfigReconcilerInventoryStatus::Running { config, .. } => { + writeln!( + &mut s, + " reconciler task running with config:" + ) + .unwrap(); + dump_sled_config(&mut s, config); + } + ConfigReconcilerInventoryStatus::Idle { .. } => { + writeln!(&mut s, " reconciler task idle").unwrap(); + } } } diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index a4ef8596c96..a7603195273 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -5,6 +5,7 @@ //! Example collections used for testing use crate::CollectionBuilder; +use crate::now_db_precision; use clickhouse_admin_types::ClickhouseKeeperClusterMembership; use clickhouse_admin_types::KeeperId; use gateway_client::types::PowerState; @@ -14,22 +15,35 @@ use gateway_client::types::SpState; use gateway_client::types::SpType; use gateway_types::rot::RotSlot; use nexus_sled_agent_shared::inventory::Baseboard; +use nexus_sled_agent_shared::inventory::ConfigReconcilerInventory; +use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult; +use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus; use nexus_sled_agent_shared::inventory::Inventory; use nexus_sled_agent_shared::inventory::InventoryDataset; use nexus_sled_agent_shared::inventory::InventoryDisk; use nexus_sled_agent_shared::inventory::InventoryZpool; +use nexus_sled_agent_shared::inventory::OmicronSledConfig; use nexus_sled_agent_shared::inventory::OmicronZonesConfig; use nexus_sled_agent_shared::inventory::SledRole; use nexus_types::inventory::BaseboardId; use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::RotPage; use nexus_types::inventory::RotPageWhich; +use nexus_types::inventory::ZpoolName; use omicron_common::api::external::ByteCount; -use omicron_common::api::external::Generation; +use omicron_common::disk::DatasetConfig; +use omicron_common::disk::DatasetKind; +use omicron_common::disk::DatasetName; use omicron_common::disk::DiskVariant; +use omicron_common::disk::OmicronPhysicalDiskConfig; +use omicron_common::disk::SharedDatasetConfig; +use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::ZpoolUuid; use std::sync::Arc; +use std::time::Duration; use strum::IntoEnumIterator; +use uuid::Uuid; /// Returns an example Collection used for testing /// @@ -295,6 +309,54 @@ pub fn representative() -> Representative { let sled16: OmicronZonesConfig = serde_json::from_str(sled16_data).unwrap(); let sled17: OmicronZonesConfig = serde_json::from_str(sled17_data).unwrap(); + // Convert these to `OmicronSledConfig`s. We'll start with empty disks and + // datasets for now, and add to them below for sled14. + let mut sled14 = OmicronSledConfig { + generation: sled14.generation, + disks: Default::default(), + datasets: Default::default(), + zones: sled14.zones.into_iter().collect(), + remove_mupdate_override: None, + }; + let sled16 = OmicronSledConfig { + generation: sled16.generation, + disks: Default::default(), + datasets: Default::default(), + zones: sled16.zones.into_iter().collect(), + remove_mupdate_override: None, + }; + let sled17 = OmicronSledConfig { + generation: sled17.generation, + disks: Default::default(), + datasets: Default::default(), + zones: sled17.zones.into_iter().collect(), + remove_mupdate_override: None, + }; + + // Create iterator producing fixed IDs. + let mut disk_id_iter = std::iter::from_fn({ + // "physicaldisk" + let mut value = "70687973-6963-616c-6469-736b00000000" + .parse::() + .unwrap() + .as_u128(); + move || { + value += 1; + Some(PhysicalDiskUuid::from_u128(value)) + } + }); + let mut zpool_id_iter = std::iter::from_fn({ + // "zpool" + let mut value = "7a706f6f-6c00-0000-0000-000000000000" + .parse::() + .unwrap() + .as_u128(); + move || { + value += 1; + Some(ZpoolUuid::from_u128(value)) + } + }); + // Report some sled agents. // // This first one will match "sled1_bb"'s baseboard information. @@ -362,19 +424,41 @@ pub fn representative() -> Representative { slot_firmware_versions: vec![Some("EXAMP1".to_string())], }, ]; - let zpools = vec![InventoryZpool { - id: "283f5475-2606-4e83-b001-9a025dbcb8a0".parse().unwrap(), - total_size: ByteCount::from(4096), - }]; + let mut zpools = Vec::new(); + for disk in &disks { + let pool_id = zpool_id_iter.next().unwrap(); + sled14.disks.insert(OmicronPhysicalDiskConfig { + identity: disk.identity.clone(), + id: disk_id_iter.next().unwrap(), + pool_id, + }); + zpools.push(InventoryZpool { + id: pool_id, + total_size: ByteCount::from(4096), + }); + } + let dataset_name = DatasetName::new( + ZpoolName::new_external(zpools[0].id), + DatasetKind::Debug, + ); let datasets = vec![InventoryDataset { id: Some("afc00483-0d7b-4181-87d5-0def937d3cd7".parse().unwrap()), - name: "mydataset".to_string(), + name: dataset_name.full_name(), available: ByteCount::from(1024), used: ByteCount::from(0), quota: None, reservation: None, compression: "lz4".to_string(), }]; + sled14.datasets.insert(DatasetConfig { + id: datasets[0].id.unwrap(), + name: dataset_name, + inner: SharedDatasetConfig { + compression: datasets[0].compression.parse().unwrap(), + quota: datasets[0].quota, + reservation: datasets[0].reservation, + }, + }); builder .found_sled_inventory( @@ -387,10 +471,10 @@ pub fn representative() -> Representative { revision: 0, }, SledRole::Gimlet, - sled14, disks, zpools, datasets, + Some(sled14), ), ) .unwrap(); @@ -415,10 +499,10 @@ pub fn representative() -> Representative { revision: 0, }, SledRole::Scrimlet, - sled16, vec![], vec![], vec![], + Some(sled16), ), ) .unwrap(); @@ -438,10 +522,10 @@ pub fn representative() -> Representative { model: String::from("fellofftruck"), }, SledRole::Gimlet, - sled17, vec![], vec![], vec![], + Some(sled17), ), ) .unwrap(); @@ -459,15 +543,12 @@ pub fn representative() -> Representative { sled_agent_id_unknown, Baseboard::Unknown, SledRole::Gimlet, - // We only have omicron zones for three sleds so use empty zone - // info here. - OmicronZonesConfig { - generation: Generation::new(), - zones: Vec::new(), - }, vec![], vec![], vec![], + // We only have omicron zones for three sleds so report no sled + // config here. + None, ), ) .unwrap(); @@ -556,11 +637,45 @@ pub fn sled_agent( sled_id: SledUuid, baseboard: Baseboard, sled_role: SledRole, - omicron_zones: OmicronZonesConfig, disks: Vec, zpools: Vec, datasets: Vec, + ledgered_sled_config: Option, ) -> Inventory { + // Assume the `ledgered_sled_config` was reconciled successfully. + let last_reconciliation = ledgered_sled_config.clone().map(|config| { + let external_disks = config + .disks + .iter() + .map(|d| (d.id, ConfigReconcilerInventoryResult::Ok)) + .collect(); + let datasets = config + .datasets + .iter() + .map(|d| (d.id, ConfigReconcilerInventoryResult::Ok)) + .collect(); + let zones = config + .zones + .iter() + .map(|z| (z.id, ConfigReconcilerInventoryResult::Ok)) + .collect(); + ConfigReconcilerInventory { + last_reconciled_config: config, + external_disks, + datasets, + zones, + } + }); + + let reconciler_status = if last_reconciliation.is_some() { + ConfigReconcilerInventoryStatus::Idle { + completed_at: now_db_precision(), + ran_for: Duration::from_secs(5), + } + } else { + ConfigReconcilerInventoryStatus::NotYetRun + }; + Inventory { baseboard, reservoir_size: ByteCount::from(1024), @@ -569,10 +684,11 @@ pub fn sled_agent( sled_id, usable_hardware_threads: 10, usable_physical_ram: ByteCount::from(1024 * 1024), - omicron_zones, disks, zpools, datasets, - omicron_physical_disks_generation: Generation::new(), + ledgered_sled_config, + reconciler_status, + last_reconciliation, } } diff --git a/nexus/inventory/tests/output/collector_basic.txt b/nexus/inventory/tests/output/collector_basic.txt index b3b70755574..3eed7f61adb 100644 --- a/nexus/inventory/tests/output/collector_basic.txt +++ b/nexus/inventory/tests/output/collector_basic.txt @@ -89,13 +89,19 @@ rot pages found: sled agents found: sled 03265caf-da7d-46c7-b1c2-39fa90ce5c65 (Scrimlet) baseboard Some(BaseboardId { part_number: "sim-gimlet", serial_number: "sim-03265caf-da7d-46c7-b1c2-39fa90ce5c65" }) - zone generation: Generation(3) - zones found: + ledgered sled config: + generation: 3 + remove_mupdate_override: None zone 8b88a56f-3eb6-4d80-ba42-75d867bc427d type oximeter + no completed reconciliation + reconciler task not yet run sled 9cb9b78f-5614-440c-b66d-e8e81fab69b0 (Scrimlet) baseboard Some(BaseboardId { part_number: "sim-gimlet", serial_number: "sim-9cb9b78f-5614-440c-b66d-e8e81fab69b0" }) - zone generation: Generation(3) - zones found: + ledgered sled config: + generation: 3 + remove_mupdate_override: None zone 5125277f-0988-490b-ac01-3bba20cc8f07 type oximeter + no completed reconciliation + reconciler task not yet run errors: diff --git a/nexus/inventory/tests/output/collector_sled_agent_errors.txt b/nexus/inventory/tests/output/collector_sled_agent_errors.txt index 113d5c89c08..e4f3efc8739 100644 --- a/nexus/inventory/tests/output/collector_sled_agent_errors.txt +++ b/nexus/inventory/tests/output/collector_sled_agent_errors.txt @@ -88,9 +88,12 @@ rot pages found: sled agents found: sled 9cb9b78f-5614-440c-b66d-e8e81fab69b0 (Scrimlet) baseboard Some(BaseboardId { part_number: "sim-gimlet", serial_number: "sim-9cb9b78f-5614-440c-b66d-e8e81fab69b0" }) - zone generation: Generation(3) - zones found: + ledgered sled config: + generation: 3 + remove_mupdate_override: None zone 5125277f-0988-490b-ac01-3bba20cc8f07 type oximeter + no completed reconciliation + reconciler task not yet run errors: error: Sled Agent "http://[100::1]:45678": inventory: Communication Error <> diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 81126677e7e..7d3b8187a2a 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -655,11 +655,13 @@ mod test { let mut blueprint_sleds = BTreeMap::new(); for (sled_id, sa) in collection.sled_agents { + let ledgered_sled_config = + sa.ledgered_sled_config.unwrap_or_default(); + // Convert the inventory `OmicronZonesConfig`s into // `BlueprintZoneConfig`s. This is going to get more painful over // time as we add to blueprints, but for now we can make this work. - let zones = sa - .omicron_zones + let zones = ledgered_sled_config .zones .into_iter() .map(|config| -> BlueprintZoneConfig { @@ -678,7 +680,7 @@ mod test { sled_id, BlueprintSledConfig { state: SledState::Active, - sled_agent_generation: sa.omicron_zones.generation, + sled_agent_generation: ledgered_sled_config.generation, disks: IdMap::new(), datasets: IdMap::new(), zones, diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 2e889f2d2d4..7822d0cfed8 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -2694,9 +2694,14 @@ pub mod test { let sled_id = { let mut selected_sled_id = None; for (sled_id, sa) in &mut collection.sled_agents { - let nzones_before_retain = sa.omicron_zones.zones.len(); - sa.omicron_zones.zones.retain(|z| !z.zone_type.is_nexus()); - if sa.omicron_zones.zones.len() < nzones_before_retain { + let sa_zones = &mut sa + .ledgered_sled_config + .as_mut() + .expect("example should have a sled config") + .zones; + let nzones_before_retain = sa_zones.len(); + sa_zones.retain(|z| !z.zone_type.is_nexus()); + if sa_zones.len() < nzones_before_retain { selected_sled_id = Some(*sled_id); // Also remove this zone from the blueprint. let mut removed_nexus = None; diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index 48314befabd..65f4c0ff334 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -14,7 +14,6 @@ use crate::planner::rng::PlannerRng; use crate::system::SledBuilder; use crate::system::SystemDescription; use nexus_inventory::CollectionBuilderRng; -use nexus_sled_agent_shared::inventory::OmicronZonesConfig; use nexus_types::deployment::Blueprint; use nexus_types::deployment::OmicronZoneNic; use nexus_types::deployment::PlanningInput; @@ -486,15 +485,7 @@ impl ExampleSystemBuilder { for (sled_id, sled_cfg) in &blueprint.sleds { let sled_cfg = sled_cfg.clone().into_in_service_sled_config(); - system - .sled_set_omicron_zones( - *sled_id, - OmicronZonesConfig { - generation: sled_cfg.generation, - zones: sled_cfg.zones.into_iter().collect(), - }, - ) - .unwrap(); + system.sled_set_omicron_config(*sled_id, sled_cfg).unwrap(); } // We just ensured that a handful of datasets should exist in @@ -612,8 +603,10 @@ mod tests { nexus_zones.len(), nexus_zones, ); - let nexus_zones = - collection_zones_of_kind(&example.collection, ZoneKind::Nexus); + let nexus_zones = collection_ledgered_zones_of_kind( + &example.collection, + ZoneKind::Nexus, + ); assert_eq!( nexus_zones.len(), 6, @@ -631,7 +624,7 @@ mod tests { internal_dns_zones.len(), internal_dns_zones, ); - let internal_dns_zones = collection_zones_of_kind( + let internal_dns_zones = collection_ledgered_zones_of_kind( &example.collection, ZoneKind::InternalDns, ); @@ -652,7 +645,7 @@ mod tests { external_dns_zones.len(), external_dns_zones, ); - let external_dns_zones = collection_zones_of_kind( + let external_dns_zones = collection_ledgered_zones_of_kind( &example.collection, ZoneKind::ExternalDns, ); @@ -673,7 +666,7 @@ mod tests { crucible_pantry_zones.len(), crucible_pantry_zones, ); - let crucible_pantry_zones = collection_zones_of_kind( + let crucible_pantry_zones = collection_ledgered_zones_of_kind( &example.collection, ZoneKind::CruciblePantry, ); @@ -700,12 +693,12 @@ mod tests { .collect() } - fn collection_zones_of_kind( + fn collection_ledgered_zones_of_kind( collection: &Collection, kind: ZoneKind, ) -> Vec<&OmicronZoneConfig> { collection - .all_omicron_zones() + .all_ledgered_omicron_zones() .filter(|zone| zone.zone_type.kind() == kind) .collect() } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 38f85e772fc..04b61483705 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -218,7 +218,13 @@ impl<'a> Planner<'a> { .inventory .sled_agents .get(&sled_id) - .map(|sa| sa.omicron_physical_disks_generation) + // TODO-correctness For now this is correct, because sled-agent + // doesn't ledger a config until it's applied it. However, once + // https://github.com/oxidecomputer/omicron/pull/8064 lands, + // sled-agent will ledger a config and then later reconcile it; do + // we need to wait for that reconciliation to decommission disks? + .and_then(|sa| sa.ledgered_sled_config.as_ref()) + .map(|config| config.generation) else { // There is no current inventory for the sled agent, so we cannot // decommission any disks. @@ -418,14 +424,18 @@ impl<'a> Planner<'a> { as_of_generation, ready_for_cleanup, } if !ready_for_cleanup => { - if sled_inv.omicron_zones.generation >= as_of_generation - && !sled_inv - .omicron_zones - .zones - .iter() - .any(|z| z.id == zone.id) - { - zones_ready_for_cleanup.push(zone.id); + // TODO-correctness For now this is correct, because + // sled-agent doesn't ledger a config until it's applied it. + // However, as a part of landing + // https://github.com/oxidecomputer/omicron/pull/8064, + // this needs to change to check the last reconciled config + // instead of just the ledgered config. + if let Some(config) = &sled_inv.ledgered_sled_config { + if config.generation >= as_of_generation + && !config.zones.contains_key(&zone.id) + { + zones_ready_for_cleanup.push(zone.id); + } } } BlueprintZoneDisposition::InService @@ -558,16 +568,25 @@ impl<'a> Planner<'a> { // the NTP zone or switching between an internal NTP vs. boundary // NTP zone), we'll need to be careful how we do it to avoid a // problem here. + // + // TODO-cleanup Once + // https://github.com/oxidecomputer/omicron/pull/8064 lands, the + // above comment will be overly conservative; sled-agent won't + // reject configs just because time isn't sync'd yet. We may be able + // to remove this check entirely. (It's probably also fine to keep + // it for now; removing it just saves us an extra planning iteration + // when adding a new sled.) let has_ntp_inventory = self .inventory .sled_agents .get(&sled_id) .map(|sled_agent| { - sled_agent - .omicron_zones - .zones - .iter() - .any(|z| z.zone_type.is_ntp()) + sled_agent.ledgered_sled_config.as_ref().map_or( + false, + |config| { + config.zones.iter().any(|z| z.zone_type.is_ntp()) + }, + ) }) .unwrap_or(false); if !has_ntp_inventory { @@ -995,7 +1014,6 @@ pub(crate) mod test { use clickhouse_admin_types::ClickhouseKeeperClusterMembership; use clickhouse_admin_types::KeeperId; use expectorate::assert_contents; - use nexus_sled_agent_shared::inventory::OmicronZonesConfig; use nexus_types::deployment::BlueprintDatasetDisposition; use nexus_types::deployment::BlueprintDiffSummary; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; @@ -1180,18 +1198,15 @@ pub(crate) mod test { // example.collection -- this should be addressed via API improvements. example .system - .sled_set_omicron_zones(new_sled_id, { - let sled_cfg = blueprint4 + .sled_set_omicron_config( + new_sled_id, + blueprint4 .sleds .get(&new_sled_id) .expect("blueprint should contain zones for new sled") .clone() - .into_in_service_sled_config(); - OmicronZonesConfig { - generation: sled_cfg.generation, - zones: sled_cfg.zones.into_iter().collect(), - } - }) + .into_in_service_sled_config(), + ) .unwrap(); let collection = example.system.to_collection_builder().unwrap().build(); @@ -2033,19 +2048,10 @@ pub(crate) mod test { let mut collection = example.collection; let input = example.input; - // The initial collection configuration has generation 1 // The initial blueprint configuration has generation 2 let (sled_id, sled_config) = blueprint1.sleds.first_key_value().unwrap(); assert_eq!(sled_config.sled_agent_generation, Generation::from_u32(2)); - assert_eq!( - collection - .sled_agents - .get(&sled_id) - .unwrap() - .omicron_physical_disks_generation, - Generation::new() - ); // All disks should have an `InService` disposition and `Active` state for disk in &sled_config.disks { @@ -2130,7 +2136,10 @@ pub(crate) mod test { .sled_agents .get_mut(&sled_id) .unwrap() - .omicron_physical_disks_generation = Generation::from_u32(3); + .ledgered_sled_config + .as_mut() + .unwrap() + .generation = Generation::from_u32(3); let blueprint3 = Planner::new_based_on( logctx.log.clone(), @@ -4078,6 +4087,7 @@ pub(crate) mod test { // * same inventory as above // * inventory reports a new generation (but zone still running) // * inventory reports zone not running (but still the old generation) + eprintln!("planning with no inventory change..."); assert_planning_makes_no_changes( &logctx.log, &blueprint2, @@ -4085,6 +4095,15 @@ pub(crate) mod test { &collection, TEST_NAME, ); + // TODO-cleanup These checks depend on `last_reconciled_config`, which + // is not yet populated; uncomment these and check them by mutating + // `last_reconciled_config` once + // https://github.com/oxidecomputer/omicron/pull/8064 lands. We could + // just mutate `ledgered_sled_config` in the meantime (as this + // commented-out code does below), but that's not really checking what + // we care about. + /* + eprintln!("planning with generation bump but zone still running..."); assert_planning_makes_no_changes( &logctx.log, &blueprint2, @@ -4095,12 +4114,15 @@ pub(crate) mod test { .sled_agents .get_mut(&sled_id) .unwrap() - .omicron_zones + .ledgered_sled_config + .as_mut() + .unwrap() .generation = bp2_generation; collection }, TEST_NAME, ); + eprintln!("planning with zone gone but generation not bumped..."); assert_planning_makes_no_changes( &logctx.log, &blueprint2, @@ -4111,23 +4133,28 @@ pub(crate) mod test { .sled_agents .get_mut(&sled_id) .unwrap() - .omicron_zones + .ledgered_sled_config + .as_mut() + .unwrap() .zones .retain(|z| z.id != nexus_config.id); collection }, TEST_NAME, ); + */ // Now make both changes to the inventory. { - let zones = &mut collection + let config = &mut collection .sled_agents .get_mut(&sled_id) .unwrap() - .omicron_zones; - zones.generation = bp2_generation; - zones.zones.retain(|z| z.id != nexus_config.id); + .ledgered_sled_config + .as_mut() + .unwrap(); + config.generation = bp2_generation; + config.zones.retain(|z| z.id != nexus_config.id); } // Run the planner. It mark our Nexus zone as ready for cleanup now that @@ -4269,13 +4296,15 @@ pub(crate) mod test { // Make the inventory changes necessary for cleanup to proceed. { - let zones = &mut collection + let config = &mut collection .sled_agents .get_mut(&sled_id) .unwrap() - .omicron_zones; - zones.generation = bp2_generation; - zones.zones.retain(|z| z.id != internal_dns_config.id); + .ledgered_sled_config + .as_mut() + .unwrap(); + config.generation = bp2_generation; + config.zones.retain(|z| z.id != internal_dns_config.id); } // Run the planner. It should mark our internal DNS zone as ready for diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index 472bfe3bfa5..12219daaf81 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -13,11 +13,12 @@ use ipnet::Ipv6Net; use ipnet::Ipv6Subnets; use nexus_inventory::CollectionBuilder; use nexus_sled_agent_shared::inventory::Baseboard; +use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus; use nexus_sled_agent_shared::inventory::Inventory; use nexus_sled_agent_shared::inventory::InventoryDataset; use nexus_sled_agent_shared::inventory::InventoryDisk; use nexus_sled_agent_shared::inventory::InventoryZpool; -use nexus_sled_agent_shared::inventory::OmicronZonesConfig; +use nexus_sled_agent_shared::inventory::OmicronSledConfig; use nexus_sled_agent_shared::inventory::SledRole; use nexus_types::deployment::ClickhousePolicy; use nexus_types::deployment::CockroachDbClusterVersion; @@ -307,7 +308,7 @@ impl SystemDescription { sled.unique, sled.hardware, hardware_slot, - sled.omicron_zones, + sled.sled_config, sled.npools, ); self.sleds.insert(sled_id, Arc::new(sled)); @@ -367,7 +368,7 @@ impl SystemDescription { !self.sleds.is_empty() } - /// Set Omicron zones for a sled. + /// Set Omicron config for a sled. /// /// The zones will be reported in the collection generated by /// [`Self::to_collection_builder`]. @@ -376,17 +377,23 @@ impl SystemDescription { /// /// # Notes /// - /// It is okay to call `sled_set_omicron_zones` in ways that wouldn't + /// It is okay to call `sled_set_omicron_config` in ways that wouldn't /// happen in production, such as to test illegal states. - pub fn sled_set_omicron_zones( + pub fn sled_set_omicron_config( &mut self, sled_id: SledUuid, - omicron_zones: OmicronZonesConfig, + sled_config: OmicronSledConfig, ) -> anyhow::Result<&mut Self> { let sled = self.sleds.get_mut(&sled_id).with_context(|| { format!("attempted to access sled {} not found in system", sled_id) })?; - Arc::make_mut(sled).inventory_sled_agent.omicron_zones = omicron_zones; + let sled = Arc::make_mut(sled); + + sled.inventory_sled_agent.ledgered_sled_config = Some(sled_config); + sled.inventory_sled_agent.reconciler_status = + ConfigReconcilerInventoryStatus::NotYetRun; + sled.inventory_sled_agent.last_reconciliation = None; + Ok(self) } @@ -490,7 +497,7 @@ pub struct SledBuilder { hardware: SledHardware, hardware_slot: Option, sled_role: SledRole, - omicron_zones: OmicronZonesConfig, + sled_config: OmicronSledConfig, npools: u8, } @@ -508,11 +515,7 @@ impl SledBuilder { hardware: SledHardware::Gimlet, hardware_slot: None, sled_role: SledRole::Gimlet, - omicron_zones: OmicronZonesConfig { - // The initial generation is the one with no zones. - generation: OmicronZonesConfig::INITIAL_GENERATION, - zones: Vec::new(), - }, + sled_config: OmicronSledConfig::default(), npools: Self::DEFAULT_NPOOLS, } } @@ -601,7 +604,7 @@ impl Sled { unique: Option, hardware: SledHardware, hardware_slot: u16, - omicron_zones: OmicronZonesConfig, + sled_config: OmicronSledConfig, nzpools: u8, ) -> Sled { use typed_rng::TypedUuidRng; @@ -688,7 +691,6 @@ impl Sled { sled_id, usable_hardware_threads: 10, usable_physical_ram: ByteCount::from(1024 * 1024), - omicron_zones, // Populate disks, appearing like a real device. disks: zpools .values() @@ -714,7 +716,9 @@ impl Sled { }) .collect(), datasets: vec![], - omicron_physical_disks_generation: Generation::new(), + ledgered_sled_config: Some(sled_config), + reconciler_status: ConfigReconcilerInventoryStatus::NotYetRun, + last_reconciliation: None, } }; @@ -849,11 +853,12 @@ impl Sled { sled_id, usable_hardware_threads: inv_sled_agent.usable_hardware_threads, usable_physical_ram: inv_sled_agent.usable_physical_ram, - omicron_zones: inv_sled_agent.omicron_zones.clone(), disks: vec![], zpools: vec![], datasets: vec![], - omicron_physical_disks_generation: Generation::new(), + ledgered_sled_config: inv_sled_agent.ledgered_sled_config.clone(), + reconciler_status: inv_sled_agent.reconciler_status.clone(), + last_reconciliation: inv_sled_agent.last_reconciliation.clone(), }; Sled { diff --git a/nexus/src/app/background/tasks/sync_service_zone_nat.rs b/nexus/src/app/background/tasks/sync_service_zone_nat.rs index 02d49e7f590..a1c1869fed9 100644 --- a/nexus/src/app/background/tasks/sync_service_zone_nat.rs +++ b/nexus/src/app/background/tasks/sync_service_zone_nat.rs @@ -17,9 +17,7 @@ use nexus_db_lookup::LookupPath; use nexus_db_model::Ipv4NatValues; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; -use nexus_sled_agent_shared::inventory::{ - OmicronZoneConfig, OmicronZoneType, OmicronZonesConfig, -}; +use nexus_sled_agent_shared::inventory::OmicronZoneType; use omicron_common::address::{MAX_PORT, MIN_PORT}; use omicron_uuid_kinds::GenericUuid; use serde_json::json; @@ -127,8 +125,22 @@ impl BackgroundTask for ServiceZoneNatTracker { let sled_address = oxnet::Ipv6Net::host_net(*sled.ip); - let zones_config: OmicronZonesConfig = sa.omicron_zones; - let zones: Vec = zones_config.zones; + // TODO-correctness Looking at inventory here is a little + // sketchy. We currently check the most-recently-ledgered zones + // which tells us what services sled-agent things it's supposed + // to be running. It might be better to check either: + // + // * `sa.last_reconciliation` (to know what zones are actually + // running; this requires + // https://github.com/oxidecomputer/omicron/pull/8064 landing) + // if the goal is to sync what's actually on the sled + // * a rendezvous table populated by reconfigurator if the goal + // is to sync with what's Nexus thinks is supposed to be + // running on the sled + let zones = sa + .ledgered_sled_config + .map(|config| config.zones) + .unwrap_or_default(); for zone in zones { let zone_type: OmicronZoneType = zone.zone_type; diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs index 406ccb6e7bf..1fa3651dd74 100644 --- a/nexus/tests/integration_tests/schema.rs +++ b/nexus/tests/integration_tests/schema.rs @@ -10,6 +10,7 @@ use nexus_config::SchemaConfig; use nexus_db_lookup::DataStoreConnection; use nexus_db_model::EARLIEST_SUPPORTED_VERSION; use nexus_db_model::SCHEMA_VERSION as LATEST_SCHEMA_VERSION; +use nexus_db_model::SchemaUpgradeStep; use nexus_db_model::{AllSchemaVersions, SchemaVersion}; use nexus_db_queries::db::DISALLOW_FULL_TABLE_SCAN_SQL; use nexus_db_queries::db::pub_test_utils::TestDatabase; @@ -25,6 +26,7 @@ use pretty_assertions::{assert_eq, assert_ne}; use semver::Version; use similar_asserts; use slog::Logger; +use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::net::IpAddr; use std::sync::Mutex; @@ -64,10 +66,16 @@ async fn test_setup<'a>( // Only returns an error if the transaction failed to commit. async fn apply_update_as_transaction_inner( client: &omicron_test_utils::dev::db::Client, - sql: &str, + step: &SchemaUpgradeStep, ) -> Result<(), tokio_postgres::Error> { client.batch_execute("BEGIN;").await.expect("Failed to BEGIN transaction"); - client.batch_execute(&sql).await.expect("Failed to execute update"); + if let Err(err) = client.batch_execute(step.sql()).await { + panic!( + "Failed to execute update step {}: {}", + step.label(), + InlineErrorChain::new(&err) + ); + }; client.batch_execute("COMMIT;").await?; Ok(()) } @@ -78,13 +86,16 @@ async fn apply_update_as_transaction_inner( async fn apply_update_as_transaction( log: &Logger, client: &omicron_test_utils::dev::db::Client, - sql: &str, + step: &SchemaUpgradeStep, ) { loop { - match apply_update_as_transaction_inner(client, sql).await { + match apply_update_as_transaction_inner(client, step).await { Ok(()) => break, Err(err) => { - warn!(log, "Failed to apply update as transaction"; "err" => err.to_string()); + warn!( + log, "Failed to apply update as transaction"; + InlineErrorChain::new(&err), + ); client .batch_execute("ROLLBACK;") .await @@ -95,7 +106,7 @@ async fn apply_update_as_transaction( continue; } } - panic!("Failed to apply update: {err}"); + panic!("Failed to apply update {}: {err}", step.label()); } } } @@ -142,7 +153,7 @@ async fn apply_update( ); for _ in 0..times_to_apply { - apply_update_as_transaction(&log, &client, step.sql()).await; + apply_update_as_transaction(&log, &client, step).await; // The following is a set of "versions exempt from being // re-applied" multiple times. PLEASE AVOID ADDING TO THIS LIST. diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index a793ac9bc1a..79c79a9a1e3 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -19,14 +19,15 @@ pub use gateway_client::types::PowerState; pub use gateway_client::types::RotImageError; pub use gateway_client::types::SpType; pub use gateway_types::rot::RotSlot; +use nexus_sled_agent_shared::inventory::ConfigReconcilerInventory; +use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus; use nexus_sled_agent_shared::inventory::InventoryDataset; use nexus_sled_agent_shared::inventory::InventoryDisk; use nexus_sled_agent_shared::inventory::InventoryZpool; +use nexus_sled_agent_shared::inventory::OmicronSledConfig; use nexus_sled_agent_shared::inventory::OmicronZoneConfig; -use nexus_sled_agent_shared::inventory::OmicronZonesConfig; use nexus_sled_agent_shared::inventory::SledRole; use omicron_common::api::external::ByteCount; -use omicron_common::api::external::Generation; pub use omicron_common::api::internal::shared::NetworkInterface; pub use omicron_common::api::internal::shared::NetworkInterfaceKind; pub use omicron_common::api::internal::shared::SourceNatConfig; @@ -166,11 +167,15 @@ impl Collection { .and_then(|by_bb| by_bb.get(baseboard_id)) } - /// Iterate over all the Omicron zones in the collection - pub fn all_omicron_zones( + /// Iterate over all the Omicron zones in the sled-agent ledgers of this + /// collection + pub fn all_ledgered_omicron_zones( &self, ) -> impl Iterator { - self.sled_agents.values().flat_map(|sa| sa.omicron_zones.zones.iter()) + self.sled_agents + .values() + .filter_map(|sa| sa.ledgered_sled_config.as_ref()) + .flat_map(|config| config.zones.iter()) } /// Iterate over the sled ids of sleds identified as Scrimlets @@ -543,17 +548,10 @@ pub struct SledAgent { pub usable_hardware_threads: u32, pub usable_physical_ram: ByteCount, pub reservoir_size: ByteCount, - pub omicron_zones: OmicronZonesConfig, pub disks: Vec, pub zpools: Vec, pub datasets: Vec, - /// As part of reconfigurator planning we need to know the control plane - /// disks configuration that the sled-agent has seen last. Specifically, - /// this allows the planner to know if a disk expungement has been seen by - /// the sled-agent, so that the planner can decommission the expunged disk. - /// - /// This field corresponds to the `generation` field in - /// `OmicronPhysicalDisksConfig` that is stored in the blueprint and sent to - /// the sled-agent via the executor over the internal API. - pub omicron_physical_disks_generation: Generation, + pub ledgered_sled_config: Option, + pub reconciler_status: ConfigReconcilerInventoryStatus, + pub last_reconciliation: Option, } diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 3aeac93c776..4279de8605d 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -3513,6 +3513,147 @@ } ] }, + "ConfigReconcilerInventory": { + "description": "Describes the last attempt made by the sled-agent-config-reconciler to reconcile the current sled config against the actual state of the sled.", + "type": "object", + "properties": { + "datasets": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ConfigReconcilerInventoryResult" + } + }, + "external_disks": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ConfigReconcilerInventoryResult" + } + }, + "last_reconciled_config": { + "$ref": "#/components/schemas/OmicronSledConfig" + }, + "zones": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ConfigReconcilerInventoryResult" + } + } + }, + "required": [ + "datasets", + "external_disks", + "last_reconciled_config", + "zones" + ] + }, + "ConfigReconcilerInventoryResult": { + "oneOf": [ + { + "type": "object", + "properties": { + "result": { + "type": "string", + "enum": [ + "ok" + ] + } + }, + "required": [ + "result" + ] + }, + { + "type": "object", + "properties": { + "message": { + "type": "string" + }, + "result": { + "type": "string", + "enum": [ + "err" + ] + } + }, + "required": [ + "message", + "result" + ] + } + ] + }, + "ConfigReconcilerInventoryStatus": { + "description": "Status of the sled-agent-config-reconciler task.", + "oneOf": [ + { + "description": "The reconciler task has not yet run for the first time since sled-agent started.", + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "not_yet_run" + ] + } + }, + "required": [ + "status" + ] + }, + { + "description": "The reconciler task is actively running.", + "type": "object", + "properties": { + "config": { + "$ref": "#/components/schemas/OmicronSledConfig" + }, + "running_for": { + "$ref": "#/components/schemas/Duration" + }, + "started_at": { + "type": "string", + "format": "date-time" + }, + "status": { + "type": "string", + "enum": [ + "running" + ] + } + }, + "required": [ + "config", + "running_for", + "started_at", + "status" + ] + }, + { + "description": "The reconciler task is currently idle, but previously did complete a reconciliation attempt.\n\nThis variant does not include the `OmicronSledConfig` used in the last attempt, because that's always available via [`ConfigReconcilerInventory::last_reconciled_config`].", + "type": "object", + "properties": { + "completed_at": { + "type": "string", + "format": "date-time" + }, + "ran_for": { + "$ref": "#/components/schemas/Duration" + }, + "status": { + "type": "string", + "enum": [ + "idle" + ] + } + }, + "required": [ + "completed_at", + "ran_for", + "status" + ] + } + ] + }, "Cpuid": { "description": "A set of CPUID values to expose to a guest.", "type": "object", @@ -4889,11 +5030,24 @@ "$ref": "#/components/schemas/InventoryDisk" } }, - "omicron_physical_disks_generation": { - "$ref": "#/components/schemas/Generation" + "last_reconciliation": { + "nullable": true, + "allOf": [ + { + "$ref": "#/components/schemas/ConfigReconcilerInventory" + } + ] }, - "omicron_zones": { - "$ref": "#/components/schemas/OmicronZonesConfig" + "ledgered_sled_config": { + "nullable": true, + "allOf": [ + { + "$ref": "#/components/schemas/OmicronSledConfig" + } + ] + }, + "reconciler_status": { + "$ref": "#/components/schemas/ConfigReconcilerInventoryStatus" }, "reservoir_size": { "$ref": "#/components/schemas/ByteCount" @@ -4926,8 +5080,7 @@ "baseboard", "datasets", "disks", - "omicron_physical_disks_generation", - "omicron_zones", + "reconciler_status", "reservoir_size", "sled_agent_address", "sled_id", @@ -5991,31 +6144,6 @@ } ] }, - "OmicronZonesConfig": { - "description": "Describes the set of Omicron-managed zones running on a sled", - "type": "object", - "properties": { - "generation": { - "description": "generation number of this configuration\n\nThis generation number is owned by the control plane (i.e., RSS or Nexus, depending on whether RSS-to-Nexus handoff has happened). It should not be bumped within Sled Agent.\n\nSled Agent rejects attempts to set the configuration to a generation older than the one it's currently running.", - "allOf": [ - { - "$ref": "#/components/schemas/Generation" - } - ] - }, - "zones": { - "description": "list of running zones", - "type": "array", - "items": { - "$ref": "#/components/schemas/OmicronZoneConfig" - } - } - }, - "required": [ - "generation", - "zones" - ] - }, "P9fs": { "description": "Describes a filesystem to expose through a P9 device.\n\nThis is only supported by Propolis servers compiled with the `falcon` feature.", "type": "object", diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index dfa1f2f5f4e..257db58c75f 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3569,6 +3569,13 @@ CREATE TYPE IF NOT EXISTS omicron.public.sled_role AS ENUM ( 'gimlet' ); +CREATE TYPE IF NOT EXISTS omicron.public.inv_config_reconciler_status_kind +AS ENUM ( + 'not-yet-run', + 'running', + 'idle' +); + -- observations from and about sled agents CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_agent ( -- where this observation came from @@ -3599,8 +3606,42 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_agent ( usable_physical_ram INT8 NOT NULL, reservoir_size INT8 CHECK (reservoir_size < usable_physical_ram) NOT NULL, - -- The last generation of OmicronPhysicalDisksConfig seen by the sled-agent - omicron_physical_disks_generation INT8 NOT NULL, + -- Currently-ledgered `OmicronSledConfig` + -- (foreign key into `inv_omicron_sled_config` table) + -- This is optional because newly-added sleds don't yet have a config. + ledgered_sled_config UUID, + + -- Most-recently-reconciled `OmicronSledConfig` + -- (foreign key into `inv_omicron_sled_config` table) + -- This is optional because the reconciler may not have run yet + last_reconciliation_sled_config UUID, + + -- Columns making up the status of the config reconciler. + reconciler_status_kind omicron.public.inv_config_reconciler_status_kind NOT NULL, + -- (foreign key into `inv_omicron_sled_config` table) + -- only present if `reconciler_status_kind = 'running'` + reconciler_status_sled_config UUID, + -- only present if `reconciler_status_kind != 'not-yet-run'` + reconciler_status_timestamp TIMESTAMPTZ, + -- only present if `reconciler_status_kind != 'not-yet-run'` + reconciler_status_duration_secs FLOAT, + + CONSTRAINT reconciler_status_sled_config_present_if_running CHECK ( + (reconciler_status_kind = 'running' + AND reconciler_status_sled_config IS NOT NULL) + OR + (reconciler_status_kind != 'running' + AND reconciler_status_sled_config IS NULL) + ), + CONSTRAINT reconciler_status_timing_present_unless_not_yet_run CHECK ( + (reconciler_status_kind = 'not-yet-run' + AND reconciler_status_timestamp IS NULL + AND reconciler_status_duration_secs IS NULL) + OR + (reconciler_status_kind != 'not-yet-run' + AND reconciler_status_timestamp IS NOT NULL + AND reconciler_status_duration_secs IS NOT NULL) + ), PRIMARY KEY (inv_collection_id, sled_id) ); @@ -3705,25 +3746,77 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_dataset ( PRIMARY KEY (inv_collection_id, sled_id, name) ); --- TODO: This table is vestigial and can be combined with `inv_sled_agent`. See --- https://github.com/oxidecomputer/omicron/issues/6770. -CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_omicron_zones ( +CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_sled_config ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- ID of this sled config. A given inventory report from a sled agent may + -- contain 0-3 sled configs, so we generate these IDs on insertion and + -- record them as the foreign keys in `inv_sled_agent`. + id UUID NOT NULL, + + -- config generation + generation INT8 NOT NULL, + + -- remove mupdate override ID, if set + remove_mupdate_override UUID, + + PRIMARY KEY (inv_collection_id, id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_last_reconciliation_disk_result ( -- where this observation came from -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, - -- when this observation was made - time_collected TIMESTAMPTZ NOT NULL, - -- URL of the sled agent that reported this data - source TEXT NOT NULL, -- unique id for this sled (should be foreign keys into `sled` table, though -- it's conceivable a sled will report an id that we don't know about) sled_id UUID NOT NULL, - -- OmicronZonesConfig generation reporting these zones - generation INT8 NOT NULL, + -- unique id for this physical disk + disk_id UUID NOT NULL, - PRIMARY KEY (inv_collection_id, sled_id) + -- error message; if NULL, an "ok" result + error_message TEXT, + + PRIMARY KEY (inv_collection_id, sled_id, disk_id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_last_reconciliation_dataset_result ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about) + sled_id UUID NOT NULL, + + -- unique id for this dataset + dataset_id UUID NOT NULL, + + -- error message; if NULL, an "ok" result + error_message TEXT, + + PRIMARY KEY (inv_collection_id, sled_id, dataset_id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_last_reconciliation_zone_result ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about) + sled_id UUID NOT NULL, + + -- unique id for this zone + zone_id UUID NOT NULL, + + -- error message; if NULL, an "ok" result + error_message TEXT, + + PRIMARY KEY (inv_collection_id, sled_id, zone_id) ); CREATE TYPE IF NOT EXISTS omicron.public.zone_type AS ENUM ( @@ -3741,15 +3834,18 @@ CREATE TYPE IF NOT EXISTS omicron.public.zone_type AS ENUM ( 'oximeter' ); --- observations from sled agents about Omicron-managed zones -CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_zone ( +CREATE TYPE IF NOT EXISTS omicron.public.inv_zone_image_source AS ENUM ( + 'install_dataset', + 'artifact' +); + +-- `zones` portion of an `OmicronSledConfig` observed from sled-agent +CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_sled_config_zone ( -- where this observation came from - -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, - -- unique id for this sled (should be foreign keys into `sled` table, though - -- it's conceivable a sled will report an id that we don't know about) - sled_id UUID NOT NULL, + -- (foreign key into `inv_omicron_sled_config` table) + sled_config_id UUID NOT NULL, -- unique id for this zone id UUID NOT NULL, @@ -3781,7 +3877,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_zone ( dataset_zpool_name TEXT, -- Zones with external IPs have an associated NIC and sockaddr for listening - -- (first is a foreign key into `inv_omicron_zone_nic`) + -- (first is a foreign key into `inv_omicron_sled_config_zone_nic`) nic_id UUID, -- Properties for internal DNS servers @@ -3810,14 +3906,34 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_zone ( -- Eventually, that nullability should be removed. filesystem_pool UUID, - PRIMARY KEY (inv_collection_id, id) + -- zone image source + image_source omicron.public.inv_zone_image_source NOT NULL, + image_artifact_sha256 STRING(64), + + CONSTRAINT zone_image_source_artifact_hash_present CHECK ( + (image_source = 'artifact' + AND image_artifact_sha256 IS NOT NULL) + OR + (image_source != 'artifact' + AND image_artifact_sha256 IS NULL) + ), + + PRIMARY KEY (inv_collection_id, sled_config_id, id) ); -CREATE INDEX IF NOT EXISTS inv_omicron_zone_nic_id ON omicron.public.inv_omicron_zone - (nic_id) STORING (sled_id, primary_service_ip, second_service_ip, snat_ip); +CREATE INDEX IF NOT EXISTS inv_omicron_sled_config_zone_nic_id + ON omicron.public.inv_omicron_sled_config_zone (nic_id) + STORING ( + primary_service_ip, + second_service_ip, + snat_ip + ); -CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_zone_nic ( +CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_sled_config_zone_nic ( + -- where this observation came from inv_collection_id UUID NOT NULL, + + sled_config_id UUID NOT NULL, id UUID NOT NULL, name TEXT NOT NULL, ip INET NOT NULL, @@ -3827,7 +3943,49 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_zone_nic ( is_primary BOOLEAN NOT NULL, slot INT2 NOT NULL, - PRIMARY KEY (inv_collection_id, id) + PRIMARY KEY (inv_collection_id, sled_config_id, id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_sled_config_dataset ( + -- where this observation came from + inv_collection_id UUID NOT NULL, + + -- foreign key into the `inv_omicron_sled_config` table + sled_config_id UUID NOT NULL, + id UUID NOT NULL, + + pool_id UUID NOT NULL, + kind omicron.public.dataset_kind NOT NULL, + -- Only valid if kind = zone + zone_name TEXT, + + quota INT8, + reservation INT8, + compression TEXT NOT NULL, + + CONSTRAINT zone_name_for_zone_kind CHECK ( + (kind != 'zone') OR + (kind = 'zone' AND zone_name IS NOT NULL) + ), + + PRIMARY KEY (inv_collection_id, sled_config_id, id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_sled_config_disk ( + -- where this observation came from + inv_collection_id UUID NOT NULL, + + -- foreign key into the `inv_omicron_sled_config` table + sled_config_id UUID NOT NULL, + id UUID NOT NULL, + + vendor TEXT NOT NULL, + serial TEXT NOT NULL, + model TEXT NOT NULL, + + pool_id UUID NOT NULL, + + PRIMARY KEY (inv_collection_id, sled_config_id, id) ); CREATE TABLE IF NOT EXISTS omicron.public.inv_clickhouse_keeper_membership ( @@ -4033,11 +4191,6 @@ CREATE TYPE IF NOT EXISTS omicron.public.bp_zone_image_source AS ENUM ( ); -- description of omicron zones specified in a blueprint --- --- This is currently identical to `inv_omicron_zone`, except that the foreign --- keys reference other blueprint tables intead of inventory tables. We expect --- their sameness to diverge over time as either inventory or blueprints (or --- both) grow context-specific properties. CREATE TABLE IF NOT EXISTS omicron.public.bp_omicron_zone ( -- foreign key into the `blueprint` table blueprint_id UUID NOT NULL, @@ -5518,7 +5671,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '143.0.0', NULL) + (TRUE, NOW(), NOW(), '144.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/inventory-omicron-sled-config/README.md b/schema/crdb/inventory-omicron-sled-config/README.md new file mode 100644 index 00000000000..016815f0df8 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/README.md @@ -0,0 +1,6 @@ +This directory contains migrations that make several changes to the +representation of inventory collections. The changes are not backwards +compatible: we're transitioning from recording only each sled's +`OmicronZonesConfig` to its entire `OmicronSledConfig`; we have no way of +backfilling the missing disk and dataset config information for preexisting +collections. Therefore, these migrations start by dropping all old collections. diff --git a/schema/crdb/inventory-omicron-sled-config/up01.sql b/schema/crdb/inventory-omicron-sled-config/up01.sql new file mode 100644 index 00000000000..6291d13498a --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up01.sql @@ -0,0 +1,22 @@ +-- Delete all existing inventory collections. Inventory collections are +-- routinely pruned by Nexus, so this should only delete a handful (3-6 +-- collections total). It does mean we will have no collections until Nexus +-- collects a new one after this migration completes, but that should happen +-- soon after Nexus starts and is not a prerequisite for the control plane +-- coming online. + +set local disallow_full_table_scans = off; + +-- This omits a few tables because we're about to drop them entirely anyway. +DELETE FROM omicron.public.inv_collection WHERE 1=1; +DELETE FROM omicron.public.inv_collection_error WHERE 1=1; +DELETE FROM omicron.public.inv_service_processor WHERE 1=1; +DELETE FROM omicron.public.inv_root_of_trust WHERE 1=1; +DELETE FROM omicron.public.inv_caboose WHERE 1=1; +DELETE FROM omicron.public.inv_root_of_trust_page WHERE 1=1; +DELETE FROM omicron.public.inv_sled_agent WHERE 1=1; +DELETE FROM omicron.public.inv_physical_disk WHERE 1=1; +DELETE FROM omicron.public.inv_nvme_disk_firmware WHERE 1=1; +DELETE FROM omicron.public.inv_zpool WHERE 1=1; +DELETE FROM omicron.public.inv_dataset WHERE 1=1; +DELETE FROM omicron.public.inv_clickhouse_keeper_membership WHERE 1=1; diff --git a/schema/crdb/inventory-omicron-sled-config/up02.sql b/schema/crdb/inventory-omicron-sled-config/up02.sql new file mode 100644 index 00000000000..4bb2c9cd87c --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up02.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS omicron.public.inv_omicron_zone_nic; diff --git a/schema/crdb/inventory-omicron-sled-config/up03.sql b/schema/crdb/inventory-omicron-sled-config/up03.sql new file mode 100644 index 00000000000..885910058ad --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up03.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS omicron.public.inv_omicron_zone; diff --git a/schema/crdb/inventory-omicron-sled-config/up04.sql b/schema/crdb/inventory-omicron-sled-config/up04.sql new file mode 100644 index 00000000000..3e86e9e013b --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up04.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS omicron.public.inv_sled_omicron_zones; diff --git a/schema/crdb/inventory-omicron-sled-config/up05.sql b/schema/crdb/inventory-omicron-sled-config/up05.sql new file mode 100644 index 00000000000..f9aad6f539b --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up05.sql @@ -0,0 +1,6 @@ +CREATE TYPE IF NOT EXISTS omicron.public.inv_config_reconciler_status_kind +AS ENUM ( + 'not-yet-run', + 'running', + 'idle' +); diff --git a/schema/crdb/inventory-omicron-sled-config/up06.sql b/schema/crdb/inventory-omicron-sled-config/up06.sql new file mode 100644 index 00000000000..81606ff93b4 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up06.sql @@ -0,0 +1,4 @@ +CREATE TYPE IF NOT EXISTS omicron.public.inv_zone_image_source AS ENUM ( + 'install_dataset', + 'artifact' +); diff --git a/schema/crdb/inventory-omicron-sled-config/up07.sql b/schema/crdb/inventory-omicron-sled-config/up07.sql new file mode 100644 index 00000000000..08099e00357 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up07.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.inv_sled_agent + DROP COLUMN IF EXISTS omicron_physical_disks_generation; diff --git a/schema/crdb/inventory-omicron-sled-config/up08.sql b/schema/crdb/inventory-omicron-sled-config/up08.sql new file mode 100644 index 00000000000..aa0338e9f81 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up08.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.inv_sled_agent + ADD COLUMN IF NOT EXISTS ledgered_sled_config UUID; diff --git a/schema/crdb/inventory-omicron-sled-config/up09.sql b/schema/crdb/inventory-omicron-sled-config/up09.sql new file mode 100644 index 00000000000..08d82a608c5 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up09.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.inv_sled_agent + ADD COLUMN IF NOT EXISTS last_reconciliation_sled_config UUID; diff --git a/schema/crdb/inventory-omicron-sled-config/up10.sql b/schema/crdb/inventory-omicron-sled-config/up10.sql new file mode 100644 index 00000000000..4d0916d2b70 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up10.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.inv_sled_agent + ADD COLUMN IF NOT EXISTS reconciler_status_kind inv_config_reconciler_status_kind NOT NULL; diff --git a/schema/crdb/inventory-omicron-sled-config/up11.sql b/schema/crdb/inventory-omicron-sled-config/up11.sql new file mode 100644 index 00000000000..96b61433887 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up11.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.inv_sled_agent + ADD COLUMN IF NOT EXISTS reconciler_status_sled_config UUID; diff --git a/schema/crdb/inventory-omicron-sled-config/up12.sql b/schema/crdb/inventory-omicron-sled-config/up12.sql new file mode 100644 index 00000000000..c9ef703b5eb --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up12.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.inv_sled_agent + ADD COLUMN IF NOT EXISTS reconciler_status_timestamp TIMESTAMPTZ; diff --git a/schema/crdb/inventory-omicron-sled-config/up13.sql b/schema/crdb/inventory-omicron-sled-config/up13.sql new file mode 100644 index 00000000000..d04a88d3b42 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up13.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.inv_sled_agent + ADD COLUMN IF NOT EXISTS reconciler_status_duration_secs FLOAT; diff --git a/schema/crdb/inventory-omicron-sled-config/up14.sql b/schema/crdb/inventory-omicron-sled-config/up14.sql new file mode 100644 index 00000000000..3720d1dcf4b --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up14.sql @@ -0,0 +1,9 @@ +ALTER TABLE omicron.public.inv_sled_agent + ADD CONSTRAINT IF NOT EXISTS + reconciler_status_sled_config_present_if_running CHECK ( + (reconciler_status_kind = 'running' + AND reconciler_status_sled_config IS NOT NULL) + OR + (reconciler_status_kind != 'running' + AND reconciler_status_sled_config IS NULL) + ); diff --git a/schema/crdb/inventory-omicron-sled-config/up15.sql b/schema/crdb/inventory-omicron-sled-config/up15.sql new file mode 100644 index 00000000000..f17e4ae9a47 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up15.sql @@ -0,0 +1,11 @@ +ALTER TABLE omicron.public.inv_sled_agent + ADD CONSTRAINT IF NOT EXISTS + reconciler_status_timing_present_unless_not_yet_run CHECK ( + (reconciler_status_kind = 'not-yet-run' + AND reconciler_status_timestamp IS NULL + AND reconciler_status_duration_secs IS NULL) + OR + (reconciler_status_kind != 'not-yet-run' + AND reconciler_status_timestamp IS NOT NULL + AND reconciler_status_duration_secs IS NOT NULL) + ); diff --git a/schema/crdb/inventory-omicron-sled-config/up16.sql b/schema/crdb/inventory-omicron-sled-config/up16.sql new file mode 100644 index 00000000000..ec098436958 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up16.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_sled_config ( + inv_collection_id UUID NOT NULL, + id UUID NOT NULL, + generation INT8 NOT NULL, + remove_mupdate_override UUID, + PRIMARY KEY (inv_collection_id, id) +); diff --git a/schema/crdb/inventory-omicron-sled-config/up17.sql b/schema/crdb/inventory-omicron-sled-config/up17.sql new file mode 100644 index 00000000000..77fb24e2ec3 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up17.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_last_reconciliation_disk_result ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + disk_id UUID NOT NULL, + error_message TEXT, + PRIMARY KEY (inv_collection_id, sled_id, disk_id) +); diff --git a/schema/crdb/inventory-omicron-sled-config/up18.sql b/schema/crdb/inventory-omicron-sled-config/up18.sql new file mode 100644 index 00000000000..269c103025b --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up18.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_last_reconciliation_dataset_result ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + dataset_id UUID NOT NULL, + error_message TEXT, + PRIMARY KEY (inv_collection_id, sled_id, dataset_id) +); diff --git a/schema/crdb/inventory-omicron-sled-config/up19.sql b/schema/crdb/inventory-omicron-sled-config/up19.sql new file mode 100644 index 00000000000..7652b0f5d19 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up19.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_last_reconciliation_zone_result ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + zone_id UUID NOT NULL, + error_message TEXT, + PRIMARY KEY (inv_collection_id, sled_id, zone_id) +); diff --git a/schema/crdb/inventory-omicron-sled-config/up20.sql b/schema/crdb/inventory-omicron-sled-config/up20.sql new file mode 100644 index 00000000000..8e9e4f1dee4 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up20.sql @@ -0,0 +1,39 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_sled_config_zone ( + inv_collection_id UUID NOT NULL, + sled_config_id UUID NOT NULL, + id UUID NOT NULL, + zone_type omicron.public.zone_type NOT NULL, + primary_service_ip INET NOT NULL, + primary_service_port INT4 + CHECK (primary_service_port BETWEEN 0 AND 65535) + NOT NULL, + second_service_ip INET, + second_service_port INT4 + CHECK (second_service_port IS NULL + OR second_service_port BETWEEN 0 AND 65535), + dataset_zpool_name TEXT, + nic_id UUID, + dns_gz_address INET, + dns_gz_address_index INT8, + ntp_ntp_servers TEXT[], + ntp_dns_servers INET[], + ntp_domain TEXT, + nexus_external_tls BOOLEAN, + nexus_external_dns_servers INET ARRAY, + snat_ip INET, + snat_first_port INT4 + CHECK (snat_first_port IS NULL OR snat_first_port BETWEEN 0 AND 65535), + snat_last_port INT4 + CHECK (snat_last_port IS NULL OR snat_last_port BETWEEN 0 AND 65535), + filesystem_pool UUID, + image_source omicron.public.inv_zone_image_source NOT NULL, + image_artifact_sha256 STRING(64), + CONSTRAINT zone_image_source_artifact_hash_present CHECK ( + (image_source = 'artifact' + AND image_artifact_sha256 IS NOT NULL) + OR + (image_source != 'artifact' + AND image_artifact_sha256 IS NULL) + ), + PRIMARY KEY (inv_collection_id, sled_config_id, id) +); diff --git a/schema/crdb/inventory-omicron-sled-config/up21.sql b/schema/crdb/inventory-omicron-sled-config/up21.sql new file mode 100644 index 00000000000..e4eb5f19947 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up21.sql @@ -0,0 +1,7 @@ +CREATE INDEX IF NOT EXISTS inv_omicron_sled_config_zone_nic_id + ON omicron.public.inv_omicron_sled_config_zone (nic_id) + STORING ( + primary_service_ip, + second_service_ip, + snat_ip + ); diff --git a/schema/crdb/inventory-omicron-sled-config/up22.sql b/schema/crdb/inventory-omicron-sled-config/up22.sql new file mode 100644 index 00000000000..a4368e12158 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up22.sql @@ -0,0 +1,13 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_sled_config_zone_nic ( + inv_collection_id UUID NOT NULL, + sled_config_id UUID NOT NULL, + id UUID NOT NULL, + name TEXT NOT NULL, + ip INET NOT NULL, + mac INT8 NOT NULL, + subnet INET NOT NULL, + vni INT8 NOT NULL, + is_primary BOOLEAN NOT NULL, + slot INT2 NOT NULL, + PRIMARY KEY (inv_collection_id, sled_config_id, id) +); diff --git a/schema/crdb/inventory-omicron-sled-config/up23.sql b/schema/crdb/inventory-omicron-sled-config/up23.sql new file mode 100644 index 00000000000..7654cd2051c --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up23.sql @@ -0,0 +1,16 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_sled_config_dataset ( + inv_collection_id UUID NOT NULL, + sled_config_id UUID NOT NULL, + id UUID NOT NULL, + pool_id UUID NOT NULL, + kind omicron.public.dataset_kind NOT NULL, + zone_name TEXT, + quota INT8, + reservation INT8, + compression TEXT NOT NULL, + CONSTRAINT zone_name_for_zone_kind CHECK ( + (kind != 'zone') OR + (kind = 'zone' AND zone_name IS NOT NULL) + ), + PRIMARY KEY (inv_collection_id, sled_config_id, id) +); diff --git a/schema/crdb/inventory-omicron-sled-config/up24.sql b/schema/crdb/inventory-omicron-sled-config/up24.sql new file mode 100644 index 00000000000..707e494cf25 --- /dev/null +++ b/schema/crdb/inventory-omicron-sled-config/up24.sql @@ -0,0 +1,10 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_omicron_sled_config_disk ( + inv_collection_id UUID NOT NULL, + sled_config_id UUID NOT NULL, + id UUID NOT NULL, + vendor TEXT NOT NULL, + serial TEXT NOT NULL, + model TEXT NOT NULL, + pool_id UUID NOT NULL, + PRIMARY KEY (inv_collection_id, sled_config_id, id) +); diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 3a58e9583a8..d5910b2837e 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -1150,10 +1150,9 @@ impl ServicePortBuilder { #[cfg(test)] mod tests { use super::*; - use nexus_sled_agent_shared::inventory::OmicronZonesConfig; + use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus; use omicron_common::address::IpRange; use omicron_common::api::external::ByteCount; - use omicron_common::api::external::Generation; use omicron_common::api::internal::shared::AllowedSourceIps; use omicron_common::api::internal::shared::RackNetworkConfig; use oxnet::Ipv6Net; @@ -1369,14 +1368,12 @@ mod tests { usable_hardware_threads: 32, usable_physical_ram: ByteCount::try_from(1_u64 << 40).unwrap(), reservoir_size: ByteCount::try_from(1_u64 << 40).unwrap(), - omicron_zones: OmicronZonesConfig { - generation: OmicronZonesConfig::INITIAL_GENERATION, - zones: vec![], - }, disks, zpools: vec![], datasets: vec![], - omicron_physical_disks_generation: Generation::new(), + ledgered_sled_config: None, + reconciler_status: ConfigReconcilerInventoryStatus::NotYetRun, + last_reconciliation: None, }, is_scrimlet, )]; diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 1503ca5f268..ad1778501bb 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -1657,8 +1657,8 @@ mod test { use crate::rack_setup::plan::service::{Plan as ServicePlan, SledInfo}; use nexus_reconfigurator_blippy::{Blippy, BlippyReportSortKey}; use nexus_sled_agent_shared::inventory::{ - Baseboard, Inventory, InventoryDisk, OmicronZoneType, - OmicronZonesConfig, SledRole, + Baseboard, ConfigReconcilerInventoryStatus, Inventory, InventoryDisk, + OmicronZoneType, SledRole, }; use omicron_common::{ address::{Ipv6Subnet, SLED_PREFIX, get_sled_address}, @@ -1685,10 +1685,6 @@ mod test { usable_hardware_threads: 32, usable_physical_ram: ByteCount::from_gibibytes_u32(16), reservoir_size: ByteCount::from_gibibytes_u32(0), - omicron_zones: OmicronZonesConfig { - generation: Generation::new(), - zones: vec![], - }, disks: (0..u2_count) .map(|i| InventoryDisk { identity: DiskIdentity { @@ -1707,7 +1703,9 @@ mod test { .collect(), zpools: vec![], datasets: vec![], - omicron_physical_disks_generation: Generation::new(), + ledgered_sled_config: None, + reconciler_status: ConfigReconcilerInventoryStatus::NotYetRun, + last_reconciliation: None, }, true, ) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 032c7205729..61adf7e0c59 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -24,8 +24,9 @@ use dropshot::Body; use dropshot::HttpError; use futures::Stream; use nexus_sled_agent_shared::inventory::{ - Inventory, InventoryDataset, InventoryDisk, InventoryZpool, - OmicronSledConfig, OmicronSledConfigResult, OmicronZonesConfig, SledRole, + ConfigReconcilerInventoryStatus, Inventory, InventoryDataset, + InventoryDisk, InventoryZpool, OmicronSledConfig, OmicronSledConfigResult, + OmicronZonesConfig, SledRole, }; use omicron_common::api::external::{ ByteCount, DiskState, Error, Generation, ResourceType, @@ -727,6 +728,21 @@ impl SledAgent { }; let storage = self.storage.lock(); + + let disks_config = + storage.omicron_physical_disks_list().unwrap_or_default(); + let datasets_config = + storage.datasets_config_list().unwrap_or_default(); + let zones_config = self.fake_zones.lock().unwrap().clone(); + + let sled_config = OmicronSledConfig { + generation: zones_config.generation, + disks: disks_config.disks.into_iter().collect(), + datasets: datasets_config.datasets.into_values().collect(), + zones: zones_config.zones.into_iter().collect(), + remove_mupdate_override: None, + }; + Ok(Inventory { sled_id: self.id, sled_agent_address, @@ -741,7 +757,6 @@ impl SledAgent { self.config.hardware.reservoir_ram, ) .context("reservoir_size")?, - omicron_zones: self.fake_zones.lock().unwrap().clone(), disks: storage .physical_disks() .values() @@ -789,7 +804,9 @@ impl SledAgent { .collect::>() }) .unwrap_or_else(|_| vec![]), - omicron_physical_disks_generation: Generation::new(), + ledgered_sled_config: Some(sled_config), + reconciler_status: ConfigReconcilerInventoryStatus::NotYetRun, + last_reconciliation: None, }) } diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index a8b1f26384b..e09e14790ea 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -1619,7 +1619,7 @@ impl StorageInner { } pub fn omicron_physical_disks_list( - &mut self, + &self, ) -> Result { let Some(config) = self.config.as_ref() else { return Err(HttpError::for_not_found( diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 30d87583007..cff825ba474 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -32,8 +32,9 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use illumos_utils::opte::PortManager; use nexus_sled_agent_shared::inventory::{ - Inventory, InventoryDataset, InventoryDisk, InventoryZpool, - OmicronSledConfig, OmicronSledConfigResult, OmicronZonesConfig, SledRole, + ConfigReconcilerInventoryStatus, Inventory, InventoryDataset, + InventoryDisk, InventoryZpool, OmicronSledConfig, OmicronSledConfigResult, + OmicronZonesConfig, SledRole, }; use omicron_common::address::{ Ipv6Subnet, SLED_PREFIX, get_sled_address, get_switch_zone_address, @@ -74,6 +75,7 @@ use sled_hardware_types::Baseboard; use sled_hardware_types::underlay::BootstrapInterface; use sled_storage::manager::StorageHandle; use slog::Logger; +use slog_error_chain::InlineErrorChain; use sprockets_tls::keys::SprocketsConfig; use std::collections::BTreeMap; use std::net::{Ipv6Addr, SocketAddrV6}; @@ -294,16 +296,21 @@ pub enum InventoryError { // system. #[error(transparent)] BadByteCount(#[from] ByteCountRangeError), + #[error("failed to get current ledgered disks")] + GetDisksConfig(#[source] sled_storage::error::Error), + #[error("failed to get current ledgered datasets")] + GetDatasetsConfig(#[source] sled_storage::error::Error), } impl From for omicron_common::api::external::Error { fn from(inventory_error: InventoryError) -> Self { match inventory_error { - e @ InventoryError::BadByteCount(..) => { - omicron_common::api::external::Error::internal_error(&format!( - "{:#}", - e - )) + e @ (InventoryError::BadByteCount(..) + | InventoryError::GetDisksConfig(_) + | InventoryError::GetDatasetsConfig(_)) => { + omicron_common::api::external::Error::internal_error( + &InlineErrorChain::new(&e).to_string(), + ) } } } @@ -1355,10 +1362,31 @@ impl SledAgent { let mut disks = vec![]; let mut zpools = vec![]; let mut datasets = vec![]; - let (all_disks, omicron_zones) = tokio::join!( + let (all_disks, disks_config, datasets_config, omicron_zones) = tokio::join!( self.storage().get_latest_disks(), + self.storage().omicron_physical_disks_list(), + self.storage().datasets_config_list(), self.inner.services.omicron_zones_list() ); + + // RSS asks for our inventory _before_ it sends us an + // `OmicronSledConfig`; echo back the default (empty) disk and dataset + // configs if we have no ledger at all. + let disks_config = match disks_config { + Ok(disks_config) => disks_config, + Err(sled_storage::error::Error::LedgerNotFound) => { + OmicronPhysicalDisksConfig::default() + } + Err(err) => return Err(InventoryError::GetDisksConfig(err)), + }; + let datasets_config = match datasets_config { + Ok(datasets_config) => datasets_config, + Err(sled_storage::error::Error::LedgerNotFound) => { + DatasetsConfig::default() + } + Err(err) => return Err(InventoryError::GetDatasetsConfig(err)), + }; + for (identity, variant, slot, firmware) in all_disks.iter_all() { disks.push(InventoryDisk { identity: identity.clone(), @@ -1410,6 +1438,16 @@ impl SledAgent { datasets.extend(inv_props); } + // Reassemble our combined sled config from its separate pieces. (This + // will go away once we start ledgering the config as a single unit.) + let sled_config = OmicronSledConfig { + generation: omicron_zones.generation, + disks: disks_config.disks.into_iter().collect(), + datasets: datasets_config.datasets.into_values().collect(), + zones: omicron_zones.zones.into_iter().collect(), + remove_mupdate_override: None, + }; + Ok(Inventory { sled_id, sled_agent_address, @@ -1418,11 +1456,14 @@ impl SledAgent { usable_hardware_threads, usable_physical_ram: ByteCount::try_from(usable_physical_ram)?, reservoir_size, - omicron_zones, disks, zpools, datasets, - omicron_physical_disks_generation: *all_disks.generation(), + // These fields will come from the reconciler once it's integrated. + // For now, we can report our ledgered config but nothing else. + ledgered_sled_config: Some(sled_config), + reconciler_status: ConfigReconcilerInventoryStatus::NotYetRun, + last_reconciliation: None, }) } diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 3eddf8011af..1d10e9642ec 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -66,6 +66,11 @@ impl_typed_uuid_kind! { Instance => "instance", LoopbackAddress => "loopback_address", MupdateOverride => "mupdate_override", + // `OmicronSledConfig`s do not themselves contain IDs, but we generate IDs + // for them when they're serialized to the database during inventory + // collection. This ID type is therefore only used by nexus-db-model and + // nexus-db-queries. + OmicronSledConfig => "omicron_sled_config", OmicronZone => "service", PhysicalDisk => "physical_disk", Propolis => "propolis",