From cd64b8b5cee0ae77ab3497432d9a3efbbc05496a Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 30 Oct 2025 21:59:41 +0000 Subject: [PATCH 01/12] Local storage 3/4: Allocate and deallocate A local storage allocation is defined as a slice of the local storage dataset for a U2. This commit adds the logic to allocate the local storage datasets allocations during instance allocation. This is done by modifying the sled reservation query to: - find all the local storage disks attached to an instance - determine which of those do not have an allocation yet - match the requests for local storage allocation to zpools with available space, and find a valid configuration of allocations These query changes take into account existing Crucible dataset usage, the control plane storage buffer, and the tombstone status of the local storage dataset. `instance_ensure_registered` now fills in the delegated_zvols entries for local storage disks. During the instance start saga, _after_ the instance's state has been changed to `starting`, list all the local storage disks, grab their allocations, and send ensure requests to sled agents for those local storage allocations. This occurs after the VMM record was created, which means that either each of those local storage disks now have an allocation or the reservation failed. One of the things that this PR also introduces is a repetition syntax for creating a batch of declared saga actions: ``` ENSURE_LOCAL_STORAGE (0, 1, 2, 3, 4, 5, 6, 7) -> "ensure_local_storage" { + sis_ensure_local_storage // No undo action for this, that is handled in the disk delete saga! } ``` During the disk delete saga, local storage disks will go through a new path to clean up those allocations, involving a deallocation in the database and an delete request to the appropriate sled agent. --- Cargo.lock | 23 +- Cargo.toml | 2 + nexus/Cargo.toml | 2 + nexus/db-model/src/affinity.rs | 2 +- .../src/local_storage_dataset_allocation.rs | 24 + nexus/db-queries/Cargo.toml | 1 + nexus/db-queries/src/db/datastore/disk.rs | 29 +- .../src/db/datastore/local_storage.rs | 89 + nexus/db-queries/src/db/datastore/sled.rs | 2685 ++++++++++++++++- nexus/db-queries/src/db/datastore/zpool.rs | 175 ++ .../src/db/queries/sled_reservation.rs | 422 ++- ...sert_resource_query_with_local_storage.sql | 204 ++ nexus/db-schema/src/schema.rs | 3 + nexus/src/app/instance.rs | 33 +- nexus/src/app/sagas/disk_delete.rs | 102 +- nexus/src/app/sagas/instance_create.rs | 35 +- nexus/src/app/sagas/instance_start.rs | 200 +- nexus/src/app/sagas/mod.rs | 188 ++ nexus/test-utils/src/background.rs | 73 + nexus/tests/integration_tests/instances.rs | 2 +- 20 files changed, 4190 insertions(+), 104 deletions(-) create mode 100644 nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql diff --git a/Cargo.lock b/Cargo.lock index af4763cdfc0..acaebab9ce7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1722,7 +1722,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" dependencies = [ "lazy_static", - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] @@ -4874,7 +4874,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.5.10", + "socket2 0.6.1", "system-configuration", "tokio", "tower-service", @@ -5884,7 +5884,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -6788,6 +6788,7 @@ dependencies = [ "nexus-sled-agent-shared", "nexus-test-utils", "nexus-types", + "nonempty", "omicron-cockroach-metrics", "omicron-common", "omicron-passwords", @@ -7598,6 +7599,12 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nonempty" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9737e026353e5cd0736f98eddae28665118eb6f6600902a7f50db585621fecb6" + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -8381,6 +8388,7 @@ dependencies = [ "scim2-rs", "scim2-test-client", "semver 1.0.27", + "seq-macro", "serde", "serde_json", "serde_urlencoded", @@ -8395,6 +8403,7 @@ dependencies = [ "slog-error-chain", "slog-term", "sp-sim", + "static_assertions", "steno", "strum 0.27.2", "subprocess", @@ -12411,6 +12420,12 @@ dependencies = [ "serde_core", ] +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.228" @@ -16241,7 +16256,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 4c38098a7b6..f352b0ed67c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -575,6 +575,7 @@ nexus-test-utils = { path = "nexus/test-utils" } nexus-types = { path = "nexus/types" } nix = { version = "0.30", features = ["fs", "net"] } nom = "7.1.3" +nonempty = "0.12.0" num-integer = "0.1.46" num = { version = "0.4.3", default-features = false, features = [ "libm" ] } omicron-clickhouse-admin = { path = "clickhouse-admin" } @@ -687,6 +688,7 @@ schemars = "0.8.22" scopeguard = "1.2.0" secrecy = "0.10.3" semver = { version = "1.0.26", features = ["std", "serde"] } +seq-macro = "0.3.6" serde = { version = "1.0", default-features = false, features = [ "derive", "rc" ] } serde_cbor = "0.11.2" serde_human_bytes = { git = "https://github.com/oxidecomputer/serde_human_bytes", branch = "main" } diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 1145813c01d..47a8ae8cf4b 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -93,6 +93,7 @@ ring.workspace = true samael.workspace = true schemars = { workspace = true, features = ["chrono", "uuid1"] } semver.workspace = true +seq-macro.workspace = true serde.workspace = true serde_json.workspace = true serde_urlencoded.workspace = true @@ -106,6 +107,7 @@ slog-dtrace.workspace = true slog-error-chain.workspace = true display-error-chain.workspace = true slog-term.workspace = true +static_assertions.workspace = true steno.workspace = true tempfile.workspace = true thiserror.workspace = true diff --git a/nexus/db-model/src/affinity.rs b/nexus/db-model/src/affinity.rs index dbef84e9672..06c223e5941 100644 --- a/nexus/db-model/src/affinity.rs +++ b/nexus/db-model/src/affinity.rs @@ -151,7 +151,7 @@ impl From for AffinityGroupUpdate { #[diesel(table_name = anti_affinity_group)] pub struct AntiAffinityGroup { #[diesel(embed)] - identity: AntiAffinityGroupIdentity, + pub identity: AntiAffinityGroupIdentity, pub project_id: Uuid, pub policy: AffinityPolicy, pub failure_domain: FailureDomain, diff --git a/nexus/db-model/src/local_storage_dataset_allocation.rs b/nexus/db-model/src/local_storage_dataset_allocation.rs index 3b7741d42b4..47cc969154f 100644 --- a/nexus/db-model/src/local_storage_dataset_allocation.rs +++ b/nexus/db-model/src/local_storage_dataset_allocation.rs @@ -40,6 +40,30 @@ pub struct LocalStorageDatasetAllocation { } impl LocalStorageDatasetAllocation { + /// These records are normally created during sled reservation, but for unit + /// tests add this `new` function. + pub fn new_for_tests_only( + id: DatasetUuid, + time_created: DateTime, + local_storage_dataset_id: DatasetUuid, + pool_id: ExternalZpoolUuid, + sled_id: SledUuid, + dataset_size: ByteCount, + ) -> Self { + Self { + id: id.into(), + + time_created, + time_deleted: None, + + local_storage_dataset_id: local_storage_dataset_id.into(), + pool_id: pool_id.into(), + sled_id: sled_id.into(), + + dataset_size, + } + } + pub fn id(&self) -> DatasetUuid { self.id.into() } diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index 314f9614a91..98420f54264 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -30,6 +30,7 @@ internal-dns-types.workspace = true ipnetwork.workspace = true itertools.workspace = true macaddr.workspace = true +nonempty.workspace = true oxnet.workspace = true paste.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs index c4ad17943df..f5fe96ab944 100644 --- a/nexus/db-queries/src/db/datastore/disk.rs +++ b/nexus/db-queries/src/db/datastore/disk.rs @@ -434,13 +434,28 @@ impl DataStore { authz_instance: &authz::Instance, pagparams: &PaginatedBy<'_>, ) -> ListResultVec { - use nexus_db_schema::schema::disk::dsl; - use nexus_db_schema::schema::disk_type_crucible::dsl as disk_type_crucible_dsl; - use nexus_db_schema::schema::disk_type_local_storage::dsl as disk_type_local_storage_dsl; + let conn = self.pool_connection_authorized(opctx).await?; opctx.authorize(authz::Action::ListChildren, authz_instance).await?; - let conn = self.pool_connection_authorized(opctx).await?; + self.instance_list_disks_impl_unauth( + &conn, + authz_instance.id(), + pagparams, + ) + .await + } + + /// List disks associated with a given instance by name. + pub async fn instance_list_disks_impl_unauth( + &self, + conn: &async_bb8_diesel::Connection, + instance_id: Uuid, + pagparams: &PaginatedBy<'_>, + ) -> ListResultVec { + use nexus_db_schema::schema::disk::dsl; + use nexus_db_schema::schema::disk_type_crucible::dsl as disk_type_crucible_dsl; + use nexus_db_schema::schema::disk_type_local_storage::dsl as disk_type_local_storage_dsl; let results = match pagparams { PaginatedBy::Id(pagparams) => { @@ -461,13 +476,13 @@ impl DataStore { .on(dsl::id.eq(disk_type_local_storage_dsl::disk_id)), ) .filter(dsl::time_deleted.is_null()) - .filter(dsl::attach_instance_id.eq(authz_instance.id())) + .filter(dsl::attach_instance_id.eq(instance_id)) .select(( model::Disk::as_select(), Option::::as_select(), Option::::as_select(), )) - .get_results_async(&*conn) + .get_results_async(conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; @@ -494,7 +509,7 @@ impl DataStore { let allocation = dsl::local_storage_dataset_allocation .filter(dsl::id.eq(to_db_typed_uuid(allocation_id))) .select(LocalStorageDatasetAllocation::as_select()) - .first_async(&*conn) + .first_async(conn) .await .map_err(|e| { public_error_from_diesel( diff --git a/nexus/db-queries/src/db/datastore/local_storage.rs b/nexus/db-queries/src/db/datastore/local_storage.rs index 1f2a0503220..09914809b17 100644 --- a/nexus/db-queries/src/db/datastore/local_storage.rs +++ b/nexus/db-queries/src/db/datastore/local_storage.rs @@ -9,7 +9,9 @@ use crate::authz; use crate::context::OpContext; use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; +use crate::db::datastore::DbConnection; use crate::db::datastore::SQL_BATCH_SIZE; +use crate::db::model::LocalStorageDatasetAllocation; use crate::db::model::RendezvousLocalStorageDataset; use crate::db::model::Zpool; use crate::db::model::to_db_typed_uuid; @@ -18,6 +20,9 @@ use crate::db::pagination::paginated; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; +use diesel::sql_types::BigInt; +use diesel::sql_types::Nullable; +use diesel::sql_types::Numeric; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use omicron_common::api::external::CreateResult; @@ -31,6 +36,9 @@ use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use uuid::Uuid; +// XXX is it bad that Numeric -> BigInt here? +define_sql_function! { fn coalesce(x: Nullable, y: BigInt) -> BigInt; } + impl DataStore { /// List all LocalStorage datasets, making as many queries as needed to get /// them all @@ -152,4 +160,85 @@ impl DataStore { }) .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + + pub(super) async fn delete_local_storage_dataset_allocation_in_txn( + conn: &async_bb8_diesel::Connection, + local_storage_dataset_allocation_id: DatasetUuid, + ) -> Result<(), diesel::result::Error> { + use nexus_db_schema::schema::local_storage_dataset_allocation::dsl; + + // If this is deleted already, then return Ok - this function should be + // idempotent, it is called from a saga node. + + let id = to_db_typed_uuid(local_storage_dataset_allocation_id); + + let allocation = dsl::local_storage_dataset_allocation + .filter(dsl::id.eq(id)) + .select(LocalStorageDatasetAllocation::as_select()) + .get_result_async(conn) + .await?; + + if allocation.time_deleted.is_some() { + return Ok(()); + } + + // Set the time_deleted + + diesel::update(dsl::local_storage_dataset_allocation) + .filter(dsl::id.eq(id)) + .set(dsl::time_deleted.eq(Utc::now())) + .execute_async(conn) + .await?; + + // Recompute the associated local storage dataset's size_used + + let dataset_id: DatasetUuid = allocation.local_storage_dataset_id(); + + use nexus_db_schema::schema::rendezvous_local_storage_dataset::dsl as dataset_dsl; + + diesel::update(dataset_dsl::rendezvous_local_storage_dataset) + .filter(dataset_dsl::id.eq(to_db_typed_uuid(dataset_id))) + .set( + dataset_dsl::size_used.eq(coalesce( + dsl::local_storage_dataset_allocation + .filter( + dsl::local_storage_dataset_id + .eq(to_db_typed_uuid(dataset_id)), + ) + .filter(dsl::time_deleted.is_null()) + .select(diesel::dsl::sum(dsl::dataset_size)) + .single_value(), + 0, + )), + ) + .execute_async(conn) + .await?; + + Ok(()) + } + + /// Mark the local storage dataset allocation as deleted, and re-compute the + /// appropriate local storage dataset size_used column. + pub async fn delete_local_storage_dataset_allocation( + &self, + opctx: &OpContext, + local_storage_dataset_allocation_id: DatasetUuid, + ) -> Result<(), Error> { + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper( + "delete_local_storage_dataset_allocation", + ) + .transaction(&conn, |conn| async move { + Self::delete_local_storage_dataset_allocation_in_txn( + &conn, + local_storage_dataset_allocation_id, + ) + .await + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } } diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 889f0700e99..965f1dcbb70 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -9,7 +9,9 @@ use super::SQL_BATCH_SIZE; use crate::authz; use crate::context::OpContext; use crate::db; +use crate::db::datastore::LocalStorageDisk; use crate::db::datastore::ValidateTransition; +use crate::db::datastore::zpool::ZpoolGetForSledReservationResult; use crate::db::model::AffinityPolicy; use crate::db::model::Sled; use crate::db::model::SledResourceVmm; @@ -19,6 +21,9 @@ use crate::db::model::to_db_sled_policy; use crate::db::model::to_db_typed_uuid; use crate::db::pagination::Paginator; use crate::db::pagination::paginated; +use crate::db::queries::disk::MAX_DISKS_PER_INSTANCE; +use crate::db::queries::sled_reservation::LocalStorageAllocation; +use crate::db::queries::sled_reservation::LocalStorageAllocationRequired; use crate::db::queries::sled_reservation::sled_find_targets_query; use crate::db::queries::sled_reservation::sled_insert_resource_query; use crate::db::update_and_check::{UpdateAndCheck, UpdateStatus}; @@ -35,6 +40,7 @@ use nexus_types::deployment::SledFilter; use nexus_types::external_api::views::SledPolicy; use nexus_types::external_api::views::SledProvisionPolicy; use nexus_types::identity::Asset; +use nonempty::NonEmpty; use omicron_common::api::external; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; @@ -42,13 +48,17 @@ use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::ResourceType; +use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::bail_unless; +use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::ZpoolUuid; use slog::Logger; use std::collections::HashSet; +use std::collections::VecDeque; use std::fmt; use strum::IntoEnumIterator; use thiserror::Error; @@ -471,23 +481,122 @@ impl DataStore { .limit(1) .load_async(&*conn) .await?; + if !old_resource.is_empty() { return Ok(old_resource[0].clone()); } - let must_use_sleds: HashSet<&SledUuid> = + // Get a list of local storage disks attached to this instance + let local_storage_disks: Vec = self + .instance_list_disks_impl_unauth( + &conn, + instance_id.into_untyped_uuid(), + &PaginatedBy::Name(DataPageParams { + marker: None, + direction: dropshot::PaginationOrder::Ascending, + limit: std::num::NonZeroU32::new(MAX_DISKS_PER_INSTANCE) + .unwrap(), + }), + ) + .await? + .into_iter() + .filter_map(|disk| match disk { + db::datastore::Disk::LocalStorage(disk) => Some(disk), + db::datastore::Disk::Crucible(_) => None, + }) + .collect(); + + // Gather constraints for the instance placement, starting with the + // arguments to this function. + let maybe_must_use_sleds: Option> = if let Some(must_select_from) = constraints.must_select_from() { - must_select_from.into_iter().collect() + Some(must_select_from.into_iter().cloned().collect()) } else { - HashSet::default() + None }; + // If any local storage disks have been allocated already, then this + // constraints VMM placement and where other unallocated local storage + // must be. + let maybe_must_use_sleds = if !local_storage_disks.is_empty() { + // Any local storage disk that was allocated already will have a + // sled_id. + let local_storage_allocation_sleds: HashSet = + local_storage_disks + .iter() + .filter_map(|disk| { + disk.local_storage_dataset_allocation + .as_ref() + .map(|allocation| allocation.sled_id()) + }) + .collect(); + + if local_storage_allocation_sleds.is_empty() { + // None of this instance's local storage disks have been + // allocated yet. + maybe_must_use_sleds + } else { + if local_storage_allocation_sleds.len() != 1 { + // It's an error for multiple sleds to host local storage + // disks, that makes no sense! + return Err(SledReservationTransactionError::Connection( + Error::internal_error(&format!( + "local storage disks for instance {instance_id} \ + allocated on multiple sleds \ + {local_storage_allocation_sleds:?}" + )), + )); + } + + // If there's already a list of sleds to use, that must be + // filtered further. Otherwise return the one sled that local + // storage disks have already been allocated on. + match maybe_must_use_sleds { + Some(must_use_sleds) => Some( + must_use_sleds + .into_iter() + .filter(|sled| { + local_storage_allocation_sleds.contains(&sled) + }) + .collect(), + ), + + None => Some(local_storage_allocation_sleds), + } + } + } else { + // `local_storage_disks` is empty, so that does not have an impact + // on the existing hash set + maybe_must_use_sleds + }; + + // If filtering has removed all the sleds from the constraint, then we + // cannot satisfy this allocation. + if let Some(must_use_sleds) = &maybe_must_use_sleds { + if must_use_sleds.is_empty() { + error!( + &opctx.log, + "no sleds available after filtering for instance \ + {instance_id}", + ); + + // Nothing will satisfy this allocation, return an error here. + return Err(SledReservationTransactionError::Reservation( + SledReservationError::NotFound, + )); + } + } + // Query for the set of possible sleds using a CTE. // // Note that this is not transactional, to reduce contention. // However, that lack of transactionality means we need to validate // our constraints again when we later try to INSERT the reservation. - let possible_sleds = sled_find_targets_query(instance_id, &resources, constraints.cpu_families()) + let possible_sleds = sled_find_targets_query( + instance_id, + &resources, + constraints.cpu_families(), + ) .get_results_async::<( // Sled UUID Uuid, @@ -508,6 +617,7 @@ impl DataStore { let mut unpreferred = HashSet::new(); let mut required = HashSet::new(); let mut preferred = HashSet::new(); + for (sled_id, fits, affinity_policy, anti_affinity_policy) in possible_sleds { @@ -517,18 +627,30 @@ impl DataStore { // queries like this. let sled_id = SledUuid::from_untyped_uuid(sled_id); - if fits - && (must_use_sleds.is_empty() - || must_use_sleds.contains(&sled_id)) - { - sled_targets.insert(sled_id); + if fits { + // If there is a Some list of sleds to select from, only add + // this target if it is in that list. A None list means that any + // sled could be a target. + match &maybe_must_use_sleds { + Some(must_use_sleds) => { + if must_use_sleds.contains(&sled_id) { + sled_targets.insert(sled_id); + } + } + + None => { + sled_targets.insert(sled_id); + } + } } + if let Some(policy) = affinity_policy { match policy { AffinityPolicy::Fail => required.insert(sled_id), AffinityPolicy::Allow => preferred.insert(sled_id), }; } + if let Some(policy) = anti_affinity_policy { match policy { AffinityPolicy::Fail => banned.insert(sled_id), @@ -537,13 +659,19 @@ impl DataStore { } } + let local_storage_allocation_required: Vec<&LocalStorageDisk> = + local_storage_disks + .iter() + .filter(|disk| disk.local_storage_dataset_allocation.is_none()) + .collect(); + // We loop here because our attempts to INSERT may be violated by // concurrent operations. We'll respond by looking through a slightly // smaller set of possible sleds. // // In the uncontended case, however, we'll only iterate through this // loop once. - loop { + 'outer: loop { // Pick a reservation target, given the constraints we previously // saw in the database. let sled_target = pick_sled_reservation_target( @@ -564,14 +692,374 @@ impl DataStore { resources.clone(), ); - // Try to INSERT the record. If this is still a valid target, we'll - // use it. If it isn't a valid target, we'll shrink the set of - // viable sled targets and try again. - let rows_inserted = sled_insert_resource_query(&resource) + if local_storage_allocation_required.is_empty() { + // If no local storage allocation is required, then simply try + // allocating a VMM to this sled. + // + // Try to INSERT the record. If this is still a valid target, + // we'll use it. If it isn't a valid target, we'll shrink the + // set of viable sled targets and try again. + let rows_inserted = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::No, + ) .execute_async(&*conn) .await?; - if rows_inserted > 0 { - return Ok(resource); + + if rows_inserted > 0 { + return Ok(resource); + } + } else { + // If local storage allocation is required, match the requests + // with all the zpools of this sled that have available space. + // This routine finds all possible configurations that would + // satisfy the requests for local storage and tries them all. + + // First, each request for local storage can possibly be + // satisfied by a number of zpools on the sled. Find this list + // of candidate zpools for each required local storage + // allocation. + + #[derive(Clone, Debug, Hash, PartialEq, Eq)] + struct CandidateDataset { + rendezvous_local_storage_dataset_id: DatasetUuid, + pool_id: ZpoolUuid, + sled_id: SledUuid, + } + + #[derive(Clone, Debug)] + struct PossibleAllocationsForRequest<'a> { + request: &'a LocalStorageDisk, + candidate_datasets: HashSet, + } + + let zpools_for_sled = self + .zpool_get_for_sled_reservation(&opctx, sled_target) + .await?; + + // If there's an existing local storage allocation on a zpool, + // remove that from the list of candidates. Local storage for + // the same Instance should not share any zpools. + let local_storage_zpools_used: HashSet = + local_storage_disks + .iter() + .filter_map(|disk| { + disk.local_storage_dataset_allocation.as_ref().map( + |allocation| { + ZpoolUuid::from_untyped_uuid( + allocation + .pool_id() + .into_untyped_uuid(), + ) + }, + ) + }) + .collect(); + + let zpools_for_sled: Vec<_> = zpools_for_sled + .into_iter() + .filter(|zpool_get_result| { + !local_storage_zpools_used + .contains(&zpool_get_result.pool.id()) + }) + .collect(); + + if local_storage_allocation_required.len() + > zpools_for_sled.len() + { + // Not enough zpools to satisfy the number of allocations + // required. Find another sled! + sled_targets.remove(&sled_target); + banned.remove(&sled_target); + unpreferred.remove(&sled_target); + preferred.remove(&sled_target); + + continue; + } + + // If there's an existing local storage allocation on a zpool, + // remove that from the list of candidates. Local storage for + // the same Instance should not share any zpools. + let local_storage_zpools_used: HashSet = + local_storage_disks + .iter() + .filter_map(|disk| { + disk.local_storage_dataset_allocation.as_ref().map( + |allocation| { + ZpoolUuid::from_untyped_uuid( + allocation + .pool_id() + .into_untyped_uuid(), + ) + }, + ) + }) + .collect(); + + let zpools_for_sled: Vec<_> = zpools_for_sled + .into_iter() + .filter(|zpool_get_result| { + !local_storage_zpools_used + .contains(&zpool_get_result.pool.id()) + }) + .collect(); + + if local_storage_allocation_required.len() + > zpools_for_sled.len() + { + // Not enough zpools to satisfy the number of allocations + // required. Find another sled! + sled_targets.remove(&sled_target); + banned.remove(&sled_target); + unpreferred.remove(&sled_target); + preferred.remove(&sled_target); + + continue; + } + + let mut allocations_to_perform = + Vec::with_capacity(local_storage_allocation_required.len()); + + for request in &local_storage_allocation_required { + // Find all the zpools that could satisfy this local storage + // request. These will be filtered later. + let candidate_datasets: HashSet = + zpools_for_sled + .iter() + .filter(|zpool_get_result| { + let ZpoolGetForSledReservationResult { + pool, + last_inv_total_size, + rendezvous_local_storage_dataset_id: _, + crucible_dataset_usage, + local_storage_usage, + } = zpool_get_result; + + // The total request size for the local storage + // dataset allocation is the disk size plus the + // required overhead. + let request_size: i64 = + request.size().to_bytes() as i64 + + request + .required_dataset_overhead() + .to_bytes() + as i64; + + let new_size_used: i64 = crucible_dataset_usage + + local_storage_usage + + request_size; + + let control_plane_storage_buffer: i64 = + pool.control_plane_storage_buffer().into(); + let adjusted_total: i64 = last_inv_total_size + - control_plane_storage_buffer; + + // Any zpool that has space for this local + // storage dataset allocation is considered a + // candidate. + new_size_used < adjusted_total + }) + .map(|zpool_get_result| CandidateDataset { + rendezvous_local_storage_dataset_id: + zpool_get_result + .rendezvous_local_storage_dataset_id, + pool_id: zpool_get_result.pool.id(), + sled_id: zpool_get_result.pool.sled_id(), + }) + .collect(); + + if candidate_datasets.is_empty() { + // if there's no local storage datasets on this sled for + // this request's size, then try another sled. + sled_targets.remove(&sled_target); + banned.remove(&sled_target); + unpreferred.remove(&sled_target); + preferred.remove(&sled_target); + + continue 'outer; + } + + allocations_to_perform.push( + PossibleAllocationsForRequest { + request, + candidate_datasets, + }, + ); + } + + // From the list of allocations to perform, and all the + // candidate local storage datasets that could fit those + // allocations, find a list of all valid request -> zpool + // mappings. + + struct PossibleAllocations { + allocations: Vec, + candidates_left: HashSet, + request_index: usize, + } + + struct ValidatedAllocations { + allocations: NonEmpty, + } + + let mut validated_allocations: Vec = + vec![]; + let mut queue = VecDeque::new(); + + // Start from no allocations made yet, a list of allocations to + // perform (stored in `requests`), and all of the available + // zpools on the sled. + + queue.push_back(PossibleAllocations { + allocations: vec![], + candidates_left: zpools_for_sled + .iter() + .map(|zpool_get_result| CandidateDataset { + rendezvous_local_storage_dataset_id: + zpool_get_result + .rendezvous_local_storage_dataset_id, + pool_id: zpool_get_result.pool.id(), + sled_id: zpool_get_result.pool.sled_id(), + }) + .collect(), + request_index: 0, + }); + + // Find each valid allocation by, for each request: + // + // - selecting a local storage dataset from the list of local + // storage datasets that have not been used yet that could + // fulfill that request + // + // - removing that selected local storage dataset from the list + // of candidates + // + // - considering a set of allocations as "valid" if all requests + // have been matched with a local storage dataset. + + while let Some(possible_allocation) = queue.pop_front() { + let PossibleAllocations { + allocations, + candidates_left, + request_index, + } = possible_allocation; + + // If we have an allocation for each possible allocation, + // this one is valid. Add it to the list! + if request_index == allocations_to_perform.len() { + let allocations_len = allocations.len(); + + match NonEmpty::from_vec(allocations) { + Some(allocations) => { + validated_allocations + .push(ValidatedAllocations { allocations }); + } + + None => { + // There should be `request_index` entries in + // the `allocations` vec, this is weird! + error!( + &opctx.log, + "expected {request_index} in the \ + allocations vec, saw {allocations_len}", + ); + } + } + + continue; + } + + // Try to allocate the Nth possible allocation + let request = &allocations_to_perform[request_index]; + + // Create a possible config based on the what datasets are + // left, and the candidate datasets for this request. + for candidate_dataset in &request.candidate_datasets { + if candidates_left.contains(candidate_dataset) { + // This request could be satisfied by this dataset. + // Select it and search further. + + let mut set_allocations = allocations.clone(); + set_allocations.push(LocalStorageAllocation { + disk_id: request.request.id(), + + local_storage_dataset_allocation_id: + DatasetUuid::new_v4(), + + required_dataset_size: { + let request_size: i64 = + request.request.size().to_bytes() + as i64 + + request + .request + .required_dataset_overhead() + .to_bytes() + as i64; + request_size + }, + + local_storage_dataset_id: candidate_dataset + .rendezvous_local_storage_dataset_id, + + pool_id: candidate_dataset.pool_id, + + sled_id: candidate_dataset.sled_id, + }); + + // Note by removing a candidate dataset from the + // list in this way, this step mandates that a + // single local storage dataset is not used for + // multiple local storage allocations for a given + // instance. + + let mut set_candidates_left = + candidates_left.clone(); + set_candidates_left.remove(candidate_dataset); + + queue.push_back(PossibleAllocations { + allocations: set_allocations, + candidates_left: set_candidates_left, + request_index: request_index + 1, + }); + } + + // Else there are no candidate datasets left for this + // request, and therefore the list of requests cannot be + // fulfilled by the current mapping of requests to + // datasets + } + } + + // Loop here over each possible set of local storage allocations + // required, and attempt the sled insert resource query with + // that particular allocation. + // + // If the `validated_allocations` list is empty, control will + // pass to the end of the loop marked with 'outer, which will + // then try the next possible sled target. In the case where + // there were existing local storage allocations there will + // _not_ be any more sleds to try and the user will see a + // capacity error. + + for valid_allocation in validated_allocations { + let ValidatedAllocations { allocations } = valid_allocation; + + // Try to INSERT the record plus the new local storage + // allocations. If this is still a valid target and the new + // local storage allocations still fit, we'll use it. If it + // isn't a valid target, we'll shrink the set of viable sled + // targets and try again. + let rows_inserted = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::Yes { allocations }, + ) + .execute_async(&*conn) + .await?; + + if rows_inserted > 0 { + return Ok(resource); + } + } } sled_targets.remove(&sled_target); @@ -1080,11 +1568,13 @@ impl TransitionError { #[cfg(test)] pub(in crate::db::datastore) mod test { use super::*; + use crate::db; use crate::db::datastore::test_utils::{ Expected, IneligibleSleds, sled_set_policy, sled_set_state, }; use crate::db::model::ByteCount; use crate::db::model::SqlU32; + use crate::db::model::Zpool; use crate::db::model::to_db_typed_uuid; use crate::db::pub_test_utils::TestDatabase; use crate::db::pub_test_utils::helpers::SledUpdateBuilder; @@ -1100,17 +1590,22 @@ pub(in crate::db::datastore) mod test { use nexus_db_model::PhysicalDiskState; use nexus_db_model::{Generation, SledCpuFamily}; use nexus_db_model::{InstanceCpuPlatform, PhysicalDisk}; + use nexus_types::external_api::params; use nexus_types::identity::Asset; use nexus_types::identity::Resource; use omicron_common::api::external; use omicron_test_utils::dev; use omicron_uuid_kinds::AffinityGroupUuid; use omicron_uuid_kinds::AntiAffinityGroupUuid; + use omicron_uuid_kinds::BlueprintUuid; + use omicron_uuid_kinds::CollectionUuid; + use omicron_uuid_kinds::ExternalZpoolUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use predicates::{BoxPredicate, prelude::*}; use std::collections::HashMap; + use std::net::SocketAddrV6; fn rack_id() -> Uuid { Uuid::parse_str(nexus_test_utils::RACK_UUID).unwrap() @@ -1493,8 +1988,12 @@ pub(in crate::db::datastore) mod test { impl Instance { fn new() -> Self { + Self::new_with_id(InstanceUuid::new_v4()) + } + + fn new_with_id(id: InstanceUuid) -> Self { Self { - id: InstanceUuid::new_v4(), + id, groups: vec![], force_onto_sled: None, resources: small_resource_request(), @@ -1524,7 +2023,12 @@ pub(in crate::db::datastore) mod test { .unwrap() .into_iter() .map(|(id, fits, affinity_policy, anti_affinity_policy)| { - FindTargetsOutput { id: SledUuid::from_untyped_uuid(id), fits, affinity_policy, anti_affinity_policy } + FindTargetsOutput { + id: SledUuid::from_untyped_uuid(id), + fits, + affinity_policy, + anti_affinity_policy, + } }) .collect() } @@ -1548,12 +2052,15 @@ pub(in crate::db::datastore) mod test { self.resources.clone(), ); - sled_insert_resource_query(&resource) - .execute_async( - &*datastore.pool_connection_for_tests().await.unwrap(), - ) - .await - .unwrap() + sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::No, + ) + .execute_async( + &*datastore.pool_connection_for_tests().await.unwrap(), + ) + .await + .unwrap() > 0 } @@ -1745,9 +2252,9 @@ pub(in crate::db::datastore) mod test { // Anti-Affinity, Policy = Allow // - // Create two sleds, put instances on each belonging to an anti-affinity group. - // We can continue to add instances belonging to that anti-affinity group, because - // it has "AffinityPolicy::Allow". + // Create two sleds, put instances on each belonging to an anti-affinity + // group. We can continue to add instances belonging to that anti-affinity + // group, because it has "AffinityPolicy::Allow". #[tokio::test] async fn anti_affinity_policy_allow() { let logctx = dev::test_setup_log("anti_affinity_policy_allow"); @@ -1869,9 +2376,9 @@ pub(in crate::db::datastore) mod test { AllGroups::create(&opctx, &datastore, &authz_project, &groups) .await; - // We are constrained so that an instance belonging to both groups must be - // placed on both sled 0 and sled 1. This cannot be satisfied, and it returns - // an error. + // We are constrained so that an instance belonging to both groups must + // be placed on both sled 0 and sled 1. This cannot be satisfied, and it + // returns an error. let instances = [ Instance::new().group("affinity1").sled(sleds[0].id()), Instance::new().group("affinity2").sled(sleds[1].id()), @@ -2221,8 +2728,8 @@ pub(in crate::db::datastore) mod test { // Sled 0 contains an affinity group, but it's large enough to make it // non-viable for future allocations. // - // Sled 1 contains an affinity and anti-affinity group, so they cancel out. - // This gives it "no priority". + // Sled 1 contains an affinity and anti-affinity group, so they cancel + // out. This gives it "no priority". // // Sled 2 contains nothing. It's a viable target, neither preferred nor // unpreferred. @@ -2345,8 +2852,9 @@ pub(in crate::db::datastore) mod test { AllGroups::create(&opctx, &datastore, &authz_project, &groups) .await; - // Only "sleds[2]" has space, even though it also contains an anti-affinity group. - // However, if we don't belong to this group, we won't care. + // Only "sleds[2]" has space, even though it also contains an + // anti-affinity group. However, if we don't belong to this group, we + // won't care. let instances = [ Instance::new().group("anti-affinity1").sled(sleds[0].id()), @@ -3090,4 +3598,2113 @@ pub(in crate::db::datastore) mod test { db.terminate().await; logctx.cleanup_successful(); } + + // local storage allocation tests + + struct LocalStorageTest { + sleds: Vec, + affinity_groups: Vec, + anti_affinity_groups: Vec, + instances: Vec, + } + + struct LocalStorageTestSled { + sled_id: SledUuid, + sled_serial: String, + u2s: Vec, + } + + struct LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid, + physical_disk_serial: String, + + zpool_id: ZpoolUuid, + control_plane_storage_buffer: external::ByteCount, + + inventory_total_size: external::ByteCount, + + crucible_dataset_id: DatasetUuid, + crucible_dataset_addr: SocketAddrV6, + + local_storage_dataset_id: DatasetUuid, + } + + struct LocalStorageAffinityGroup { + id: AffinityGroupUuid, + name: String, + policy: external::AffinityPolicy, + failure_domain: external::FailureDomain, + } + + struct LocalStorageAntiAffinityGroup { + id: AntiAffinityGroupUuid, + name: String, + policy: external::AffinityPolicy, + failure_domain: external::FailureDomain, + } + + struct LocalStorageTestInstance { + id: InstanceUuid, + name: String, + affinity: Option<(Affinity, usize)>, + disks: Vec, + } + + struct LocalStorageTestInstanceDisk { + id: Uuid, + name: external::Name, + size: external::ByteCount, + } + + async fn setup_local_storage_allocation_test( + opctx: &OpContext, + datastore: &DataStore, + config: &LocalStorageTest, + ) { + for sled_config in &config.sleds { + let sled = SledUpdate::new( + sled_config.sled_id, + "[::1]:0".parse().unwrap(), + 0, + db::model::SledBaseboard { + serial_number: sled_config.sled_serial.clone(), + part_number: "test-pn".to_string(), + revision: 0, + }, + db::model::SledSystemHardware { + is_scrimlet: false, + usable_hardware_threads: 128, + usable_physical_ram: (64 << 30).try_into().unwrap(), + reservoir_size: (16 << 30).try_into().unwrap(), + cpu_family: SledCpuFamily::AmdMilan, + }, + Uuid::new_v4(), + Generation::new(), + ); + + datastore.sled_upsert(sled).await.expect("failed to upsert sled"); + + for u2 in &sled_config.u2s { + let physical_disk = db::model::PhysicalDisk::new( + u2.physical_disk_id, + String::from("vendor"), + u2.physical_disk_serial.clone(), + String::from("model"), + db::model::PhysicalDiskKind::U2, + sled_config.sled_id, + ); + + datastore + .physical_disk_insert(opctx, physical_disk) + .await + .unwrap(); + + let zpool = Zpool::new( + u2.zpool_id, + sled_config.sled_id, + u2.physical_disk_id, + u2.control_plane_storage_buffer.into(), + ); + + datastore + .zpool_insert(opctx, zpool) + .await + .expect("failed to upsert zpool"); + + add_inventory_row_for_zpool( + datastore, + u2.zpool_id, + sled_config.sled_id, + u2.inventory_total_size.into(), + ) + .await; + + add_crucible_dataset( + datastore, + u2.crucible_dataset_id, + u2.zpool_id, + u2.crucible_dataset_addr, + ) + .await; + + add_local_storage_dataset( + opctx, + datastore, + u2.local_storage_dataset_id, + u2.zpool_id, + ) + .await; + } + } + + let (authz_project, _project) = + create_project(opctx, datastore, "project").await; + + for group in &config.affinity_groups { + datastore + .affinity_group_create( + &opctx, + &authz_project, + db::model::AffinityGroup { + identity: db::model::AffinityGroupIdentity::new( + group.id.into_untyped_uuid(), + external::IdentityMetadataCreateParams { + name: group.name.parse().unwrap(), + description: String::from("desc"), + }, + ), + project_id: authz_project.id(), + policy: group.policy.into(), + failure_domain: group.failure_domain.into(), + }, + ) + .await + .unwrap(); + } + + for group in &config.anti_affinity_groups { + datastore + .anti_affinity_group_create( + &opctx, + &authz_project, + db::model::AntiAffinityGroup { + identity: db::model::AntiAffinityGroupIdentity::new( + group.id.into_untyped_uuid(), + external::IdentityMetadataCreateParams { + name: group.name.parse().unwrap(), + description: String::from("desc"), + }, + ), + project_id: authz_project.id(), + policy: group.policy.into(), + failure_domain: group.failure_domain.into(), + }, + ) + .await + .unwrap(); + } + + for instance in &config.instances { + let authz_instance = create_test_instance( + datastore, + opctx, + &authz_project, + instance.id, + &instance.name.as_str(), + ) + .await; + + if let Some((affinity, index)) = &instance.affinity { + match affinity { + Affinity::Positive => { + let group = &config.affinity_groups[*index]; + add_instance_to_affinity_group( + &datastore, + group.id, + instance.id, + ) + .await; + } + + Affinity::Negative => { + let group = &config.anti_affinity_groups[*index]; + add_instance_to_anti_affinity_group( + &datastore, + group.id, + instance.id, + ) + .await; + } + } + } + + for disk in &instance.disks { + let params = params::DiskCreate { + identity: external::IdentityMetadataCreateParams { + name: disk.name.clone(), + description: String::from("desc"), + }, + + disk_backend: params::DiskBackend::Local {}, + + size: disk.size, + }; + + datastore + .project_create_disk( + &opctx, + &authz_project, + db::datastore::Disk::LocalStorage( + db::datastore::LocalStorageDisk::new( + db::model::Disk::new( + disk.id, + authz_project.id(), + ¶ms, + db::model::BlockSize::AdvancedFormat, + db::model::DiskRuntimeState::new(), + db::model::DiskType::LocalStorage, + ), + db::model::DiskTypeLocalStorage::new( + disk.id, disk.size, + ) + .unwrap(), + ), + ), + ) + .await + .unwrap(); + + let (.., authz_disk) = LookupPath::new(&opctx, datastore) + .disk_id(disk.id) + .lookup_for(authz::Action::Read) + .await + .unwrap(); + + datastore + .instance_attach_disk( + &opctx, + &authz_instance, + &authz_disk, + MAX_DISKS_PER_INSTANCE, + ) + .await + .unwrap(); + } + } + } + + async fn add_inventory_row_for_zpool( + datastore: &DataStore, + zpool_id: ZpoolUuid, + sled_id: SledUuid, + total_size: ByteCount, + ) { + use nexus_db_schema::schema::inv_zpool::dsl; + + let inv_collection_id = CollectionUuid::new_v4(); + let time_collected = Utc::now(); + let inv_pool = nexus_db_model::InvZpool { + inv_collection_id: inv_collection_id.into(), + time_collected, + id: zpool_id.into(), + sled_id: to_db_typed_uuid(sled_id), + total_size, + }; + + diesel::insert_into(dsl::inv_zpool) + .values(inv_pool) + .execute_async( + &*datastore.pool_connection_for_tests().await.unwrap(), + ) + .await + .unwrap(); + } + + async fn add_crucible_dataset( + datastore: &DataStore, + dataset_id: DatasetUuid, + pool_id: ZpoolUuid, + addr: SocketAddrV6, + ) -> DatasetUuid { + let dataset = + db::model::CrucibleDataset::new(dataset_id, pool_id, addr); + + datastore.crucible_dataset_upsert(dataset).await.unwrap(); + + dataset_id + } + + async fn set_crucible_dataset_size_used( + datastore: &DataStore, + crucible_dataset_id: DatasetUuid, + size_used: ByteCount, + ) { + use nexus_db_schema::schema::crucible_dataset::dsl; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + diesel::update(dsl::crucible_dataset) + .filter(dsl::id.eq(to_db_typed_uuid(crucible_dataset_id))) + .set(dsl::size_used.eq(size_used)) + .execute_async(&*conn) + .await + .unwrap(); + } + + async fn add_local_storage_dataset( + opctx: &OpContext, + datastore: &DataStore, + dataset_id: DatasetUuid, + pool_id: ZpoolUuid, + ) -> DatasetUuid { + let dataset = db::model::RendezvousLocalStorageDataset::new( + dataset_id, + pool_id, + BlueprintUuid::new_v4(), + ); + + datastore + .local_storage_dataset_insert_if_not_exists(opctx, dataset) + .await + .unwrap(); + + dataset_id + } + + async fn set_local_storage_dataset_no_provision( + datastore: &DataStore, + local_storage_dataset_id: DatasetUuid, + ) { + use nexus_db_schema::schema::rendezvous_local_storage_dataset::dsl; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + diesel::update(dsl::rendezvous_local_storage_dataset) + .filter(dsl::id.eq(to_db_typed_uuid(local_storage_dataset_id))) + .set(dsl::no_provision.eq(true)) + .execute_async(&*conn) + .await + .unwrap(); + } + + async fn set_local_storage_allocation( + datastore: &DataStore, + disk_id: Uuid, + local_storage_dataset_id: DatasetUuid, + pool_id: ExternalZpoolUuid, + sled_id: SledUuid, + dataset_size: ByteCount, + ) { + let allocation_id = DatasetUuid::new_v4(); + + let allocation = + db::model::LocalStorageDatasetAllocation::new_for_tests_only( + allocation_id, + Utc::now(), + local_storage_dataset_id, + pool_id, + sled_id, + dataset_size, + ); + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + { + use nexus_db_schema::schema::local_storage_dataset_allocation::dsl; + diesel::insert_into(dsl::local_storage_dataset_allocation) + .values(allocation) + .execute_async(&*conn) + .await + .unwrap(); + } + + { + use nexus_db_schema::schema::disk_type_local_storage::dsl; + + let rows_updated = diesel::update(dsl::disk_type_local_storage) + .filter(dsl::disk_id.eq(disk_id)) + .set( + dsl::local_storage_dataset_allocation_id + .eq(to_db_typed_uuid(allocation_id)), + ) + .execute_async(&*conn) + .await + .unwrap(); + + assert_eq!(rows_updated, 1); + } + } + + async fn create_test_instance( + datastore: &DataStore, + opctx: &OpContext, + authz_project: &authz::Project, + instance_id: InstanceUuid, + name: &str, + ) -> authz::Instance { + datastore + .project_create_instance( + &opctx, + &authz_project, + db::model::Instance::new( + instance_id, + authz_project.id(), + ¶ms::InstanceCreate { + identity: external::IdentityMetadataCreateParams { + name: name.parse().unwrap(), + description: "It's an instance".into(), + }, + ncpus: 2i64.try_into().unwrap(), + memory: external::ByteCount::from_gibibytes_u32(16), + hostname: "myhostname".try_into().unwrap(), + user_data: Vec::new(), + network_interfaces: + params::InstanceNetworkInterfaceAttachment::None, + external_ips: Vec::new(), + disks: Vec::new(), + boot_disk: None, + cpu_platform: None, + ssh_public_keys: None, + start: false, + auto_restart_policy: Default::default(), + anti_affinity_groups: Vec::new(), + multicast_groups: Vec::new(), + }, + ), + ) + .await + .expect("instance must be created successfully"); + + let (.., authz_instance) = LookupPath::new(&opctx, datastore) + .instance_id(instance_id.into_untyped_uuid()) + .lookup_for(authz::Action::Modify) + .await + .expect("instance must exist"); + + authz_instance + } + + #[tokio::test] + async fn local_storage_allocation() { + let logctx = dev::test_setup_log("local_storage_allocation"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with one U2 + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with one local storage disk + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion works + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that an existing allocation restricts where a VMM can be + #[tokio::test] + async fn local_storage_allocation_existing_allocation() { + let logctx = + dev::test_setup_log("local_storage_allocation_existing_allocation"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // Two sleds, each with one U2 + sleds: vec![ + LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }, + LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_1"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }, + ], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with one local storage disk + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // Add an allocation for this disk to the first sled's zpool + set_local_storage_allocation( + datastore, + config.instances[0].disks[0].id, + config.sleds[0].u2s[0].local_storage_dataset_id, + ExternalZpoolUuid::from_untyped_uuid( + config.sleds[0].u2s[0].zpool_id.into_untyped_uuid(), + ), + config.sleds[0].sled_id, + external::ByteCount::from_gibibytes_u32(512 + 10).into(), + ) + .await; + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 2); + + // insertion should succeed but never be on the second sled + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that `no_provision` on the local storage dataset is respected + #[tokio::test] + async fn local_storage_allocation_fail_no_provision() { + let logctx = + dev::test_setup_log("local_storage_allocation_fail_no_provision"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with one U2 + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with one local storage disk + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + set_local_storage_dataset_no_provision( + datastore, + config.sleds[0].u2s[0].local_storage_dataset_id, + ) + .await; + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion fails + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that local storage allocation does not run the zpool out of space + #[tokio::test] + async fn local_storage_allocation_fail_no_space() { + let logctx = + dev::test_setup_log("local_storage_allocation_fail_no_space"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with one U2 + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with one local storage disk + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // The zpool size is 1 TiB, and the control plane buffer is 250 GiB. If + // we set a crucible dataset size_used of 300 GiB, then ensure a 512 GiB + // allocation won't work. + + set_crucible_dataset_size_used( + datastore, + config.sleds[0].u2s[0].crucible_dataset_id, + external::ByteCount::from_gibibytes_u32(300).into(), + ) + .await; + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion fails + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + // if someone frees up some crucible disks, then insertion should work + set_crucible_dataset_size_used( + datastore, + config.sleds[0].u2s[0].crucible_dataset_id, + external::ByteCount::from_gibibytes_u32(200).into(), + ) + .await; + + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that local storage allocations do not share a U2 + #[tokio::test] + async fn local_storage_allocation_fail_no_share_u2() { + let logctx = + dev::test_setup_log("local_storage_allocation_fail_no_share_u2"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with one U2 + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with two local storage disks + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion fails + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that multiple local storage allocations can use multiple U2s + #[tokio::test] + async fn local_storage_allocation_multiple_u2() { + let logctx = + dev::test_setup_log("local_storage_allocation_multiple_u2"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with two U2s + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with two local storage disks + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion succeeds + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that multiple local storage allocations will not partially + /// succeed + #[tokio::test] + async fn local_storage_allocation_fail_no_partial_success() { + let logctx = dev::test_setup_log( + "local_storage_allocation_fail_no_partial_success", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with two U2s + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with two local storage disks + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // The zpool size is 1 TiB, and the control plane buffer is 250 GiB. If + // we set the first U2's crucible dataset size_used of 300 GiB, then + // we ensure one of the 512 GiB allocations won't work. + + set_crucible_dataset_size_used( + datastore, + config.sleds[0].u2s[0].crucible_dataset_id, + external::ByteCount::from_gibibytes_u32(300).into(), + ) + .await; + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion fails + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + // Assert no partial allocations occurred + { + let (.., authz_instance) = LookupPath::new(&opctx, datastore) + .instance_id(instance.id.into_untyped_uuid()) + .lookup_for(authz::Action::Modify) + .await + .expect("instance must exist"); + + let disks = datastore + .instance_list_disks( + &opctx, + &authz_instance, + &PaginatedBy::Name(DataPageParams { + marker: None, + direction: dropshot::PaginationOrder::Ascending, + limit: std::num::NonZeroU32::new( + MAX_DISKS_PER_INSTANCE, + ) + .unwrap(), + }), + ) + .await + .unwrap(); + + for disk in disks { + let db::datastore::Disk::LocalStorage(disk) = disk else { + panic!("wrong disk type"); + }; + + assert!(disk.local_storage_dataset_allocation.is_none()); + } + } + + // if someone frees up some crucible disks, then insertion should work + set_crucible_dataset_size_used( + datastore, + config.sleds[0].u2s[0].crucible_dataset_id, + external::ByteCount::from_gibibytes_u32(200).into(), + ) + .await; + + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that multiple local storage allocations work with one of them + /// existing already + #[tokio::test] + async fn local_storage_allocation_multiple_with_one_already() { + let logctx = dev::test_setup_log( + "local_storage_allocation_multiple_with_one_already", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with two U2s + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with two local storage disks + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // One of the local storage have been allocated already. + + set_local_storage_allocation( + datastore, + config.instances[0].disks[0].id, + config.sleds[0].u2s[0].local_storage_dataset_id, + ExternalZpoolUuid::from_untyped_uuid( + config.sleds[0].u2s[0].zpool_id.into_untyped_uuid(), + ), + config.sleds[0].sled_id, + external::ByteCount::from_gibibytes_u32(512 + 10).into(), + ) + .await; + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that multiple instances will share slices of a single U2 for + /// local storage allocations + #[tokio::test] + async fn local_storage_allocation_multiple_instances_share_u2() { + let logctx = dev::test_setup_log( + "local_storage_allocation_multiple_instances_share_u2", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with two U2s + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure two instances, each with two local storage disks + instances: vec![ + LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from( + "local1".to_string(), + ) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(128), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from( + "local2".to_string(), + ) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(128), + }, + ], + }, + LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local2".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from( + "local21".to_string(), + ) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(128), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from( + "local22".to_string(), + ) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(128), + }, + ], + }, + ], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + for instance in &config.instances { + let instance = Instance::new_with_id(instance.id); + + // the output of the find targets query does not currently take + // required local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 32, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + } + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that multiple local storage allocations won't work if one of them + /// exists already but the second would result in the zpool running out of + /// space + #[tokio::test] + async fn local_storage_allocation_fail_multiple_with_existing_no_space() { + let logctx = dev::test_setup_log( + "local_storage_allocation_fail_multiple_with_existing_no_space", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with two U2s + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with two local storage disks + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // One of the local storage have been allocated already to the first U2 + + set_local_storage_allocation( + datastore, + config.instances[0].disks[0].id, + config.sleds[0].u2s[0].local_storage_dataset_id, + ExternalZpoolUuid::from_untyped_uuid( + config.sleds[0].u2s[0].zpool_id.into_untyped_uuid(), + ), + config.sleds[0].sled_id, + external::ByteCount::from_gibibytes_u32(512 + 10).into(), + ) + .await; + + // The zpool size is 1 TiB, and the control plane buffer is 250 GiB. If + // we set the second U2's crucible dataset size_used of 300 GiB, then + // we ensure the second of the 512 GiB allocations won't work. + + set_crucible_dataset_size_used( + datastore, + config.sleds[0].u2s[1].crucible_dataset_id, + external::ByteCount::from_gibibytes_u32(300).into(), + ) + .await; + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that a full rack can have one VMM take all the U2s on each sled + #[tokio::test] + async fn local_storage_allocation_full_rack() { + let logctx = dev::test_setup_log("local_storage_allocation_full_rack"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut config = LocalStorageTest { + sleds: vec![], + affinity_groups: vec![], + anti_affinity_groups: vec![], + instances: vec![], + }; + + for i in 0..32 { + let mut u2s = vec![]; + + for n in 0..MAX_DISKS_PER_INSTANCE { + u2s.push(LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: format!("phys_{i}_{n}"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: format!( + "[fd00:1122:3344:{i}0{n}::1]:12345" + ) + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }); + } + + config.sleds.push(LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: format!("sled_{i}"), + u2s, + }); + + let mut disks = vec![]; + + for n in 0..MAX_DISKS_PER_INSTANCE { + disks.push(LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from(format!("local-{i}-{n}")) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }); + } + + config.instances.push(LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: format!("inst{i}"), + affinity: None, + disks, + }); + } + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + let mut vmms = vec![]; + + for (i, config_instance) in config.instances.iter().enumerate() { + let instance = Instance::new_with_id(config_instance.id); + + // the output of the find targets query does not currently take + // required local storage allocations into account, but each VMM + // will occupy all the available threads on a sled. + assert_eq!(instance.find_targets(datastore).await.len(), 32 - i); + + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 128, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + vmms.push(vmm); + } + + let sleds: HashSet<_> = + vmms.into_iter().map(|vmm| vmm.sled_id).collect(); + + assert_eq!(sleds.len(), 32); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that an affinity grouping will cause instances with local storage + /// to fail allocation even if there's space left. + #[tokio::test] + async fn local_storage_allocation_blocked_by_affinity() { + let logctx = + dev::test_setup_log("local_storage_allocation_blocked_by_affinity"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // Two sleds, with one U2 each + sleds: vec![ + LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }, + LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_1"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }, + ], + affinity_groups: vec![LocalStorageAffinityGroup { + id: AffinityGroupUuid::new_v4(), + name: String::from("group-0"), + policy: external::AffinityPolicy::Fail, + failure_domain: external::FailureDomain::Sled, + }], + anti_affinity_groups: vec![], + // Configure two instances with one local storage disk each, both in + // the same affinity group + instances: vec![ + LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: Some((Affinity::Positive, 0)), + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(600), + }], + }, + LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local2".to_string(), + affinity: Some((Affinity::Positive, 0)), + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(600), + }], + }, + ], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + // The first instance's sled reservation should succeed, there's enough + // space for the disk + + let instance = Instance::new_with_id(config.instances[0].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 2); + + // make sure that insertion works + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + // The second instance's sled reservation should not succeed, because + // the affinity group's policy is set to Fail, and there isn't enough + // space for the second instance's disk on the one sled. + + let instance = Instance::new_with_id(config.instances[1].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 2); + + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that an anti-affinity grouping will be honoured for instances + /// with local storage. + #[tokio::test] + async fn local_storage_allocation_blocked_by_anti_affinity() { + let logctx = dev::test_setup_log( + "local_storage_allocation_blocked_by_anti_affinity", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with one U2 + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![LocalStorageAntiAffinityGroup { + id: AntiAffinityGroupUuid::new_v4(), + name: String::from("anti-group-0"), + policy: external::AffinityPolicy::Fail, + failure_domain: external::FailureDomain::Sled, + }], + // Configure two instances with one local storage disk each, both in + // the same anti-affinity group + instances: vec![ + LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: Some((Affinity::Negative, 0)), + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }], + }, + LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local2".to_string(), + affinity: Some((Affinity::Negative, 0)), + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }], + }, + ], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + // The first instance's sled reservation should succeed + + let instance = Instance::new_with_id(config.instances[0].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion works + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + // The second instance's sled reservation should not succeed, because + // the anti-affinity group's policy is set to Fail. + + let instance = Instance::new_with_id(config.instances[1].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that instances with local storage honour the reservation + /// constraints. + #[tokio::test] + async fn local_storage_allocation_blocked_by_constraint() { + let logctx = dev::test_setup_log( + "local_storage_allocation_blocked_by_constraint", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with one U2, and another with two U2s + sleds: vec![ + LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }, + LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_1"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: + "[fd00:1122:3344:201::1]:12345".parse().unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys2"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: + "[fd00:1122:3344:202::1]:12345".parse().unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }, + ], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure one instance with two local storage disks + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + // If there's a constraint that this instance must use sled_0, it cannot + // succeed, as it needs two local storage allocations (which it could + // get if it was allowed to use sled_1). + + let instance = Instance::new_with_id(config.instances[0].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 2); + + // make sure that insertion fails if we are restricted to sled_0 + let constraints = db::model::SledReservationConstraintBuilder::new() + .must_select_from(&[config.sleds[0].sled_id]) + .build(); + + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + constraints, + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/datastore/zpool.rs b/nexus/db-queries/src/db/datastore/zpool.rs index 2f31fb72dfd..e4f5c0a9e14 100644 --- a/nexus/db-queries/src/db/datastore/zpool.rs +++ b/nexus/db-queries/src/db/datastore/zpool.rs @@ -11,6 +11,7 @@ use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::datastore::OpContext; use crate::db::identity::Asset; +use crate::db::model::ByteCount; use crate::db::model::DbTypedUuid; use crate::db::model::PhysicalDisk; use crate::db::model::PhysicalDiskPolicy; @@ -37,12 +38,30 @@ use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; +use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolKind; use omicron_uuid_kinds::ZpoolUuid; use uuid::Uuid; +#[derive(Debug)] +pub struct ZpoolGetForSledReservationResult { + pub pool: Zpool, + + /// Last reported inventory size for the zpool + pub last_inv_total_size: i64, + + /// The rendezvous local storage dataset for this zpool + pub rendezvous_local_storage_dataset_id: DatasetUuid, + + /// Upper bound on Crucible dataset usage + pub crucible_dataset_usage: i64, + + /// Upper bound on Local Storage dataset usage + pub local_storage_usage: i64, +} + impl DataStore { pub async fn zpool_insert( &self, @@ -348,4 +367,160 @@ impl DataStore { .map(|_| ()) .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + + /// For a given sled id, return all zpools for that sled plus: + /// + /// - the total upper bound on usage as reported by their crucible and local + /// storage datasets + /// - their most recent total_size as reported by inventory. + pub async fn zpool_get_for_sled_reservation( + &self, + opctx: &OpContext, + sled_id: SledUuid, + ) -> ListResultVec { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + + use nexus_db_schema::schema::crucible_dataset; + use nexus_db_schema::schema::inv_zpool; + use nexus_db_schema::schema::physical_disk::dsl as physical_disk_dsl; + use nexus_db_schema::schema::rendezvous_local_storage_dataset; + use nexus_db_schema::schema::zpool::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + let tuples = dsl::zpool + .filter(dsl::sled_id.eq(to_db_typed_uuid(sled_id))) + .filter(dsl::time_deleted.is_null()) + .inner_join( + physical_disk_dsl::physical_disk + .on(dsl::physical_disk_id.eq(physical_disk_dsl::id)), + ) + .filter( + physical_disk_dsl::disk_policy + .eq(PhysicalDiskPolicy::InService), + ) + // Only U2 disks will be considered for local storage allocations. + // This should be redundant when filtering on pools with local + // storage datasets! + .filter(physical_disk_dsl::variant.eq(PhysicalDiskKind::U2)) + .select(( + Zpool::as_select(), + // This and the next result are the upper bound on existing + // usages + crucible_dataset::table + .select(diesel::dsl::sum(crucible_dataset::size_used)) + .filter(crucible_dataset::time_deleted.is_null()) + .filter(crucible_dataset::pool_id.eq(dsl::id)) + .single_value(), + rendezvous_local_storage_dataset::table + .select(diesel::dsl::sum( + rendezvous_local_storage_dataset::size_used, + )) + .filter( + rendezvous_local_storage_dataset::time_tombstoned + .is_null(), + ) + .filter( + rendezvous_local_storage_dataset::pool_id.eq(dsl::id), + ) + .single_value(), + // + rendezvous_local_storage_dataset::table + .select(rendezvous_local_storage_dataset::id) + .filter( + rendezvous_local_storage_dataset::time_tombstoned + .is_null(), + ) + .filter( + rendezvous_local_storage_dataset::pool_id.eq(dsl::id), + ) + .single_value(), + // last reported total size of this pool from inventory + inv_zpool::table + .select(inv_zpool::total_size) + .filter(inv_zpool::id.eq(dsl::id)) + .order_by(inv_zpool::time_collected.desc()) + .limit(1) + .single_value(), + )) + .load_async::<( + Zpool, + Option, + Option, + Option, + Option, + )>(&*conn) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Sled, + LookupType::by_id(sled_id), + ), + ) + })?; + + let mut converted = Vec::with_capacity(tuples.len()); + + for tuple in tuples { + let ( + pool, + crucible_dataset_usage, + local_storage_usage, + rendezvous_local_storage_dataset_id, + last_inv_total_size, + ) = tuple; + + // If this zpool doesn't have a local storage dataset yet, skip it + let Some(rendezvous_local_storage_dataset_id) = + rendezvous_local_storage_dataset_id + else { + continue; + }; + + // If the usage sum for either dataset type wasn't able to be + // computed, then the associated rendezvous table entry doesn't + // exist yet, so skip this zpool. + let crucible_dataset_usage: ByteCount = + if let Some(crucible_dataset_usage) = crucible_dataset_usage { + crucible_dataset_usage.try_into().map_err( + |e: anyhow::Error| { + Error::internal_error(&e.to_string()) + }, + )? + } else { + continue; + }; + + let local_storage_usage: ByteCount = + if let Some(local_storage_usage) = local_storage_usage { + local_storage_usage.try_into().map_err( + |e: anyhow::Error| { + Error::internal_error(&e.to_string()) + }, + )? + } else { + continue; + }; + + // If there isn't an inventory collection for this zpool, skip it + let Some(last_inv_total_size) = last_inv_total_size else { + continue; + }; + + converted.push(ZpoolGetForSledReservationResult { + pool, + last_inv_total_size, + rendezvous_local_storage_dataset_id: + DatasetUuid::from_untyped_uuid( + rendezvous_local_storage_dataset_id, + ), + crucible_dataset_usage: crucible_dataset_usage.into(), + local_storage_usage: local_storage_usage.into(), + }); + } + + Ok(converted) + } } diff --git a/nexus/db-queries/src/db/queries/sled_reservation.rs b/nexus/db-queries/src/db/queries/sled_reservation.rs index a5fee2ce9c1..12ec80edafd 100644 --- a/nexus/db-queries/src/db/queries/sled_reservation.rs +++ b/nexus/db-queries/src/db/queries/sled_reservation.rs @@ -12,8 +12,29 @@ use diesel::sql_types; use nexus_db_model::SledCpuFamily; use nexus_db_schema::enums::AffinityPolicyEnum; use nexus_db_schema::enums::SledCpuFamilyEnum; +use nonempty::NonEmpty; +use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; +use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::ZpoolUuid; +use uuid::Uuid; + +#[derive(Clone)] +pub struct LocalStorageAllocation { + pub disk_id: Uuid, + pub local_storage_dataset_allocation_id: DatasetUuid, + pub required_dataset_size: i64, + pub local_storage_dataset_id: DatasetUuid, + pub pool_id: ZpoolUuid, + pub sled_id: SledUuid, +} + +#[derive(Clone)] +pub enum LocalStorageAllocationRequired { + No, + Yes { allocations: NonEmpty }, +} fn subquery_our_aa_groups(query: &mut QueryBuilder) { query @@ -252,10 +273,16 @@ pub fn sled_find_targets_query( query.query() } -/// Inserts a sled_resource_vmm record into the database, if it is -/// a valid reservation. +/// Attempts to: +/// +/// 1. Insert a sled_resource_vmm record, if it is still a valid reservation +/// +/// 2. Optionally perform local storage allocation by updating local storage +/// disk records, if those are still valid with respect to the amount of +/// available zpool space. pub fn sled_insert_resource_query( resource: &SledResourceVmm, + local_storage_allocation_required: &LocalStorageAllocationRequired, ) -> TypedSqlQuery<(sql_types::Numeric,)> { let mut query = QueryBuilder::new(); @@ -281,13 +308,24 @@ pub fn sled_insert_resource_query( ).param().sql(" <= sled.usable_physical_ram AND COALESCE(SUM(CAST(sled_resource_vmm.reservoir_ram AS INT8)), 0) + " ).param().sql(" <= sled.reservoir_size - ),"); + ),") + .bind::(resource.sled_id.into_untyped_uuid()) + .bind::(resource.resources.hardware_threads) + .bind::(resource.resources.rss_ram) + .bind::(resource.resources.reservoir_ram); // "our_aa_groups": All the anti-affinity group_ids to which our instance belongs. subquery_our_aa_groups(&mut query); + query.bind::( + resource.instance_id.unwrap().into_untyped_uuid(), + ); + // "other_aa_instances": All the group_id,instance_ids of instances (other // than our own) belonging to "our_aa_groups". subquery_other_aa_instances(&mut query); + query.bind::( + resource.instance_id.unwrap().into_untyped_uuid(), + ); // Find instances with a strict anti-affinity policy in the sled failure // domain. We must ensure we do not co-locate with these instances. @@ -317,9 +355,16 @@ pub fn sled_insert_resource_query( // "our_a_groups": All the affinity group_ids to which our instance belongs. subquery_our_a_groups(&mut query); + query.bind::( + resource.instance_id.unwrap().into_untyped_uuid(), + ); + // "other_a_instances": All the group_id,instance_ids of instances (other // than our own) belonging to "our_a_instances"). subquery_other_a_instances(&mut query); + query.bind::( + resource.instance_id.unwrap().into_untyped_uuid(), + ); // Find instances with a strict affinity policy in the sled failure // domain. We must ensure we co-locate with these instances. @@ -347,11 +392,135 @@ pub fn sled_insert_resource_query( ),", ); + match local_storage_allocation_required { + LocalStorageAllocationRequired::No => {} + + LocalStorageAllocationRequired::Yes { allocations } => { + // Update each local storage disk's allocation id + + query.sql(" UPDATED_LOCAL_STORAGE_DISK_RECORDS AS ("); + query.sql(" UPDATE disk_type_local_storage "); + + query.sql( + " SET local_storage_dataset_allocation_id = CASE disk_id ", + ); + + for allocation in allocations { + let LocalStorageAllocation { + disk_id, + local_storage_dataset_allocation_id, + .. + } = allocation; + + query.sql("WHEN "); + query.param().bind::(*disk_id); + query.sql(" THEN "); + query.param().bind::( + local_storage_dataset_allocation_id.into_untyped_uuid(), + ); + query.sql(" "); + } + + query.sql(" END"); + + query.sql(" WHERE disk_id in ( "); + + for (index, allocation) in allocations.iter().enumerate() { + let LocalStorageAllocation { disk_id, .. } = allocation; + + query.param().bind::(*disk_id); + + if index != (allocations.len() - 1) { + query.sql(","); + } + } + + query.sql(" )"); + + query.sql(" RETURNING *"); + query.sql(" ), "); + + // Create new local storage dataset allocation records. + + query.sql(" NEW_LOCAL_STORAGE_ALLOCATION_RECORDS AS ("); + query.sql(" INSERT INTO local_storage_dataset_allocation VALUES "); + + for (index, allocation) in allocations.iter().enumerate() { + let LocalStorageAllocation { + local_storage_dataset_allocation_id, + local_storage_dataset_id, + pool_id, + sled_id, + required_dataset_size, + .. + } = allocation; + + query.sql("("); + + query.param().bind::( + local_storage_dataset_allocation_id.into_untyped_uuid(), + ); + query.sql(","); + + query.sql("NOW(),"); + query.sql("NULL,"); + + query.param().bind::( + local_storage_dataset_id.into_untyped_uuid(), + ); + query.sql(","); + + query + .param() + .bind::(pool_id.into_untyped_uuid()); + query.sql(","); + + query + .param() + .bind::(sled_id.into_untyped_uuid()); + query.sql(","); + + query + .param() + .bind::(*required_dataset_size); + + query.sql(")"); + + if index != (allocations.len() - 1) { + query.sql(","); + } + } + + query.sql(" RETURNING *), "); + + // Update the rendezvous_local_storage_dataset table's size_used + // column by adding these new rows + + query.sql("UPDATE_RENDEZVOUS_TABLES as ("); + query.sql(" + UPDATE rendezvous_local_storage_dataset + SET size_used = size_used + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.dataset_size + FROM NEW_LOCAL_STORAGE_ALLOCATION_RECORDS + WHERE + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.local_storage_dataset_id = rendezvous_local_storage_dataset.id AND + rendezvous_local_storage_dataset.time_tombstoned is null and + rendezvous_local_storage_dataset.no_provision = FALSE + RETURNING *"); + query.sql("), "); + } + } + // The insert is only valid if: // - // - The sled still has space for our isntance + // - The sled still has space for our instance // - The sled is not banned (due to anti-affinity rules) // - If the sled is required (due to affinity rules) we're selecting it + // - If there are requested local storage allocations: + // - they all fit on the target zpools: combine the new local storage + // allocation records with the existing ones and test each zpool. + // - the rendezvous local storage dataset is not tombstoned or marked + // no_provision + query .sql( " @@ -362,19 +531,163 @@ pub fn sled_insert_resource_query( NOT(EXISTS(SELECT 1 FROM banned_sleds WHERE sled_id = ", ) .param() + .bind::(resource.sled_id.into_untyped_uuid()) .sql( ")) AND ( EXISTS(SELECT 1 FROM required_sleds WHERE sled_id = ", ) .param() + .bind::(resource.sled_id.into_untyped_uuid()) .sql( ") OR NOT EXISTS (SELECT 1 FROM required_sleds) - ) - )", + )", ); + match local_storage_allocation_required { + LocalStorageAllocationRequired::No => {} + + LocalStorageAllocationRequired::Yes { allocations } => { + query.sql("AND ("); + + for (index, allocation) in allocations.iter().enumerate() { + query.sql("("); + + // First, make sure that the additional usage fits in the zpool + + query.sql("("); + + // Add up crucible and local dataset usage, plus the altered + // local_storage_dataset_allocation records + + query.sql("("); + + query.sql("SELECT + SUM( + crucible_dataset.size_used + + rendezvous_local_storage_dataset.size_used + + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.dataset_size + ) + FROM + crucible_dataset + JOIN + rendezvous_local_storage_dataset + ON + crucible_dataset.pool_id = rendezvous_local_storage_dataset.pool_id + JOIN + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS + ON + crucible_dataset.pool_id = NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.pool_id + WHERE + (crucible_dataset.size_used IS NOT NULL) AND + (crucible_dataset.time_deleted IS NULL) AND + (rendezvous_local_storage_dataset.time_tombstoned IS NULL) AND + (rendezvous_local_storage_dataset.no_provision IS FALSE) AND + (crucible_dataset.pool_id = ") + .param() + .bind::(allocation.pool_id.into_untyped_uuid()) + .sql(") + GROUP BY + crucible_dataset.pool_id"); + + query.sql(") < ("); + + // and compare that to the zpool's available space (minus the + // control plane storage buffer) as reported by the latest + // inventory collection. + + query + .sql( + "(SELECT + total_size + FROM + inv_zpool + WHERE + inv_zpool.id = ", + ) + .param() + .bind::( + allocation.pool_id.into_untyped_uuid(), + ) + .sql(" ORDER BY inv_zpool.time_collected DESC LIMIT 1)"); + + query.sql(" - "); + + query + .sql( + "(SELECT + control_plane_storage_buffer + FROM + zpool + WHERE id = ", + ) + .param() + .bind::( + allocation.pool_id.into_untyped_uuid(), + ) + .sql(")"); + + query.sql(")"); + + query.sql(")"); + + query.sql(") AND "); + + // and, the zpool must be available + + query.sql("("); + + query + .sql( + "SELECT + sled.sled_policy = 'in_service' + AND sled.sled_state = 'active' + AND physical_disk.disk_policy = 'in_service' + AND physical_disk.disk_state = 'active' + FROM + zpool + JOIN + sled ON (zpool.sled_id = sled.id) + JOIN + physical_disk ON (zpool.physical_disk_id = physical_disk.id) + WHERE + zpool.id = ", + ) + .param() + .bind::( + allocation.pool_id.into_untyped_uuid(), + ); + + query.sql(") AND "); + + // and the rendezvous local dataset must be available + + query + .sql( + "( + SELECT time_tombstoned IS NULL AND no_provision IS FALSE + FROM rendezvous_local_storage_dataset + WHERE rendezvous_local_storage_dataset.id = ", + ) + .param() + .bind::( + allocation.local_storage_dataset_id.into_untyped_uuid(), + ); + + query.sql(")"); + + if index != (allocations.len() - 1) { + query.sql(" AND "); + } + } + + query.sql(")"); + } + } + + query.sql(")"); + // Finally, perform the INSERT if it's still valid. query.sql(" INSERT INTO sled_resource_vmm (id, sled_id, hardware_threads, rss_ram, reservoir_ram, instance_id) @@ -387,16 +700,6 @@ pub fn sled_insert_resource_query( ").param().sql(" WHERE EXISTS(SELECT 1 FROM insert_valid) ") - .bind::(resource.sled_id.into_untyped_uuid()) - .bind::(resource.resources.hardware_threads) - .bind::(resource.resources.rss_ram) - .bind::(resource.resources.reservoir_ram) - .bind::(resource.instance_id.unwrap().into_untyped_uuid()) - .bind::(resource.instance_id.unwrap().into_untyped_uuid()) - .bind::(resource.instance_id.unwrap().into_untyped_uuid()) - .bind::(resource.instance_id.unwrap().into_untyped_uuid()) - .bind::(resource.sled_id.into_untyped_uuid()) - .bind::(resource.sled_id.into_untyped_uuid()) .bind::(resource.id.into_untyped_uuid()) .bind::(resource.sled_id.into_untyped_uuid()) .bind::(resource.resources.hardware_threads) @@ -420,6 +723,8 @@ mod test { use omicron_uuid_kinds::PropolisUuid; use omicron_uuid_kinds::SledUuid; + use nonempty::nonempty; + #[tokio::test] async fn expectorate_sled_find_targets_query() { let id = InstanceUuid::nil(); @@ -504,12 +809,38 @@ mod test { ), ); - let query = sled_insert_resource_query(&resource); + // with no local storage + + let query = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::No, + ); + expectorate_query_contents( &query, "tests/output/sled_insert_resource_query.sql", ) .await; + + let query = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::Yes { + allocations: nonempty![LocalStorageAllocation { + disk_id: Uuid::nil(), + local_storage_dataset_allocation_id: DatasetUuid::nil(), + required_dataset_size: 128 * 1024 * 1024 * 1024, + local_storage_dataset_id: DatasetUuid::nil(), + pool_id: ZpoolUuid::nil(), + sled_id: SledUuid::nil(), + }], + }, + ); + + expectorate_query_contents( + &query, + "tests/output/sled_insert_resource_query_with_local_storage.sql", + ) + .await; } #[tokio::test] @@ -519,6 +850,8 @@ mod test { let pool = db.pool(); let conn = pool.claim().await.unwrap(); + // with no local storage + let resource = SledResourceVmm::new( PropolisUuid::nil(), InstanceUuid::nil(), @@ -534,7 +867,60 @@ mod test { ), ); - let query = sled_insert_resource_query(&resource); + let query = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::No, + ); + let _ = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + + // with local storage + + let query = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::Yes { + allocations: nonempty![LocalStorageAllocation { + disk_id: Uuid::nil(), + local_storage_dataset_allocation_id: DatasetUuid::nil(), + required_dataset_size: 128 * 1024 * 1024 * 1024, + local_storage_dataset_id: DatasetUuid::nil(), + pool_id: ZpoolUuid::nil(), + sled_id: SledUuid::nil(), + }], + }, + ); + + let _ = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + + let query = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::Yes { + allocations: nonempty![ + LocalStorageAllocation { + disk_id: Uuid::nil(), + local_storage_dataset_allocation_id: DatasetUuid::nil(), + required_dataset_size: 128 * 1024 * 1024 * 1024, + local_storage_dataset_id: DatasetUuid::nil(), + pool_id: ZpoolUuid::nil(), + sled_id: SledUuid::nil(), + }, + LocalStorageAllocation { + disk_id: Uuid::nil(), + local_storage_dataset_allocation_id: DatasetUuid::nil(), + required_dataset_size: 256 * 1024 * 1024 * 1024, + local_storage_dataset_id: DatasetUuid::nil(), + pool_id: ZpoolUuid::nil(), + sled_id: SledUuid::nil(), + }, + ], + }, + ); + let _ = query .explain_async(&conn) .await diff --git a/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql b/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql new file mode 100644 index 00000000000..31d42158cb6 --- /dev/null +++ b/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql @@ -0,0 +1,204 @@ +WITH + sled_has_space + AS ( + SELECT + 1 + FROM + sled LEFT JOIN sled_resource_vmm ON sled_resource_vmm.sled_id = sled.id + WHERE + sled.id = $1 + AND sled.time_deleted IS NULL + AND sled.sled_policy = 'in_service' + AND sled.sled_state = 'active' + GROUP BY + sled.id + HAVING + COALESCE(sum(CAST(sled_resource_vmm.hardware_threads AS INT8)), 0) + $2 + <= sled.usable_hardware_threads + AND COALESCE(sum(CAST(sled_resource_vmm.rss_ram AS INT8)), 0) + $3 + <= sled.usable_physical_ram + AND COALESCE(sum(CAST(sled_resource_vmm.reservoir_ram AS INT8)), 0) + $4 + <= sled.reservoir_size + ), + our_aa_groups + AS (SELECT group_id FROM anti_affinity_group_instance_membership WHERE instance_id = $5), + other_aa_instances + AS ( + SELECT + anti_affinity_group_instance_membership.group_id, instance_id + FROM + anti_affinity_group_instance_membership + JOIN our_aa_groups ON + anti_affinity_group_instance_membership.group_id = our_aa_groups.group_id + WHERE + instance_id != $6 + ), + banned_instances + AS ( + SELECT + instance_id + FROM + other_aa_instances + JOIN anti_affinity_group ON + anti_affinity_group.id = other_aa_instances.group_id + AND anti_affinity_group.failure_domain = 'sled' + AND anti_affinity_group.policy = 'fail' + WHERE + anti_affinity_group.time_deleted IS NULL + ), + banned_sleds + AS ( + SELECT + DISTINCT sled_id + FROM + banned_instances + JOIN sled_resource_vmm ON sled_resource_vmm.instance_id = banned_instances.instance_id + ), + our_a_groups AS (SELECT group_id FROM affinity_group_instance_membership WHERE instance_id = $7), + other_a_instances + AS ( + SELECT + affinity_group_instance_membership.group_id, instance_id + FROM + affinity_group_instance_membership + JOIN our_a_groups ON affinity_group_instance_membership.group_id = our_a_groups.group_id + WHERE + instance_id != $8 + ), + required_instances + AS ( + SELECT + policy, instance_id + FROM + other_a_instances + JOIN affinity_group ON + affinity_group.id = other_a_instances.group_id + AND affinity_group.failure_domain = 'sled' + AND affinity_group.policy = 'fail' + WHERE + affinity_group.time_deleted IS NULL + ), + required_sleds + AS ( + SELECT + DISTINCT sled_id + FROM + required_instances + JOIN sled_resource_vmm ON sled_resource_vmm.instance_id = required_instances.instance_id + ), + updated_local_storage_disk_records + AS ( + UPDATE + disk_type_local_storage + SET + local_storage_dataset_allocation_id = CASE disk_id WHEN $9 THEN $10 END + WHERE + disk_id IN ($11,) + RETURNING + * + ), + new_local_storage_allocation_records + AS ( + INSERT + INTO + local_storage_dataset_allocation + VALUES + ($12, now(), NULL, $13, $14, $15, $16) + RETURNING + * + ), + update_rendezvous_tables + AS ( + UPDATE + rendezvous_local_storage_dataset + SET + size_used = size_used + new_local_storage_allocation_records.dataset_size + FROM + new_local_storage_allocation_records + WHERE + new_local_storage_allocation_records.local_storage_dataset_id + = rendezvous_local_storage_dataset.id + AND rendezvous_local_storage_dataset.time_tombstoned IS NULL + AND rendezvous_local_storage_dataset.no_provision = false + RETURNING + * + ), + insert_valid + AS ( + SELECT + 1 + WHERE + EXISTS(SELECT 1 FROM sled_has_space) + AND NOT (EXISTS(SELECT 1 FROM banned_sleds WHERE sled_id = $17)) + AND ( + EXISTS(SELECT 1 FROM required_sleds WHERE sled_id = $18) + OR NOT EXISTS(SELECT 1 FROM required_sleds) + ) + AND ( + ( + SELECT + sum( + crucible_dataset.size_used + + rendezvous_local_storage_dataset.size_used + + new_local_storage_allocation_records.dataset_size + ) + FROM + crucible_dataset + JOIN rendezvous_local_storage_dataset ON + crucible_dataset.pool_id = rendezvous_local_storage_dataset.pool_id + JOIN new_local_storage_allocation_records ON + crucible_dataset.pool_id = new_local_storage_allocation_records.pool_id + WHERE + (crucible_dataset.size_used IS NOT NULL) + AND (crucible_dataset.time_deleted IS NULL) + AND (rendezvous_local_storage_dataset.time_tombstoned IS NULL) + AND rendezvous_local_storage_dataset.no_provision IS false + AND crucible_dataset.pool_id = $19 + GROUP BY + crucible_dataset.pool_id + ) + < ( + ( + SELECT + total_size + FROM + inv_zpool + WHERE + inv_zpool.id = $20 + ORDER BY + inv_zpool.time_collected DESC + LIMIT + 1 + ) + - (SELECT control_plane_storage_buffer FROM zpool WHERE id = $21) + ) + AND ( + SELECT + sled.sled_policy = 'in_service' + AND sled.sled_state = 'active' + AND physical_disk.disk_policy = 'in_service' + AND physical_disk.disk_state = 'active' + FROM + zpool + JOIN sled ON zpool.sled_id = sled.id + JOIN physical_disk ON zpool.physical_disk_id = physical_disk.id + WHERE + zpool.id = $22 + ) + AND ( + SELECT + time_tombstoned IS NULL AND no_provision IS false + FROM + rendezvous_local_storage_dataset + WHERE + rendezvous_local_storage_dataset.id = $23 + ) + ) + ) +INSERT +INTO + sled_resource_vmm (id, sled_id, hardware_threads, rss_ram, reservoir_ram, instance_id) +SELECT + $24, $25, $26, $27, $28, $29 +WHERE + EXISTS(SELECT 1 FROM insert_valid) diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 8be6bb768c2..e09755c9ae5 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1850,6 +1850,9 @@ table! { } } +allow_tables_to_appear_in_same_query!(zpool, inv_zpool); +allow_tables_to_appear_in_same_query!(inv_zpool, physical_disk); + table! { inv_dataset (inv_collection_id, sled_id, name) { inv_collection_id -> Uuid, diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index a402ee2c1bd..83fb9b85b04 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -14,6 +14,7 @@ use super::MAX_VCPU_PER_INSTANCE; use super::MIN_MEMORY_BYTES_PER_INSTANCE; use crate::app::sagas; use crate::app::sagas::NexusSaga; +use crate::db::datastore::Disk; use crate::external_api::params; use cancel_safe_futures::prelude::*; use futures::future::Fuse; @@ -66,6 +67,7 @@ use propolis_client::support::tungstenite::protocol::frame::coding::CloseCode; use sagas::instance_common::ExternalIpAttach; use sagas::instance_start; use sagas::instance_update; +use sled_agent_client::types::DelegatedZvol; use sled_agent_client::types::InstanceMigrationTargetParams; use sled_agent_client::types::VmmPutStateBody; use std::collections::HashSet; @@ -1361,6 +1363,35 @@ impl super::Nexus { ) .await?; + // Each Disk::LocalStorage will require a delegated zvol entry. + let mut delegated_zvols: Vec = + Vec::with_capacity(disks.len()); + + for disk in &disks { + let local_storage_disk = match disk { + Disk::Crucible(_) => { + continue; + } + + Disk::LocalStorage(local_storage_disk) => local_storage_disk, + }; + + let Some(local_storage_dataset_allocation) = + &local_storage_disk.local_storage_dataset_allocation + else { + return Err(Error::internal_error(&format!( + "local storage disk {} allocation is None!", + disk.id() + )) + .into()); + }; + + delegated_zvols.push(DelegatedZvol::LocalStorage { + zpool_id: local_storage_dataset_allocation.pool_id(), + dataset_id: local_storage_dataset_allocation.id(), + }); + } + let nics = self .db_datastore .derive_guest_network_interface_info(&opctx, &authz_instance) @@ -1549,7 +1580,7 @@ impl super::Nexus { host_domain: None, search_domains: Vec::new(), }, - delegated_zvols: vec![], + delegated_zvols, }; let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); diff --git a/nexus/src/app/sagas/disk_delete.rs b/nexus/src/app/sagas/disk_delete.rs index 07a9f22a267..87391ffdca7 100644 --- a/nexus/src/app/sagas/disk_delete.rs +++ b/nexus/src/app/sagas/disk_delete.rs @@ -5,6 +5,7 @@ use super::ActionRegistry; use super::NexusActionContext; use super::NexusSaga; +use crate::app::InlineErrorChain; use crate::app::sagas::SagaInitError; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::volume_delete; @@ -12,6 +13,7 @@ use nexus_db_queries::authn; use nexus_db_queries::db; use nexus_db_queries::db::datastore; use omicron_common::api::external::DiskState; +use omicron_common::progenitor_operation_retry::ProgenitorOperationRetry; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -39,6 +41,12 @@ declare_saga_actions! { + sdd_account_space - sdd_account_space_undo } + DEALLOCATE_LOCAL_STORAGE -> "deallocate_local_storage" { + + sdd_deallocate_local_storage + } + DELETE_LOCAL_STORAGE -> "delete_local_storage" { + + sdd_delete_local_storage + } } // disk delete saga: definition @@ -96,8 +104,8 @@ impl NexusSaga for SagaDiskDelete { } datastore::Disk::LocalStorage(_) => { - // XXX nothing to do here yet, need the delegate_zvol branch to - // merge first. + builder.append(deallocate_local_storage_action()); + builder.append(delete_local_storage_action()); } } @@ -188,6 +196,96 @@ async fn sdd_account_space_undo( Ok(()) } +async fn sdd_deallocate_local_storage( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let datastore::Disk::LocalStorage(disk) = params.disk else { + // Check during make_saga_dag prevents this case. + unreachable!(); + }; + + let Some(allocation) = disk.local_storage_dataset_allocation else { + // Nothing to do! + return Ok(()); + }; + + osagactx + .datastore() + .delete_local_storage_dataset_allocation(&opctx, allocation.id()) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn sdd_delete_local_storage( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let datastore::Disk::LocalStorage(disk) = params.disk else { + // Check during make_saga_dag prevents this case. + unreachable!(); + }; + + let Some(allocation) = disk.local_storage_dataset_allocation else { + // Nothing to do! + return Ok(()); + }; + + let dataset_id = allocation.id(); + let pool_id = allocation.pool_id(); + let sled_id = allocation.sled_id(); + + // Get a sled agent client + + let sled_agent_client = osagactx + .nexus() + .sled_client(&sled_id) + .await + .map_err(ActionError::action_failed)?; + + // Ensure that the local storage is deleted + + let delete_operation = || async { + sled_agent_client + .local_storage_dataset_delete(&pool_id, &dataset_id) + .await + }; + + let gone_check = || async { + osagactx.datastore().check_sled_in_service(&opctx, sled_id).await?; + + // `check_sled_in_service` returns an error if the sled is no longer in + // service; if it succeeds, the sled is not gone. + Ok(false) + }; + + ProgenitorOperationRetry::new(delete_operation, gone_check) + .run(osagactx.log()) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "failed to delete local storage: {}", + InlineErrorChain::new(&e) + )) + })?; + + Ok(()) +} + #[cfg(test)] pub(crate) mod test { use crate::{ diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 9acfb435e36..38c96886060 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -2,7 +2,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::{ACTION_GENERATE_ID, NexusActionContext, NexusSaga, SagaInitError}; +use super::ACTION_GENERATE_ID; +use super::NexusActionContext; +use super::NexusSaga; +use super::SagaInitError; +use super::subsaga_append; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::disk_create::{self, SagaDiskCreate}; use crate::app::{ @@ -50,6 +54,7 @@ pub(crate) struct Params { pub create_params: params::InstanceCreate, pub boundary_switches: HashSet, } + // Several nodes in this saga are wrapped in their own subsaga so that they can // have a parameter that denotes which node they are (e.g., which NIC or which // external IP). They also need the outer saga's parameters. @@ -168,34 +173,6 @@ impl NexusSaga for SagaInstanceCreate { builder.append(associate_ssh_keys_action()); - // Helper function for appending subsagas to our parent saga. - fn subsaga_append( - node_basename: String, - subsaga_dag: steno::Dag, - parent_builder: &mut steno::DagBuilder, - params: S, - which: usize, - ) -> Result<(), SagaInitError> { - // The "parameter" node is a constant node that goes into the outer - // saga. Its value becomes the parameters for the one-node subsaga - // (defined below) that actually creates each resource. - let params_node_name = format!("{}_params{}", node_basename, which); - parent_builder.append(Node::constant( - ¶ms_node_name, - serde_json::to_value(¶ms).map_err(|e| { - SagaInitError::SerializeError(params_node_name.clone(), e) - })?, - )); - - let output_name = format!("{}{}", node_basename, which); - parent_builder.append(Node::subsaga( - output_name.as_str(), - subsaga_dag, - params_node_name, - )); - Ok(()) - } - for (i, group) in params.create_params.anti_affinity_groups.iter().enumerate() { diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index d16e064dd01..8ce785bc37c 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -6,10 +6,12 @@ use std::net::Ipv6Addr; -use super::{ - NexusActionContext, NexusSaga, SagaInitError, - instance_common::allocate_vmm_ipv6, -}; +use super::NexusActionContext; +use super::NexusSaga; +use super::SagaInitError; +use super::instance_common::allocate_vmm_ipv6; +use crate::app::InlineErrorChain; +use crate::app::MAX_DISKS_PER_INSTANCE; use crate::app::instance::{ InstanceEnsureRegisteredApiResources, InstanceRegisterReason, InstanceStateChangeError, @@ -17,12 +19,21 @@ use crate::app::instance::{ use crate::app::sagas::declare_saga_actions; use chrono::Utc; use nexus_db_lookup::LookupPath; +use nexus_db_queries::db::datastore::Disk; +use nexus_db_queries::db::datastore::LocalStorageDisk; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::{authn, authz, db}; +use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; +use omicron_common::api::external::http_pagination::PaginatedBy; +use omicron_common::progenitor_operation_retry::ProgenitorOperationRetry; use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid, SledUuid}; +use paste::paste; +use seq_macro::seq; use serde::{Deserialize, Serialize}; -use slog::{error, info}; +use sled_agent_client::types::LocalStorageDatasetEnsureRequest; +use slog::error; +use slog::info; use steno::ActionError; /// Parameters to the instance start saga. @@ -80,6 +91,15 @@ declare_saga_actions! { - sis_move_to_starting_undo } + LIST_LOCAL_STORAGE -> "local_storage_records" { + + sis_list_local_storage + } + + ENSURE_LOCAL_STORAGE (0, 1, 2, 3, 4, 5, 6, 7) -> "ensure_local_storage" { + + sis_ensure_local_storage + // No undo action for this, that is handled in the disk delete saga! + } + // TODO(#3879) This can be replaced with an action that triggers the NAT RPW // once such an RPW is available. DPD_ENSURE -> "dpd_ensure" { @@ -144,12 +164,26 @@ impl NexusSaga for SagaInstanceStart { builder.append(alloc_propolis_ip_action()); builder.append(create_vmm_record_action()); builder.append(mark_as_starting_action()); + + // After the instance's state has moved to starting, this should block + // out all disk attach and detach requests. List all the possible local + // storage disks attached to this instance, and ensure that they exist + // so they can be delegated into the propolis zone. + builder.append(list_local_storage_action()); + + // Changing MAX_DISKS_PER_INSTANCE requires changing this saga + static_assertions::const_assert!(MAX_DISKS_PER_INSTANCE == 8); + seq!(N in 0..8 { + builder.append(paste!([]())); + }); + builder.append(dpd_ensure_action()); builder.append(v2p_ensure_action()); builder.append(ensure_registered_action()); builder.append(update_multicast_sled_id_action()); builder.append(add_virtual_resources_action()); builder.append(ensure_running_action()); + Ok(builder.build()?) } } @@ -515,6 +549,159 @@ async fn sis_account_virtual_resources_undo( Ok(()) } +async fn sis_list_local_storage( + sagactx: NexusActionContext, +) -> Result, ActionError> { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + let datastore = osagactx.datastore(); + + let db_instance = + sagactx.lookup::("started_record")?; + let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); + + let (_, _, authz_instance, ..) = LookupPath::new(&opctx, datastore) + .instance_id(instance_id.into_untyped_uuid()) + .fetch_for(authz::Action::Read) + .await + .map_err(ActionError::action_failed)?; + + let disks = datastore + .instance_list_disks( + &opctx, + &authz_instance, + &PaginatedBy::Name(DataPageParams { + marker: None, + direction: dropshot::PaginationOrder::Ascending, + limit: std::num::NonZeroU32::new(MAX_DISKS_PER_INSTANCE) + .unwrap(), + }), + ) + .await + .map_err(ActionError::action_failed)?; + + let records = disks + .into_iter() + .filter_map(|disk| match disk { + Disk::LocalStorage(disk) => Some(disk), + Disk::Crucible(_) => None, + }) + .collect(); + + Ok(records) +} + +async fn sis_ensure_local_storage( + sagactx: NexusActionContext, + which: usize, +) -> Result<(), ActionError> { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + // Get this node's record + + let local_storage_records = + sagactx.lookup::>("local_storage_records")?; + + if local_storage_records.is_empty() + || (which >= local_storage_records.len()) + { + return Ok(()); + } + + let LocalStorageDisk { + disk, + disk_type_local_storage: _, + local_storage_dataset_allocation, + } = &local_storage_records[which]; + + // Make sure this was a complete allocation. + + let Some(local_storage_dataset_allocation) = + local_storage_dataset_allocation + else { + return Err(ActionError::action_failed(format!( + "local storage record {} has a None allocation!", + which + ))); + }; + + // All local storage volumes will be created with 4k blocks. Double check + // here. + + if disk.block_size.to_bytes() != 4096 { + return Err(ActionError::action_failed(format!( + "local storage record {} has block size {}!", + which, + disk.block_size.to_bytes(), + ))); + } + + let dataset_id = local_storage_dataset_allocation.id(); + let pool_id = local_storage_dataset_allocation.pool_id(); + let sled_id = local_storage_dataset_allocation.sled_id(); + let dataset_size = local_storage_dataset_allocation.dataset_size.into(); + let volume_size = disk.size.into(); + + // Get a sled agent client + + let sled_agent_client = osagactx + .nexus() + .sled_client(&sled_id) + .await + .map_err(ActionError::action_failed)?; + + // Ensure that the local storage is created + + let ensure_operation = || async { + sled_agent_client + .local_storage_dataset_ensure( + &pool_id, + &dataset_id, + &LocalStorageDatasetEnsureRequest { dataset_size, volume_size }, + ) + .await + }; + + let gone_check = || async { + osagactx.datastore().check_sled_in_service(&opctx, sled_id).await?; + + // `check_sled_in_service` returns an error if the sled is no longer in + // service; if it succeeds, the sled is not gone. + Ok(false) + }; + + ProgenitorOperationRetry::new(ensure_operation, gone_check) + .run(osagactx.log()) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "failed to ensure local storage: {}", + InlineErrorChain::new(&e) + )) + })?; + + Ok(()) +} + +seq!(M in 0..8 { + async fn sis_ensure_local_storage_~M( + sagactx: NexusActionContext, + ) -> Result<(), ActionError> { + sis_ensure_local_storage(sagactx, M).await + } +}); + async fn sis_dpd_ensure( sagactx: NexusActionContext, ) -> Result<(), ActionError> { @@ -921,8 +1108,7 @@ mod test { use nexus_test_utils_macros::nexus_test; use nexus_types::identity::Resource; use omicron_common::api::external::{ - ByteCount, DataPageParams, IdentityMetadataCreateParams, - InstanceCpuCount, Name, + ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, Name, }; use omicron_common::api::internal::shared::SwitchLocation; use omicron_test_utils::dev::poll; diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 642c7a3947f..ecc35f159ce 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -10,11 +10,13 @@ // easier it will be to test, version, and update in deployed systems. use crate::saga_interface::SagaContext; +use serde::Serialize; use std::sync::Arc; use std::sync::LazyLock; use steno::ActionContext; use steno::ActionError; use steno::DagBuilder; +use steno::Node; use steno::SagaDag; use steno::SagaName; use steno::SagaType; @@ -316,6 +318,161 @@ macro_rules! declare_saga_actions { crate::app::sagas::__emit_action!($node, $out); declare_saga_actions!(S = $saga $($nodes,)* $node <> $($tail)*); }; + // Allow for a repetition syntax that looks like + // + // ACTION_NAME (i, j, k) -> "output" { + // + action + // - undo + // } + // + // This is equivalent to + // + // ACTION_NAME_i -> "output_i" { + // + action_i + // - undo_i + // } + // + // ACTION_NAME_j -> "output_j" { + // + action_j + // - undo_j + // } + // + // ACTION_NAME_k -> "output_k" { + // + action_k + // - undo_k + // } + // + (S = $saga:ident $($nodes:ident),* <> $node:ident ($repeat:literal, $($repeat_tail:literal),*) -> $out:literal { + $a:ident - $u:ident } $($tail:tt)*) => { + paste::paste! { + static [<$node _ $repeat>]: ::std::sync::LazyLock = + ::std::sync::LazyLock::new(|| { + ::steno::new_action_noop_undo( + crate::app::sagas::__action_name!( + $saga, + [<$node _ $repeat>] + ), + [<$a _ $repeat>], + [<$u _ $repeat>] + ) + }); + + // Note: this is the inlined contents of `__emit_action`, that macro + // cannot take the `paste!` output as it isn't a literal: + // + // crate::app::sagas::__emit_action!( + // [<$node _ $repeat>], + // [<$out _ $repeat>] <- this triggers a compile error + // ); + #[allow(dead_code)] + fn [<$node:lower _ $repeat _action>]() -> ::steno::Node { + ::steno::Node::action( + crate::app::sagas::__stringify_ident!([<$out _ $repeat>]), + crate::app::sagas::__stringify_ident!([<$node:camel _ $repeat>]), + [<$node _ $repeat>].as_ref(), + ) + } + + declare_saga_actions!(S = $saga $($nodes,)* [<$node _ $repeat>] <> $node ($($repeat_tail),+) -> $out { + $a } $($tail)*); + } + }; + (S = $saga:ident $($nodes:ident),* <> $node:ident ($repeat:literal) -> $out:literal { + $a:ident - $u:ident } $($tail:tt)*) => { + paste::paste! { + static [<$node _ $repeat>]: ::std::sync::LazyLock = + ::std::sync::LazyLock::new(|| { + ::steno::new_action_noop_undo( + crate::app::sagas::__action_name!( + $saga, + [<$node _ $repeat>] + ), + [<$a _ $repeat>], + [<$u _ $repeat>] + ) + }); + + // Note: this is the inlined contents of `__emit_action`, that macro + // cannot take the `paste!` output as it isn't a literal: + // + // crate::app::sagas::__emit_action!( + // [<$node _ $repeat>], + // [<$out _ $repeat>] <- this triggers a compile error + // ); + #[allow(dead_code)] + fn [<$node:lower _ $repeat _action>]() -> ::steno::Node { + ::steno::Node::action( + crate::app::sagas::__stringify_ident!([<$out _ $repeat>]), + crate::app::sagas::__stringify_ident!([<$node:camel _ $repeat>]), + [<$node _ $repeat>].as_ref(), + ) + } + + declare_saga_actions!(S = $saga $($nodes,)* [<$node _ $repeat>] <> $($tail)*); + } + }; + // Same as the prior match, but without the undo action. + (S = $saga:ident $($nodes:ident),* <> $node:ident ($repeat:literal, $($repeat_tail:literal),*) -> $out:literal { + $a:ident } $($tail:tt)*) => { + paste::paste! { + static [<$node _ $repeat>]: ::std::sync::LazyLock = + ::std::sync::LazyLock::new(|| { + ::steno::new_action_noop_undo( + crate::app::sagas::__action_name!( + $saga, + [<$node _ $repeat>] + ), + [<$a _ $repeat>] + ) + }); + + // Note: this is the inlined contents of `__emit_action`, that macro + // cannot take the `paste!` output as it isn't a literal: + // + // crate::app::sagas::__emit_action!( + // [<$node _ $repeat>], + // [<$out _ $repeat>] <- this triggers a compile error + // ); + #[allow(dead_code)] + fn [<$node:lower _ $repeat _action>]() -> ::steno::Node { + ::steno::Node::action( + crate::app::sagas::__stringify_ident!([<$out _ $repeat>]), + crate::app::sagas::__stringify_ident!([<$node:camel _ $repeat>]), + [<$node _ $repeat>].as_ref(), + ) + } + + declare_saga_actions!(S = $saga $($nodes,)* [<$node _ $repeat>] <> $node ($($repeat_tail),+) -> $out { + $a } $($tail)*); + } + }; + (S = $saga:ident $($nodes:ident),* <> $node:ident ($repeat:literal) -> $out:literal { + $a:ident } $($tail:tt)*) => { + paste::paste! { + static [<$node _ $repeat>]: ::std::sync::LazyLock = + ::std::sync::LazyLock::new(|| { + ::steno::new_action_noop_undo( + crate::app::sagas::__action_name!( + $saga, + [<$node _ $repeat>] + ), + [<$a _ $repeat>] + ) + }); + + // Note: this is the inlined contents of `__emit_action`, that macro + // cannot take the `paste!` output as it isn't a literal: + // + // crate::app::sagas::__emit_action!( + // [<$node _ $repeat>], + // [<$out _ $repeat>] <- this triggers a compile error + // ); + #[allow(dead_code)] + fn [<$node:lower _ $repeat _action>]() -> ::steno::Node { + ::steno::Node::action( + crate::app::sagas::__stringify_ident!([<$out _ $repeat>]), + crate::app::sagas::__stringify_ident!([<$node:camel _ $repeat>]), + [<$node _ $repeat>].as_ref(), + ) + } + + declare_saga_actions!(S = $saga $($nodes,)* [<$node _ $repeat>] <> $($tail)*); + } + }; // The end of the macro, which registers all previous generated saga nodes. // // We generate a new function, rather than implementing @@ -337,3 +494,34 @@ pub(crate) use __action_name; pub(crate) use __emit_action; pub(crate) use __stringify_ident; pub(crate) use declare_saga_actions; + +// Helper function for appending subsagas to parent sagas. +fn subsaga_append( + node_basename: String, + subsaga_dag: steno::Dag, + parent_builder: &mut steno::DagBuilder, + params: S, + which: usize, +) -> Result<(), SagaInitError> { + // The "parameter" node is a constant node that goes into the outer saga. + // Its value becomes the parameters for the one-node subsaga (defined below) + // that actually creates each resource. + let params_node_name = format!("{}_params{}", node_basename, which); + + parent_builder.append(Node::constant( + ¶ms_node_name, + serde_json::to_value(¶ms).map_err(|e| { + SagaInitError::SerializeError(params_node_name.clone(), e) + })?, + )); + + let output_name = format!("{}{}", node_basename, which); + + parent_builder.append(Node::subsaga( + output_name.as_str(), + subsaga_dag, + params_node_name, + )); + + Ok(()) +} diff --git a/nexus/test-utils/src/background.rs b/nexus/test-utils/src/background.rs index 6733ab9daae..74514af810a 100644 --- a/nexus/test-utils/src/background.rs +++ b/nexus/test-utils/src/background.rs @@ -471,3 +471,76 @@ pub async fn run_tuf_artifact_replication_step( assert_eq!(status.last_run_counters.err(), 0); status } + +/// Run the blueprint_loader background task +pub async fn run_blueprint_loader(lockstep_client: &ClientTestContext) { + let last_background_task = + activate_background_task(&lockstep_client, "blueprint_loader").await; + + let LastResult::Completed(_last_result_completed) = + last_background_task.last + else { + panic!( + "unexpected {:?} returned from blueprint_loader task", + last_background_task.last, + ); + }; +} + +/// Run the blueprint_planner background task +pub async fn run_blueprint_planner( + lockstep_client: &ClientTestContext, +) -> BlueprintPlannerStatus { + let last_background_task = + activate_background_task(&lockstep_client, "blueprint_planner").await; + + let LastResult::Completed(last_result_completed) = + last_background_task.last + else { + panic!( + "unexpected {:?} returned from blueprint_planner task", + last_background_task.last, + ); + }; + + serde_json::from_value::( + last_result_completed.details, + ) + .unwrap() +} + +/// Run the blueprint_executor background task +pub async fn run_blueprint_executor(lockstep_client: &ClientTestContext) { + let last_background_task = + activate_background_task(&lockstep_client, "blueprint_executor").await; + + let LastResult::Completed(_last_result_completed) = + last_background_task.last + else { + panic!( + "unexpected {:?} returned from blueprint_executor task", + last_background_task.last, + ); + }; +} + +/// Run the blueprint_rendezvous background task +pub async fn run_blueprint_rendezvous(lockstep_client: &ClientTestContext) { + let last_background_task = + activate_background_task(&lockstep_client, "blueprint_rendezvous") + .await; + + let LastResult::Completed(last_result_completed) = + last_background_task.last + else { + panic!( + "unexpected {:?} returned from blueprint_rendezvous task", + last_background_task.last, + ); + }; + + let _status = serde_json::from_value::( + last_result_completed.details, + ) + .unwrap(); +} diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 31e37a4be10..e2dd876e1b4 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -117,7 +117,7 @@ fn get_instances_url() -> String { format!("/v1/instances?{}", get_project_selector()) } -fn get_instance_url(instance_name: &str) -> String { +pub fn get_instance_url(instance_name: &str) -> String { format!("/v1/instances/{}?{}", instance_name, get_project_selector()) } From 0dcf0712b7fd71be38f461fb5b5c6653a17adec2 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 11 Dec 2025 01:25:28 +0000 Subject: [PATCH 02/12] live review: use "on_conn" --- nexus/db-queries/src/db/datastore/disk.rs | 4 ++-- nexus/db-queries/src/db/datastore/sled.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs index f5fe96ab944..5f4447e1ecc 100644 --- a/nexus/db-queries/src/db/datastore/disk.rs +++ b/nexus/db-queries/src/db/datastore/disk.rs @@ -438,7 +438,7 @@ impl DataStore { opctx.authorize(authz::Action::ListChildren, authz_instance).await?; - self.instance_list_disks_impl_unauth( + self.instance_list_disks_on_conn( &conn, authz_instance.id(), pagparams, @@ -447,7 +447,7 @@ impl DataStore { } /// List disks associated with a given instance by name. - pub async fn instance_list_disks_impl_unauth( + pub async fn instance_list_disks_on_conn( &self, conn: &async_bb8_diesel::Connection, instance_id: Uuid, diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 965f1dcbb70..f08c65fc3fc 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -488,7 +488,7 @@ impl DataStore { // Get a list of local storage disks attached to this instance let local_storage_disks: Vec = self - .instance_list_disks_impl_unauth( + .instance_list_disks_on_conn( &conn, instance_id.into_untyped_uuid(), &PaginatedBy::Name(DataPageParams { From e08228d34946d6beb9f3d5341400a1549c4ab6f6 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 11 Dec 2025 04:30:43 +0000 Subject: [PATCH 03/12] fmt --- nexus/db-queries/src/db/datastore/disk.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs index 5f4447e1ecc..63cb51887c3 100644 --- a/nexus/db-queries/src/db/datastore/disk.rs +++ b/nexus/db-queries/src/db/datastore/disk.rs @@ -438,12 +438,8 @@ impl DataStore { opctx.authorize(authz::Action::ListChildren, authz_instance).await?; - self.instance_list_disks_on_conn( - &conn, - authz_instance.id(), - pagparams, - ) - .await + self.instance_list_disks_on_conn(&conn, authz_instance.id(), pagparams) + .await } /// List disks associated with a given instance by name. From e9e9f5790e07d4159047821372b57d03588eaf35 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 11 Dec 2025 13:46:51 +0000 Subject: [PATCH 04/12] sled reservation must block attaching local storage disks --- nexus/db-queries/src/db/datastore/disk.rs | 78 +++++++++++- nexus/db-queries/src/db/datastore/sled.rs | 140 ++++++++++++++++++++++ 2 files changed, 214 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs index 63cb51887c3..97993779285 100644 --- a/nexus/db-queries/src/db/datastore/disk.rs +++ b/nexus/db-queries/src/db/datastore/disk.rs @@ -38,6 +38,8 @@ use async_bb8_diesel::AsyncRunQueryDsl; use chrono::DateTime; use chrono::Utc; use diesel::prelude::*; +use diesel::dsl::not; +use diesel::dsl::exists; use nexus_db_errors::ErrorHandler; use nexus_db_errors::OptionalError; use nexus_db_errors::public_error_from_diesel; @@ -758,7 +760,9 @@ impl DataStore { authz_disk: &authz::Disk, max_disks: u32, ) -> Result<(Instance, Disk), Error> { - use nexus_db_schema::schema::{disk, instance}; + use nexus_db_schema::schema::disk; + use nexus_db_schema::schema::instance; + use nexus_db_schema::schema::sled_resource_vmm; opctx.authorize(authz::Action::Modify, authz_instance).await?; opctx.authorize(authz::Action::Modify, authz_disk).await?; @@ -782,6 +786,74 @@ impl DataStore { let attach_update = DiskSetClauseForAttach::new(authz_instance.id()); + let disk = self.disk_get(&opctx, authz_disk.id()).await?; + let resource_query = match disk { + Disk::Crucible(_) => disk::table.into_boxed().filter( + disk::dsl::disk_state.eq_any(ok_to_attach_disk_state_labels), + ), + + // Attaching a local storage disk to the instance has to be blocked + // if sled reservation has occured for this instance: local storage + // allocation records are only created during sled reservation, and + // importantly a particular configuration of local storage + // allocations for an instance are only _validated_ during the sled + // reservation. + // + // The instance start saga perform sled reservation, creates the + // corresponding VMM record, and changes the instance's runtime + // state all in _separate saga nodes_. This means that there could + // be an indeterminate amount of time between running the sled + // reservation query and changing the instance's state. + // + // If a client attaches a local storage disk to an instance after + // sled reservation occurs but before the instance's start moves to + // starting, and we do not block it, there are several problems that + // result: + // + // - if an allocation does not already exist for the local storage + // disk, the instance_start saga will fail (and unwind) when + // trying to ensure that the allocation's dataset and zvol exist + // because the allocation_id column is None. + // + // - if an allocation does already exist for the local storage disk, + // _it may not be for the same sled the VMM is on_. the sled + // reservation query would prevent this, but this attach (if not + // blocked) happened afterwards. This would mean Nexus would + // construct a InstanceSledLocalConfig that contains DelegatedZvol + // entries that refer to different sleds. + // + // - if an allocation does exist already, and it's for the same sled + // the VMM is on, it may be colocated on a zpool with another + // local storage disk's allocation. again, the sled reservation + // query prevents this. + // + // - if an allocation does exist already, and it's for the same + // sled the VMM is on, and it's on a distinct zpool, then it's + // probably fine, but it's safer to let the sled reservation query + // validate everything, and it makes a much smaller query to block + // this case as well. + // + // `reserve_on_random_sled` will create an entry in + // `SledResourcesVmm` when the query is successful in finding a VMM + // reservation, so use that here: if there is a `SledResourcesVmm` + // record for this instance, then block attachment. + // + // Note that depending on our implementation, this may be the code + // path responsible for attaching disks to already-running instances + // when we support hot-plug. Local storage disks may never support + // hot-plug because running zones cannot be reconfigured (aka a new + // zvol rdsk device cannot be added to a running propolis zone). + Disk::LocalStorage(_) => disk::table + .into_boxed() + .filter( + disk::dsl::disk_state + .eq_any(ok_to_attach_disk_state_labels), + ) + .filter(not(exists(sled_resource_vmm::table.filter( + sled_resource_vmm::dsl::instance_id.eq(authz_instance.id()), + )))), + }; + let query = Instance::attach_resource( authz_instance.id(), authz_disk.id(), @@ -790,9 +862,7 @@ impl DataStore { .eq_any(ok_to_attach_instance_states) .and(instance::dsl::active_propolis_id.is_null()), ), - disk::table.into_boxed().filter( - disk::dsl::disk_state.eq_any(ok_to_attach_disk_state_labels), - ), + resource_query, max_disks, diesel::update(disk::dsl::disk).set(attach_update), ); diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index f08c65fc3fc..9148438f04b 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -5707,4 +5707,144 @@ pub(in crate::db::datastore) mod test { db.terminate().await; logctx.cleanup_successful(); } + + /// Ensure that after sled reservation, local storage disks cannot be + /// attached to instances, even if they don't yet have a VMM. + #[tokio::test] + async fn local_storage_disk_no_attach_after_reservation() { + let logctx = dev::test_setup_log( + "local_storage_disk_no_attach_after_reservation", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled with two U2s + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:102::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure one instance with two local storage disks. One will be + // detached before sled reservation. + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + let instance = Instance::new_with_id(config.instances[0].id); + + // Detach the second disk from the instance + let (.., authz_instance) = LookupPath::new(&opctx, datastore) + .instance_id(instance.id.into_untyped_uuid()) + .lookup_for(authz::Action::Modify) + .await + .expect("instance must exist"); + + let (.., authz_disk) = LookupPath::new(&opctx, datastore) + .disk_id(config.instances[0].disks[1].id) + .lookup_for(authz::Action::Read) + .await + .expect("disk must exist"); + + datastore + .instance_detach_disk( + &opctx, + &authz_instance, + &authz_disk, + ) + .await + .unwrap(); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + // Try to attach the second disk to the instance - it should fail. + + datastore + .instance_attach_disk( + &opctx, + &authz_instance, + &authz_disk, + MAX_DISKS_PER_INSTANCE, + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } } From 8b07137c506e65d8f0de36b1ea75ee99952c07cc Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 11 Dec 2025 13:47:48 +0000 Subject: [PATCH 05/12] remove literal duplicate code --- nexus/db-queries/src/db/datastore/sled.rs | 40 ----------------------- 1 file changed, 40 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 9148438f04b..dc981977c41 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -777,46 +777,6 @@ impl DataStore { continue; } - // If there's an existing local storage allocation on a zpool, - // remove that from the list of candidates. Local storage for - // the same Instance should not share any zpools. - let local_storage_zpools_used: HashSet = - local_storage_disks - .iter() - .filter_map(|disk| { - disk.local_storage_dataset_allocation.as_ref().map( - |allocation| { - ZpoolUuid::from_untyped_uuid( - allocation - .pool_id() - .into_untyped_uuid(), - ) - }, - ) - }) - .collect(); - - let zpools_for_sled: Vec<_> = zpools_for_sled - .into_iter() - .filter(|zpool_get_result| { - !local_storage_zpools_used - .contains(&zpool_get_result.pool.id()) - }) - .collect(); - - if local_storage_allocation_required.len() - > zpools_for_sled.len() - { - // Not enough zpools to satisfy the number of allocations - // required. Find another sled! - sled_targets.remove(&sled_target); - banned.remove(&sled_target); - unpreferred.remove(&sled_target); - preferred.remove(&sled_target); - - continue; - } - let mut allocations_to_perform = Vec::with_capacity(local_storage_allocation_required.len()); From bb4509c6c9a9d9d9b777070c78fbdf6a1548a2ec Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 11 Dec 2025 14:07:24 +0000 Subject: [PATCH 06/12] word wrap, remove "no_provision = FALSE" when not needed, and double up on expectorate output --- .../src/db/queries/sled_reservation.rs | 67 ++++++++++++------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/nexus/db-queries/src/db/queries/sled_reservation.rs b/nexus/db-queries/src/db/queries/sled_reservation.rs index 12ec80edafd..f9c80dd179c 100644 --- a/nexus/db-queries/src/db/queries/sled_reservation.rs +++ b/nexus/db-queries/src/db/queries/sled_reservation.rs @@ -497,15 +497,18 @@ pub fn sled_insert_resource_query( // column by adding these new rows query.sql("UPDATE_RENDEZVOUS_TABLES as ("); - query.sql(" + query.sql( + " UPDATE rendezvous_local_storage_dataset - SET size_used = size_used + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.dataset_size + SET size_used = size_used + \ + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.dataset_size FROM NEW_LOCAL_STORAGE_ALLOCATION_RECORDS WHERE - NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.local_storage_dataset_id = rendezvous_local_storage_dataset.id AND - rendezvous_local_storage_dataset.time_tombstoned is null and - rendezvous_local_storage_dataset.no_provision = FALSE - RETURNING *"); + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.local_storage_dataset_id = \ + rendezvous_local_storage_dataset.id AND + rendezvous_local_storage_dataset.time_tombstoned IS NULL + RETURNING *", + ); query.sql("), "); } } @@ -563,7 +566,9 @@ pub fn sled_insert_resource_query( query.sql("("); - query.sql("SELECT + query + .sql( + "SELECT SUM( crucible_dataset.size_used + rendezvous_local_storage_dataset.size_used + @@ -574,22 +579,28 @@ pub fn sled_insert_resource_query( JOIN rendezvous_local_storage_dataset ON - crucible_dataset.pool_id = rendezvous_local_storage_dataset.pool_id + crucible_dataset.pool_id = \ + rendezvous_local_storage_dataset.pool_id JOIN NEW_LOCAL_STORAGE_ALLOCATION_RECORDS ON - crucible_dataset.pool_id = NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.pool_id + crucible_dataset.pool_id = \ + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.pool_id WHERE (crucible_dataset.size_used IS NOT NULL) AND (crucible_dataset.time_deleted IS NULL) AND (rendezvous_local_storage_dataset.time_tombstoned IS NULL) AND - (rendezvous_local_storage_dataset.no_provision IS FALSE) AND - (crucible_dataset.pool_id = ") - .param() - .bind::(allocation.pool_id.into_untyped_uuid()) - .sql(") + (crucible_dataset.pool_id = ", + ) + .param() + .bind::( + allocation.pool_id.into_untyped_uuid(), + ) + .sql( + ") GROUP BY - crucible_dataset.pool_id"); + crucible_dataset.pool_id", + ); query.sql(") < ("); @@ -825,14 +836,24 @@ mod test { let query = sled_insert_resource_query( &resource, &LocalStorageAllocationRequired::Yes { - allocations: nonempty![LocalStorageAllocation { - disk_id: Uuid::nil(), - local_storage_dataset_allocation_id: DatasetUuid::nil(), - required_dataset_size: 128 * 1024 * 1024 * 1024, - local_storage_dataset_id: DatasetUuid::nil(), - pool_id: ZpoolUuid::nil(), - sled_id: SledUuid::nil(), - }], + allocations: nonempty![ + LocalStorageAllocation { + disk_id: Uuid::nil(), + local_storage_dataset_allocation_id: DatasetUuid::nil(), + required_dataset_size: 64 * 1024 * 1024 * 1024, + local_storage_dataset_id: DatasetUuid::nil(), + pool_id: ZpoolUuid::nil(), + sled_id: SledUuid::nil(), + }, + LocalStorageAllocation { + disk_id: Uuid::nil(), + local_storage_dataset_allocation_id: DatasetUuid::nil(), + required_dataset_size: 128 * 1024 * 1024 * 1024, + local_storage_dataset_id: DatasetUuid::nil(), + pool_id: ZpoolUuid::nil(), + sled_id: SledUuid::nil(), + } + ], }, ); From 57b0434aaceac1bb999afb530b9c56293dc1347a Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 11 Dec 2025 14:11:35 +0000 Subject: [PATCH 07/12] expectorate from query changes --- ...sert_resource_query_with_local_storage.sql | 81 ++++++++++++++++--- 1 file changed, 68 insertions(+), 13 deletions(-) diff --git a/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql b/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql index 31d42158cb6..59964d59df8 100644 --- a/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql +++ b/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql @@ -91,9 +91,9 @@ WITH UPDATE disk_type_local_storage SET - local_storage_dataset_allocation_id = CASE disk_id WHEN $9 THEN $10 END + local_storage_dataset_allocation_id = CASE disk_id WHEN $9 THEN $10 WHEN $11 THEN $12 END WHERE - disk_id IN ($11,) + disk_id IN ($13, $14) RETURNING * ), @@ -103,7 +103,7 @@ WITH INTO local_storage_dataset_allocation VALUES - ($12, now(), NULL, $13, $14, $15, $16) + ($15, now(), NULL, $16, $17, $18, $19), ($20, now(), NULL, $21, $22, $23, $24) RETURNING * ), @@ -119,7 +119,6 @@ WITH new_local_storage_allocation_records.local_storage_dataset_id = rendezvous_local_storage_dataset.id AND rendezvous_local_storage_dataset.time_tombstoned IS NULL - AND rendezvous_local_storage_dataset.no_provision = false RETURNING * ), @@ -129,9 +128,9 @@ WITH 1 WHERE EXISTS(SELECT 1 FROM sled_has_space) - AND NOT (EXISTS(SELECT 1 FROM banned_sleds WHERE sled_id = $17)) + AND NOT (EXISTS(SELECT 1 FROM banned_sleds WHERE sled_id = $25)) AND ( - EXISTS(SELECT 1 FROM required_sleds WHERE sled_id = $18) + EXISTS(SELECT 1 FROM required_sleds WHERE sled_id = $26) OR NOT EXISTS(SELECT 1 FROM required_sleds) ) AND ( @@ -152,8 +151,7 @@ WITH (crucible_dataset.size_used IS NOT NULL) AND (crucible_dataset.time_deleted IS NULL) AND (rendezvous_local_storage_dataset.time_tombstoned IS NULL) - AND rendezvous_local_storage_dataset.no_provision IS false - AND crucible_dataset.pool_id = $19 + AND crucible_dataset.pool_id = $27 GROUP BY crucible_dataset.pool_id ) @@ -164,13 +162,13 @@ WITH FROM inv_zpool WHERE - inv_zpool.id = $20 + inv_zpool.id = $28 ORDER BY inv_zpool.time_collected DESC LIMIT 1 ) - - (SELECT control_plane_storage_buffer FROM zpool WHERE id = $21) + - (SELECT control_plane_storage_buffer FROM zpool WHERE id = $29) ) AND ( SELECT @@ -183,7 +181,7 @@ WITH JOIN sled ON zpool.sled_id = sled.id JOIN physical_disk ON zpool.physical_disk_id = physical_disk.id WHERE - zpool.id = $22 + zpool.id = $30 ) AND ( SELECT @@ -191,7 +189,64 @@ WITH FROM rendezvous_local_storage_dataset WHERE - rendezvous_local_storage_dataset.id = $23 + rendezvous_local_storage_dataset.id = $31 + ) + AND ( + SELECT + sum( + crucible_dataset.size_used + + rendezvous_local_storage_dataset.size_used + + new_local_storage_allocation_records.dataset_size + ) + FROM + crucible_dataset + JOIN rendezvous_local_storage_dataset ON + crucible_dataset.pool_id = rendezvous_local_storage_dataset.pool_id + JOIN new_local_storage_allocation_records ON + crucible_dataset.pool_id = new_local_storage_allocation_records.pool_id + WHERE + (crucible_dataset.size_used IS NOT NULL) + AND (crucible_dataset.time_deleted IS NULL) + AND (rendezvous_local_storage_dataset.time_tombstoned IS NULL) + AND crucible_dataset.pool_id = $32 + GROUP BY + crucible_dataset.pool_id + ) + < ( + ( + SELECT + total_size + FROM + inv_zpool + WHERE + inv_zpool.id = $33 + ORDER BY + inv_zpool.time_collected DESC + LIMIT + 1 + ) + - (SELECT control_plane_storage_buffer FROM zpool WHERE id = $34) + ) + AND ( + SELECT + sled.sled_policy = 'in_service' + AND sled.sled_state = 'active' + AND physical_disk.disk_policy = 'in_service' + AND physical_disk.disk_state = 'active' + FROM + zpool + JOIN sled ON zpool.sled_id = sled.id + JOIN physical_disk ON zpool.physical_disk_id = physical_disk.id + WHERE + zpool.id = $35 + ) + AND ( + SELECT + time_tombstoned IS NULL AND no_provision IS false + FROM + rendezvous_local_storage_dataset + WHERE + rendezvous_local_storage_dataset.id = $36 ) ) ) @@ -199,6 +254,6 @@ INSERT INTO sled_resource_vmm (id, sled_id, hardware_threads, rss_ram, reservoir_ram, instance_id) SELECT - $24, $25, $26, $27, $28, $29 + $37, $38, $39, $40, $41, $42 WHERE EXISTS(SELECT 1 FROM insert_valid) From e417fc4d82c36c6a001428de9c4c0f0a1093e538 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 11 Dec 2025 14:11:48 +0000 Subject: [PATCH 08/12] missed use --- nexus/db-queries/src/db/datastore/disk.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs index 97993779285..bd043a6f041 100644 --- a/nexus/db-queries/src/db/datastore/disk.rs +++ b/nexus/db-queries/src/db/datastore/disk.rs @@ -37,9 +37,9 @@ use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::DateTime; use chrono::Utc; -use diesel::prelude::*; -use diesel::dsl::not; use diesel::dsl::exists; +use diesel::dsl::not; +use diesel::prelude::*; use nexus_db_errors::ErrorHandler; use nexus_db_errors::OptionalError; use nexus_db_errors::public_error_from_diesel; From d6e7e1bafd7a4f72825521a4dc0e1a946021a1c9 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 11 Dec 2025 14:11:59 +0000 Subject: [PATCH 09/12] fmt --- nexus/db-queries/src/db/datastore/sled.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index dc981977c41..421b87d984a 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -5765,11 +5765,7 @@ pub(in crate::db::datastore) mod test { .expect("disk must exist"); datastore - .instance_detach_disk( - &opctx, - &authz_instance, - &authz_disk, - ) + .instance_detach_disk(&opctx, &authz_instance, &authz_disk) .await .unwrap(); From c67d8e9668919136aae567d981fa936f354fb18e Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 11 Dec 2025 14:27:25 +0000 Subject: [PATCH 10/12] return conflict, case can happen due to client requests --- nexus/db-queries/src/db/datastore/sled.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 421b87d984a..a63e54aba1b 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -538,9 +538,25 @@ impl DataStore { } else { if local_storage_allocation_sleds.len() != 1 { // It's an error for multiple sleds to host local storage - // disks, that makes no sense! + // disks for a single VMM, so return a conflict error here. + // + // This case can happen if a local storage disk was + // allocated on a sled, and then is detached from the + // instance whose VMM was on that sled, and then is attached + // to another instance that has some local storage disks + // that already have some allocations on another sled. + // + // TODO by the time this query has run that detach + attach + // has already occurred. Nexus should disallow attaching + // local storage disks to an instance that already has local + // storage disks if the allocations are on different sleds. + // + // TODO for clients to prevent such a scenario they would + // need to be aware of which sled a local storage disk's + // allocation is on, which means that information has to be + // exposed somehow in the Disk view. return Err(SledReservationTransactionError::Connection( - Error::internal_error(&format!( + Error::conflict(&format!( "local storage disks for instance {instance_id} \ allocated on multiple sleds \ {local_storage_allocation_sleds:?}" From f77aa55a4f8f02c755d775f28e89b1cedab920e8 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 11 Dec 2025 14:55:44 +0000 Subject: [PATCH 11/12] log during sled reservation --- nexus/db-queries/src/db/datastore/sled.rs | 54 ++++++++++++++++--- .../src/db/queries/sled_reservation.rs | 2 +- 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index a63e54aba1b..ca724f7ebc0 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -467,6 +467,14 @@ impl DataStore { constraints: db::model::SledReservationConstraints, ) -> Result { + let log = opctx.log.new(o!( + "query" => "sled_reservation", + "instance_id" => instance_id.to_string(), + "propolis_id" => propolis_id.to_string(), + )); + + info!(&log, "sled reservation starting"); + let conn = self.pool_connection_authorized(opctx).await?; // Check if resource ID already exists - if so, return it. @@ -483,6 +491,7 @@ impl DataStore { .await?; if !old_resource.is_empty() { + info!(&log, "sled reservation already occurred, returning"); return Ok(old_resource[0].clone()); } @@ -510,7 +519,9 @@ impl DataStore { // arguments to this function. let maybe_must_use_sleds: Option> = if let Some(must_select_from) = constraints.must_select_from() { - Some(must_select_from.into_iter().cloned().collect()) + let set = must_select_from.into_iter().cloned().collect(); + info!(&log, "reservation constrained to sleds {set:?}"); + Some(set) } else { None }; @@ -534,8 +545,15 @@ impl DataStore { if local_storage_allocation_sleds.is_empty() { // None of this instance's local storage disks have been // allocated yet. + info!(&log, "no existing local storage allocations"); maybe_must_use_sleds } else { + info!( + &log, + "existing local storage allocations on sleds + {local_storage_allocation_sleds:?}" + ); + if local_storage_allocation_sleds.len() != 1 { // It's an error for multiple sleds to host local storage // disks for a single VMM, so return a conflict error here. @@ -583,6 +601,7 @@ impl DataStore { } else { // `local_storage_disks` is empty, so that does not have an impact // on the existing hash set + info!(&log, "no attached local storage disks"); maybe_must_use_sleds }; @@ -590,11 +609,7 @@ impl DataStore { // cannot satisfy this allocation. if let Some(must_use_sleds) = &maybe_must_use_sleds { if must_use_sleds.is_empty() { - error!( - &opctx.log, - "no sleds available after filtering for instance \ - {instance_id}", - ); + error!(&log, "no sleds available after filtering"); // Nothing will satisfy this allocation, return an error here. return Err(SledReservationTransactionError::Reservation( @@ -675,12 +690,20 @@ impl DataStore { } } + info!(&log, "sled targets: {sled_targets:?}"); + let local_storage_allocation_required: Vec<&LocalStorageDisk> = local_storage_disks .iter() .filter(|disk| disk.local_storage_dataset_allocation.is_none()) .collect(); + info!( + &log, + "local_storage_allocation_required: \ + {local_storage_allocation_required:?}" + ); + // We loop here because our attempts to INSERT may be violated by // concurrent operations. We'll respond by looking through a slightly // smaller set of possible sleds. @@ -699,6 +722,8 @@ impl DataStore { &preferred, )?; + info!(&log, "trying sled target {sled_target}"); + // Create a SledResourceVmm record, associate it with the target // sled. let resource = SledResourceVmm::new( @@ -709,6 +734,11 @@ impl DataStore { ); if local_storage_allocation_required.is_empty() { + info!( + &log, + "calling insert (no local storage allocation requred)" + ); + // If no local storage allocation is required, then simply try // allocating a VMM to this sled. // @@ -723,8 +753,10 @@ impl DataStore { .await?; if rows_inserted > 0 { + info!(&log, "reservation succeeded!"); return Ok(resource); } + info!(&log, "reservation failed"); } else { // If local storage allocation is required, match the requests // with all the zpools of this sled that have available space. @@ -780,6 +812,8 @@ impl DataStore { }) .collect(); + info!(&log, "filtered zpools for sled: {zpools_for_sled:?}"); + if local_storage_allocation_required.len() > zpools_for_sled.len() { @@ -1020,6 +1054,12 @@ impl DataStore { for valid_allocation in validated_allocations { let ValidatedAllocations { allocations } = valid_allocation; + info!( + &log, + "calling insert with local storage allocations"; + "allocations" => ?allocations, + ); + // Try to INSERT the record plus the new local storage // allocations. If this is still a valid target and the new // local storage allocations still fit, we'll use it. If it @@ -1033,8 +1073,10 @@ impl DataStore { .await?; if rows_inserted > 0 { + info!(&log, "reservation succeeded!"); return Ok(resource); } + info!(&log, "reservation failed"); } } diff --git a/nexus/db-queries/src/db/queries/sled_reservation.rs b/nexus/db-queries/src/db/queries/sled_reservation.rs index f9c80dd179c..9cf6b6b78d5 100644 --- a/nexus/db-queries/src/db/queries/sled_reservation.rs +++ b/nexus/db-queries/src/db/queries/sled_reservation.rs @@ -20,7 +20,7 @@ use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolUuid; use uuid::Uuid; -#[derive(Clone)] +#[derive(Debug, Clone)] pub struct LocalStorageAllocation { pub disk_id: Uuid, pub local_storage_dataset_allocation_id: DatasetUuid, From 827bb37a692616810fb3d8fed79166a211be9fe7 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 11 Dec 2025 14:57:45 +0000 Subject: [PATCH 12/12] typo --- nexus/db-queries/src/db/datastore/sled.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index ca724f7ebc0..37dc67af701 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -527,7 +527,7 @@ impl DataStore { }; // If any local storage disks have been allocated already, then this - // constraints VMM placement and where other unallocated local storage + // constrains VMM placement and where other unallocated local storage // must be. let maybe_must_use_sleds = if !local_storage_disks.is_empty() { // Any local storage disk that was allocated already will have a