diff --git a/Cargo.lock b/Cargo.lock index af4763cdfc0..acaebab9ce7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1722,7 +1722,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" dependencies = [ "lazy_static", - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] @@ -4874,7 +4874,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.5.10", + "socket2 0.6.1", "system-configuration", "tokio", "tower-service", @@ -5884,7 +5884,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -6788,6 +6788,7 @@ dependencies = [ "nexus-sled-agent-shared", "nexus-test-utils", "nexus-types", + "nonempty", "omicron-cockroach-metrics", "omicron-common", "omicron-passwords", @@ -7598,6 +7599,12 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nonempty" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9737e026353e5cd0736f98eddae28665118eb6f6600902a7f50db585621fecb6" + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -8381,6 +8388,7 @@ dependencies = [ "scim2-rs", "scim2-test-client", "semver 1.0.27", + "seq-macro", "serde", "serde_json", "serde_urlencoded", @@ -8395,6 +8403,7 @@ dependencies = [ "slog-error-chain", "slog-term", "sp-sim", + "static_assertions", "steno", "strum 0.27.2", "subprocess", @@ -12411,6 +12420,12 @@ dependencies = [ "serde_core", ] +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.228" @@ -16241,7 +16256,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 4c38098a7b6..f352b0ed67c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -575,6 +575,7 @@ nexus-test-utils = { path = "nexus/test-utils" } nexus-types = { path = "nexus/types" } nix = { version = "0.30", features = ["fs", "net"] } nom = "7.1.3" +nonempty = "0.12.0" num-integer = "0.1.46" num = { version = "0.4.3", default-features = false, features = [ "libm" ] } omicron-clickhouse-admin = { path = "clickhouse-admin" } @@ -687,6 +688,7 @@ schemars = "0.8.22" scopeguard = "1.2.0" secrecy = "0.10.3" semver = { version = "1.0.26", features = ["std", "serde"] } +seq-macro = "0.3.6" serde = { version = "1.0", default-features = false, features = [ "derive", "rc" ] } serde_cbor = "0.11.2" serde_human_bytes = { git = "https://github.com/oxidecomputer/serde_human_bytes", branch = "main" } diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 1145813c01d..47a8ae8cf4b 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -93,6 +93,7 @@ ring.workspace = true samael.workspace = true schemars = { workspace = true, features = ["chrono", "uuid1"] } semver.workspace = true +seq-macro.workspace = true serde.workspace = true serde_json.workspace = true serde_urlencoded.workspace = true @@ -106,6 +107,7 @@ slog-dtrace.workspace = true slog-error-chain.workspace = true display-error-chain.workspace = true slog-term.workspace = true +static_assertions.workspace = true steno.workspace = true tempfile.workspace = true thiserror.workspace = true diff --git a/nexus/db-model/src/affinity.rs b/nexus/db-model/src/affinity.rs index dbef84e9672..06c223e5941 100644 --- a/nexus/db-model/src/affinity.rs +++ b/nexus/db-model/src/affinity.rs @@ -151,7 +151,7 @@ impl From for AffinityGroupUpdate { #[diesel(table_name = anti_affinity_group)] pub struct AntiAffinityGroup { #[diesel(embed)] - identity: AntiAffinityGroupIdentity, + pub identity: AntiAffinityGroupIdentity, pub project_id: Uuid, pub policy: AffinityPolicy, pub failure_domain: FailureDomain, diff --git a/nexus/db-model/src/local_storage_dataset_allocation.rs b/nexus/db-model/src/local_storage_dataset_allocation.rs index 3b7741d42b4..47cc969154f 100644 --- a/nexus/db-model/src/local_storage_dataset_allocation.rs +++ b/nexus/db-model/src/local_storage_dataset_allocation.rs @@ -40,6 +40,30 @@ pub struct LocalStorageDatasetAllocation { } impl LocalStorageDatasetAllocation { + /// These records are normally created during sled reservation, but for unit + /// tests add this `new` function. + pub fn new_for_tests_only( + id: DatasetUuid, + time_created: DateTime, + local_storage_dataset_id: DatasetUuid, + pool_id: ExternalZpoolUuid, + sled_id: SledUuid, + dataset_size: ByteCount, + ) -> Self { + Self { + id: id.into(), + + time_created, + time_deleted: None, + + local_storage_dataset_id: local_storage_dataset_id.into(), + pool_id: pool_id.into(), + sled_id: sled_id.into(), + + dataset_size, + } + } + pub fn id(&self) -> DatasetUuid { self.id.into() } diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index 314f9614a91..98420f54264 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -30,6 +30,7 @@ internal-dns-types.workspace = true ipnetwork.workspace = true itertools.workspace = true macaddr.workspace = true +nonempty.workspace = true oxnet.workspace = true paste.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs index c4ad17943df..bd043a6f041 100644 --- a/nexus/db-queries/src/db/datastore/disk.rs +++ b/nexus/db-queries/src/db/datastore/disk.rs @@ -37,6 +37,8 @@ use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::DateTime; use chrono::Utc; +use diesel::dsl::exists; +use diesel::dsl::not; use diesel::prelude::*; use nexus_db_errors::ErrorHandler; use nexus_db_errors::OptionalError; @@ -434,13 +436,24 @@ impl DataStore { authz_instance: &authz::Instance, pagparams: &PaginatedBy<'_>, ) -> ListResultVec { - use nexus_db_schema::schema::disk::dsl; - use nexus_db_schema::schema::disk_type_crucible::dsl as disk_type_crucible_dsl; - use nexus_db_schema::schema::disk_type_local_storage::dsl as disk_type_local_storage_dsl; + let conn = self.pool_connection_authorized(opctx).await?; opctx.authorize(authz::Action::ListChildren, authz_instance).await?; - let conn = self.pool_connection_authorized(opctx).await?; + self.instance_list_disks_on_conn(&conn, authz_instance.id(), pagparams) + .await + } + + /// List disks associated with a given instance by name. + pub async fn instance_list_disks_on_conn( + &self, + conn: &async_bb8_diesel::Connection, + instance_id: Uuid, + pagparams: &PaginatedBy<'_>, + ) -> ListResultVec { + use nexus_db_schema::schema::disk::dsl; + use nexus_db_schema::schema::disk_type_crucible::dsl as disk_type_crucible_dsl; + use nexus_db_schema::schema::disk_type_local_storage::dsl as disk_type_local_storage_dsl; let results = match pagparams { PaginatedBy::Id(pagparams) => { @@ -461,13 +474,13 @@ impl DataStore { .on(dsl::id.eq(disk_type_local_storage_dsl::disk_id)), ) .filter(dsl::time_deleted.is_null()) - .filter(dsl::attach_instance_id.eq(authz_instance.id())) + .filter(dsl::attach_instance_id.eq(instance_id)) .select(( model::Disk::as_select(), Option::::as_select(), Option::::as_select(), )) - .get_results_async(&*conn) + .get_results_async(conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; @@ -494,7 +507,7 @@ impl DataStore { let allocation = dsl::local_storage_dataset_allocation .filter(dsl::id.eq(to_db_typed_uuid(allocation_id))) .select(LocalStorageDatasetAllocation::as_select()) - .first_async(&*conn) + .first_async(conn) .await .map_err(|e| { public_error_from_diesel( @@ -747,7 +760,9 @@ impl DataStore { authz_disk: &authz::Disk, max_disks: u32, ) -> Result<(Instance, Disk), Error> { - use nexus_db_schema::schema::{disk, instance}; + use nexus_db_schema::schema::disk; + use nexus_db_schema::schema::instance; + use nexus_db_schema::schema::sled_resource_vmm; opctx.authorize(authz::Action::Modify, authz_instance).await?; opctx.authorize(authz::Action::Modify, authz_disk).await?; @@ -771,6 +786,74 @@ impl DataStore { let attach_update = DiskSetClauseForAttach::new(authz_instance.id()); + let disk = self.disk_get(&opctx, authz_disk.id()).await?; + let resource_query = match disk { + Disk::Crucible(_) => disk::table.into_boxed().filter( + disk::dsl::disk_state.eq_any(ok_to_attach_disk_state_labels), + ), + + // Attaching a local storage disk to the instance has to be blocked + // if sled reservation has occured for this instance: local storage + // allocation records are only created during sled reservation, and + // importantly a particular configuration of local storage + // allocations for an instance are only _validated_ during the sled + // reservation. + // + // The instance start saga perform sled reservation, creates the + // corresponding VMM record, and changes the instance's runtime + // state all in _separate saga nodes_. This means that there could + // be an indeterminate amount of time between running the sled + // reservation query and changing the instance's state. + // + // If a client attaches a local storage disk to an instance after + // sled reservation occurs but before the instance's start moves to + // starting, and we do not block it, there are several problems that + // result: + // + // - if an allocation does not already exist for the local storage + // disk, the instance_start saga will fail (and unwind) when + // trying to ensure that the allocation's dataset and zvol exist + // because the allocation_id column is None. + // + // - if an allocation does already exist for the local storage disk, + // _it may not be for the same sled the VMM is on_. the sled + // reservation query would prevent this, but this attach (if not + // blocked) happened afterwards. This would mean Nexus would + // construct a InstanceSledLocalConfig that contains DelegatedZvol + // entries that refer to different sleds. + // + // - if an allocation does exist already, and it's for the same sled + // the VMM is on, it may be colocated on a zpool with another + // local storage disk's allocation. again, the sled reservation + // query prevents this. + // + // - if an allocation does exist already, and it's for the same + // sled the VMM is on, and it's on a distinct zpool, then it's + // probably fine, but it's safer to let the sled reservation query + // validate everything, and it makes a much smaller query to block + // this case as well. + // + // `reserve_on_random_sled` will create an entry in + // `SledResourcesVmm` when the query is successful in finding a VMM + // reservation, so use that here: if there is a `SledResourcesVmm` + // record for this instance, then block attachment. + // + // Note that depending on our implementation, this may be the code + // path responsible for attaching disks to already-running instances + // when we support hot-plug. Local storage disks may never support + // hot-plug because running zones cannot be reconfigured (aka a new + // zvol rdsk device cannot be added to a running propolis zone). + Disk::LocalStorage(_) => disk::table + .into_boxed() + .filter( + disk::dsl::disk_state + .eq_any(ok_to_attach_disk_state_labels), + ) + .filter(not(exists(sled_resource_vmm::table.filter( + sled_resource_vmm::dsl::instance_id.eq(authz_instance.id()), + )))), + }; + let query = Instance::attach_resource( authz_instance.id(), authz_disk.id(), @@ -779,9 +862,7 @@ impl DataStore { .eq_any(ok_to_attach_instance_states) .and(instance::dsl::active_propolis_id.is_null()), ), - disk::table.into_boxed().filter( - disk::dsl::disk_state.eq_any(ok_to_attach_disk_state_labels), - ), + resource_query, max_disks, diesel::update(disk::dsl::disk).set(attach_update), ); diff --git a/nexus/db-queries/src/db/datastore/local_storage.rs b/nexus/db-queries/src/db/datastore/local_storage.rs index 1f2a0503220..09914809b17 100644 --- a/nexus/db-queries/src/db/datastore/local_storage.rs +++ b/nexus/db-queries/src/db/datastore/local_storage.rs @@ -9,7 +9,9 @@ use crate::authz; use crate::context::OpContext; use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; +use crate::db::datastore::DbConnection; use crate::db::datastore::SQL_BATCH_SIZE; +use crate::db::model::LocalStorageDatasetAllocation; use crate::db::model::RendezvousLocalStorageDataset; use crate::db::model::Zpool; use crate::db::model::to_db_typed_uuid; @@ -18,6 +20,9 @@ use crate::db::pagination::paginated; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; +use diesel::sql_types::BigInt; +use diesel::sql_types::Nullable; +use diesel::sql_types::Numeric; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use omicron_common::api::external::CreateResult; @@ -31,6 +36,9 @@ use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use uuid::Uuid; +// XXX is it bad that Numeric -> BigInt here? +define_sql_function! { fn coalesce(x: Nullable, y: BigInt) -> BigInt; } + impl DataStore { /// List all LocalStorage datasets, making as many queries as needed to get /// them all @@ -152,4 +160,85 @@ impl DataStore { }) .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + + pub(super) async fn delete_local_storage_dataset_allocation_in_txn( + conn: &async_bb8_diesel::Connection, + local_storage_dataset_allocation_id: DatasetUuid, + ) -> Result<(), diesel::result::Error> { + use nexus_db_schema::schema::local_storage_dataset_allocation::dsl; + + // If this is deleted already, then return Ok - this function should be + // idempotent, it is called from a saga node. + + let id = to_db_typed_uuid(local_storage_dataset_allocation_id); + + let allocation = dsl::local_storage_dataset_allocation + .filter(dsl::id.eq(id)) + .select(LocalStorageDatasetAllocation::as_select()) + .get_result_async(conn) + .await?; + + if allocation.time_deleted.is_some() { + return Ok(()); + } + + // Set the time_deleted + + diesel::update(dsl::local_storage_dataset_allocation) + .filter(dsl::id.eq(id)) + .set(dsl::time_deleted.eq(Utc::now())) + .execute_async(conn) + .await?; + + // Recompute the associated local storage dataset's size_used + + let dataset_id: DatasetUuid = allocation.local_storage_dataset_id(); + + use nexus_db_schema::schema::rendezvous_local_storage_dataset::dsl as dataset_dsl; + + diesel::update(dataset_dsl::rendezvous_local_storage_dataset) + .filter(dataset_dsl::id.eq(to_db_typed_uuid(dataset_id))) + .set( + dataset_dsl::size_used.eq(coalesce( + dsl::local_storage_dataset_allocation + .filter( + dsl::local_storage_dataset_id + .eq(to_db_typed_uuid(dataset_id)), + ) + .filter(dsl::time_deleted.is_null()) + .select(diesel::dsl::sum(dsl::dataset_size)) + .single_value(), + 0, + )), + ) + .execute_async(conn) + .await?; + + Ok(()) + } + + /// Mark the local storage dataset allocation as deleted, and re-compute the + /// appropriate local storage dataset size_used column. + pub async fn delete_local_storage_dataset_allocation( + &self, + opctx: &OpContext, + local_storage_dataset_allocation_id: DatasetUuid, + ) -> Result<(), Error> { + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper( + "delete_local_storage_dataset_allocation", + ) + .transaction(&conn, |conn| async move { + Self::delete_local_storage_dataset_allocation_in_txn( + &conn, + local_storage_dataset_allocation_id, + ) + .await + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } } diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 889f0700e99..37dc67af701 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -9,7 +9,9 @@ use super::SQL_BATCH_SIZE; use crate::authz; use crate::context::OpContext; use crate::db; +use crate::db::datastore::LocalStorageDisk; use crate::db::datastore::ValidateTransition; +use crate::db::datastore::zpool::ZpoolGetForSledReservationResult; use crate::db::model::AffinityPolicy; use crate::db::model::Sled; use crate::db::model::SledResourceVmm; @@ -19,6 +21,9 @@ use crate::db::model::to_db_sled_policy; use crate::db::model::to_db_typed_uuid; use crate::db::pagination::Paginator; use crate::db::pagination::paginated; +use crate::db::queries::disk::MAX_DISKS_PER_INSTANCE; +use crate::db::queries::sled_reservation::LocalStorageAllocation; +use crate::db::queries::sled_reservation::LocalStorageAllocationRequired; use crate::db::queries::sled_reservation::sled_find_targets_query; use crate::db::queries::sled_reservation::sled_insert_resource_query; use crate::db::update_and_check::{UpdateAndCheck, UpdateStatus}; @@ -35,6 +40,7 @@ use nexus_types::deployment::SledFilter; use nexus_types::external_api::views::SledPolicy; use nexus_types::external_api::views::SledProvisionPolicy; use nexus_types::identity::Asset; +use nonempty::NonEmpty; use omicron_common::api::external; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; @@ -42,13 +48,17 @@ use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::ResourceType; +use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::bail_unless; +use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::ZpoolUuid; use slog::Logger; use std::collections::HashSet; +use std::collections::VecDeque; use std::fmt; use strum::IntoEnumIterator; use thiserror::Error; @@ -457,6 +467,14 @@ impl DataStore { constraints: db::model::SledReservationConstraints, ) -> Result { + let log = opctx.log.new(o!( + "query" => "sled_reservation", + "instance_id" => instance_id.to_string(), + "propolis_id" => propolis_id.to_string(), + )); + + info!(&log, "sled reservation starting"); + let conn = self.pool_connection_authorized(opctx).await?; // Check if resource ID already exists - if so, return it. @@ -471,23 +489,145 @@ impl DataStore { .limit(1) .load_async(&*conn) .await?; + if !old_resource.is_empty() { + info!(&log, "sled reservation already occurred, returning"); return Ok(old_resource[0].clone()); } - let must_use_sleds: HashSet<&SledUuid> = + // Get a list of local storage disks attached to this instance + let local_storage_disks: Vec = self + .instance_list_disks_on_conn( + &conn, + instance_id.into_untyped_uuid(), + &PaginatedBy::Name(DataPageParams { + marker: None, + direction: dropshot::PaginationOrder::Ascending, + limit: std::num::NonZeroU32::new(MAX_DISKS_PER_INSTANCE) + .unwrap(), + }), + ) + .await? + .into_iter() + .filter_map(|disk| match disk { + db::datastore::Disk::LocalStorage(disk) => Some(disk), + db::datastore::Disk::Crucible(_) => None, + }) + .collect(); + + // Gather constraints for the instance placement, starting with the + // arguments to this function. + let maybe_must_use_sleds: Option> = if let Some(must_select_from) = constraints.must_select_from() { - must_select_from.into_iter().collect() + let set = must_select_from.into_iter().cloned().collect(); + info!(&log, "reservation constrained to sleds {set:?}"); + Some(set) } else { - HashSet::default() + None }; + // If any local storage disks have been allocated already, then this + // constrains VMM placement and where other unallocated local storage + // must be. + let maybe_must_use_sleds = if !local_storage_disks.is_empty() { + // Any local storage disk that was allocated already will have a + // sled_id. + let local_storage_allocation_sleds: HashSet = + local_storage_disks + .iter() + .filter_map(|disk| { + disk.local_storage_dataset_allocation + .as_ref() + .map(|allocation| allocation.sled_id()) + }) + .collect(); + + if local_storage_allocation_sleds.is_empty() { + // None of this instance's local storage disks have been + // allocated yet. + info!(&log, "no existing local storage allocations"); + maybe_must_use_sleds + } else { + info!( + &log, + "existing local storage allocations on sleds + {local_storage_allocation_sleds:?}" + ); + + if local_storage_allocation_sleds.len() != 1 { + // It's an error for multiple sleds to host local storage + // disks for a single VMM, so return a conflict error here. + // + // This case can happen if a local storage disk was + // allocated on a sled, and then is detached from the + // instance whose VMM was on that sled, and then is attached + // to another instance that has some local storage disks + // that already have some allocations on another sled. + // + // TODO by the time this query has run that detach + attach + // has already occurred. Nexus should disallow attaching + // local storage disks to an instance that already has local + // storage disks if the allocations are on different sleds. + // + // TODO for clients to prevent such a scenario they would + // need to be aware of which sled a local storage disk's + // allocation is on, which means that information has to be + // exposed somehow in the Disk view. + return Err(SledReservationTransactionError::Connection( + Error::conflict(&format!( + "local storage disks for instance {instance_id} \ + allocated on multiple sleds \ + {local_storage_allocation_sleds:?}" + )), + )); + } + + // If there's already a list of sleds to use, that must be + // filtered further. Otherwise return the one sled that local + // storage disks have already been allocated on. + match maybe_must_use_sleds { + Some(must_use_sleds) => Some( + must_use_sleds + .into_iter() + .filter(|sled| { + local_storage_allocation_sleds.contains(&sled) + }) + .collect(), + ), + + None => Some(local_storage_allocation_sleds), + } + } + } else { + // `local_storage_disks` is empty, so that does not have an impact + // on the existing hash set + info!(&log, "no attached local storage disks"); + maybe_must_use_sleds + }; + + // If filtering has removed all the sleds from the constraint, then we + // cannot satisfy this allocation. + if let Some(must_use_sleds) = &maybe_must_use_sleds { + if must_use_sleds.is_empty() { + error!(&log, "no sleds available after filtering"); + + // Nothing will satisfy this allocation, return an error here. + return Err(SledReservationTransactionError::Reservation( + SledReservationError::NotFound, + )); + } + } + // Query for the set of possible sleds using a CTE. // // Note that this is not transactional, to reduce contention. // However, that lack of transactionality means we need to validate // our constraints again when we later try to INSERT the reservation. - let possible_sleds = sled_find_targets_query(instance_id, &resources, constraints.cpu_families()) + let possible_sleds = sled_find_targets_query( + instance_id, + &resources, + constraints.cpu_families(), + ) .get_results_async::<( // Sled UUID Uuid, @@ -508,6 +648,7 @@ impl DataStore { let mut unpreferred = HashSet::new(); let mut required = HashSet::new(); let mut preferred = HashSet::new(); + for (sled_id, fits, affinity_policy, anti_affinity_policy) in possible_sleds { @@ -517,18 +658,30 @@ impl DataStore { // queries like this. let sled_id = SledUuid::from_untyped_uuid(sled_id); - if fits - && (must_use_sleds.is_empty() - || must_use_sleds.contains(&sled_id)) - { - sled_targets.insert(sled_id); + if fits { + // If there is a Some list of sleds to select from, only add + // this target if it is in that list. A None list means that any + // sled could be a target. + match &maybe_must_use_sleds { + Some(must_use_sleds) => { + if must_use_sleds.contains(&sled_id) { + sled_targets.insert(sled_id); + } + } + + None => { + sled_targets.insert(sled_id); + } + } } + if let Some(policy) = affinity_policy { match policy { AffinityPolicy::Fail => required.insert(sled_id), AffinityPolicy::Allow => preferred.insert(sled_id), }; } + if let Some(policy) = anti_affinity_policy { match policy { AffinityPolicy::Fail => banned.insert(sled_id), @@ -537,13 +690,27 @@ impl DataStore { } } + info!(&log, "sled targets: {sled_targets:?}"); + + let local_storage_allocation_required: Vec<&LocalStorageDisk> = + local_storage_disks + .iter() + .filter(|disk| disk.local_storage_dataset_allocation.is_none()) + .collect(); + + info!( + &log, + "local_storage_allocation_required: \ + {local_storage_allocation_required:?}" + ); + // We loop here because our attempts to INSERT may be violated by // concurrent operations. We'll respond by looking through a slightly // smaller set of possible sleds. // // In the uncontended case, however, we'll only iterate through this // loop once. - loop { + 'outer: loop { // Pick a reservation target, given the constraints we previously // saw in the database. let sled_target = pick_sled_reservation_target( @@ -555,6 +722,8 @@ impl DataStore { &preferred, )?; + info!(&log, "trying sled target {sled_target}"); + // Create a SledResourceVmm record, associate it with the target // sled. let resource = SledResourceVmm::new( @@ -564,14 +733,351 @@ impl DataStore { resources.clone(), ); - // Try to INSERT the record. If this is still a valid target, we'll - // use it. If it isn't a valid target, we'll shrink the set of - // viable sled targets and try again. - let rows_inserted = sled_insert_resource_query(&resource) + if local_storage_allocation_required.is_empty() { + info!( + &log, + "calling insert (no local storage allocation requred)" + ); + + // If no local storage allocation is required, then simply try + // allocating a VMM to this sled. + // + // Try to INSERT the record. If this is still a valid target, + // we'll use it. If it isn't a valid target, we'll shrink the + // set of viable sled targets and try again. + let rows_inserted = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::No, + ) .execute_async(&*conn) .await?; - if rows_inserted > 0 { - return Ok(resource); + + if rows_inserted > 0 { + info!(&log, "reservation succeeded!"); + return Ok(resource); + } + info!(&log, "reservation failed"); + } else { + // If local storage allocation is required, match the requests + // with all the zpools of this sled that have available space. + // This routine finds all possible configurations that would + // satisfy the requests for local storage and tries them all. + + // First, each request for local storage can possibly be + // satisfied by a number of zpools on the sled. Find this list + // of candidate zpools for each required local storage + // allocation. + + #[derive(Clone, Debug, Hash, PartialEq, Eq)] + struct CandidateDataset { + rendezvous_local_storage_dataset_id: DatasetUuid, + pool_id: ZpoolUuid, + sled_id: SledUuid, + } + + #[derive(Clone, Debug)] + struct PossibleAllocationsForRequest<'a> { + request: &'a LocalStorageDisk, + candidate_datasets: HashSet, + } + + let zpools_for_sled = self + .zpool_get_for_sled_reservation(&opctx, sled_target) + .await?; + + // If there's an existing local storage allocation on a zpool, + // remove that from the list of candidates. Local storage for + // the same Instance should not share any zpools. + let local_storage_zpools_used: HashSet = + local_storage_disks + .iter() + .filter_map(|disk| { + disk.local_storage_dataset_allocation.as_ref().map( + |allocation| { + ZpoolUuid::from_untyped_uuid( + allocation + .pool_id() + .into_untyped_uuid(), + ) + }, + ) + }) + .collect(); + + let zpools_for_sled: Vec<_> = zpools_for_sled + .into_iter() + .filter(|zpool_get_result| { + !local_storage_zpools_used + .contains(&zpool_get_result.pool.id()) + }) + .collect(); + + info!(&log, "filtered zpools for sled: {zpools_for_sled:?}"); + + if local_storage_allocation_required.len() + > zpools_for_sled.len() + { + // Not enough zpools to satisfy the number of allocations + // required. Find another sled! + sled_targets.remove(&sled_target); + banned.remove(&sled_target); + unpreferred.remove(&sled_target); + preferred.remove(&sled_target); + + continue; + } + + let mut allocations_to_perform = + Vec::with_capacity(local_storage_allocation_required.len()); + + for request in &local_storage_allocation_required { + // Find all the zpools that could satisfy this local storage + // request. These will be filtered later. + let candidate_datasets: HashSet = + zpools_for_sled + .iter() + .filter(|zpool_get_result| { + let ZpoolGetForSledReservationResult { + pool, + last_inv_total_size, + rendezvous_local_storage_dataset_id: _, + crucible_dataset_usage, + local_storage_usage, + } = zpool_get_result; + + // The total request size for the local storage + // dataset allocation is the disk size plus the + // required overhead. + let request_size: i64 = + request.size().to_bytes() as i64 + + request + .required_dataset_overhead() + .to_bytes() + as i64; + + let new_size_used: i64 = crucible_dataset_usage + + local_storage_usage + + request_size; + + let control_plane_storage_buffer: i64 = + pool.control_plane_storage_buffer().into(); + let adjusted_total: i64 = last_inv_total_size + - control_plane_storage_buffer; + + // Any zpool that has space for this local + // storage dataset allocation is considered a + // candidate. + new_size_used < adjusted_total + }) + .map(|zpool_get_result| CandidateDataset { + rendezvous_local_storage_dataset_id: + zpool_get_result + .rendezvous_local_storage_dataset_id, + pool_id: zpool_get_result.pool.id(), + sled_id: zpool_get_result.pool.sled_id(), + }) + .collect(); + + if candidate_datasets.is_empty() { + // if there's no local storage datasets on this sled for + // this request's size, then try another sled. + sled_targets.remove(&sled_target); + banned.remove(&sled_target); + unpreferred.remove(&sled_target); + preferred.remove(&sled_target); + + continue 'outer; + } + + allocations_to_perform.push( + PossibleAllocationsForRequest { + request, + candidate_datasets, + }, + ); + } + + // From the list of allocations to perform, and all the + // candidate local storage datasets that could fit those + // allocations, find a list of all valid request -> zpool + // mappings. + + struct PossibleAllocations { + allocations: Vec, + candidates_left: HashSet, + request_index: usize, + } + + struct ValidatedAllocations { + allocations: NonEmpty, + } + + let mut validated_allocations: Vec = + vec![]; + let mut queue = VecDeque::new(); + + // Start from no allocations made yet, a list of allocations to + // perform (stored in `requests`), and all of the available + // zpools on the sled. + + queue.push_back(PossibleAllocations { + allocations: vec![], + candidates_left: zpools_for_sled + .iter() + .map(|zpool_get_result| CandidateDataset { + rendezvous_local_storage_dataset_id: + zpool_get_result + .rendezvous_local_storage_dataset_id, + pool_id: zpool_get_result.pool.id(), + sled_id: zpool_get_result.pool.sled_id(), + }) + .collect(), + request_index: 0, + }); + + // Find each valid allocation by, for each request: + // + // - selecting a local storage dataset from the list of local + // storage datasets that have not been used yet that could + // fulfill that request + // + // - removing that selected local storage dataset from the list + // of candidates + // + // - considering a set of allocations as "valid" if all requests + // have been matched with a local storage dataset. + + while let Some(possible_allocation) = queue.pop_front() { + let PossibleAllocations { + allocations, + candidates_left, + request_index, + } = possible_allocation; + + // If we have an allocation for each possible allocation, + // this one is valid. Add it to the list! + if request_index == allocations_to_perform.len() { + let allocations_len = allocations.len(); + + match NonEmpty::from_vec(allocations) { + Some(allocations) => { + validated_allocations + .push(ValidatedAllocations { allocations }); + } + + None => { + // There should be `request_index` entries in + // the `allocations` vec, this is weird! + error!( + &opctx.log, + "expected {request_index} in the \ + allocations vec, saw {allocations_len}", + ); + } + } + + continue; + } + + // Try to allocate the Nth possible allocation + let request = &allocations_to_perform[request_index]; + + // Create a possible config based on the what datasets are + // left, and the candidate datasets for this request. + for candidate_dataset in &request.candidate_datasets { + if candidates_left.contains(candidate_dataset) { + // This request could be satisfied by this dataset. + // Select it and search further. + + let mut set_allocations = allocations.clone(); + set_allocations.push(LocalStorageAllocation { + disk_id: request.request.id(), + + local_storage_dataset_allocation_id: + DatasetUuid::new_v4(), + + required_dataset_size: { + let request_size: i64 = + request.request.size().to_bytes() + as i64 + + request + .request + .required_dataset_overhead() + .to_bytes() + as i64; + request_size + }, + + local_storage_dataset_id: candidate_dataset + .rendezvous_local_storage_dataset_id, + + pool_id: candidate_dataset.pool_id, + + sled_id: candidate_dataset.sled_id, + }); + + // Note by removing a candidate dataset from the + // list in this way, this step mandates that a + // single local storage dataset is not used for + // multiple local storage allocations for a given + // instance. + + let mut set_candidates_left = + candidates_left.clone(); + set_candidates_left.remove(candidate_dataset); + + queue.push_back(PossibleAllocations { + allocations: set_allocations, + candidates_left: set_candidates_left, + request_index: request_index + 1, + }); + } + + // Else there are no candidate datasets left for this + // request, and therefore the list of requests cannot be + // fulfilled by the current mapping of requests to + // datasets + } + } + + // Loop here over each possible set of local storage allocations + // required, and attempt the sled insert resource query with + // that particular allocation. + // + // If the `validated_allocations` list is empty, control will + // pass to the end of the loop marked with 'outer, which will + // then try the next possible sled target. In the case where + // there were existing local storage allocations there will + // _not_ be any more sleds to try and the user will see a + // capacity error. + + for valid_allocation in validated_allocations { + let ValidatedAllocations { allocations } = valid_allocation; + + info!( + &log, + "calling insert with local storage allocations"; + "allocations" => ?allocations, + ); + + // Try to INSERT the record plus the new local storage + // allocations. If this is still a valid target and the new + // local storage allocations still fit, we'll use it. If it + // isn't a valid target, we'll shrink the set of viable sled + // targets and try again. + let rows_inserted = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::Yes { allocations }, + ) + .execute_async(&*conn) + .await?; + + if rows_inserted > 0 { + info!(&log, "reservation succeeded!"); + return Ok(resource); + } + info!(&log, "reservation failed"); + } } sled_targets.remove(&sled_target); @@ -1080,11 +1586,13 @@ impl TransitionError { #[cfg(test)] pub(in crate::db::datastore) mod test { use super::*; + use crate::db; use crate::db::datastore::test_utils::{ Expected, IneligibleSleds, sled_set_policy, sled_set_state, }; use crate::db::model::ByteCount; use crate::db::model::SqlU32; + use crate::db::model::Zpool; use crate::db::model::to_db_typed_uuid; use crate::db::pub_test_utils::TestDatabase; use crate::db::pub_test_utils::helpers::SledUpdateBuilder; @@ -1100,17 +1608,22 @@ pub(in crate::db::datastore) mod test { use nexus_db_model::PhysicalDiskState; use nexus_db_model::{Generation, SledCpuFamily}; use nexus_db_model::{InstanceCpuPlatform, PhysicalDisk}; + use nexus_types::external_api::params; use nexus_types::identity::Asset; use nexus_types::identity::Resource; use omicron_common::api::external; use omicron_test_utils::dev; use omicron_uuid_kinds::AffinityGroupUuid; use omicron_uuid_kinds::AntiAffinityGroupUuid; + use omicron_uuid_kinds::BlueprintUuid; + use omicron_uuid_kinds::CollectionUuid; + use omicron_uuid_kinds::ExternalZpoolUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use predicates::{BoxPredicate, prelude::*}; use std::collections::HashMap; + use std::net::SocketAddrV6; fn rack_id() -> Uuid { Uuid::parse_str(nexus_test_utils::RACK_UUID).unwrap() @@ -1493,8 +2006,12 @@ pub(in crate::db::datastore) mod test { impl Instance { fn new() -> Self { + Self::new_with_id(InstanceUuid::new_v4()) + } + + fn new_with_id(id: InstanceUuid) -> Self { Self { - id: InstanceUuid::new_v4(), + id, groups: vec![], force_onto_sled: None, resources: small_resource_request(), @@ -1524,7 +2041,12 @@ pub(in crate::db::datastore) mod test { .unwrap() .into_iter() .map(|(id, fits, affinity_policy, anti_affinity_policy)| { - FindTargetsOutput { id: SledUuid::from_untyped_uuid(id), fits, affinity_policy, anti_affinity_policy } + FindTargetsOutput { + id: SledUuid::from_untyped_uuid(id), + fits, + affinity_policy, + anti_affinity_policy, + } }) .collect() } @@ -1548,12 +2070,15 @@ pub(in crate::db::datastore) mod test { self.resources.clone(), ); - sled_insert_resource_query(&resource) - .execute_async( - &*datastore.pool_connection_for_tests().await.unwrap(), - ) - .await - .unwrap() + sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::No, + ) + .execute_async( + &*datastore.pool_connection_for_tests().await.unwrap(), + ) + .await + .unwrap() > 0 } @@ -1745,9 +2270,9 @@ pub(in crate::db::datastore) mod test { // Anti-Affinity, Policy = Allow // - // Create two sleds, put instances on each belonging to an anti-affinity group. - // We can continue to add instances belonging to that anti-affinity group, because - // it has "AffinityPolicy::Allow". + // Create two sleds, put instances on each belonging to an anti-affinity + // group. We can continue to add instances belonging to that anti-affinity + // group, because it has "AffinityPolicy::Allow". #[tokio::test] async fn anti_affinity_policy_allow() { let logctx = dev::test_setup_log("anti_affinity_policy_allow"); @@ -1869,9 +2394,9 @@ pub(in crate::db::datastore) mod test { AllGroups::create(&opctx, &datastore, &authz_project, &groups) .await; - // We are constrained so that an instance belonging to both groups must be - // placed on both sled 0 and sled 1. This cannot be satisfied, and it returns - // an error. + // We are constrained so that an instance belonging to both groups must + // be placed on both sled 0 and sled 1. This cannot be satisfied, and it + // returns an error. let instances = [ Instance::new().group("affinity1").sled(sleds[0].id()), Instance::new().group("affinity2").sled(sleds[1].id()), @@ -2221,8 +2746,8 @@ pub(in crate::db::datastore) mod test { // Sled 0 contains an affinity group, but it's large enough to make it // non-viable for future allocations. // - // Sled 1 contains an affinity and anti-affinity group, so they cancel out. - // This gives it "no priority". + // Sled 1 contains an affinity and anti-affinity group, so they cancel + // out. This gives it "no priority". // // Sled 2 contains nothing. It's a viable target, neither preferred nor // unpreferred. @@ -2345,8 +2870,9 @@ pub(in crate::db::datastore) mod test { AllGroups::create(&opctx, &datastore, &authz_project, &groups) .await; - // Only "sleds[2]" has space, even though it also contains an anti-affinity group. - // However, if we don't belong to this group, we won't care. + // Only "sleds[2]" has space, even though it also contains an + // anti-affinity group. However, if we don't belong to this group, we + // won't care. let instances = [ Instance::new().group("anti-affinity1").sled(sleds[0].id()), @@ -3090,4 +3616,2249 @@ pub(in crate::db::datastore) mod test { db.terminate().await; logctx.cleanup_successful(); } + + // local storage allocation tests + + struct LocalStorageTest { + sleds: Vec, + affinity_groups: Vec, + anti_affinity_groups: Vec, + instances: Vec, + } + + struct LocalStorageTestSled { + sled_id: SledUuid, + sled_serial: String, + u2s: Vec, + } + + struct LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid, + physical_disk_serial: String, + + zpool_id: ZpoolUuid, + control_plane_storage_buffer: external::ByteCount, + + inventory_total_size: external::ByteCount, + + crucible_dataset_id: DatasetUuid, + crucible_dataset_addr: SocketAddrV6, + + local_storage_dataset_id: DatasetUuid, + } + + struct LocalStorageAffinityGroup { + id: AffinityGroupUuid, + name: String, + policy: external::AffinityPolicy, + failure_domain: external::FailureDomain, + } + + struct LocalStorageAntiAffinityGroup { + id: AntiAffinityGroupUuid, + name: String, + policy: external::AffinityPolicy, + failure_domain: external::FailureDomain, + } + + struct LocalStorageTestInstance { + id: InstanceUuid, + name: String, + affinity: Option<(Affinity, usize)>, + disks: Vec, + } + + struct LocalStorageTestInstanceDisk { + id: Uuid, + name: external::Name, + size: external::ByteCount, + } + + async fn setup_local_storage_allocation_test( + opctx: &OpContext, + datastore: &DataStore, + config: &LocalStorageTest, + ) { + for sled_config in &config.sleds { + let sled = SledUpdate::new( + sled_config.sled_id, + "[::1]:0".parse().unwrap(), + 0, + db::model::SledBaseboard { + serial_number: sled_config.sled_serial.clone(), + part_number: "test-pn".to_string(), + revision: 0, + }, + db::model::SledSystemHardware { + is_scrimlet: false, + usable_hardware_threads: 128, + usable_physical_ram: (64 << 30).try_into().unwrap(), + reservoir_size: (16 << 30).try_into().unwrap(), + cpu_family: SledCpuFamily::AmdMilan, + }, + Uuid::new_v4(), + Generation::new(), + ); + + datastore.sled_upsert(sled).await.expect("failed to upsert sled"); + + for u2 in &sled_config.u2s { + let physical_disk = db::model::PhysicalDisk::new( + u2.physical_disk_id, + String::from("vendor"), + u2.physical_disk_serial.clone(), + String::from("model"), + db::model::PhysicalDiskKind::U2, + sled_config.sled_id, + ); + + datastore + .physical_disk_insert(opctx, physical_disk) + .await + .unwrap(); + + let zpool = Zpool::new( + u2.zpool_id, + sled_config.sled_id, + u2.physical_disk_id, + u2.control_plane_storage_buffer.into(), + ); + + datastore + .zpool_insert(opctx, zpool) + .await + .expect("failed to upsert zpool"); + + add_inventory_row_for_zpool( + datastore, + u2.zpool_id, + sled_config.sled_id, + u2.inventory_total_size.into(), + ) + .await; + + add_crucible_dataset( + datastore, + u2.crucible_dataset_id, + u2.zpool_id, + u2.crucible_dataset_addr, + ) + .await; + + add_local_storage_dataset( + opctx, + datastore, + u2.local_storage_dataset_id, + u2.zpool_id, + ) + .await; + } + } + + let (authz_project, _project) = + create_project(opctx, datastore, "project").await; + + for group in &config.affinity_groups { + datastore + .affinity_group_create( + &opctx, + &authz_project, + db::model::AffinityGroup { + identity: db::model::AffinityGroupIdentity::new( + group.id.into_untyped_uuid(), + external::IdentityMetadataCreateParams { + name: group.name.parse().unwrap(), + description: String::from("desc"), + }, + ), + project_id: authz_project.id(), + policy: group.policy.into(), + failure_domain: group.failure_domain.into(), + }, + ) + .await + .unwrap(); + } + + for group in &config.anti_affinity_groups { + datastore + .anti_affinity_group_create( + &opctx, + &authz_project, + db::model::AntiAffinityGroup { + identity: db::model::AntiAffinityGroupIdentity::new( + group.id.into_untyped_uuid(), + external::IdentityMetadataCreateParams { + name: group.name.parse().unwrap(), + description: String::from("desc"), + }, + ), + project_id: authz_project.id(), + policy: group.policy.into(), + failure_domain: group.failure_domain.into(), + }, + ) + .await + .unwrap(); + } + + for instance in &config.instances { + let authz_instance = create_test_instance( + datastore, + opctx, + &authz_project, + instance.id, + &instance.name.as_str(), + ) + .await; + + if let Some((affinity, index)) = &instance.affinity { + match affinity { + Affinity::Positive => { + let group = &config.affinity_groups[*index]; + add_instance_to_affinity_group( + &datastore, + group.id, + instance.id, + ) + .await; + } + + Affinity::Negative => { + let group = &config.anti_affinity_groups[*index]; + add_instance_to_anti_affinity_group( + &datastore, + group.id, + instance.id, + ) + .await; + } + } + } + + for disk in &instance.disks { + let params = params::DiskCreate { + identity: external::IdentityMetadataCreateParams { + name: disk.name.clone(), + description: String::from("desc"), + }, + + disk_backend: params::DiskBackend::Local {}, + + size: disk.size, + }; + + datastore + .project_create_disk( + &opctx, + &authz_project, + db::datastore::Disk::LocalStorage( + db::datastore::LocalStorageDisk::new( + db::model::Disk::new( + disk.id, + authz_project.id(), + ¶ms, + db::model::BlockSize::AdvancedFormat, + db::model::DiskRuntimeState::new(), + db::model::DiskType::LocalStorage, + ), + db::model::DiskTypeLocalStorage::new( + disk.id, disk.size, + ) + .unwrap(), + ), + ), + ) + .await + .unwrap(); + + let (.., authz_disk) = LookupPath::new(&opctx, datastore) + .disk_id(disk.id) + .lookup_for(authz::Action::Read) + .await + .unwrap(); + + datastore + .instance_attach_disk( + &opctx, + &authz_instance, + &authz_disk, + MAX_DISKS_PER_INSTANCE, + ) + .await + .unwrap(); + } + } + } + + async fn add_inventory_row_for_zpool( + datastore: &DataStore, + zpool_id: ZpoolUuid, + sled_id: SledUuid, + total_size: ByteCount, + ) { + use nexus_db_schema::schema::inv_zpool::dsl; + + let inv_collection_id = CollectionUuid::new_v4(); + let time_collected = Utc::now(); + let inv_pool = nexus_db_model::InvZpool { + inv_collection_id: inv_collection_id.into(), + time_collected, + id: zpool_id.into(), + sled_id: to_db_typed_uuid(sled_id), + total_size, + }; + + diesel::insert_into(dsl::inv_zpool) + .values(inv_pool) + .execute_async( + &*datastore.pool_connection_for_tests().await.unwrap(), + ) + .await + .unwrap(); + } + + async fn add_crucible_dataset( + datastore: &DataStore, + dataset_id: DatasetUuid, + pool_id: ZpoolUuid, + addr: SocketAddrV6, + ) -> DatasetUuid { + let dataset = + db::model::CrucibleDataset::new(dataset_id, pool_id, addr); + + datastore.crucible_dataset_upsert(dataset).await.unwrap(); + + dataset_id + } + + async fn set_crucible_dataset_size_used( + datastore: &DataStore, + crucible_dataset_id: DatasetUuid, + size_used: ByteCount, + ) { + use nexus_db_schema::schema::crucible_dataset::dsl; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + diesel::update(dsl::crucible_dataset) + .filter(dsl::id.eq(to_db_typed_uuid(crucible_dataset_id))) + .set(dsl::size_used.eq(size_used)) + .execute_async(&*conn) + .await + .unwrap(); + } + + async fn add_local_storage_dataset( + opctx: &OpContext, + datastore: &DataStore, + dataset_id: DatasetUuid, + pool_id: ZpoolUuid, + ) -> DatasetUuid { + let dataset = db::model::RendezvousLocalStorageDataset::new( + dataset_id, + pool_id, + BlueprintUuid::new_v4(), + ); + + datastore + .local_storage_dataset_insert_if_not_exists(opctx, dataset) + .await + .unwrap(); + + dataset_id + } + + async fn set_local_storage_dataset_no_provision( + datastore: &DataStore, + local_storage_dataset_id: DatasetUuid, + ) { + use nexus_db_schema::schema::rendezvous_local_storage_dataset::dsl; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + diesel::update(dsl::rendezvous_local_storage_dataset) + .filter(dsl::id.eq(to_db_typed_uuid(local_storage_dataset_id))) + .set(dsl::no_provision.eq(true)) + .execute_async(&*conn) + .await + .unwrap(); + } + + async fn set_local_storage_allocation( + datastore: &DataStore, + disk_id: Uuid, + local_storage_dataset_id: DatasetUuid, + pool_id: ExternalZpoolUuid, + sled_id: SledUuid, + dataset_size: ByteCount, + ) { + let allocation_id = DatasetUuid::new_v4(); + + let allocation = + db::model::LocalStorageDatasetAllocation::new_for_tests_only( + allocation_id, + Utc::now(), + local_storage_dataset_id, + pool_id, + sled_id, + dataset_size, + ); + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + { + use nexus_db_schema::schema::local_storage_dataset_allocation::dsl; + diesel::insert_into(dsl::local_storage_dataset_allocation) + .values(allocation) + .execute_async(&*conn) + .await + .unwrap(); + } + + { + use nexus_db_schema::schema::disk_type_local_storage::dsl; + + let rows_updated = diesel::update(dsl::disk_type_local_storage) + .filter(dsl::disk_id.eq(disk_id)) + .set( + dsl::local_storage_dataset_allocation_id + .eq(to_db_typed_uuid(allocation_id)), + ) + .execute_async(&*conn) + .await + .unwrap(); + + assert_eq!(rows_updated, 1); + } + } + + async fn create_test_instance( + datastore: &DataStore, + opctx: &OpContext, + authz_project: &authz::Project, + instance_id: InstanceUuid, + name: &str, + ) -> authz::Instance { + datastore + .project_create_instance( + &opctx, + &authz_project, + db::model::Instance::new( + instance_id, + authz_project.id(), + ¶ms::InstanceCreate { + identity: external::IdentityMetadataCreateParams { + name: name.parse().unwrap(), + description: "It's an instance".into(), + }, + ncpus: 2i64.try_into().unwrap(), + memory: external::ByteCount::from_gibibytes_u32(16), + hostname: "myhostname".try_into().unwrap(), + user_data: Vec::new(), + network_interfaces: + params::InstanceNetworkInterfaceAttachment::None, + external_ips: Vec::new(), + disks: Vec::new(), + boot_disk: None, + cpu_platform: None, + ssh_public_keys: None, + start: false, + auto_restart_policy: Default::default(), + anti_affinity_groups: Vec::new(), + multicast_groups: Vec::new(), + }, + ), + ) + .await + .expect("instance must be created successfully"); + + let (.., authz_instance) = LookupPath::new(&opctx, datastore) + .instance_id(instance_id.into_untyped_uuid()) + .lookup_for(authz::Action::Modify) + .await + .expect("instance must exist"); + + authz_instance + } + + #[tokio::test] + async fn local_storage_allocation() { + let logctx = dev::test_setup_log("local_storage_allocation"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with one U2 + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with one local storage disk + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion works + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that an existing allocation restricts where a VMM can be + #[tokio::test] + async fn local_storage_allocation_existing_allocation() { + let logctx = + dev::test_setup_log("local_storage_allocation_existing_allocation"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // Two sleds, each with one U2 + sleds: vec![ + LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }, + LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_1"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }, + ], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with one local storage disk + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // Add an allocation for this disk to the first sled's zpool + set_local_storage_allocation( + datastore, + config.instances[0].disks[0].id, + config.sleds[0].u2s[0].local_storage_dataset_id, + ExternalZpoolUuid::from_untyped_uuid( + config.sleds[0].u2s[0].zpool_id.into_untyped_uuid(), + ), + config.sleds[0].sled_id, + external::ByteCount::from_gibibytes_u32(512 + 10).into(), + ) + .await; + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 2); + + // insertion should succeed but never be on the second sled + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that `no_provision` on the local storage dataset is respected + #[tokio::test] + async fn local_storage_allocation_fail_no_provision() { + let logctx = + dev::test_setup_log("local_storage_allocation_fail_no_provision"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with one U2 + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with one local storage disk + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + set_local_storage_dataset_no_provision( + datastore, + config.sleds[0].u2s[0].local_storage_dataset_id, + ) + .await; + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion fails + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that local storage allocation does not run the zpool out of space + #[tokio::test] + async fn local_storage_allocation_fail_no_space() { + let logctx = + dev::test_setup_log("local_storage_allocation_fail_no_space"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with one U2 + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with one local storage disk + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // The zpool size is 1 TiB, and the control plane buffer is 250 GiB. If + // we set a crucible dataset size_used of 300 GiB, then ensure a 512 GiB + // allocation won't work. + + set_crucible_dataset_size_used( + datastore, + config.sleds[0].u2s[0].crucible_dataset_id, + external::ByteCount::from_gibibytes_u32(300).into(), + ) + .await; + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion fails + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + // if someone frees up some crucible disks, then insertion should work + set_crucible_dataset_size_used( + datastore, + config.sleds[0].u2s[0].crucible_dataset_id, + external::ByteCount::from_gibibytes_u32(200).into(), + ) + .await; + + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that local storage allocations do not share a U2 + #[tokio::test] + async fn local_storage_allocation_fail_no_share_u2() { + let logctx = + dev::test_setup_log("local_storage_allocation_fail_no_share_u2"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with one U2 + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with two local storage disks + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion fails + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that multiple local storage allocations can use multiple U2s + #[tokio::test] + async fn local_storage_allocation_multiple_u2() { + let logctx = + dev::test_setup_log("local_storage_allocation_multiple_u2"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with two U2s + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with two local storage disks + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion succeeds + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that multiple local storage allocations will not partially + /// succeed + #[tokio::test] + async fn local_storage_allocation_fail_no_partial_success() { + let logctx = dev::test_setup_log( + "local_storage_allocation_fail_no_partial_success", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with two U2s + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with two local storage disks + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // The zpool size is 1 TiB, and the control plane buffer is 250 GiB. If + // we set the first U2's crucible dataset size_used of 300 GiB, then + // we ensure one of the 512 GiB allocations won't work. + + set_crucible_dataset_size_used( + datastore, + config.sleds[0].u2s[0].crucible_dataset_id, + external::ByteCount::from_gibibytes_u32(300).into(), + ) + .await; + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion fails + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + // Assert no partial allocations occurred + { + let (.., authz_instance) = LookupPath::new(&opctx, datastore) + .instance_id(instance.id.into_untyped_uuid()) + .lookup_for(authz::Action::Modify) + .await + .expect("instance must exist"); + + let disks = datastore + .instance_list_disks( + &opctx, + &authz_instance, + &PaginatedBy::Name(DataPageParams { + marker: None, + direction: dropshot::PaginationOrder::Ascending, + limit: std::num::NonZeroU32::new( + MAX_DISKS_PER_INSTANCE, + ) + .unwrap(), + }), + ) + .await + .unwrap(); + + for disk in disks { + let db::datastore::Disk::LocalStorage(disk) = disk else { + panic!("wrong disk type"); + }; + + assert!(disk.local_storage_dataset_allocation.is_none()); + } + } + + // if someone frees up some crucible disks, then insertion should work + set_crucible_dataset_size_used( + datastore, + config.sleds[0].u2s[0].crucible_dataset_id, + external::ByteCount::from_gibibytes_u32(200).into(), + ) + .await; + + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that multiple local storage allocations work with one of them + /// existing already + #[tokio::test] + async fn local_storage_allocation_multiple_with_one_already() { + let logctx = dev::test_setup_log( + "local_storage_allocation_multiple_with_one_already", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with two U2s + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with two local storage disks + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // One of the local storage have been allocated already. + + set_local_storage_allocation( + datastore, + config.instances[0].disks[0].id, + config.sleds[0].u2s[0].local_storage_dataset_id, + ExternalZpoolUuid::from_untyped_uuid( + config.sleds[0].u2s[0].zpool_id.into_untyped_uuid(), + ), + config.sleds[0].sled_id, + external::ByteCount::from_gibibytes_u32(512 + 10).into(), + ) + .await; + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that multiple instances will share slices of a single U2 for + /// local storage allocations + #[tokio::test] + async fn local_storage_allocation_multiple_instances_share_u2() { + let logctx = dev::test_setup_log( + "local_storage_allocation_multiple_instances_share_u2", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with two U2s + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure two instances, each with two local storage disks + instances: vec![ + LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from( + "local1".to_string(), + ) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(128), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from( + "local2".to_string(), + ) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(128), + }, + ], + }, + LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local2".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from( + "local21".to_string(), + ) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(128), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from( + "local22".to_string(), + ) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(128), + }, + ], + }, + ], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + for instance in &config.instances { + let instance = Instance::new_with_id(instance.id); + + // the output of the find targets query does not currently take + // required local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 32, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + } + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that multiple local storage allocations won't work if one of them + /// exists already but the second would result in the zpool running out of + /// space + #[tokio::test] + async fn local_storage_allocation_fail_multiple_with_existing_no_space() { + let logctx = dev::test_setup_log( + "local_storage_allocation_fail_multiple_with_existing_no_space", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with two U2s + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure an instance with two local storage disks + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + let instance = Instance::new_with_id(config.instances[0].id); + + // One of the local storage have been allocated already to the first U2 + + set_local_storage_allocation( + datastore, + config.instances[0].disks[0].id, + config.sleds[0].u2s[0].local_storage_dataset_id, + ExternalZpoolUuid::from_untyped_uuid( + config.sleds[0].u2s[0].zpool_id.into_untyped_uuid(), + ), + config.sleds[0].sled_id, + external::ByteCount::from_gibibytes_u32(512 + 10).into(), + ) + .await; + + // The zpool size is 1 TiB, and the control plane buffer is 250 GiB. If + // we set the second U2's crucible dataset size_used of 300 GiB, then + // we ensure the second of the 512 GiB allocations won't work. + + set_crucible_dataset_size_used( + datastore, + config.sleds[0].u2s[1].crucible_dataset_id, + external::ByteCount::from_gibibytes_u32(300).into(), + ) + .await; + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that a full rack can have one VMM take all the U2s on each sled + #[tokio::test] + async fn local_storage_allocation_full_rack() { + let logctx = dev::test_setup_log("local_storage_allocation_full_rack"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let mut config = LocalStorageTest { + sleds: vec![], + affinity_groups: vec![], + anti_affinity_groups: vec![], + instances: vec![], + }; + + for i in 0..32 { + let mut u2s = vec![]; + + for n in 0..MAX_DISKS_PER_INSTANCE { + u2s.push(LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: format!("phys_{i}_{n}"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: format!( + "[fd00:1122:3344:{i}0{n}::1]:12345" + ) + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }); + } + + config.sleds.push(LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: format!("sled_{i}"), + u2s, + }); + + let mut disks = vec![]; + + for n in 0..MAX_DISKS_PER_INSTANCE { + disks.push(LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from(format!("local-{i}-{n}")) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }); + } + + config.instances.push(LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: format!("inst{i}"), + affinity: None, + disks, + }); + } + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + let mut vmms = vec![]; + + for (i, config_instance) in config.instances.iter().enumerate() { + let instance = Instance::new_with_id(config_instance.id); + + // the output of the find targets query does not currently take + // required local storage allocations into account, but each VMM + // will occupy all the available threads on a sled. + assert_eq!(instance.find_targets(datastore).await.len(), 32 - i); + + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 128, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + vmms.push(vmm); + } + + let sleds: HashSet<_> = + vmms.into_iter().map(|vmm| vmm.sled_id).collect(); + + assert_eq!(sleds.len(), 32); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that an affinity grouping will cause instances with local storage + /// to fail allocation even if there's space left. + #[tokio::test] + async fn local_storage_allocation_blocked_by_affinity() { + let logctx = + dev::test_setup_log("local_storage_allocation_blocked_by_affinity"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // Two sleds, with one U2 each + sleds: vec![ + LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }, + LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_1"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:201::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }, + ], + affinity_groups: vec![LocalStorageAffinityGroup { + id: AffinityGroupUuid::new_v4(), + name: String::from("group-0"), + policy: external::AffinityPolicy::Fail, + failure_domain: external::FailureDomain::Sled, + }], + anti_affinity_groups: vec![], + // Configure two instances with one local storage disk each, both in + // the same affinity group + instances: vec![ + LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: Some((Affinity::Positive, 0)), + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(600), + }], + }, + LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local2".to_string(), + affinity: Some((Affinity::Positive, 0)), + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(600), + }], + }, + ], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + // The first instance's sled reservation should succeed, there's enough + // space for the disk + + let instance = Instance::new_with_id(config.instances[0].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 2); + + // make sure that insertion works + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + // The second instance's sled reservation should not succeed, because + // the affinity group's policy is set to Fail, and there isn't enough + // space for the second instance's disk on the one sled. + + let instance = Instance::new_with_id(config.instances[1].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 2); + + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that an anti-affinity grouping will be honoured for instances + /// with local storage. + #[tokio::test] + async fn local_storage_allocation_blocked_by_anti_affinity() { + let logctx = dev::test_setup_log( + "local_storage_allocation_blocked_by_anti_affinity", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with one U2 + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![LocalStorageAntiAffinityGroup { + id: AntiAffinityGroupUuid::new_v4(), + name: String::from("anti-group-0"), + policy: external::AffinityPolicy::Fail, + failure_domain: external::FailureDomain::Sled, + }], + // Configure two instances with one local storage disk each, both in + // the same anti-affinity group + instances: vec![ + LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: Some((Affinity::Negative, 0)), + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }], + }, + LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local2".to_string(), + affinity: Some((Affinity::Negative, 0)), + disks: vec![LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }], + }, + ], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + // The first instance's sled reservation should succeed + + let instance = Instance::new_with_id(config.instances[0].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + // make sure that insertion works + let vmm = datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + assert_eq!(vmm.sled_id, config.sleds[0].sled_id.into()); + + // The second instance's sled reservation should not succeed, because + // the anti-affinity group's policy is set to Fail. + + let instance = Instance::new_with_id(config.instances[1].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that instances with local storage honour the reservation + /// constraints. + #[tokio::test] + async fn local_storage_allocation_blocked_by_constraint() { + let logctx = dev::test_setup_log( + "local_storage_allocation_blocked_by_constraint", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled, with one U2, and another with two U2s + sleds: vec![ + LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }], + }, + LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_1"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: + "[fd00:1122:3344:201::1]:12345".parse().unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys2"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: + "[fd00:1122:3344:202::1]:12345".parse().unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }, + ], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure one instance with two local storage disks + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(512), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + // If there's a constraint that this instance must use sled_0, it cannot + // succeed, as it needs two local storage allocations (which it could + // get if it was allowed to use sled_1). + + let instance = Instance::new_with_id(config.instances[0].id); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 2); + + // make sure that insertion fails if we are restricted to sled_0 + let constraints = db::model::SledReservationConstraintBuilder::new() + .must_select_from(&[config.sleds[0].sled_id]) + .build(); + + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + constraints, + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + /// Ensure that after sled reservation, local storage disks cannot be + /// attached to instances, even if they don't yet have a VMM. + #[tokio::test] + async fn local_storage_disk_no_attach_after_reservation() { + let logctx = dev::test_setup_log( + "local_storage_disk_no_attach_after_reservation", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let config = LocalStorageTest { + // One sled with two U2s + sleds: vec![LocalStorageTestSled { + sled_id: SledUuid::new_v4(), + sled_serial: String::from("sled_0"), + u2s: vec![ + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys0"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:101::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + LocalStorageTestSledU2 { + physical_disk_id: PhysicalDiskUuid::new_v4(), + physical_disk_serial: String::from("phys1"), + + zpool_id: ZpoolUuid::new_v4(), + control_plane_storage_buffer: + external::ByteCount::from_gibibytes_u32(250), + + inventory_total_size: + external::ByteCount::from_gibibytes_u32(1024), + + crucible_dataset_id: DatasetUuid::new_v4(), + crucible_dataset_addr: "[fd00:1122:3344:102::1]:12345" + .parse() + .unwrap(), + + local_storage_dataset_id: DatasetUuid::new_v4(), + }, + ], + }], + affinity_groups: vec![], + anti_affinity_groups: vec![], + // Configure one instance with two local storage disks. One will be + // detached before sled reservation. + instances: vec![LocalStorageTestInstance { + id: InstanceUuid::new_v4(), + name: "local".to_string(), + affinity: None, + disks: vec![ + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local1".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }, + LocalStorageTestInstanceDisk { + id: Uuid::new_v4(), + name: external::Name::try_from("local2".to_string()) + .unwrap(), + size: external::ByteCount::from_gibibytes_u32(64), + }, + ], + }], + }; + + setup_local_storage_allocation_test(&opctx, datastore, &config).await; + + let instance = Instance::new_with_id(config.instances[0].id); + + // Detach the second disk from the instance + let (.., authz_instance) = LookupPath::new(&opctx, datastore) + .instance_id(instance.id.into_untyped_uuid()) + .lookup_for(authz::Action::Modify) + .await + .expect("instance must exist"); + + let (.., authz_disk) = LookupPath::new(&opctx, datastore) + .disk_id(config.instances[0].disks[1].id) + .lookup_for(authz::Action::Read) + .await + .expect("disk must exist"); + + datastore + .instance_detach_disk(&opctx, &authz_instance, &authz_disk) + .await + .unwrap(); + + // the output of the find targets query does not currently take required + // local storage allocations into account + assert_eq!(instance.find_targets(datastore).await.len(), 1); + + datastore + .sled_reservation_create( + opctx, + instance.id, + PropolisUuid::new_v4(), + db::model::Resources::new( + 1, + ByteCount::try_from(1024).unwrap(), + ByteCount::try_from(1024).unwrap(), + ), + db::model::SledReservationConstraints::none(), + ) + .await + .unwrap(); + + // Try to attach the second disk to the instance - it should fail. + + datastore + .instance_attach_disk( + &opctx, + &authz_instance, + &authz_disk, + MAX_DISKS_PER_INSTANCE, + ) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/datastore/zpool.rs b/nexus/db-queries/src/db/datastore/zpool.rs index 2f31fb72dfd..e4f5c0a9e14 100644 --- a/nexus/db-queries/src/db/datastore/zpool.rs +++ b/nexus/db-queries/src/db/datastore/zpool.rs @@ -11,6 +11,7 @@ use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::datastore::OpContext; use crate::db::identity::Asset; +use crate::db::model::ByteCount; use crate::db::model::DbTypedUuid; use crate::db::model::PhysicalDisk; use crate::db::model::PhysicalDiskPolicy; @@ -37,12 +38,30 @@ use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; +use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolKind; use omicron_uuid_kinds::ZpoolUuid; use uuid::Uuid; +#[derive(Debug)] +pub struct ZpoolGetForSledReservationResult { + pub pool: Zpool, + + /// Last reported inventory size for the zpool + pub last_inv_total_size: i64, + + /// The rendezvous local storage dataset for this zpool + pub rendezvous_local_storage_dataset_id: DatasetUuid, + + /// Upper bound on Crucible dataset usage + pub crucible_dataset_usage: i64, + + /// Upper bound on Local Storage dataset usage + pub local_storage_usage: i64, +} + impl DataStore { pub async fn zpool_insert( &self, @@ -348,4 +367,160 @@ impl DataStore { .map(|_| ()) .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + + /// For a given sled id, return all zpools for that sled plus: + /// + /// - the total upper bound on usage as reported by their crucible and local + /// storage datasets + /// - their most recent total_size as reported by inventory. + pub async fn zpool_get_for_sled_reservation( + &self, + opctx: &OpContext, + sled_id: SledUuid, + ) -> ListResultVec { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + + use nexus_db_schema::schema::crucible_dataset; + use nexus_db_schema::schema::inv_zpool; + use nexus_db_schema::schema::physical_disk::dsl as physical_disk_dsl; + use nexus_db_schema::schema::rendezvous_local_storage_dataset; + use nexus_db_schema::schema::zpool::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + let tuples = dsl::zpool + .filter(dsl::sled_id.eq(to_db_typed_uuid(sled_id))) + .filter(dsl::time_deleted.is_null()) + .inner_join( + physical_disk_dsl::physical_disk + .on(dsl::physical_disk_id.eq(physical_disk_dsl::id)), + ) + .filter( + physical_disk_dsl::disk_policy + .eq(PhysicalDiskPolicy::InService), + ) + // Only U2 disks will be considered for local storage allocations. + // This should be redundant when filtering on pools with local + // storage datasets! + .filter(physical_disk_dsl::variant.eq(PhysicalDiskKind::U2)) + .select(( + Zpool::as_select(), + // This and the next result are the upper bound on existing + // usages + crucible_dataset::table + .select(diesel::dsl::sum(crucible_dataset::size_used)) + .filter(crucible_dataset::time_deleted.is_null()) + .filter(crucible_dataset::pool_id.eq(dsl::id)) + .single_value(), + rendezvous_local_storage_dataset::table + .select(diesel::dsl::sum( + rendezvous_local_storage_dataset::size_used, + )) + .filter( + rendezvous_local_storage_dataset::time_tombstoned + .is_null(), + ) + .filter( + rendezvous_local_storage_dataset::pool_id.eq(dsl::id), + ) + .single_value(), + // + rendezvous_local_storage_dataset::table + .select(rendezvous_local_storage_dataset::id) + .filter( + rendezvous_local_storage_dataset::time_tombstoned + .is_null(), + ) + .filter( + rendezvous_local_storage_dataset::pool_id.eq(dsl::id), + ) + .single_value(), + // last reported total size of this pool from inventory + inv_zpool::table + .select(inv_zpool::total_size) + .filter(inv_zpool::id.eq(dsl::id)) + .order_by(inv_zpool::time_collected.desc()) + .limit(1) + .single_value(), + )) + .load_async::<( + Zpool, + Option, + Option, + Option, + Option, + )>(&*conn) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Sled, + LookupType::by_id(sled_id), + ), + ) + })?; + + let mut converted = Vec::with_capacity(tuples.len()); + + for tuple in tuples { + let ( + pool, + crucible_dataset_usage, + local_storage_usage, + rendezvous_local_storage_dataset_id, + last_inv_total_size, + ) = tuple; + + // If this zpool doesn't have a local storage dataset yet, skip it + let Some(rendezvous_local_storage_dataset_id) = + rendezvous_local_storage_dataset_id + else { + continue; + }; + + // If the usage sum for either dataset type wasn't able to be + // computed, then the associated rendezvous table entry doesn't + // exist yet, so skip this zpool. + let crucible_dataset_usage: ByteCount = + if let Some(crucible_dataset_usage) = crucible_dataset_usage { + crucible_dataset_usage.try_into().map_err( + |e: anyhow::Error| { + Error::internal_error(&e.to_string()) + }, + )? + } else { + continue; + }; + + let local_storage_usage: ByteCount = + if let Some(local_storage_usage) = local_storage_usage { + local_storage_usage.try_into().map_err( + |e: anyhow::Error| { + Error::internal_error(&e.to_string()) + }, + )? + } else { + continue; + }; + + // If there isn't an inventory collection for this zpool, skip it + let Some(last_inv_total_size) = last_inv_total_size else { + continue; + }; + + converted.push(ZpoolGetForSledReservationResult { + pool, + last_inv_total_size, + rendezvous_local_storage_dataset_id: + DatasetUuid::from_untyped_uuid( + rendezvous_local_storage_dataset_id, + ), + crucible_dataset_usage: crucible_dataset_usage.into(), + local_storage_usage: local_storage_usage.into(), + }); + } + + Ok(converted) + } } diff --git a/nexus/db-queries/src/db/queries/sled_reservation.rs b/nexus/db-queries/src/db/queries/sled_reservation.rs index a5fee2ce9c1..9cf6b6b78d5 100644 --- a/nexus/db-queries/src/db/queries/sled_reservation.rs +++ b/nexus/db-queries/src/db/queries/sled_reservation.rs @@ -12,8 +12,29 @@ use diesel::sql_types; use nexus_db_model::SledCpuFamily; use nexus_db_schema::enums::AffinityPolicyEnum; use nexus_db_schema::enums::SledCpuFamilyEnum; +use nonempty::NonEmpty; +use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; +use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::ZpoolUuid; +use uuid::Uuid; + +#[derive(Debug, Clone)] +pub struct LocalStorageAllocation { + pub disk_id: Uuid, + pub local_storage_dataset_allocation_id: DatasetUuid, + pub required_dataset_size: i64, + pub local_storage_dataset_id: DatasetUuid, + pub pool_id: ZpoolUuid, + pub sled_id: SledUuid, +} + +#[derive(Clone)] +pub enum LocalStorageAllocationRequired { + No, + Yes { allocations: NonEmpty }, +} fn subquery_our_aa_groups(query: &mut QueryBuilder) { query @@ -252,10 +273,16 @@ pub fn sled_find_targets_query( query.query() } -/// Inserts a sled_resource_vmm record into the database, if it is -/// a valid reservation. +/// Attempts to: +/// +/// 1. Insert a sled_resource_vmm record, if it is still a valid reservation +/// +/// 2. Optionally perform local storage allocation by updating local storage +/// disk records, if those are still valid with respect to the amount of +/// available zpool space. pub fn sled_insert_resource_query( resource: &SledResourceVmm, + local_storage_allocation_required: &LocalStorageAllocationRequired, ) -> TypedSqlQuery<(sql_types::Numeric,)> { let mut query = QueryBuilder::new(); @@ -281,13 +308,24 @@ pub fn sled_insert_resource_query( ).param().sql(" <= sled.usable_physical_ram AND COALESCE(SUM(CAST(sled_resource_vmm.reservoir_ram AS INT8)), 0) + " ).param().sql(" <= sled.reservoir_size - ),"); + ),") + .bind::(resource.sled_id.into_untyped_uuid()) + .bind::(resource.resources.hardware_threads) + .bind::(resource.resources.rss_ram) + .bind::(resource.resources.reservoir_ram); // "our_aa_groups": All the anti-affinity group_ids to which our instance belongs. subquery_our_aa_groups(&mut query); + query.bind::( + resource.instance_id.unwrap().into_untyped_uuid(), + ); + // "other_aa_instances": All the group_id,instance_ids of instances (other // than our own) belonging to "our_aa_groups". subquery_other_aa_instances(&mut query); + query.bind::( + resource.instance_id.unwrap().into_untyped_uuid(), + ); // Find instances with a strict anti-affinity policy in the sled failure // domain. We must ensure we do not co-locate with these instances. @@ -317,9 +355,16 @@ pub fn sled_insert_resource_query( // "our_a_groups": All the affinity group_ids to which our instance belongs. subquery_our_a_groups(&mut query); + query.bind::( + resource.instance_id.unwrap().into_untyped_uuid(), + ); + // "other_a_instances": All the group_id,instance_ids of instances (other // than our own) belonging to "our_a_instances"). subquery_other_a_instances(&mut query); + query.bind::( + resource.instance_id.unwrap().into_untyped_uuid(), + ); // Find instances with a strict affinity policy in the sled failure // domain. We must ensure we co-locate with these instances. @@ -347,11 +392,138 @@ pub fn sled_insert_resource_query( ),", ); + match local_storage_allocation_required { + LocalStorageAllocationRequired::No => {} + + LocalStorageAllocationRequired::Yes { allocations } => { + // Update each local storage disk's allocation id + + query.sql(" UPDATED_LOCAL_STORAGE_DISK_RECORDS AS ("); + query.sql(" UPDATE disk_type_local_storage "); + + query.sql( + " SET local_storage_dataset_allocation_id = CASE disk_id ", + ); + + for allocation in allocations { + let LocalStorageAllocation { + disk_id, + local_storage_dataset_allocation_id, + .. + } = allocation; + + query.sql("WHEN "); + query.param().bind::(*disk_id); + query.sql(" THEN "); + query.param().bind::( + local_storage_dataset_allocation_id.into_untyped_uuid(), + ); + query.sql(" "); + } + + query.sql(" END"); + + query.sql(" WHERE disk_id in ( "); + + for (index, allocation) in allocations.iter().enumerate() { + let LocalStorageAllocation { disk_id, .. } = allocation; + + query.param().bind::(*disk_id); + + if index != (allocations.len() - 1) { + query.sql(","); + } + } + + query.sql(" )"); + + query.sql(" RETURNING *"); + query.sql(" ), "); + + // Create new local storage dataset allocation records. + + query.sql(" NEW_LOCAL_STORAGE_ALLOCATION_RECORDS AS ("); + query.sql(" INSERT INTO local_storage_dataset_allocation VALUES "); + + for (index, allocation) in allocations.iter().enumerate() { + let LocalStorageAllocation { + local_storage_dataset_allocation_id, + local_storage_dataset_id, + pool_id, + sled_id, + required_dataset_size, + .. + } = allocation; + + query.sql("("); + + query.param().bind::( + local_storage_dataset_allocation_id.into_untyped_uuid(), + ); + query.sql(","); + + query.sql("NOW(),"); + query.sql("NULL,"); + + query.param().bind::( + local_storage_dataset_id.into_untyped_uuid(), + ); + query.sql(","); + + query + .param() + .bind::(pool_id.into_untyped_uuid()); + query.sql(","); + + query + .param() + .bind::(sled_id.into_untyped_uuid()); + query.sql(","); + + query + .param() + .bind::(*required_dataset_size); + + query.sql(")"); + + if index != (allocations.len() - 1) { + query.sql(","); + } + } + + query.sql(" RETURNING *), "); + + // Update the rendezvous_local_storage_dataset table's size_used + // column by adding these new rows + + query.sql("UPDATE_RENDEZVOUS_TABLES as ("); + query.sql( + " + UPDATE rendezvous_local_storage_dataset + SET size_used = size_used + \ + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.dataset_size + FROM NEW_LOCAL_STORAGE_ALLOCATION_RECORDS + WHERE + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.local_storage_dataset_id = \ + rendezvous_local_storage_dataset.id AND + rendezvous_local_storage_dataset.time_tombstoned IS NULL + RETURNING *", + ); + query.sql("), "); + } + } + // The insert is only valid if: // - // - The sled still has space for our isntance + // - The sled still has space for our instance // - The sled is not banned (due to anti-affinity rules) // - If the sled is required (due to affinity rules) we're selecting it + // - If there are requested local storage allocations: + // - they all fit on the target zpools: combine the new local storage + // allocation records with the existing ones and test each zpool. + // - the rendezvous local storage dataset is not tombstoned or marked + // no_provision + query .sql( " @@ -362,19 +534,171 @@ pub fn sled_insert_resource_query( NOT(EXISTS(SELECT 1 FROM banned_sleds WHERE sled_id = ", ) .param() + .bind::(resource.sled_id.into_untyped_uuid()) .sql( ")) AND ( EXISTS(SELECT 1 FROM required_sleds WHERE sled_id = ", ) .param() + .bind::(resource.sled_id.into_untyped_uuid()) .sql( ") OR NOT EXISTS (SELECT 1 FROM required_sleds) - ) - )", + )", ); + match local_storage_allocation_required { + LocalStorageAllocationRequired::No => {} + + LocalStorageAllocationRequired::Yes { allocations } => { + query.sql("AND ("); + + for (index, allocation) in allocations.iter().enumerate() { + query.sql("("); + + // First, make sure that the additional usage fits in the zpool + + query.sql("("); + + // Add up crucible and local dataset usage, plus the altered + // local_storage_dataset_allocation records + + query.sql("("); + + query + .sql( + "SELECT + SUM( + crucible_dataset.size_used + + rendezvous_local_storage_dataset.size_used + + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.dataset_size + ) + FROM + crucible_dataset + JOIN + rendezvous_local_storage_dataset + ON + crucible_dataset.pool_id = \ + rendezvous_local_storage_dataset.pool_id + JOIN + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS + ON + crucible_dataset.pool_id = \ + NEW_LOCAL_STORAGE_ALLOCATION_RECORDS.pool_id + WHERE + (crucible_dataset.size_used IS NOT NULL) AND + (crucible_dataset.time_deleted IS NULL) AND + (rendezvous_local_storage_dataset.time_tombstoned IS NULL) AND + (crucible_dataset.pool_id = ", + ) + .param() + .bind::( + allocation.pool_id.into_untyped_uuid(), + ) + .sql( + ") + GROUP BY + crucible_dataset.pool_id", + ); + + query.sql(") < ("); + + // and compare that to the zpool's available space (minus the + // control plane storage buffer) as reported by the latest + // inventory collection. + + query + .sql( + "(SELECT + total_size + FROM + inv_zpool + WHERE + inv_zpool.id = ", + ) + .param() + .bind::( + allocation.pool_id.into_untyped_uuid(), + ) + .sql(" ORDER BY inv_zpool.time_collected DESC LIMIT 1)"); + + query.sql(" - "); + + query + .sql( + "(SELECT + control_plane_storage_buffer + FROM + zpool + WHERE id = ", + ) + .param() + .bind::( + allocation.pool_id.into_untyped_uuid(), + ) + .sql(")"); + + query.sql(")"); + + query.sql(")"); + + query.sql(") AND "); + + // and, the zpool must be available + + query.sql("("); + + query + .sql( + "SELECT + sled.sled_policy = 'in_service' + AND sled.sled_state = 'active' + AND physical_disk.disk_policy = 'in_service' + AND physical_disk.disk_state = 'active' + FROM + zpool + JOIN + sled ON (zpool.sled_id = sled.id) + JOIN + physical_disk ON (zpool.physical_disk_id = physical_disk.id) + WHERE + zpool.id = ", + ) + .param() + .bind::( + allocation.pool_id.into_untyped_uuid(), + ); + + query.sql(") AND "); + + // and the rendezvous local dataset must be available + + query + .sql( + "( + SELECT time_tombstoned IS NULL AND no_provision IS FALSE + FROM rendezvous_local_storage_dataset + WHERE rendezvous_local_storage_dataset.id = ", + ) + .param() + .bind::( + allocation.local_storage_dataset_id.into_untyped_uuid(), + ); + + query.sql(")"); + + if index != (allocations.len() - 1) { + query.sql(" AND "); + } + } + + query.sql(")"); + } + } + + query.sql(")"); + // Finally, perform the INSERT if it's still valid. query.sql(" INSERT INTO sled_resource_vmm (id, sled_id, hardware_threads, rss_ram, reservoir_ram, instance_id) @@ -387,16 +711,6 @@ pub fn sled_insert_resource_query( ").param().sql(" WHERE EXISTS(SELECT 1 FROM insert_valid) ") - .bind::(resource.sled_id.into_untyped_uuid()) - .bind::(resource.resources.hardware_threads) - .bind::(resource.resources.rss_ram) - .bind::(resource.resources.reservoir_ram) - .bind::(resource.instance_id.unwrap().into_untyped_uuid()) - .bind::(resource.instance_id.unwrap().into_untyped_uuid()) - .bind::(resource.instance_id.unwrap().into_untyped_uuid()) - .bind::(resource.instance_id.unwrap().into_untyped_uuid()) - .bind::(resource.sled_id.into_untyped_uuid()) - .bind::(resource.sled_id.into_untyped_uuid()) .bind::(resource.id.into_untyped_uuid()) .bind::(resource.sled_id.into_untyped_uuid()) .bind::(resource.resources.hardware_threads) @@ -420,6 +734,8 @@ mod test { use omicron_uuid_kinds::PropolisUuid; use omicron_uuid_kinds::SledUuid; + use nonempty::nonempty; + #[tokio::test] async fn expectorate_sled_find_targets_query() { let id = InstanceUuid::nil(); @@ -504,12 +820,48 @@ mod test { ), ); - let query = sled_insert_resource_query(&resource); + // with no local storage + + let query = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::No, + ); + expectorate_query_contents( &query, "tests/output/sled_insert_resource_query.sql", ) .await; + + let query = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::Yes { + allocations: nonempty![ + LocalStorageAllocation { + disk_id: Uuid::nil(), + local_storage_dataset_allocation_id: DatasetUuid::nil(), + required_dataset_size: 64 * 1024 * 1024 * 1024, + local_storage_dataset_id: DatasetUuid::nil(), + pool_id: ZpoolUuid::nil(), + sled_id: SledUuid::nil(), + }, + LocalStorageAllocation { + disk_id: Uuid::nil(), + local_storage_dataset_allocation_id: DatasetUuid::nil(), + required_dataset_size: 128 * 1024 * 1024 * 1024, + local_storage_dataset_id: DatasetUuid::nil(), + pool_id: ZpoolUuid::nil(), + sled_id: SledUuid::nil(), + } + ], + }, + ); + + expectorate_query_contents( + &query, + "tests/output/sled_insert_resource_query_with_local_storage.sql", + ) + .await; } #[tokio::test] @@ -519,6 +871,8 @@ mod test { let pool = db.pool(); let conn = pool.claim().await.unwrap(); + // with no local storage + let resource = SledResourceVmm::new( PropolisUuid::nil(), InstanceUuid::nil(), @@ -534,7 +888,60 @@ mod test { ), ); - let query = sled_insert_resource_query(&resource); + let query = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::No, + ); + let _ = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + + // with local storage + + let query = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::Yes { + allocations: nonempty![LocalStorageAllocation { + disk_id: Uuid::nil(), + local_storage_dataset_allocation_id: DatasetUuid::nil(), + required_dataset_size: 128 * 1024 * 1024 * 1024, + local_storage_dataset_id: DatasetUuid::nil(), + pool_id: ZpoolUuid::nil(), + sled_id: SledUuid::nil(), + }], + }, + ); + + let _ = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + + let query = sled_insert_resource_query( + &resource, + &LocalStorageAllocationRequired::Yes { + allocations: nonempty![ + LocalStorageAllocation { + disk_id: Uuid::nil(), + local_storage_dataset_allocation_id: DatasetUuid::nil(), + required_dataset_size: 128 * 1024 * 1024 * 1024, + local_storage_dataset_id: DatasetUuid::nil(), + pool_id: ZpoolUuid::nil(), + sled_id: SledUuid::nil(), + }, + LocalStorageAllocation { + disk_id: Uuid::nil(), + local_storage_dataset_allocation_id: DatasetUuid::nil(), + required_dataset_size: 256 * 1024 * 1024 * 1024, + local_storage_dataset_id: DatasetUuid::nil(), + pool_id: ZpoolUuid::nil(), + sled_id: SledUuid::nil(), + }, + ], + }, + ); + let _ = query .explain_async(&conn) .await diff --git a/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql b/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql new file mode 100644 index 00000000000..59964d59df8 --- /dev/null +++ b/nexus/db-queries/tests/output/sled_insert_resource_query_with_local_storage.sql @@ -0,0 +1,259 @@ +WITH + sled_has_space + AS ( + SELECT + 1 + FROM + sled LEFT JOIN sled_resource_vmm ON sled_resource_vmm.sled_id = sled.id + WHERE + sled.id = $1 + AND sled.time_deleted IS NULL + AND sled.sled_policy = 'in_service' + AND sled.sled_state = 'active' + GROUP BY + sled.id + HAVING + COALESCE(sum(CAST(sled_resource_vmm.hardware_threads AS INT8)), 0) + $2 + <= sled.usable_hardware_threads + AND COALESCE(sum(CAST(sled_resource_vmm.rss_ram AS INT8)), 0) + $3 + <= sled.usable_physical_ram + AND COALESCE(sum(CAST(sled_resource_vmm.reservoir_ram AS INT8)), 0) + $4 + <= sled.reservoir_size + ), + our_aa_groups + AS (SELECT group_id FROM anti_affinity_group_instance_membership WHERE instance_id = $5), + other_aa_instances + AS ( + SELECT + anti_affinity_group_instance_membership.group_id, instance_id + FROM + anti_affinity_group_instance_membership + JOIN our_aa_groups ON + anti_affinity_group_instance_membership.group_id = our_aa_groups.group_id + WHERE + instance_id != $6 + ), + banned_instances + AS ( + SELECT + instance_id + FROM + other_aa_instances + JOIN anti_affinity_group ON + anti_affinity_group.id = other_aa_instances.group_id + AND anti_affinity_group.failure_domain = 'sled' + AND anti_affinity_group.policy = 'fail' + WHERE + anti_affinity_group.time_deleted IS NULL + ), + banned_sleds + AS ( + SELECT + DISTINCT sled_id + FROM + banned_instances + JOIN sled_resource_vmm ON sled_resource_vmm.instance_id = banned_instances.instance_id + ), + our_a_groups AS (SELECT group_id FROM affinity_group_instance_membership WHERE instance_id = $7), + other_a_instances + AS ( + SELECT + affinity_group_instance_membership.group_id, instance_id + FROM + affinity_group_instance_membership + JOIN our_a_groups ON affinity_group_instance_membership.group_id = our_a_groups.group_id + WHERE + instance_id != $8 + ), + required_instances + AS ( + SELECT + policy, instance_id + FROM + other_a_instances + JOIN affinity_group ON + affinity_group.id = other_a_instances.group_id + AND affinity_group.failure_domain = 'sled' + AND affinity_group.policy = 'fail' + WHERE + affinity_group.time_deleted IS NULL + ), + required_sleds + AS ( + SELECT + DISTINCT sled_id + FROM + required_instances + JOIN sled_resource_vmm ON sled_resource_vmm.instance_id = required_instances.instance_id + ), + updated_local_storage_disk_records + AS ( + UPDATE + disk_type_local_storage + SET + local_storage_dataset_allocation_id = CASE disk_id WHEN $9 THEN $10 WHEN $11 THEN $12 END + WHERE + disk_id IN ($13, $14) + RETURNING + * + ), + new_local_storage_allocation_records + AS ( + INSERT + INTO + local_storage_dataset_allocation + VALUES + ($15, now(), NULL, $16, $17, $18, $19), ($20, now(), NULL, $21, $22, $23, $24) + RETURNING + * + ), + update_rendezvous_tables + AS ( + UPDATE + rendezvous_local_storage_dataset + SET + size_used = size_used + new_local_storage_allocation_records.dataset_size + FROM + new_local_storage_allocation_records + WHERE + new_local_storage_allocation_records.local_storage_dataset_id + = rendezvous_local_storage_dataset.id + AND rendezvous_local_storage_dataset.time_tombstoned IS NULL + RETURNING + * + ), + insert_valid + AS ( + SELECT + 1 + WHERE + EXISTS(SELECT 1 FROM sled_has_space) + AND NOT (EXISTS(SELECT 1 FROM banned_sleds WHERE sled_id = $25)) + AND ( + EXISTS(SELECT 1 FROM required_sleds WHERE sled_id = $26) + OR NOT EXISTS(SELECT 1 FROM required_sleds) + ) + AND ( + ( + SELECT + sum( + crucible_dataset.size_used + + rendezvous_local_storage_dataset.size_used + + new_local_storage_allocation_records.dataset_size + ) + FROM + crucible_dataset + JOIN rendezvous_local_storage_dataset ON + crucible_dataset.pool_id = rendezvous_local_storage_dataset.pool_id + JOIN new_local_storage_allocation_records ON + crucible_dataset.pool_id = new_local_storage_allocation_records.pool_id + WHERE + (crucible_dataset.size_used IS NOT NULL) + AND (crucible_dataset.time_deleted IS NULL) + AND (rendezvous_local_storage_dataset.time_tombstoned IS NULL) + AND crucible_dataset.pool_id = $27 + GROUP BY + crucible_dataset.pool_id + ) + < ( + ( + SELECT + total_size + FROM + inv_zpool + WHERE + inv_zpool.id = $28 + ORDER BY + inv_zpool.time_collected DESC + LIMIT + 1 + ) + - (SELECT control_plane_storage_buffer FROM zpool WHERE id = $29) + ) + AND ( + SELECT + sled.sled_policy = 'in_service' + AND sled.sled_state = 'active' + AND physical_disk.disk_policy = 'in_service' + AND physical_disk.disk_state = 'active' + FROM + zpool + JOIN sled ON zpool.sled_id = sled.id + JOIN physical_disk ON zpool.physical_disk_id = physical_disk.id + WHERE + zpool.id = $30 + ) + AND ( + SELECT + time_tombstoned IS NULL AND no_provision IS false + FROM + rendezvous_local_storage_dataset + WHERE + rendezvous_local_storage_dataset.id = $31 + ) + AND ( + SELECT + sum( + crucible_dataset.size_used + + rendezvous_local_storage_dataset.size_used + + new_local_storage_allocation_records.dataset_size + ) + FROM + crucible_dataset + JOIN rendezvous_local_storage_dataset ON + crucible_dataset.pool_id = rendezvous_local_storage_dataset.pool_id + JOIN new_local_storage_allocation_records ON + crucible_dataset.pool_id = new_local_storage_allocation_records.pool_id + WHERE + (crucible_dataset.size_used IS NOT NULL) + AND (crucible_dataset.time_deleted IS NULL) + AND (rendezvous_local_storage_dataset.time_tombstoned IS NULL) + AND crucible_dataset.pool_id = $32 + GROUP BY + crucible_dataset.pool_id + ) + < ( + ( + SELECT + total_size + FROM + inv_zpool + WHERE + inv_zpool.id = $33 + ORDER BY + inv_zpool.time_collected DESC + LIMIT + 1 + ) + - (SELECT control_plane_storage_buffer FROM zpool WHERE id = $34) + ) + AND ( + SELECT + sled.sled_policy = 'in_service' + AND sled.sled_state = 'active' + AND physical_disk.disk_policy = 'in_service' + AND physical_disk.disk_state = 'active' + FROM + zpool + JOIN sled ON zpool.sled_id = sled.id + JOIN physical_disk ON zpool.physical_disk_id = physical_disk.id + WHERE + zpool.id = $35 + ) + AND ( + SELECT + time_tombstoned IS NULL AND no_provision IS false + FROM + rendezvous_local_storage_dataset + WHERE + rendezvous_local_storage_dataset.id = $36 + ) + ) + ) +INSERT +INTO + sled_resource_vmm (id, sled_id, hardware_threads, rss_ram, reservoir_ram, instance_id) +SELECT + $37, $38, $39, $40, $41, $42 +WHERE + EXISTS(SELECT 1 FROM insert_valid) diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 8be6bb768c2..e09755c9ae5 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1850,6 +1850,9 @@ table! { } } +allow_tables_to_appear_in_same_query!(zpool, inv_zpool); +allow_tables_to_appear_in_same_query!(inv_zpool, physical_disk); + table! { inv_dataset (inv_collection_id, sled_id, name) { inv_collection_id -> Uuid, diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index a402ee2c1bd..83fb9b85b04 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -14,6 +14,7 @@ use super::MAX_VCPU_PER_INSTANCE; use super::MIN_MEMORY_BYTES_PER_INSTANCE; use crate::app::sagas; use crate::app::sagas::NexusSaga; +use crate::db::datastore::Disk; use crate::external_api::params; use cancel_safe_futures::prelude::*; use futures::future::Fuse; @@ -66,6 +67,7 @@ use propolis_client::support::tungstenite::protocol::frame::coding::CloseCode; use sagas::instance_common::ExternalIpAttach; use sagas::instance_start; use sagas::instance_update; +use sled_agent_client::types::DelegatedZvol; use sled_agent_client::types::InstanceMigrationTargetParams; use sled_agent_client::types::VmmPutStateBody; use std::collections::HashSet; @@ -1361,6 +1363,35 @@ impl super::Nexus { ) .await?; + // Each Disk::LocalStorage will require a delegated zvol entry. + let mut delegated_zvols: Vec = + Vec::with_capacity(disks.len()); + + for disk in &disks { + let local_storage_disk = match disk { + Disk::Crucible(_) => { + continue; + } + + Disk::LocalStorage(local_storage_disk) => local_storage_disk, + }; + + let Some(local_storage_dataset_allocation) = + &local_storage_disk.local_storage_dataset_allocation + else { + return Err(Error::internal_error(&format!( + "local storage disk {} allocation is None!", + disk.id() + )) + .into()); + }; + + delegated_zvols.push(DelegatedZvol::LocalStorage { + zpool_id: local_storage_dataset_allocation.pool_id(), + dataset_id: local_storage_dataset_allocation.id(), + }); + } + let nics = self .db_datastore .derive_guest_network_interface_info(&opctx, &authz_instance) @@ -1549,7 +1580,7 @@ impl super::Nexus { host_domain: None, search_domains: Vec::new(), }, - delegated_zvols: vec![], + delegated_zvols, }; let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); diff --git a/nexus/src/app/sagas/disk_delete.rs b/nexus/src/app/sagas/disk_delete.rs index 07a9f22a267..87391ffdca7 100644 --- a/nexus/src/app/sagas/disk_delete.rs +++ b/nexus/src/app/sagas/disk_delete.rs @@ -5,6 +5,7 @@ use super::ActionRegistry; use super::NexusActionContext; use super::NexusSaga; +use crate::app::InlineErrorChain; use crate::app::sagas::SagaInitError; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::volume_delete; @@ -12,6 +13,7 @@ use nexus_db_queries::authn; use nexus_db_queries::db; use nexus_db_queries::db::datastore; use omicron_common::api::external::DiskState; +use omicron_common::progenitor_operation_retry::ProgenitorOperationRetry; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -39,6 +41,12 @@ declare_saga_actions! { + sdd_account_space - sdd_account_space_undo } + DEALLOCATE_LOCAL_STORAGE -> "deallocate_local_storage" { + + sdd_deallocate_local_storage + } + DELETE_LOCAL_STORAGE -> "delete_local_storage" { + + sdd_delete_local_storage + } } // disk delete saga: definition @@ -96,8 +104,8 @@ impl NexusSaga for SagaDiskDelete { } datastore::Disk::LocalStorage(_) => { - // XXX nothing to do here yet, need the delegate_zvol branch to - // merge first. + builder.append(deallocate_local_storage_action()); + builder.append(delete_local_storage_action()); } } @@ -188,6 +196,96 @@ async fn sdd_account_space_undo( Ok(()) } +async fn sdd_deallocate_local_storage( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let datastore::Disk::LocalStorage(disk) = params.disk else { + // Check during make_saga_dag prevents this case. + unreachable!(); + }; + + let Some(allocation) = disk.local_storage_dataset_allocation else { + // Nothing to do! + return Ok(()); + }; + + osagactx + .datastore() + .delete_local_storage_dataset_allocation(&opctx, allocation.id()) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn sdd_delete_local_storage( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let datastore::Disk::LocalStorage(disk) = params.disk else { + // Check during make_saga_dag prevents this case. + unreachable!(); + }; + + let Some(allocation) = disk.local_storage_dataset_allocation else { + // Nothing to do! + return Ok(()); + }; + + let dataset_id = allocation.id(); + let pool_id = allocation.pool_id(); + let sled_id = allocation.sled_id(); + + // Get a sled agent client + + let sled_agent_client = osagactx + .nexus() + .sled_client(&sled_id) + .await + .map_err(ActionError::action_failed)?; + + // Ensure that the local storage is deleted + + let delete_operation = || async { + sled_agent_client + .local_storage_dataset_delete(&pool_id, &dataset_id) + .await + }; + + let gone_check = || async { + osagactx.datastore().check_sled_in_service(&opctx, sled_id).await?; + + // `check_sled_in_service` returns an error if the sled is no longer in + // service; if it succeeds, the sled is not gone. + Ok(false) + }; + + ProgenitorOperationRetry::new(delete_operation, gone_check) + .run(osagactx.log()) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "failed to delete local storage: {}", + InlineErrorChain::new(&e) + )) + })?; + + Ok(()) +} + #[cfg(test)] pub(crate) mod test { use crate::{ diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 9acfb435e36..38c96886060 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -2,7 +2,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::{ACTION_GENERATE_ID, NexusActionContext, NexusSaga, SagaInitError}; +use super::ACTION_GENERATE_ID; +use super::NexusActionContext; +use super::NexusSaga; +use super::SagaInitError; +use super::subsaga_append; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::disk_create::{self, SagaDiskCreate}; use crate::app::{ @@ -50,6 +54,7 @@ pub(crate) struct Params { pub create_params: params::InstanceCreate, pub boundary_switches: HashSet, } + // Several nodes in this saga are wrapped in their own subsaga so that they can // have a parameter that denotes which node they are (e.g., which NIC or which // external IP). They also need the outer saga's parameters. @@ -168,34 +173,6 @@ impl NexusSaga for SagaInstanceCreate { builder.append(associate_ssh_keys_action()); - // Helper function for appending subsagas to our parent saga. - fn subsaga_append( - node_basename: String, - subsaga_dag: steno::Dag, - parent_builder: &mut steno::DagBuilder, - params: S, - which: usize, - ) -> Result<(), SagaInitError> { - // The "parameter" node is a constant node that goes into the outer - // saga. Its value becomes the parameters for the one-node subsaga - // (defined below) that actually creates each resource. - let params_node_name = format!("{}_params{}", node_basename, which); - parent_builder.append(Node::constant( - ¶ms_node_name, - serde_json::to_value(¶ms).map_err(|e| { - SagaInitError::SerializeError(params_node_name.clone(), e) - })?, - )); - - let output_name = format!("{}{}", node_basename, which); - parent_builder.append(Node::subsaga( - output_name.as_str(), - subsaga_dag, - params_node_name, - )); - Ok(()) - } - for (i, group) in params.create_params.anti_affinity_groups.iter().enumerate() { diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index d16e064dd01..8ce785bc37c 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -6,10 +6,12 @@ use std::net::Ipv6Addr; -use super::{ - NexusActionContext, NexusSaga, SagaInitError, - instance_common::allocate_vmm_ipv6, -}; +use super::NexusActionContext; +use super::NexusSaga; +use super::SagaInitError; +use super::instance_common::allocate_vmm_ipv6; +use crate::app::InlineErrorChain; +use crate::app::MAX_DISKS_PER_INSTANCE; use crate::app::instance::{ InstanceEnsureRegisteredApiResources, InstanceRegisterReason, InstanceStateChangeError, @@ -17,12 +19,21 @@ use crate::app::instance::{ use crate::app::sagas::declare_saga_actions; use chrono::Utc; use nexus_db_lookup::LookupPath; +use nexus_db_queries::db::datastore::Disk; +use nexus_db_queries::db::datastore::LocalStorageDisk; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::{authn, authz, db}; +use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; +use omicron_common::api::external::http_pagination::PaginatedBy; +use omicron_common::progenitor_operation_retry::ProgenitorOperationRetry; use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid, SledUuid}; +use paste::paste; +use seq_macro::seq; use serde::{Deserialize, Serialize}; -use slog::{error, info}; +use sled_agent_client::types::LocalStorageDatasetEnsureRequest; +use slog::error; +use slog::info; use steno::ActionError; /// Parameters to the instance start saga. @@ -80,6 +91,15 @@ declare_saga_actions! { - sis_move_to_starting_undo } + LIST_LOCAL_STORAGE -> "local_storage_records" { + + sis_list_local_storage + } + + ENSURE_LOCAL_STORAGE (0, 1, 2, 3, 4, 5, 6, 7) -> "ensure_local_storage" { + + sis_ensure_local_storage + // No undo action for this, that is handled in the disk delete saga! + } + // TODO(#3879) This can be replaced with an action that triggers the NAT RPW // once such an RPW is available. DPD_ENSURE -> "dpd_ensure" { @@ -144,12 +164,26 @@ impl NexusSaga for SagaInstanceStart { builder.append(alloc_propolis_ip_action()); builder.append(create_vmm_record_action()); builder.append(mark_as_starting_action()); + + // After the instance's state has moved to starting, this should block + // out all disk attach and detach requests. List all the possible local + // storage disks attached to this instance, and ensure that they exist + // so they can be delegated into the propolis zone. + builder.append(list_local_storage_action()); + + // Changing MAX_DISKS_PER_INSTANCE requires changing this saga + static_assertions::const_assert!(MAX_DISKS_PER_INSTANCE == 8); + seq!(N in 0..8 { + builder.append(paste!([]())); + }); + builder.append(dpd_ensure_action()); builder.append(v2p_ensure_action()); builder.append(ensure_registered_action()); builder.append(update_multicast_sled_id_action()); builder.append(add_virtual_resources_action()); builder.append(ensure_running_action()); + Ok(builder.build()?) } } @@ -515,6 +549,159 @@ async fn sis_account_virtual_resources_undo( Ok(()) } +async fn sis_list_local_storage( + sagactx: NexusActionContext, +) -> Result, ActionError> { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + let datastore = osagactx.datastore(); + + let db_instance = + sagactx.lookup::("started_record")?; + let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); + + let (_, _, authz_instance, ..) = LookupPath::new(&opctx, datastore) + .instance_id(instance_id.into_untyped_uuid()) + .fetch_for(authz::Action::Read) + .await + .map_err(ActionError::action_failed)?; + + let disks = datastore + .instance_list_disks( + &opctx, + &authz_instance, + &PaginatedBy::Name(DataPageParams { + marker: None, + direction: dropshot::PaginationOrder::Ascending, + limit: std::num::NonZeroU32::new(MAX_DISKS_PER_INSTANCE) + .unwrap(), + }), + ) + .await + .map_err(ActionError::action_failed)?; + + let records = disks + .into_iter() + .filter_map(|disk| match disk { + Disk::LocalStorage(disk) => Some(disk), + Disk::Crucible(_) => None, + }) + .collect(); + + Ok(records) +} + +async fn sis_ensure_local_storage( + sagactx: NexusActionContext, + which: usize, +) -> Result<(), ActionError> { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + // Get this node's record + + let local_storage_records = + sagactx.lookup::>("local_storage_records")?; + + if local_storage_records.is_empty() + || (which >= local_storage_records.len()) + { + return Ok(()); + } + + let LocalStorageDisk { + disk, + disk_type_local_storage: _, + local_storage_dataset_allocation, + } = &local_storage_records[which]; + + // Make sure this was a complete allocation. + + let Some(local_storage_dataset_allocation) = + local_storage_dataset_allocation + else { + return Err(ActionError::action_failed(format!( + "local storage record {} has a None allocation!", + which + ))); + }; + + // All local storage volumes will be created with 4k blocks. Double check + // here. + + if disk.block_size.to_bytes() != 4096 { + return Err(ActionError::action_failed(format!( + "local storage record {} has block size {}!", + which, + disk.block_size.to_bytes(), + ))); + } + + let dataset_id = local_storage_dataset_allocation.id(); + let pool_id = local_storage_dataset_allocation.pool_id(); + let sled_id = local_storage_dataset_allocation.sled_id(); + let dataset_size = local_storage_dataset_allocation.dataset_size.into(); + let volume_size = disk.size.into(); + + // Get a sled agent client + + let sled_agent_client = osagactx + .nexus() + .sled_client(&sled_id) + .await + .map_err(ActionError::action_failed)?; + + // Ensure that the local storage is created + + let ensure_operation = || async { + sled_agent_client + .local_storage_dataset_ensure( + &pool_id, + &dataset_id, + &LocalStorageDatasetEnsureRequest { dataset_size, volume_size }, + ) + .await + }; + + let gone_check = || async { + osagactx.datastore().check_sled_in_service(&opctx, sled_id).await?; + + // `check_sled_in_service` returns an error if the sled is no longer in + // service; if it succeeds, the sled is not gone. + Ok(false) + }; + + ProgenitorOperationRetry::new(ensure_operation, gone_check) + .run(osagactx.log()) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "failed to ensure local storage: {}", + InlineErrorChain::new(&e) + )) + })?; + + Ok(()) +} + +seq!(M in 0..8 { + async fn sis_ensure_local_storage_~M( + sagactx: NexusActionContext, + ) -> Result<(), ActionError> { + sis_ensure_local_storage(sagactx, M).await + } +}); + async fn sis_dpd_ensure( sagactx: NexusActionContext, ) -> Result<(), ActionError> { @@ -921,8 +1108,7 @@ mod test { use nexus_test_utils_macros::nexus_test; use nexus_types::identity::Resource; use omicron_common::api::external::{ - ByteCount, DataPageParams, IdentityMetadataCreateParams, - InstanceCpuCount, Name, + ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, Name, }; use omicron_common::api::internal::shared::SwitchLocation; use omicron_test_utils::dev::poll; diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 642c7a3947f..ecc35f159ce 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -10,11 +10,13 @@ // easier it will be to test, version, and update in deployed systems. use crate::saga_interface::SagaContext; +use serde::Serialize; use std::sync::Arc; use std::sync::LazyLock; use steno::ActionContext; use steno::ActionError; use steno::DagBuilder; +use steno::Node; use steno::SagaDag; use steno::SagaName; use steno::SagaType; @@ -316,6 +318,161 @@ macro_rules! declare_saga_actions { crate::app::sagas::__emit_action!($node, $out); declare_saga_actions!(S = $saga $($nodes,)* $node <> $($tail)*); }; + // Allow for a repetition syntax that looks like + // + // ACTION_NAME (i, j, k) -> "output" { + // + action + // - undo + // } + // + // This is equivalent to + // + // ACTION_NAME_i -> "output_i" { + // + action_i + // - undo_i + // } + // + // ACTION_NAME_j -> "output_j" { + // + action_j + // - undo_j + // } + // + // ACTION_NAME_k -> "output_k" { + // + action_k + // - undo_k + // } + // + (S = $saga:ident $($nodes:ident),* <> $node:ident ($repeat:literal, $($repeat_tail:literal),*) -> $out:literal { + $a:ident - $u:ident } $($tail:tt)*) => { + paste::paste! { + static [<$node _ $repeat>]: ::std::sync::LazyLock = + ::std::sync::LazyLock::new(|| { + ::steno::new_action_noop_undo( + crate::app::sagas::__action_name!( + $saga, + [<$node _ $repeat>] + ), + [<$a _ $repeat>], + [<$u _ $repeat>] + ) + }); + + // Note: this is the inlined contents of `__emit_action`, that macro + // cannot take the `paste!` output as it isn't a literal: + // + // crate::app::sagas::__emit_action!( + // [<$node _ $repeat>], + // [<$out _ $repeat>] <- this triggers a compile error + // ); + #[allow(dead_code)] + fn [<$node:lower _ $repeat _action>]() -> ::steno::Node { + ::steno::Node::action( + crate::app::sagas::__stringify_ident!([<$out _ $repeat>]), + crate::app::sagas::__stringify_ident!([<$node:camel _ $repeat>]), + [<$node _ $repeat>].as_ref(), + ) + } + + declare_saga_actions!(S = $saga $($nodes,)* [<$node _ $repeat>] <> $node ($($repeat_tail),+) -> $out { + $a } $($tail)*); + } + }; + (S = $saga:ident $($nodes:ident),* <> $node:ident ($repeat:literal) -> $out:literal { + $a:ident - $u:ident } $($tail:tt)*) => { + paste::paste! { + static [<$node _ $repeat>]: ::std::sync::LazyLock = + ::std::sync::LazyLock::new(|| { + ::steno::new_action_noop_undo( + crate::app::sagas::__action_name!( + $saga, + [<$node _ $repeat>] + ), + [<$a _ $repeat>], + [<$u _ $repeat>] + ) + }); + + // Note: this is the inlined contents of `__emit_action`, that macro + // cannot take the `paste!` output as it isn't a literal: + // + // crate::app::sagas::__emit_action!( + // [<$node _ $repeat>], + // [<$out _ $repeat>] <- this triggers a compile error + // ); + #[allow(dead_code)] + fn [<$node:lower _ $repeat _action>]() -> ::steno::Node { + ::steno::Node::action( + crate::app::sagas::__stringify_ident!([<$out _ $repeat>]), + crate::app::sagas::__stringify_ident!([<$node:camel _ $repeat>]), + [<$node _ $repeat>].as_ref(), + ) + } + + declare_saga_actions!(S = $saga $($nodes,)* [<$node _ $repeat>] <> $($tail)*); + } + }; + // Same as the prior match, but without the undo action. + (S = $saga:ident $($nodes:ident),* <> $node:ident ($repeat:literal, $($repeat_tail:literal),*) -> $out:literal { + $a:ident } $($tail:tt)*) => { + paste::paste! { + static [<$node _ $repeat>]: ::std::sync::LazyLock = + ::std::sync::LazyLock::new(|| { + ::steno::new_action_noop_undo( + crate::app::sagas::__action_name!( + $saga, + [<$node _ $repeat>] + ), + [<$a _ $repeat>] + ) + }); + + // Note: this is the inlined contents of `__emit_action`, that macro + // cannot take the `paste!` output as it isn't a literal: + // + // crate::app::sagas::__emit_action!( + // [<$node _ $repeat>], + // [<$out _ $repeat>] <- this triggers a compile error + // ); + #[allow(dead_code)] + fn [<$node:lower _ $repeat _action>]() -> ::steno::Node { + ::steno::Node::action( + crate::app::sagas::__stringify_ident!([<$out _ $repeat>]), + crate::app::sagas::__stringify_ident!([<$node:camel _ $repeat>]), + [<$node _ $repeat>].as_ref(), + ) + } + + declare_saga_actions!(S = $saga $($nodes,)* [<$node _ $repeat>] <> $node ($($repeat_tail),+) -> $out { + $a } $($tail)*); + } + }; + (S = $saga:ident $($nodes:ident),* <> $node:ident ($repeat:literal) -> $out:literal { + $a:ident } $($tail:tt)*) => { + paste::paste! { + static [<$node _ $repeat>]: ::std::sync::LazyLock = + ::std::sync::LazyLock::new(|| { + ::steno::new_action_noop_undo( + crate::app::sagas::__action_name!( + $saga, + [<$node _ $repeat>] + ), + [<$a _ $repeat>] + ) + }); + + // Note: this is the inlined contents of `__emit_action`, that macro + // cannot take the `paste!` output as it isn't a literal: + // + // crate::app::sagas::__emit_action!( + // [<$node _ $repeat>], + // [<$out _ $repeat>] <- this triggers a compile error + // ); + #[allow(dead_code)] + fn [<$node:lower _ $repeat _action>]() -> ::steno::Node { + ::steno::Node::action( + crate::app::sagas::__stringify_ident!([<$out _ $repeat>]), + crate::app::sagas::__stringify_ident!([<$node:camel _ $repeat>]), + [<$node _ $repeat>].as_ref(), + ) + } + + declare_saga_actions!(S = $saga $($nodes,)* [<$node _ $repeat>] <> $($tail)*); + } + }; // The end of the macro, which registers all previous generated saga nodes. // // We generate a new function, rather than implementing @@ -337,3 +494,34 @@ pub(crate) use __action_name; pub(crate) use __emit_action; pub(crate) use __stringify_ident; pub(crate) use declare_saga_actions; + +// Helper function for appending subsagas to parent sagas. +fn subsaga_append( + node_basename: String, + subsaga_dag: steno::Dag, + parent_builder: &mut steno::DagBuilder, + params: S, + which: usize, +) -> Result<(), SagaInitError> { + // The "parameter" node is a constant node that goes into the outer saga. + // Its value becomes the parameters for the one-node subsaga (defined below) + // that actually creates each resource. + let params_node_name = format!("{}_params{}", node_basename, which); + + parent_builder.append(Node::constant( + ¶ms_node_name, + serde_json::to_value(¶ms).map_err(|e| { + SagaInitError::SerializeError(params_node_name.clone(), e) + })?, + )); + + let output_name = format!("{}{}", node_basename, which); + + parent_builder.append(Node::subsaga( + output_name.as_str(), + subsaga_dag, + params_node_name, + )); + + Ok(()) +} diff --git a/nexus/test-utils/src/background.rs b/nexus/test-utils/src/background.rs index 6733ab9daae..74514af810a 100644 --- a/nexus/test-utils/src/background.rs +++ b/nexus/test-utils/src/background.rs @@ -471,3 +471,76 @@ pub async fn run_tuf_artifact_replication_step( assert_eq!(status.last_run_counters.err(), 0); status } + +/// Run the blueprint_loader background task +pub async fn run_blueprint_loader(lockstep_client: &ClientTestContext) { + let last_background_task = + activate_background_task(&lockstep_client, "blueprint_loader").await; + + let LastResult::Completed(_last_result_completed) = + last_background_task.last + else { + panic!( + "unexpected {:?} returned from blueprint_loader task", + last_background_task.last, + ); + }; +} + +/// Run the blueprint_planner background task +pub async fn run_blueprint_planner( + lockstep_client: &ClientTestContext, +) -> BlueprintPlannerStatus { + let last_background_task = + activate_background_task(&lockstep_client, "blueprint_planner").await; + + let LastResult::Completed(last_result_completed) = + last_background_task.last + else { + panic!( + "unexpected {:?} returned from blueprint_planner task", + last_background_task.last, + ); + }; + + serde_json::from_value::( + last_result_completed.details, + ) + .unwrap() +} + +/// Run the blueprint_executor background task +pub async fn run_blueprint_executor(lockstep_client: &ClientTestContext) { + let last_background_task = + activate_background_task(&lockstep_client, "blueprint_executor").await; + + let LastResult::Completed(_last_result_completed) = + last_background_task.last + else { + panic!( + "unexpected {:?} returned from blueprint_executor task", + last_background_task.last, + ); + }; +} + +/// Run the blueprint_rendezvous background task +pub async fn run_blueprint_rendezvous(lockstep_client: &ClientTestContext) { + let last_background_task = + activate_background_task(&lockstep_client, "blueprint_rendezvous") + .await; + + let LastResult::Completed(last_result_completed) = + last_background_task.last + else { + panic!( + "unexpected {:?} returned from blueprint_rendezvous task", + last_background_task.last, + ); + }; + + let _status = serde_json::from_value::( + last_result_completed.details, + ) + .unwrap(); +} diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 31e37a4be10..e2dd876e1b4 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -117,7 +117,7 @@ fn get_instances_url() -> String { format!("/v1/instances?{}", get_project_selector()) } -fn get_instance_url(instance_name: &str) -> String { +pub fn get_instance_url(instance_name: &str) -> String { format!("/v1/instances/{}?{}", instance_name, get_project_selector()) }