oxidecomputer · jmpesp · Oct 30, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -575,6 +575,7 @@ nexus-test-utils = { path = "nexus/test-utils" }
 nexus-types = { path = "nexus/types" }
 nix = { version = "0.30", features = ["fs", "net"] }
 nom = "7.1.3"
+nonempty = "0.12.0"
 num-integer = "0.1.46"
 num = { version = "0.4.3", default-features = false, features = [ "libm" ] }
 omicron-clickhouse-admin = { path = "clickhouse-admin" }
@@ -687,6 +688,7 @@ schemars = "0.8.22"
 scopeguard = "1.2.0"
 secrecy = "0.10.3"
 semver = { version = "1.0.26", features = ["std", "serde"] }
+seq-macro = "0.3.6"
 serde = { version = "1.0", default-features = false, features = [ "derive", "rc" ] }
 serde_cbor = "0.11.2"
 serde_human_bytes = { git = "https://github.com/oxidecomputer/serde_human_bytes", branch = "main" }

diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml
@@ -93,6 +93,7 @@ ring.workspace = true
 samael.workspace = true
 schemars = { workspace = true, features = ["chrono", "uuid1"] }
 semver.workspace = true
+seq-macro.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 serde_urlencoded.workspace = true
@@ -106,6 +107,7 @@ slog-dtrace.workspace = true
 slog-error-chain.workspace = true
 display-error-chain.workspace = true
 slog-term.workspace = true
+static_assertions.workspace = true
 steno.workspace = true
 tempfile.workspace = true
 thiserror.workspace = true

diff --git a/nexus/db-model/src/affinity.rs b/nexus/db-model/src/affinity.rs
@@ -151,7 +151,7 @@ impl From<params::AffinityGroupUpdate> for AffinityGroupUpdate {
 #[diesel(table_name = anti_affinity_group)]
 pub struct AntiAffinityGroup {
     #[diesel(embed)]
-    identity: AntiAffinityGroupIdentity,
+    pub identity: AntiAffinityGroupIdentity,
     pub project_id: Uuid,
     pub policy: AffinityPolicy,
     pub failure_domain: FailureDomain,

diff --git a/nexus/db-model/src/local_storage_dataset_allocation.rs b/nexus/db-model/src/local_storage_dataset_allocation.rs
@@ -40,6 +40,30 @@ pub struct LocalStorageDatasetAllocation {
 }
 
 impl LocalStorageDatasetAllocation {
+    /// These records are normally created during sled reservation, but for unit
+    /// tests add this `new` function.
+    pub fn new_for_tests_only(
+        id: DatasetUuid,
+        time_created: DateTime<Utc>,
+        local_storage_dataset_id: DatasetUuid,
+        pool_id: ExternalZpoolUuid,
+        sled_id: SledUuid,
+        dataset_size: ByteCount,
+    ) -> Self {
+        Self {
+            id: id.into(),
+
+            time_created,
+            time_deleted: None,
+
+            local_storage_dataset_id: local_storage_dataset_id.into(),
+            pool_id: pool_id.into(),
+            sled_id: sled_id.into(),
+
+            dataset_size,
+        }
+    }
+
     pub fn id(&self) -> DatasetUuid {
         self.id.into()
     }

diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml
@@ -30,6 +30,7 @@ internal-dns-types.workspace = true
 ipnetwork.workspace = true
 itertools.workspace = true
 macaddr.workspace = true
+nonempty.workspace = true
 oxnet.workspace = true
 paste.workspace = true
 # See omicron-rpaths for more about the "pq-sys" dependency.

diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs
@@ -37,6 +37,8 @@ use crate::db::update_and_check::UpdateStatus;
 use async_bb8_diesel::AsyncRunQueryDsl;
 use chrono::DateTime;
 use chrono::Utc;
+use diesel::dsl::exists;
+use diesel::dsl::not;
 use diesel::prelude::*;
 use nexus_db_errors::ErrorHandler;
 use nexus_db_errors::OptionalError;
@@ -434,13 +436,24 @@ impl DataStore {
         authz_instance: &authz::Instance,
         pagparams: &PaginatedBy<'_>,
     ) -> ListResultVec<Disk> {
-        use nexus_db_schema::schema::disk::dsl;
-        use nexus_db_schema::schema::disk_type_crucible::dsl as disk_type_crucible_dsl;
-        use nexus_db_schema::schema::disk_type_local_storage::dsl as disk_type_local_storage_dsl;
+        let conn = self.pool_connection_authorized(opctx).await?;
 
         opctx.authorize(authz::Action::ListChildren, authz_instance).await?;
 
-        let conn = self.pool_connection_authorized(opctx).await?;
+        self.instance_list_disks_on_conn(&conn, authz_instance.id(), pagparams)
+            .await
+    }
+
+    /// List disks associated with a given instance by name.
+    pub async fn instance_list_disks_on_conn(
+        &self,
+        conn: &async_bb8_diesel::Connection<DbConnection>,
+        instance_id: Uuid,
+        pagparams: &PaginatedBy<'_>,
+    ) -> ListResultVec<Disk> {
+        use nexus_db_schema::schema::disk::dsl;
+        use nexus_db_schema::schema::disk_type_crucible::dsl as disk_type_crucible_dsl;
+        use nexus_db_schema::schema::disk_type_local_storage::dsl as disk_type_local_storage_dsl;
 
         let results = match pagparams {
             PaginatedBy::Id(pagparams) => {
@@ -461,13 +474,13 @@ impl DataStore {
                 .on(dsl::id.eq(disk_type_local_storage_dsl::disk_id)),
         )
         .filter(dsl::time_deleted.is_null())
-        .filter(dsl::attach_instance_id.eq(authz_instance.id()))
+        .filter(dsl::attach_instance_id.eq(instance_id))
         .select((
             model::Disk::as_select(),
             Option::<DiskTypeCrucible>::as_select(),
             Option::<DiskTypeLocalStorage>::as_select(),
         ))
-        .get_results_async(&*conn)
+        .get_results_async(conn)
         .await
         .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
 
@@ -494,7 +507,7 @@ impl DataStore {
                         let allocation = dsl::local_storage_dataset_allocation
                             .filter(dsl::id.eq(to_db_typed_uuid(allocation_id)))
                             .select(LocalStorageDatasetAllocation::as_select())
-                            .first_async(&*conn)
+                            .first_async(conn)
                             .await
                             .map_err(|e| {
                                 public_error_from_diesel(
@@ -747,7 +760,9 @@ impl DataStore {
         authz_disk: &authz::Disk,
         max_disks: u32,
     ) -> Result<(Instance, Disk), Error> {
-        use nexus_db_schema::schema::{disk, instance};
+        use nexus_db_schema::schema::disk;
+        use nexus_db_schema::schema::instance;
+        use nexus_db_schema::schema::sled_resource_vmm;
 
         opctx.authorize(authz::Action::Modify, authz_instance).await?;
         opctx.authorize(authz::Action::Modify, authz_disk).await?;
@@ -771,6 +786,74 @@ impl DataStore {
 
         let attach_update = DiskSetClauseForAttach::new(authz_instance.id());
 
+        let disk = self.disk_get(&opctx, authz_disk.id()).await?;
+        let resource_query = match disk {
+            Disk::Crucible(_) => disk::table.into_boxed().filter(
+                disk::dsl::disk_state.eq_any(ok_to_attach_disk_state_labels),
+            ),
+
+            // Attaching a local storage disk to the instance has to be blocked
+            // if sled reservation has occured for this instance: local storage
+            // allocation records are only created during sled reservation, and
+            // importantly a particular configuration of local storage
-            // importantly a particular configuration of local storage
+            // importantly, a particular configuration of local storage
-            // importantly a particular configuration of local storage
+            // importantly, a particular configuration of local storage
+            // allocations for an instance are only _validated_ during the sled
+            // reservation.
+            //
+            // The instance start saga perform sled reservation, creates the
-            // The instance start saga perform sled reservation, creates the
+            // The instance start saga performs sled reservation, creates the
-            // The instance start saga perform sled reservation, creates the
+            // The instance start saga performs sled reservation, creates the
+            // corresponding VMM record, and changes the instance's runtime
+            // state all in _separate saga nodes_.  This means that there could
+            // be an indeterminate amount of time between running the sled
+            // reservation query and changing the instance's state.
+            //
+            // If a client attaches a local storage disk to an instance after
+            // sled reservation occurs but before the instance's start moves to
-            // sled reservation occurs but before the instance's start moves to
+            // sled reservation occurs but before the instance's start saga moves to
-            // sled reservation occurs but before the instance's start moves to
+            // sled reservation occurs but before the instance's start saga moves to
+            // starting, and we do not block it, there are several problems that
+            // result:
+            //
+            // - if an allocation does not already exist for the local storage
+            //   disk, the instance_start saga will fail (and unwind) when
+            //   trying to ensure that the allocation's dataset and zvol exist
-            //   trying to ensure that the allocation's dataset and zvol exist
+            //   trying to ensure that the allocation's dataset and zvol exist,
-            //   trying to ensure that the allocation's dataset and zvol exist
+            //   trying to ensure that the allocation's dataset and zvol exist,
+            //   because the allocation_id column is None.
+            //
+            // - if an allocation does already exist for the local storage disk,
+            //   _it may not be for the same sled the VMM is on_. the sled
+            //   reservation query would prevent this, but this attach (if not
+            //   blocked) happened afterwards. This would mean Nexus would
+            //   construct a InstanceSledLocalConfig that contains DelegatedZvol
+            //   entries that refer to different sleds.
+            //
+            // - if an allocation does exist already, and it's for the same sled
+            //   the VMM is on, it may be colocated on a zpool with another
+            //   local storage disk's allocation. again, the sled reservation
+            //   query prevents this.
+            //
+            // - if an allocation does exist already, and it's for the same
+            //   sled the VMM is on, and it's on a distinct zpool, then it's
+            //   probably fine, but it's safer to let the sled reservation query
+            //   validate everything, and it makes a much smaller query to block
+            //   this case as well.
+            //
+            // `reserve_on_random_sled` will create an entry in
+            // `SledResourcesVmm` when the query is successful in finding a VMM
+            // reservation, so use that here: if there is a `SledResourcesVmm`
+            // record for this instance, then block attachment.
+            //
+            // Note that depending on our implementation, this may be the code
+            // path responsible for attaching disks to already-running instances
+            // when we support hot-plug. Local storage disks may never support
+            // hot-plug because running zones cannot be reconfigured (aka a new
+            // zvol rdsk device cannot be added to a running propolis zone).
+            Disk::LocalStorage(_) => disk::table
+                .into_boxed()
+                .filter(
+                    disk::dsl::disk_state
+                        .eq_any(ok_to_attach_disk_state_labels),
+                )
+                .filter(not(exists(sled_resource_vmm::table.filter(
+                    sled_resource_vmm::dsl::instance_id.eq(authz_instance.id()),
+                )))),
+        };
+
         let query = Instance::attach_resource(
             authz_instance.id(),
             authz_disk.id(),
@@ -779,9 +862,7 @@ impl DataStore {
                     .eq_any(ok_to_attach_instance_states)
                     .and(instance::dsl::active_propolis_id.is_null()),
             ),
-            disk::table.into_boxed().filter(
-                disk::dsl::disk_state.eq_any(ok_to_attach_disk_state_labels),
-            ),
+            resource_query,
                 AttachError::NoUpdate { attached_count, resource, collection } => { 
                     let disk_state = resource.state().into(); 
                     match disk_state { 
                         // Idempotent errors: We did not perform an update, 
                         // because we're already in the process of attaching. 
                         api::external::DiskState::Attached(id) if id == authz_instance.id() => { 
                             return Ok((collection, resource)); 
                         } 
                         api::external::DiskState::Attaching(id) if id == authz_instance.id() => { 
                             return Ok((collection, resource)); 
                         } 
                         // Ok-to-attach disk states: Inspect the state to infer 
                         // why we did not attach. 
                         api::external::DiskState::Creating | 
                         api::external::DiskState::Detached => { 
                             if collection.runtime_state.propolis_id.is_some() { 
                                 return Err( 
                                     Error::invalid_request( 
                                         "cannot attach disk: instance is not \ 
                                         fully stopped" 
                                     ) 
                                 ); 
                             } 
                             match collection.runtime_state.nexus_state.state() { 
                                 // Ok-to-be-attached instance states: 
                                 api::external::InstanceState::Creating | 
                                 api::external::InstanceState::Stopped => { 
                                     // The disk is ready to be attached, and the 
                                     // instance is ready to be attached. Perhaps 
                                     // we are at attachment capacity? 
                                     if attached_count == i64::from(max_disks) { 
                                         return Err(Error::invalid_request(&format!( 
                                             "cannot attach more than {} disks to instance", 
                                             max_disks 
                                         ))); 
                                     } 
                                     // We can't attach, but the error hasn't 
                                     // helped us infer why. 
                                     return Err(Error::internal_error( 
                                         "cannot attach disk" 
                                     )); 
                                 } 
                                 // Not okay-to-be-attached instance states: 
                                 _ => { 
                                     Err(Error::invalid_request(&format!( 
                                         "cannot attach disk to instance in {} state", 
                                         collection.runtime_state.nexus_state.state(), 
                                     ))) 
                                 } 
                             } 
                         }, 
                         // Not-okay-to-attach disk states: The disk is attached elsewhere. 
                         api::external::DiskState::Attached(_) | 
                         api::external::DiskState::Attaching(_) | 
                         api::external::DiskState::Detaching(_) => { 
                             Err(Error::invalid_request(&format!( 
                                 "cannot attach disk \"{}\": disk is attached to another instance", 
                                 resource.name().as_str(), 
                             ))) 
                         } 
                         _ => { 
                             Err(Error::invalid_request(&format!( 
                                 "cannot attach disk \"{}\": invalid state {}", 
                                 resource.name().as_str(), 
                                 disk_state, 
                             ))) 
                         } 
                     } 
                 }, 
 Err(Error::invalid_request(&format!( 
     "cannot attach disk \"{}\": invalid state {}", 
     resource.name().as_str(), 
     disk_state, 
 ))) 
                 AttachError::NoUpdate { attached_count, resource, collection } => { 
                     let disk_state = resource.state().into(); 
                     match disk_state { 
                         // Idempotent errors: We did not perform an update, 
                         // because we're already in the process of attaching. 
                         api::external::DiskState::Attached(id) if id == authz_instance.id() => { 
                             return Ok((collection, resource)); 
                         } 
                         api::external::DiskState::Attaching(id) if id == authz_instance.id() => { 
                             return Ok((collection, resource)); 
                         } 
                         // Ok-to-attach disk states: Inspect the state to infer 
                         // why we did not attach. 
                         api::external::DiskState::Creating | 
                         api::external::DiskState::Detached => { 
                             if collection.runtime_state.propolis_id.is_some() { 
                                 return Err( 
                                     Error::invalid_request( 
                                         "cannot attach disk: instance is not \ 
                                         fully stopped" 
                                     ) 
                                 ); 
                             } 
                             match collection.runtime_state.nexus_state.state() { 
                                 // Ok-to-be-attached instance states: 
                                 api::external::InstanceState::Creating | 
                                 api::external::InstanceState::Stopped => { 
                                     // The disk is ready to be attached, and the 
                                     // instance is ready to be attached. Perhaps 
                                     // we are at attachment capacity? 
                                     if attached_count == i64::from(max_disks) { 
                                         return Err(Error::invalid_request(&format!( 
                                             "cannot attach more than {} disks to instance", 
                                             max_disks 
                                         ))); 
                                     } 
  
                                     // We can't attach, but the error hasn't 
                                     // helped us infer why. 
                                     return Err(Error::internal_error( 
                                         "cannot attach disk" 
                                     )); 
                                 } 
                                 // Not okay-to-be-attached instance states: 
                                 _ => { 
                                     Err(Error::invalid_request(&format!( 
                                         "cannot attach disk to instance in {} state", 
                                         collection.runtime_state.nexus_state.state(), 
                                     ))) 
                                 } 
                             } 
                         }, 
                         // Not-okay-to-attach disk states: The disk is attached elsewhere. 
                         api::external::DiskState::Attached(_) | 
                         api::external::DiskState::Attaching(_) | 
                         api::external::DiskState::Detaching(_) => { 
                             Err(Error::invalid_request(&format!( 
                                 "cannot attach disk \"{}\": disk is attached to another instance", 
                                 resource.name().as_str(), 
                             ))) 
                         } 
                         _ => { 
                             Err(Error::invalid_request(&format!( 
                                 "cannot attach disk \"{}\": invalid state {}", 
                                 resource.name().as_str(), 
                                 disk_state, 
                             ))) 
                         } 
                     } 
                 }, 
 Err(Error::invalid_request(&format!( 
     "cannot attach disk \"{}\": invalid state {}", 
     resource.name().as_str(), 
     disk_state, 
 ))) 
             max_disks,
             diesel::update(disk::dsl::disk).set(attach_update),
         );