From 0159e95ac0fd9b4723d4df9519ddde5ed660b7bf Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 4 Oct 2024 17:26:35 +0000 Subject: [PATCH 1/8] allow instance_reconfigure to resize a stopped instance Co-authored-by: Greg Colombo --- nexus/db-model/src/instance.rs | 4 + nexus/db-queries/src/db/datastore/instance.rs | 64 +++++- nexus/src/app/instance.rs | 104 +++++---- nexus/tests/integration_tests/endpoints.rs | 2 + nexus/tests/integration_tests/instances.rs | 202 +++++++++++++++++- nexus/types/src/external_api/params.rs | 6 + 6 files changed, 334 insertions(+), 48 deletions(-) diff --git a/nexus/db-model/src/instance.rs b/nexus/db-model/src/instance.rs index e7aa989971..173e0efd5f 100644 --- a/nexus/db-model/src/instance.rs +++ b/nexus/db-model/src/instance.rs @@ -542,4 +542,8 @@ pub struct InstanceUpdate { /// set the instance's auto-restart policy to `NULL`. #[diesel(column_name = auto_restart_policy)] pub auto_restart_policy: Option, + + pub ncpus: InstanceCpuCount, + + pub memory: ByteCount, } diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 1662504865..49fcebfdd3 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -18,10 +18,12 @@ use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Resource; use crate::db::lookup::LookupPath; +use crate::db::model::ByteCount; use crate::db::model::Generation; use crate::db::model::Instance; use crate::db::model::InstanceAutoRestart; use crate::db::model::InstanceAutoRestartPolicy; +use crate::db::model::InstanceCpuCount; use crate::db::model::InstanceRuntimeState; use crate::db::model::InstanceState; use crate::db::model::InstanceUpdate; @@ -1038,8 +1040,12 @@ impl DataStore { .transaction_retry_wrapper("reconfigure_instance") .transaction(&conn, |conn| { let err = err.clone(); - let InstanceUpdate { boot_disk_id, auto_restart_policy } = - update.clone(); + let InstanceUpdate { + boot_disk_id, + auto_restart_policy, + ncpus, + memory, + } = update.clone(); async move { // Set the auto-restart policy. diesel::update(instance_dsl::instance) @@ -1051,6 +1057,16 @@ impl DataStore { .execute_async(&conn) .await?; + // Set vCPUs and memory size. + self.instance_set_size_on_conn( + &conn, + &err, + authz_instance, + ncpus, + memory, + ) + .await?; + // Next, set the boot disk if needed. self.instance_set_boot_disk_on_conn( &conn, @@ -1215,6 +1231,50 @@ impl DataStore { } } + /// Set an instance's CPU count and memory size to the provided values, + /// within an existing transaction. + async fn instance_set_size_on_conn( + &self, + conn: &async_bb8_diesel::Connection, + err: &OptionalError, + authz_instance: &authz::Instance, + ncpus: InstanceCpuCount, + memory: ByteCount, + ) -> Result<(), diesel::result::Error> { + use db::schema::instance::dsl as instance_dsl; + + // Do not allow setting sizes of running instances to ensure that if an + // instance is running, its resource usage matches what we record in the + // database. + const OK_TO_SET_SIZE_STATES: &'static [InstanceState] = &[ + InstanceState::NoVmm, + InstanceState::Failed, + InstanceState::Creating, + ]; + + let r = diesel::update(instance_dsl::instance) + .filter(instance_dsl::id.eq(authz_instance.id())) + .filter(instance_dsl::state.eq_any(OK_TO_SET_SIZE_STATES)) + .set(( + instance_dsl::ncpus.eq(ncpus), + instance_dsl::memory.eq(memory), + )) + .check_if_exists::(authz_instance.id()) + .execute_and_check(&conn) + .await?; + match r.status { + UpdateStatus::NotUpdatedButExists => { + // This should be the only reason the query would fail... + debug_assert!(!OK_TO_SET_SIZE_STATES + .contains(&r.found.runtime().nexus_state)); + Err(err.bail(Error::conflict( + "instance must be stopped to be resized", + ))) + } + UpdateStatus::Updated => Ok(()), + } + } + pub async fn project_delete_instance( &self, opctx: &OpContext, diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index b4ad778735..95b9771f1e 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -41,6 +41,7 @@ use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; +use omicron_common::api::external::InstanceCpuCount; use omicron_common::api::external::InstanceState; use omicron_common::api::external::InternalContext; use omicron_common::api::external::ListResultVec; @@ -311,6 +312,8 @@ impl super::Nexus { let (.., authz_project, authz_instance) = instance_lookup.lookup_for(authz::Action::Modify).await?; + check_instance_cpu_memory_sizes(params.ncpus, params.memory)?; + let boot_disk_id = match params.boot_disk.clone() { Some(disk) => { let selector = params::DiskSelector { @@ -331,8 +334,11 @@ impl super::Nexus { }; let auto_restart_policy = params.auto_restart_policy.map(Into::into); + let ncpus = params.ncpus.into(); + let memory = params.memory.into(); - let update = InstanceUpdate { boot_disk_id, auto_restart_policy }; + let update = + InstanceUpdate { boot_disk_id, auto_restart_policy, ncpus, memory }; self.datastore() .instance_reconfigure(opctx, &authz_instance, update) .await @@ -347,6 +353,8 @@ impl super::Nexus { let (.., authz_project) = project_lookup.lookup_for(authz::Action::CreateChild).await?; + check_instance_cpu_memory_sizes(params.ncpus, params.memory)?; + let all_disks: Vec<¶ms::InstanceDiskAttachment> = params.boot_disk.iter().chain(params.disks.iter()).collect(); @@ -363,12 +371,6 @@ impl super::Nexus { .await?; } } - if params.ncpus.0 > MAX_VCPU_PER_INSTANCE { - return Err(Error::invalid_request(&format!( - "cannot have more than {} vCPUs per instance", - MAX_VCPU_PER_INSTANCE - ))); - } if params.external_ips.len() > MAX_EXTERNAL_IPS_PER_INSTANCE { return Err(Error::invalid_request(&format!( "An instance may not have more than {} external IP addresses", @@ -413,43 +415,6 @@ impl super::Nexus { } } - // Reject instances where the memory is not at least - // MIN_MEMORY_BYTES_PER_INSTANCE - if params.memory.to_bytes() < u64::from(MIN_MEMORY_BYTES_PER_INSTANCE) { - return Err(Error::invalid_value( - "size", - format!( - "memory must be at least {}", - ByteCount::from(MIN_MEMORY_BYTES_PER_INSTANCE) - ), - )); - } - - // Reject instances where the memory is not divisible by - // MIN_MEMORY_BYTES_PER_INSTANCE - if (params.memory.to_bytes() % u64::from(MIN_MEMORY_BYTES_PER_INSTANCE)) - != 0 - { - return Err(Error::invalid_value( - "size", - format!( - "memory must be divisible by {}", - ByteCount::from(MIN_MEMORY_BYTES_PER_INSTANCE) - ), - )); - } - - // Reject instances where the memory is greater than the limit - if params.memory.to_bytes() > MAX_MEMORY_BYTES_PER_INSTANCE { - return Err(Error::invalid_value( - "size", - format!( - "memory must be less than or equal to {}", - ByteCount::try_from(MAX_MEMORY_BYTES_PER_INSTANCE).unwrap() - ), - )); - } - let actor = opctx.authn.actor_required().internal_context( "loading current user's ssh keys for new Instance", )?; @@ -2187,6 +2152,57 @@ pub(crate) async fn process_vmm_update( } } +/// Determines whether the supplied instance sizes (CPU count and memory size) +/// are acceptable. +fn check_instance_cpu_memory_sizes( + ncpus: InstanceCpuCount, + memory: ByteCount, +) -> Result<(), Error> { + if ncpus.0 > MAX_VCPU_PER_INSTANCE { + return Err(Error::invalid_request(&format!( + "cannot have more than {} vCPUs per instance", + MAX_VCPU_PER_INSTANCE + ))); + } + + // Reject instances where the memory is not at least + // MIN_MEMORY_BYTES_PER_INSTANCE + if memory.to_bytes() < u64::from(MIN_MEMORY_BYTES_PER_INSTANCE) { + return Err(Error::invalid_value( + "size", + format!( + "memory must be at least {}", + ByteCount::from(MIN_MEMORY_BYTES_PER_INSTANCE) + ), + )); + } + + // Reject instances where the memory is not divisible by + // MIN_MEMORY_BYTES_PER_INSTANCE + if (memory.to_bytes() % u64::from(MIN_MEMORY_BYTES_PER_INSTANCE)) != 0 { + return Err(Error::invalid_value( + "size", + format!( + "memory must be divisible by {}", + ByteCount::from(MIN_MEMORY_BYTES_PER_INSTANCE) + ), + )); + } + + // Reject instances where the memory is greater than the limit + if memory.to_bytes() > MAX_MEMORY_BYTES_PER_INSTANCE { + return Err(Error::invalid_value( + "size", + format!( + "memory must be less than or equal to {}", + ByteCount::try_from(MAX_MEMORY_BYTES_PER_INSTANCE).unwrap() + ), + )); + } + + Ok(()) +} + /// Determines the disposition of a request to start an instance given its state /// (and its current VMM's state, if it has one) in the database. fn instance_start_allowed( diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 1aac5a186c..3595943050 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -437,6 +437,8 @@ pub static DEMO_INSTANCE_UPDATE: Lazy = Lazy::new(|| params::InstanceUpdate { boot_disk: None, auto_restart_policy: None, + ncpus: InstanceCpuCount(1), + memory: ByteCount::from_gibibytes_u32(16), }); // The instance needs a network interface, too. diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index d5f37d62be..98f7907923 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -4160,7 +4160,12 @@ async fn test_cannot_detach_boot_disk(cptestctx: &ControlPlaneTestContext) { let instance = expect_instance_reconfigure_ok( &client, &instance.identity.id, - params::InstanceUpdate { boot_disk: None, auto_restart_policy: None }, + params::InstanceUpdate { + boot_disk: None, + auto_restart_policy: None, + ncpus: InstanceCpuCount::try_from(2).unwrap(), + memory: ByteCount::from_gibibytes_u32(4), + }, ) .await; assert_eq!(instance.boot_disk_id, None); @@ -4260,6 +4265,8 @@ async fn test_updating_running_instance_boot_disk_is_conflict( params::InstanceUpdate { boot_disk: Some(alsodata.clone().into()), auto_restart_policy: None, + ncpus: InstanceCpuCount::try_from(2).unwrap(), + memory: ByteCount::from_gibibytes_u32(4), }, http::StatusCode::CONFLICT, ) @@ -4276,6 +4283,8 @@ async fn test_updating_running_instance_boot_disk_is_conflict( // was created. boot_disk: Some(probablydata.clone().into()), auto_restart_policy: Some(InstanceAutoRestartPolicy::BestEffort), + ncpus: InstanceCpuCount::try_from(2).unwrap(), + memory: ByteCount::from_gibibytes_u32(4), }, ) .await; @@ -4293,7 +4302,12 @@ async fn test_updating_missing_instance_is_not_found( let error = expect_instance_reconfigure_err( &client, &UUID_THAT_DOESNT_EXIST, - params::InstanceUpdate { boot_disk: None, auto_restart_policy: None }, + params::InstanceUpdate { + boot_disk: None, + auto_restart_policy: None, + ncpus: InstanceCpuCount::try_from(0).unwrap(), + memory: ByteCount::from_gibibytes_u32(0), + }, http::StatusCode::NOT_FOUND, ) .await; @@ -4355,6 +4369,182 @@ async fn expect_instance_reconfigure_err( .parsed_body::() .expect("error response should parse successfully") } + +// Test reconfiguring an instance's size in CPUs and memory. +#[nexus_test] +async fn test_size_can_be_changed(cptestctx: &ControlPlaneTestContext) { + let client = &cptestctx.external_client; + let instance_name = "downloading-more-ram"; + + create_project_and_pool(&client).await; + + let instance_params = params::InstanceCreate { + identity: IdentityMetadataCreateParams { + name: instance_name.parse().unwrap(), + description: String::from("stuff"), + }, + ncpus: InstanceCpuCount::try_from(2).unwrap(), + memory: ByteCount::from_gibibytes_u32(4), + hostname: instance_name.parse().unwrap(), + user_data: vec![], + ssh_public_keys: None, + network_interfaces: params::InstanceNetworkInterfaceAttachment::Default, + external_ips: vec![], + boot_disk: None, + disks: Vec::new(), + start: true, + // Start out with None + auto_restart_policy: None, + }; + + let builder = + RequestBuilder::new(client, http::Method::POST, &get_instances_url()) + .body(Some(&instance_params)) + .expect_status(Some(http::StatusCode::CREATED)); + let response = NexusRequest::new(builder) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Expected instance creation to work!"); + + let instance = response.parsed_body::().unwrap(); + let boot_disk_nameorid: Option = + instance.boot_disk_id.map(|x| x.into()); + let auto_restart_policy = instance.auto_restart_status.policy; + + let new_ncpus = InstanceCpuCount::try_from(4).unwrap(); + let new_memory = ByteCount::from_gibibytes_u32(8); + + // Resizing the instance immediately will error; the instance is running. + let err = expect_instance_reconfigure_err( + client, + &instance.identity.id, + params::InstanceUpdate { + auto_restart_policy, + boot_disk: boot_disk_nameorid.clone(), + ncpus: new_ncpus, + memory: new_memory, + }, + StatusCode::CONFLICT, + ) + .await; + + assert_eq!(err.message, "instance must be stopped to be resized"); + + instance_post(&client, instance_name, InstanceOp::Stop).await; + let nexus = &cptestctx.server.server_context().nexus; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; + + // Now that the instance is stopped, we can resize it.. + let instance = expect_instance_reconfigure_ok( + client, + &instance.identity.id, + params::InstanceUpdate { + auto_restart_policy, + boot_disk: boot_disk_nameorid.clone(), + ncpus: new_ncpus, + memory: new_memory, + }, + ) + .await; + assert_eq!(instance.ncpus.0, new_ncpus.0); + assert_eq!(instance.memory, new_memory); + + // Now try a few invalid sizes. These all should fail for slightly different + // reasons. + + // Too many CPUs. + let err = expect_instance_reconfigure_err( + client, + &instance.identity.id, + params::InstanceUpdate { + auto_restart_policy, + boot_disk: boot_disk_nameorid.clone(), + ncpus: InstanceCpuCount(MAX_VCPU_PER_INSTANCE + 1), + memory: instance.memory, + }, + StatusCode::BAD_REQUEST, + ) + .await; + assert_eq!( + err.message, + format!( + "cannot have more than {} vCPUs per instance", + MAX_VCPU_PER_INSTANCE + ) + ); + + // Too little memory. + let err = expect_instance_reconfigure_err( + client, + &instance.identity.id, + params::InstanceUpdate { + auto_restart_policy, + boot_disk: boot_disk_nameorid.clone(), + ncpus: instance.ncpus, + memory: ByteCount::from_mebibytes_u32(0), + }, + StatusCode::BAD_REQUEST, + ) + .await; + assert!(err.message.contains("memory must be at least")); + + // Enough memory, but not an amount we want to accept. + let err = expect_instance_reconfigure_err( + client, + &instance.identity.id, + params::InstanceUpdate { + auto_restart_policy, + boot_disk: boot_disk_nameorid.clone(), + ncpus: instance.ncpus, + memory: ByteCount::try_from(MAX_MEMORY_BYTES_PER_INSTANCE - 1) + .unwrap(), + }, + StatusCode::BAD_REQUEST, + ) + .await; + assert!(err.message.contains("memory must be divisible by")); + + // Too much memory, but an otherwise-acceptable amount. + let max_mib = MAX_MEMORY_BYTES_PER_INSTANCE / (1024 * 1024); + let err = expect_instance_reconfigure_err( + client, + &instance.identity.id, + params::InstanceUpdate { + auto_restart_policy, + boot_disk: boot_disk_nameorid.clone(), + ncpus: instance.ncpus, + memory: ByteCount::from_mebibytes_u32( + (max_mib + 1024).try_into().unwrap(), + ), + }, + StatusCode::BAD_REQUEST, + ) + .await; + assert!(err.message.contains("must be less than or equal")); + + // Now delete the instance; we should not be able to resize (or really + // interact at all!) with a deleted instance.. + expect_instance_delete_ok(client, instance_name).await; + + // Attempt to send a previously-valid update again. The only reason this + // will not work is that we just deleted the instance we'd be updating. + expect_instance_reconfigure_err( + client, + &instance.identity.id, + params::InstanceUpdate { + auto_restart_policy, + boot_disk: boot_disk_nameorid.clone(), + ncpus: new_ncpus, + memory: new_memory, + }, + StatusCode::NOT_FOUND + ) + .await; +} + // Test reconfiguring an instance's auto-restart policy. #[nexus_test] async fn test_auto_restart_policy_can_be_changed( @@ -4406,6 +4596,8 @@ async fn test_auto_restart_policy_can_be_changed( dbg!(params::InstanceUpdate { auto_restart_policy, boot_disk: None, + ncpus: InstanceCpuCount::try_from(2).unwrap(), + memory: ByteCount::from_gibibytes_u32(4), }), ) .await; @@ -4499,6 +4691,8 @@ async fn test_boot_disk_can_be_changed(cptestctx: &ControlPlaneTestContext) { params::InstanceUpdate { boot_disk: Some(disks[1].identity.id.into()), auto_restart_policy: None, + ncpus: InstanceCpuCount::try_from(2).unwrap(), + memory: ByteCount::from_gibibytes_u32(4), }, ) .await; @@ -4563,6 +4757,8 @@ async fn test_boot_disk_must_be_attached(cptestctx: &ControlPlaneTestContext) { params::InstanceUpdate { boot_disk: Some(disks[0].identity.id.into()), auto_restart_policy: None, + ncpus: InstanceCpuCount::try_from(2).unwrap(), + memory: ByteCount::from_gibibytes_u32(4), }, http::StatusCode::CONFLICT, ) @@ -4594,6 +4790,8 @@ async fn test_boot_disk_must_be_attached(cptestctx: &ControlPlaneTestContext) { params::InstanceUpdate { boot_disk: Some(disks[0].identity.id.into()), auto_restart_policy: None, + ncpus: InstanceCpuCount::try_from(2).unwrap(), + memory: ByteCount::from_gibibytes_u32(4), }, ) .await; diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 7e36d5f1f8..91249f80cf 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -1075,6 +1075,12 @@ pub struct InstanceCreate { /// Parameters of an `Instance` that can be reconfigured after creation. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct InstanceUpdate { + /// The number of CPUs to assign to this instance. + pub ncpus: InstanceCpuCount, + + /// The amount of memory to assign to this instance. + pub memory: ByteCount, + /// Name or ID of the disk the instance should be instructed to boot from. /// /// If not provided, unset the instance's boot disk. From 60afb8cce64251a162152f26f86831e610a50319 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 4 Oct 2024 17:48:19 +0000 Subject: [PATCH 2/8] i cannot add four lines without doing a cargo-fmt violation --- nexus/tests/integration_tests/instances.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 98f7907923..5bae125c61 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -4540,7 +4540,7 @@ async fn test_size_can_be_changed(cptestctx: &ControlPlaneTestContext) { ncpus: new_ncpus, memory: new_memory, }, - StatusCode::NOT_FOUND + StatusCode::NOT_FOUND, ) .await; } From fd7e5a1387475d1ebec9880e3bbb854f69e5fcf6 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 4 Oct 2024 18:12:14 +0000 Subject: [PATCH 3/8] dont forget to commit the updated openapi document --- openapi/nexus.json | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/openapi/nexus.json b/openapi/nexus.json index 714e2a25e7..eaa65f5551 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -15833,8 +15833,28 @@ "$ref": "#/components/schemas/NameOrId" } ] + }, + "memory": { + "description": "The amount of memory to assign to this instance.", + "allOf": [ + { + "$ref": "#/components/schemas/ByteCount" + } + ] + }, + "ncpus": { + "description": "The number of CPUs to assign to this instance.", + "allOf": [ + { + "$ref": "#/components/schemas/InstanceCpuCount" + } + ] } - } + }, + "required": [ + "memory", + "ncpus" + ] }, "IpNet": { "x-rust-type": { From e76814ff97fad26bec6cd732f8e5e86ef44682a5 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 4 Oct 2024 19:12:15 +0000 Subject: [PATCH 4/8] if a size update would be no change, succeed early --- common/src/api/external/mod.rs | 4 +- nexus/db-model/src/instance_cpu_count.rs | 10 ++++- nexus/db-queries/src/db/datastore/instance.rs | 39 ++++++++++++++++--- 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index d43bcbb8d4..3a21534ecb 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -1146,7 +1146,9 @@ impl InstanceState { } /// The number of CPUs in an Instance -#[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)] +#[derive( + Copy, Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, +)] pub struct InstanceCpuCount(pub u16); impl TryFrom for InstanceCpuCount { diff --git a/nexus/db-model/src/instance_cpu_count.rs b/nexus/db-model/src/instance_cpu_count.rs index 6c7debc6a5..dfa524a452 100644 --- a/nexus/db-model/src/instance_cpu_count.rs +++ b/nexus/db-model/src/instance_cpu_count.rs @@ -13,7 +13,15 @@ use serde::Serialize; use std::convert::TryFrom; #[derive( - Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, + Copy, + Clone, + Debug, + AsExpression, + FromSqlRow, + Serialize, + Deserialize, + PartialEq, + Eq, )] #[diesel(sql_type = sql_types::BigInt)] pub struct InstanceCpuCount(pub external::InstanceCpuCount); diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 49fcebfdd3..73eb1a17ab 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1157,13 +1157,13 @@ impl DataStore { InstanceState::Creating, ]; - let maybe_instance = instance_dsl::instance + let maybe_old_boot_disk_id = instance_dsl::instance .filter(instance_dsl::id.eq(authz_instance.id())) .filter(instance_dsl::time_deleted.is_null()) - .select(Instance::as_select()) - .first_async::(conn) + .select(instance_dsl::boot_disk_id) + .first_async::>(conn) .await; - let instance = match maybe_instance { + let old_boot_disk_id = match maybe_old_boot_disk_id { Ok(i) => i, Err(diesel::NotFound) => { // If the instance simply doesn't exist, we @@ -1179,7 +1179,7 @@ impl DataStore { // If the desired boot disk is already set, we're good here, and can // elide the check that the instance is in an acceptable state to change // the boot disk. - if instance.boot_disk_id == boot_disk_id { + if old_boot_disk_id == boot_disk_id { return Ok(()); } @@ -1252,6 +1252,35 @@ impl DataStore { InstanceState::Creating, ]; + let maybe_old_sizes = instance_dsl::instance + .filter(instance_dsl::id.eq(authz_instance.id())) + .filter(instance_dsl::time_deleted.is_null()) + .select((instance_dsl::ncpus, instance_dsl::memory)) + .first_async::<(InstanceCpuCount, ByteCount)>(conn) + .await; + + // Might be nice to extract `old_sizes` into an `InstanceDimensions` or + // something? Not processing the tuple `(InstanceCpuCount, ByteCount)` + // elsewhere yet, so maybe that would be premature... + let old_sizes = match maybe_old_sizes { + Ok(i) => i, + Err(diesel::NotFound) => { + // If the instance simply doesn't exist, we + // shouldn't retry. Bail with a useful error. + return Err(err.bail(Error::not_found_by_id( + ResourceType::Instance, + &authz_instance.id(), + ))); + } + Err(e) => return Err(e), + }; + + // Bail out early if the extant vCPU and memory settings match the + // "new values". + if old_sizes == (ncpus, memory) { + return Ok(()); + } + let r = diesel::update(instance_dsl::instance) .filter(instance_dsl::id.eq(authz_instance.id())) .filter(instance_dsl::state.eq_any(OK_TO_SET_SIZE_STATES)) From 57087c48fa7a9c3b91bf92a6fce4d0396896a0d7 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 4 Oct 2024 22:39:41 +0000 Subject: [PATCH 5/8] taking &Instance is annoying, use more filter --- nexus/db-queries/src/db/datastore/instance.rs | 164 +++++++----------- 1 file changed, 64 insertions(+), 100 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 73eb1a17ab..5a4199f835 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1061,7 +1061,7 @@ impl DataStore { self.instance_set_size_on_conn( &conn, &err, - authz_instance, + &authz_instance, ncpus, memory, ) @@ -1071,7 +1071,7 @@ impl DataStore { self.instance_set_boot_disk_on_conn( &conn, &err, - authz_instance, + &authz_instance, boot_disk_id, ) .await?; @@ -1093,14 +1093,16 @@ impl DataStore { Ok(instance_and_vmm) } + /// Set the boot disk on an instance, bypassing the rest of an instance + /// update. You probably don't need this; it's only used at the end of + /// instance creation, since the boot disk can't be set until the new + /// instance's disks are all attached. pub async fn instance_set_boot_disk( &self, opctx: &OpContext, authz_instance: &authz::Instance, boot_disk_id: Option, ) -> Result<(), Error> { - opctx.authorize(authz::Action::Modify, authz_instance).await?; - let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper("instance_set_boot_disk") @@ -1157,60 +1159,6 @@ impl DataStore { InstanceState::Creating, ]; - let maybe_old_boot_disk_id = instance_dsl::instance - .filter(instance_dsl::id.eq(authz_instance.id())) - .filter(instance_dsl::time_deleted.is_null()) - .select(instance_dsl::boot_disk_id) - .first_async::>(conn) - .await; - let old_boot_disk_id = match maybe_old_boot_disk_id { - Ok(i) => i, - Err(diesel::NotFound) => { - // If the instance simply doesn't exist, we - // shouldn't retry. Bail with a useful error. - return Err(err.bail(Error::not_found_by_id( - ResourceType::Instance, - &authz_instance.id(), - ))); - } - Err(e) => return Err(e), - }; - - // If the desired boot disk is already set, we're good here, and can - // elide the check that the instance is in an acceptable state to change - // the boot disk. - if old_boot_disk_id == boot_disk_id { - return Ok(()); - } - - if let Some(disk_id) = boot_disk_id { - // Ensure the disk is currently attached before updating - // the database. - let expected_state = - api::external::DiskState::Attached(authz_instance.id()); - - let attached_disk: Option = disk_dsl::disk - .filter(disk_dsl::id.eq(disk_id)) - .filter(disk_dsl::attach_instance_id.eq(authz_instance.id())) - .filter(disk_dsl::disk_state.eq(expected_state.label())) - .select(disk_dsl::id) - .first_async::(conn) - .await - .optional()?; - - if attached_disk.is_none() { - return Err( - err.bail(Error::conflict("boot disk must be attached")) - ); - } - } - // - // NOTE: from this point forward it is OK if we update the - // instance's `boot_disk_id` column with the updated value - // again. It will have already been assigned with constraint - // checking performed above, so updates will just be - // repetitive, not harmful. - let r = diesel::update(instance_dsl::instance) .filter(instance_dsl::id.eq(authz_instance.id())) .filter(instance_dsl::state.eq_any(OK_TO_SET_BOOT_DISK_STATES)) @@ -1220,14 +1168,46 @@ impl DataStore { .await?; match r.status { UpdateStatus::NotUpdatedButExists => { - // This should be the only reason the query would fail... - debug_assert!(!OK_TO_SET_BOOT_DISK_STATES - .contains(&r.found.runtime().nexus_state)); - Err(err.bail(Error::conflict( - "instance must be stopped to set boot disk", - ))) + if r.found.boot_disk_id == boot_disk_id { + Ok(()) + } else { + // This should be the only other reason the query would + // fail... + debug_assert!(!OK_TO_SET_BOOT_DISK_STATES + .contains(&r.found.runtime().nexus_state)); + Err(err.bail(Error::conflict( + "instance must be stopped to set boot disk", + ))) + } + } + UpdateStatus::Updated => { + if let Some(disk_id) = boot_disk_id { + // Ensure the disk is currently attached before updating + // the database. + let expected_state = + api::external::DiskState::Attached(authz_instance.id()); + + let attached_disk: Option = disk_dsl::disk + .filter(disk_dsl::id.eq(disk_id)) + .filter( + disk_dsl::attach_instance_id + .eq(authz_instance.id()), + ) + .filter(disk_dsl::disk_state.eq(expected_state.label())) + .select(disk_dsl::id) + .first_async::(conn) + .await + .optional()?; + + if attached_disk.is_none() { + return Err(err.bail(Error::conflict( + "boot disk must be attached", + ))); + } + } + + Ok(()) } - UpdateStatus::Updated => Ok(()), } } @@ -1252,38 +1232,14 @@ impl DataStore { InstanceState::Creating, ]; - let maybe_old_sizes = instance_dsl::instance - .filter(instance_dsl::id.eq(authz_instance.id())) - .filter(instance_dsl::time_deleted.is_null()) - .select((instance_dsl::ncpus, instance_dsl::memory)) - .first_async::<(InstanceCpuCount, ByteCount)>(conn) - .await; - - // Might be nice to extract `old_sizes` into an `InstanceDimensions` or - // something? Not processing the tuple `(InstanceCpuCount, ByteCount)` - // elsewhere yet, so maybe that would be premature... - let old_sizes = match maybe_old_sizes { - Ok(i) => i, - Err(diesel::NotFound) => { - // If the instance simply doesn't exist, we - // shouldn't retry. Bail with a useful error. - return Err(err.bail(Error::not_found_by_id( - ResourceType::Instance, - &authz_instance.id(), - ))); - } - Err(e) => return Err(e), - }; - - // Bail out early if the extant vCPU and memory settings match the - // "new values". - if old_sizes == (ncpus, memory) { - return Ok(()); - } - let r = diesel::update(instance_dsl::instance) .filter(instance_dsl::id.eq(authz_instance.id())) .filter(instance_dsl::state.eq_any(OK_TO_SET_SIZE_STATES)) + .filter( + instance_dsl::ncpus + .ne(ncpus) + .and(instance_dsl::memory.ne(memory)), + ) .set(( instance_dsl::ncpus.eq(ncpus), instance_dsl::memory.eq(memory), @@ -1293,12 +1249,20 @@ impl DataStore { .await?; match r.status { UpdateStatus::NotUpdatedButExists => { - // This should be the only reason the query would fail... - debug_assert!(!OK_TO_SET_SIZE_STATES - .contains(&r.found.runtime().nexus_state)); - Err(err.bail(Error::conflict( - "instance must be stopped to be resized", - ))) + let noop_update = + r.found.ncpus == ncpus && r.found.memory == memory; + + if noop_update { + Ok(()) + } else { + // This should be the only other reason the query would + // fail... + debug_assert!(!OK_TO_SET_SIZE_STATES + .contains(&r.found.runtime().nexus_state)); + Err(err.bail(Error::conflict( + "instance must be stopped to be resized", + ))) + } } UpdateStatus::Updated => Ok(()), } From 62177576c461a7ad7722b5f5063724caad377b03 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 8 Nov 2024 22:13:43 +0000 Subject: [PATCH 6/8] rework boot disk update with better filtering "better" here meaning "if we don't need to update the database, don't update the database". if we would change the boot disk to the same value, or we would change it from null to null, this change means we will instead not even select the row for an update. since we still `check_if_exists` we'll still fetch an Instance for the operation if there's no update performed. so, if the database can detect that the update would have been a no-op and elide it, this change is not particularly useful. if the database would have to commit some state as a result of an ineffectual `set`, then this is helpful! more importantly, this is moderately more legible than smearing the update logic and consistency checking between the Diesel query and handling in the `NotUpdatedButExists` arm. at the same time, Eliza rightfully pointed out that there are now (at least) two copies of a list of states that are both roughly "there is no VMM for this instance, so we can do things to its' database record". extract that out to a const on `InstanceState` and name it appropriately. finally, the instance size change logic incorrectly rejected size changes that only changed one of `(memory, cpus)`. fix that to allow changing one or the other independently and add tests to catch a future dingus (me) who might regress that somehow. --- nexus/db-model/src/instance_state.rs | 5 + nexus/db-queries/src/db/datastore/instance.rs | 140 +++++++++--------- nexus/tests/integration_tests/instances.rs | 36 ++++- 3 files changed, 109 insertions(+), 72 deletions(-) diff --git a/nexus/db-model/src/instance_state.rs b/nexus/db-model/src/instance_state.rs index 5925e92ae0..c5424251b3 100644 --- a/nexus/db-model/src/instance_state.rs +++ b/nexus/db-model/src/instance_state.rs @@ -26,6 +26,11 @@ impl_enum_type!( ); impl InstanceState { + /// Instance states where there is not currently a VMM incarnating this + /// instance, but might be in the future. + pub const NO_VMM_STATES: &'static [InstanceState] = + &[InstanceState::NoVmm, InstanceState::Creating, InstanceState::Failed]; + pub fn state(&self) -> external::InstanceState { external::InstanceState::from(*self) } diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 5a4199f835..bb2b662424 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1146,22 +1146,43 @@ impl DataStore { use db::schema::disk::dsl as disk_dsl; use db::schema::instance::dsl as instance_dsl; - // * Allow setting the boot disk in NoVmm because there is no VMM to - // contend with. - // * Allow setting the boot disk in Failed to allow changing the boot - // disk of a failed instance and free its boot disk for detach. - // * Allow setting the boot disk in Creating because one of the last - // steps of instance creation, while the instance is still in - // Creating, is to reconfigure the instance to the desired boot disk. - const OK_TO_SET_BOOT_DISK_STATES: &'static [InstanceState] = &[ - InstanceState::NoVmm, - InstanceState::Failed, - InstanceState::Creating, - ]; + if let Some(disk_id) = boot_disk_id { + // Ensure the disk is currently attached before updating the + // database. + let expected_state = + api::external::DiskState::Attached(authz_instance.id()); + + let attached_disk: Option = disk_dsl::disk + .filter(disk_dsl::id.eq(disk_id)) + .filter(disk_dsl::attach_instance_id.eq(authz_instance.id())) + .filter(disk_dsl::disk_state.eq(expected_state.label())) + .select(disk_dsl::id) + .first_async::(conn) + .await + .optional()?; - let r = diesel::update(instance_dsl::instance) + if attached_disk.is_none() { + return Err( + err.bail(Error::conflict("boot disk must be attached")) + ); + } + } + + let query = diesel::update(instance_dsl::instance) + .into_boxed() .filter(instance_dsl::id.eq(authz_instance.id())) - .filter(instance_dsl::state.eq_any(OK_TO_SET_BOOT_DISK_STATES)) + .filter(instance_dsl::state.eq_any(InstanceState::NO_VMM_STATES)); + let query = if boot_disk_id.is_some() { + query.filter( + instance_dsl::boot_disk_id + .ne(boot_disk_id) + .or(instance_dsl::boot_disk_id.is_null()), + ) + } else { + query.filter(instance_dsl::boot_disk_id.is_not_null()) + }; + + let r = query .set(instance_dsl::boot_disk_id.eq(boot_disk_id)) .check_if_exists::(authz_instance.id()) .execute_and_check(&conn) @@ -1169,50 +1190,34 @@ impl DataStore { match r.status { UpdateStatus::NotUpdatedButExists => { if r.found.boot_disk_id == boot_disk_id { - Ok(()) - } else { - // This should be the only other reason the query would - // fail... - debug_assert!(!OK_TO_SET_BOOT_DISK_STATES - .contains(&r.found.runtime().nexus_state)); - Err(err.bail(Error::conflict( - "instance must be stopped to set boot disk", - ))) + // Not updated, because the update is no change.. + return Ok(()); } - } - UpdateStatus::Updated => { - if let Some(disk_id) = boot_disk_id { - // Ensure the disk is currently attached before updating - // the database. - let expected_state = - api::external::DiskState::Attached(authz_instance.id()); - - let attached_disk: Option = disk_dsl::disk - .filter(disk_dsl::id.eq(disk_id)) - .filter( - disk_dsl::attach_instance_id - .eq(authz_instance.id()), - ) - .filter(disk_dsl::disk_state.eq(expected_state.label())) - .select(disk_dsl::id) - .first_async::(conn) - .await - .optional()?; - if attached_disk.is_none() { - return Err(err.bail(Error::conflict( - "boot disk must be attached", - ))); - } + if !InstanceState::NO_VMM_STATES + .contains(&r.found.runtime().nexus_state) + { + return Err(err.bail(Error::conflict( + "instance must be stopped to set boot disk", + ))); } - Ok(()) + // There should be no other reason the update fails on an + // existing instance. + return Err(err.bail(Error::internal_error( + "unable to reconfigure instance boot disk", + ))); } + UpdateStatus::Updated => Ok(()), } } /// Set an instance's CPU count and memory size to the provided values, /// within an existing transaction. + /// + /// Does not allow setting sizes of running instances to ensure that if an + /// instance is running, its resource reservation matches what we record in + /// the database. async fn instance_set_size_on_conn( &self, conn: &async_bb8_diesel::Connection, @@ -1223,22 +1228,13 @@ impl DataStore { ) -> Result<(), diesel::result::Error> { use db::schema::instance::dsl as instance_dsl; - // Do not allow setting sizes of running instances to ensure that if an - // instance is running, its resource usage matches what we record in the - // database. - const OK_TO_SET_SIZE_STATES: &'static [InstanceState] = &[ - InstanceState::NoVmm, - InstanceState::Failed, - InstanceState::Creating, - ]; - let r = diesel::update(instance_dsl::instance) .filter(instance_dsl::id.eq(authz_instance.id())) - .filter(instance_dsl::state.eq_any(OK_TO_SET_SIZE_STATES)) + .filter(instance_dsl::state.eq_any(InstanceState::NO_VMM_STATES)) .filter( instance_dsl::ncpus .ne(ncpus) - .and(instance_dsl::memory.ne(memory)), + .or(instance_dsl::memory.ne(memory)), ) .set(( instance_dsl::ncpus.eq(ncpus), @@ -1249,20 +1245,24 @@ impl DataStore { .await?; match r.status { UpdateStatus::NotUpdatedButExists => { - let noop_update = - r.found.ncpus == ncpus && r.found.memory == memory; + if (r.found.ncpus, r.found.memory) == (ncpus, memory) { + // Not updated, because the update is no change.. + return Ok(()); + } - if noop_update { - Ok(()) - } else { - // This should be the only other reason the query would - // fail... - debug_assert!(!OK_TO_SET_SIZE_STATES - .contains(&r.found.runtime().nexus_state)); - Err(err.bail(Error::conflict( + if !InstanceState::NO_VMM_STATES + .contains(&r.found.runtime().nexus_state) + { + return Err(err.bail(Error::conflict( "instance must be stopped to be resized", - ))) + ))); } + + // There should be no other reason the update fails on an + // existing instance. + return Err(err.bail(Error::internal_error( + "unable to reconfigure instance size", + ))); } UpdateStatus::Updated => Ok(()), } diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 5bae125c61..9a7fa9c7f1 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -4378,13 +4378,16 @@ async fn test_size_can_be_changed(cptestctx: &ControlPlaneTestContext) { create_project_and_pool(&client).await; + let initial_ncpus = InstanceCpuCount::try_from(2).unwrap(); + let initial_memory = ByteCount::from_gibibytes_u32(4); + let instance_params = params::InstanceCreate { identity: IdentityMetadataCreateParams { name: instance_name.parse().unwrap(), description: String::from("stuff"), }, - ncpus: InstanceCpuCount::try_from(2).unwrap(), - memory: ByteCount::from_gibibytes_u32(4), + ncpus: initial_ncpus, + memory: initial_memory, hostname: instance_name.parse().unwrap(), user_data: vec![], ssh_public_keys: None, @@ -4452,6 +4455,35 @@ async fn test_size_can_be_changed(cptestctx: &ControlPlaneTestContext) { assert_eq!(instance.ncpus.0, new_ncpus.0); assert_eq!(instance.memory, new_memory); + // No harm in reverting to the original size one field at a time though: + let instance = expect_instance_reconfigure_ok( + client, + &instance.identity.id, + params::InstanceUpdate { + auto_restart_policy, + boot_disk: boot_disk_nameorid.clone(), + ncpus: initial_ncpus, + memory: new_memory, + }, + ) + .await; + assert_eq!(instance.ncpus.0, initial_ncpus.0); + assert_eq!(instance.memory, new_memory); + + let instance = expect_instance_reconfigure_ok( + client, + &instance.identity.id, + params::InstanceUpdate { + auto_restart_policy, + boot_disk: boot_disk_nameorid.clone(), + ncpus: initial_ncpus, + memory: initial_memory, + }, + ) + .await; + assert_eq!(instance.ncpus.0, initial_ncpus.0); + assert_eq!(instance.memory, initial_memory); + // Now try a few invalid sizes. These all should fail for slightly different // reasons. From a6d7544d29187e3333f3d5c46d52f5d5ce31501a Mon Sep 17 00:00:00 2001 From: iximeow Date: Mon, 11 Nov 2024 18:18:48 +0000 Subject: [PATCH 7/8] NO_VMM -> NOT_INCARNATED avoid ambiguity between "NoVmm the state" and "NO_VMM the states that do not have a VMM" --- nexus/db-model/src/instance_state.rs | 2 +- nexus/db-queries/src/db/datastore/instance.rs | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/nexus/db-model/src/instance_state.rs b/nexus/db-model/src/instance_state.rs index e6d04e2d71..a3bc9d7270 100644 --- a/nexus/db-model/src/instance_state.rs +++ b/nexus/db-model/src/instance_state.rs @@ -38,7 +38,7 @@ impl_enum_type!( impl InstanceState { /// Instance states where there is not currently a VMM incarnating this /// instance, but might be in the future. - pub const NO_VMM_STATES: &'static [InstanceState] = + pub const NOT_INCARNATED_STATES: &'static [InstanceState] = &[InstanceState::NoVmm, InstanceState::Creating, InstanceState::Failed]; pub fn state(&self) -> external::InstanceState { diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index f5f5583efe..6f395099a1 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1171,7 +1171,10 @@ impl DataStore { let query = diesel::update(instance_dsl::instance) .into_boxed() .filter(instance_dsl::id.eq(authz_instance.id())) - .filter(instance_dsl::state.eq_any(InstanceState::NO_VMM_STATES)); + .filter( + instance_dsl::state + .eq_any(InstanceState::NOT_INCARNATED_STATES), + ); let query = if boot_disk_id.is_some() { query.filter( instance_dsl::boot_disk_id @@ -1194,7 +1197,7 @@ impl DataStore { return Ok(()); } - if !InstanceState::NO_VMM_STATES + if !InstanceState::NOT_INCARNATED_STATES .contains(&r.found.runtime().nexus_state) { return Err(err.bail(Error::conflict( @@ -1230,7 +1233,10 @@ impl DataStore { let r = diesel::update(instance_dsl::instance) .filter(instance_dsl::id.eq(authz_instance.id())) - .filter(instance_dsl::state.eq_any(InstanceState::NO_VMM_STATES)) + .filter( + instance_dsl::state + .eq_any(InstanceState::NOT_INCARNATED_STATES), + ) .filter( instance_dsl::ncpus .ne(ncpus) @@ -1250,7 +1256,7 @@ impl DataStore { return Ok(()); } - if !InstanceState::NO_VMM_STATES + if !InstanceState::NOT_INCARNATED_STATES .contains(&r.found.runtime().nexus_state) { return Err(err.bail(Error::conflict( From 422ae418f407a558ef6c18cbae45c9ff79fa9421 Mon Sep 17 00:00:00 2001 From: iximeow Date: Mon, 18 Nov 2024 18:27:23 +0000 Subject: [PATCH 8/8] some last comments and logging if we see things go sideways --- nexus/db-queries/src/db/datastore/instance.rs | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 6f395099a1..5894e4dba2 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1132,6 +1132,15 @@ impl DataStore { /// Set an instance's boot disk to the provided `boot_disk_id` (or unset it, /// if `boot_disk_id` is `None`), within an existing transaction. /// + /// The instance must be in an updatable state for this update to succeed. + /// If the instance is not updatable, return `Error::Conflict`. + /// + /// To update the boot disk an instance must not be incarnated by a VMM and, + /// if the boot disk is to be set non-NULL, the disk must already be + /// attached. These constraints together ensure that the boot disk reflected + /// in Nexus is always reflective of the disk a VMM should be allowed to + /// use. + /// /// This is factored out as it is used by both /// [`DataStore::instance_reconfigure`], which mutates many instance fields, /// and [`DataStore::instance_set_boot_disk`], which only touches the boot @@ -1207,6 +1216,12 @@ impl DataStore { // There should be no other reason the update fails on an // existing instance. + warn!( + self.log, "failed to instance_set_boot_disk_on_conn on an \ + instance that should have been updatable"; + "instance_id" => %r.found.id(), + "new boot_disk_id" => ?boot_disk_id + ); return Err(err.bail(Error::internal_error( "unable to reconfigure instance boot disk", ))); @@ -1218,6 +1233,14 @@ impl DataStore { /// Set an instance's CPU count and memory size to the provided values, /// within an existing transaction. /// + /// The instance must be in an updatable state for this update to succeed. + /// If the instance is not updatable, return `Error::Conflict`. + /// + /// To update an instance's CPU or memory sizes an instance must not be + /// incarnated by a VMM. This constraint ensures that the sizes recorded in + /// Nexus sum to the actual peak possible resource usage of running + /// instances. + /// /// Does not allow setting sizes of running instances to ensure that if an /// instance is running, its resource reservation matches what we record in /// the database. @@ -1266,6 +1289,13 @@ impl DataStore { // There should be no other reason the update fails on an // existing instance. + warn!( + self.log, "failed to instance_set_size_on_conn on an \ + instance that should have been updatable"; + "instance_id" => %r.found.id(), + "new ncpus" => ?ncpus, + "new memory" => ?memory, + ); return Err(err.bail(Error::internal_error( "unable to reconfigure instance size", )));