Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow instance_reconfigure to resize a stopped instance #6774

Merged
merged 9 commits into from
Nov 19, 2024
4 changes: 3 additions & 1 deletion common/src/api/external/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,9 @@ impl InstanceState {
}

/// The number of CPUs in an Instance
#[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)]
#[derive(
Copy, Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq,
)]
pub struct InstanceCpuCount(pub u16);

impl TryFrom<i64> for InstanceCpuCount {
Expand Down
4 changes: 4 additions & 0 deletions nexus/db-model/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -542,4 +542,8 @@ pub struct InstanceUpdate {
/// set the instance's auto-restart policy to `NULL`.
#[diesel(column_name = auto_restart_policy)]
pub auto_restart_policy: Option<InstanceAutoRestartPolicy>,

pub ncpus: InstanceCpuCount,

pub memory: ByteCount,
hawkw marked this conversation as resolved.
Show resolved Hide resolved
}
10 changes: 9 additions & 1 deletion nexus/db-model/src/instance_cpu_count.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,15 @@ use serde::Serialize;
use std::convert::TryFrom;

#[derive(
Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize,
Copy,
Clone,
Debug,
AsExpression,
FromSqlRow,
Serialize,
Deserialize,
PartialEq,
Eq,
)]
#[diesel(sql_type = sql_types::BigInt)]
pub struct InstanceCpuCount(pub external::InstanceCpuCount);
Expand Down
173 changes: 113 additions & 60 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ use crate::db::error::public_error_from_diesel;
use crate::db::error::ErrorHandler;
use crate::db::identity::Resource;
use crate::db::lookup::LookupPath;
use crate::db::model::ByteCount;
use crate::db::model::Generation;
use crate::db::model::Instance;
use crate::db::model::InstanceAutoRestart;
use crate::db::model::InstanceAutoRestartPolicy;
use crate::db::model::InstanceCpuCount;
use crate::db::model::InstanceRuntimeState;
use crate::db::model::InstanceState;
use crate::db::model::InstanceUpdate;
Expand Down Expand Up @@ -1038,8 +1040,12 @@ impl DataStore {
.transaction_retry_wrapper("reconfigure_instance")
.transaction(&conn, |conn| {
let err = err.clone();
let InstanceUpdate { boot_disk_id, auto_restart_policy } =
update.clone();
let InstanceUpdate {
boot_disk_id,
auto_restart_policy,
ncpus,
memory,
} = update.clone();
async move {
// Set the auto-restart policy.
diesel::update(instance_dsl::instance)
Expand All @@ -1051,11 +1057,21 @@ impl DataStore {
.execute_async(&conn)
.await?;

// Set vCPUs and memory size.
self.instance_set_size_on_conn(
&conn,
&err,
&authz_instance,
ncpus,
memory,
)
.await?;

// Next, set the boot disk if needed.
self.instance_set_boot_disk_on_conn(
&conn,
&err,
authz_instance,
&authz_instance,
boot_disk_id,
)
.await?;
Expand All @@ -1077,14 +1093,16 @@ impl DataStore {
Ok(instance_and_vmm)
}

/// Set the boot disk on an instance, bypassing the rest of an instance
/// update. You probably don't need this; it's only used at the end of
/// instance creation, since the boot disk can't be set until the new
/// instance's disks are all attached.
pub async fn instance_set_boot_disk(
&self,
opctx: &OpContext,
authz_instance: &authz::Instance,
boot_disk_id: Option<Uuid>,
) -> Result<(), Error> {
opctx.authorize(authz::Action::Modify, authz_instance).await?;

let err = OptionalError::new();
let conn = self.pool_connection_authorized(opctx).await?;
self.transaction_retry_wrapper("instance_set_boot_disk")
Expand Down Expand Up @@ -1141,75 +1159,110 @@ impl DataStore {
InstanceState::Creating,
];

let maybe_instance = instance_dsl::instance
let r = diesel::update(instance_dsl::instance)
.filter(instance_dsl::id.eq(authz_instance.id()))
.filter(instance_dsl::time_deleted.is_null())
.select(Instance::as_select())
.first_async::<Instance>(conn)
.await;
let instance = match maybe_instance {
Ok(i) => i,
Err(diesel::NotFound) => {
// If the instance simply doesn't exist, we
// shouldn't retry. Bail with a useful error.
return Err(err.bail(Error::not_found_by_id(
ResourceType::Instance,
&authz_instance.id(),
)));
.filter(instance_dsl::state.eq_any(OK_TO_SET_BOOT_DISK_STATES))
.set(instance_dsl::boot_disk_id.eq(boot_disk_id))
.check_if_exists::<Instance>(authz_instance.id())
.execute_and_check(&conn)
.await?;
match r.status {
UpdateStatus::NotUpdatedButExists => {
if r.found.boot_disk_id == boot_disk_id {
Ok(())
} else {
// This should be the only other reason the query would
// fail...
debug_assert!(!OK_TO_SET_BOOT_DISK_STATES
.contains(&r.found.runtime().nexus_state));
Err(err.bail(Error::conflict(
"instance must be stopped to set boot disk",
)))
}
}
Err(e) => return Err(e),
};

// If the desired boot disk is already set, we're good here, and can
// elide the check that the instance is in an acceptable state to change
// the boot disk.
if instance.boot_disk_id == boot_disk_id {
return Ok(());
}
UpdateStatus::Updated => {
if let Some(disk_id) = boot_disk_id {
// Ensure the disk is currently attached before updating
// the database.
let expected_state =
api::external::DiskState::Attached(authz_instance.id());

let attached_disk: Option<Uuid> = disk_dsl::disk
.filter(disk_dsl::id.eq(disk_id))
.filter(
disk_dsl::attach_instance_id
.eq(authz_instance.id()),
)
.filter(disk_dsl::disk_state.eq(expected_state.label()))
.select(disk_dsl::id)
.first_async::<Uuid>(conn)
.await
.optional()?;

if let Some(disk_id) = boot_disk_id {
// Ensure the disk is currently attached before updating
// the database.
let expected_state =
api::external::DiskState::Attached(authz_instance.id());

let attached_disk: Option<Uuid> = disk_dsl::disk
.filter(disk_dsl::id.eq(disk_id))
.filter(disk_dsl::attach_instance_id.eq(authz_instance.id()))
.filter(disk_dsl::disk_state.eq(expected_state.label()))
.select(disk_dsl::id)
.first_async::<Uuid>(conn)
.await
.optional()?;
if attached_disk.is_none() {
return Err(err.bail(Error::conflict(
"boot disk must be attached",
)));
}
}

if attached_disk.is_none() {
return Err(
err.bail(Error::conflict("boot disk must be attached"))
);
Ok(())
}
}
//
// NOTE: from this point forward it is OK if we update the
// instance's `boot_disk_id` column with the updated value
// again. It will have already been assigned with constraint
// checking performed above, so updates will just be
// repetitive, not harmful.
}

/// Set an instance's CPU count and memory size to the provided values,
/// within an existing transaction.
async fn instance_set_size_on_conn(
&self,
conn: &async_bb8_diesel::Connection<DbConnection>,
err: &OptionalError<Error>,
authz_instance: &authz::Instance,
ncpus: InstanceCpuCount,
memory: ByteCount,
) -> Result<(), diesel::result::Error> {
use db::schema::instance::dsl as instance_dsl;

// Do not allow setting sizes of running instances to ensure that if an
// instance is running, its resource usage matches what we record in the
// database.
const OK_TO_SET_SIZE_STATES: &'static [InstanceState] = &[
InstanceState::NoVmm,
InstanceState::Failed,
InstanceState::Creating,
];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It occurs to me that OK_TO_SET_SIZE_STATES and OK_TO_SET_BOOT_DISK_STATES are the same list of states, and what we really want in both cases is "states in which there is no VMM actively incarnating this instance". I kind of wonder if, rather than duplicating in both the instance-resize and instance-set-boot-disk queries, we should just have a constant on InstanceState that's like NOT_INCARNATED_STATES or similar, so that these queries and any future ones can all use the same set of states, and if we add new states that have "not incarnated" semantics, we can add them to the one definition of that list of states, instead.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on one hand i agree (and enough to go just do the thing, there's now InstanceState::NO_VMM_STATES*), but from the conversations we've had we've been pretty close to having caveouts for specific not-incarnated states and fields. namely "is it OK to set the boot disk in Creating", and does allowing that allow correctness issues (no) or a possibly-confusing-but-correct API result (yes). otoh "actions we can take on not-incarnated instances" is a much easier concept than having to think through each kind of action and state combination, so it's nicer in an axiomatic way.

*: after writing this i realize why you said NOT_INCARNATED rather than NO_VMM - because there is a NoVmm state which is very distinct from what this list is getting at. so i should go change that name.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, my thinking is that we should avoid having weird exceptions for setting specific parameters unless it's absolutely necessary. I'd be quite surprised if we don't basically just end up being able to divide every possible instance_reconfigure into either "can do this only if there is no Real Life VMM Process incarnating the instance" and "can do this whenever", and I'd prefer to have to burn that bridge only once we come to it...


let r = diesel::update(instance_dsl::instance)
.filter(instance_dsl::id.eq(authz_instance.id()))
.filter(instance_dsl::state.eq_any(OK_TO_SET_BOOT_DISK_STATES))
.set(instance_dsl::boot_disk_id.eq(boot_disk_id))
.filter(instance_dsl::state.eq_any(OK_TO_SET_SIZE_STATES))
.filter(
instance_dsl::ncpus
.ne(ncpus)
.and(instance_dsl::memory.ne(memory)),
)
iximeow marked this conversation as resolved.
Show resolved Hide resolved
.set((
instance_dsl::ncpus.eq(ncpus),
instance_dsl::memory.eq(memory),
))
.check_if_exists::<Instance>(authz_instance.id())
.execute_and_check(&conn)
.await?;
match r.status {
UpdateStatus::NotUpdatedButExists => {
// This should be the only reason the query would fail...
debug_assert!(!OK_TO_SET_BOOT_DISK_STATES
.contains(&r.found.runtime().nexus_state));
Err(err.bail(Error::conflict(
"instance must be stopped to set boot disk",
)))
let noop_update =
r.found.ncpus == ncpus && r.found.memory == memory;

if noop_update {
Ok(())
} else {
// This should be the only other reason the query would
// fail...
debug_assert!(!OK_TO_SET_SIZE_STATES
.contains(&r.found.runtime().nexus_state));
Err(err.bail(Error::conflict(
"instance must be stopped to be resized",
)))
}
}
UpdateStatus::Updated => Ok(()),
}
Expand Down
Loading
Loading