Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,7 @@ nexus-test-utils = { path = "nexus/test-utils" }
nexus-types = { path = "nexus/types" }
nix = { version = "0.30", features = ["fs", "net"] }
nom = "7.1.3"
nonempty = "0.12.0"
num-integer = "0.1.46"
num = { version = "0.4.3", default-features = false, features = [ "libm" ] }
omicron-clickhouse-admin = { path = "clickhouse-admin" }
Expand Down Expand Up @@ -687,6 +688,7 @@ schemars = "0.8.22"
scopeguard = "1.2.0"
secrecy = "0.10.3"
semver = { version = "1.0.26", features = ["std", "serde"] }
seq-macro = "0.3.6"
serde = { version = "1.0", default-features = false, features = [ "derive", "rc" ] }
serde_cbor = "0.11.2"
serde_human_bytes = { git = "https://github.com/oxidecomputer/serde_human_bytes", branch = "main" }
Expand Down
2 changes: 2 additions & 0 deletions nexus/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ ring.workspace = true
samael.workspace = true
schemars = { workspace = true, features = ["chrono", "uuid1"] }
semver.workspace = true
seq-macro.workspace = true
serde.workspace = true
serde_json.workspace = true
serde_urlencoded.workspace = true
Expand All @@ -106,6 +107,7 @@ slog-dtrace.workspace = true
slog-error-chain.workspace = true
display-error-chain.workspace = true
slog-term.workspace = true
static_assertions.workspace = true
steno.workspace = true
tempfile.workspace = true
thiserror.workspace = true
Expand Down
2 changes: 1 addition & 1 deletion nexus/db-model/src/affinity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ impl From<params::AffinityGroupUpdate> for AffinityGroupUpdate {
#[diesel(table_name = anti_affinity_group)]
pub struct AntiAffinityGroup {
#[diesel(embed)]
identity: AntiAffinityGroupIdentity,
pub identity: AntiAffinityGroupIdentity,
pub project_id: Uuid,
pub policy: AffinityPolicy,
pub failure_domain: FailureDomain,
Expand Down
24 changes: 24 additions & 0 deletions nexus/db-model/src/local_storage_dataset_allocation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,30 @@ pub struct LocalStorageDatasetAllocation {
}

impl LocalStorageDatasetAllocation {
/// These records are normally created during sled reservation, but for unit
/// tests add this `new` function.
pub fn new_for_tests_only(
id: DatasetUuid,
time_created: DateTime<Utc>,
local_storage_dataset_id: DatasetUuid,
pool_id: ExternalZpoolUuid,
sled_id: SledUuid,
dataset_size: ByteCount,
) -> Self {
Self {
id: id.into(),

time_created,
time_deleted: None,

local_storage_dataset_id: local_storage_dataset_id.into(),
pool_id: pool_id.into(),
sled_id: sled_id.into(),

dataset_size,
}
}

pub fn id(&self) -> DatasetUuid {
self.id.into()
}
Expand Down
1 change: 1 addition & 0 deletions nexus/db-queries/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ internal-dns-types.workspace = true
ipnetwork.workspace = true
itertools.workspace = true
macaddr.workspace = true
nonempty.workspace = true
oxnet.workspace = true
paste.workspace = true
# See omicron-rpaths for more about the "pq-sys" dependency.
Expand Down
103 changes: 92 additions & 11 deletions nexus/db-queries/src/db/datastore/disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ use crate::db::update_and_check::UpdateStatus;
use async_bb8_diesel::AsyncRunQueryDsl;
use chrono::DateTime;
use chrono::Utc;
use diesel::dsl::exists;
use diesel::dsl::not;
use diesel::prelude::*;
use nexus_db_errors::ErrorHandler;
use nexus_db_errors::OptionalError;
Expand Down Expand Up @@ -434,13 +436,24 @@ impl DataStore {
authz_instance: &authz::Instance,
pagparams: &PaginatedBy<'_>,
) -> ListResultVec<Disk> {
use nexus_db_schema::schema::disk::dsl;
use nexus_db_schema::schema::disk_type_crucible::dsl as disk_type_crucible_dsl;
use nexus_db_schema::schema::disk_type_local_storage::dsl as disk_type_local_storage_dsl;
let conn = self.pool_connection_authorized(opctx).await?;

opctx.authorize(authz::Action::ListChildren, authz_instance).await?;

let conn = self.pool_connection_authorized(opctx).await?;
self.instance_list_disks_on_conn(&conn, authz_instance.id(), pagparams)
.await
}

/// List disks associated with a given instance by name.
pub async fn instance_list_disks_on_conn(
&self,
conn: &async_bb8_diesel::Connection<DbConnection>,
instance_id: Uuid,
pagparams: &PaginatedBy<'_>,
) -> ListResultVec<Disk> {
use nexus_db_schema::schema::disk::dsl;
use nexus_db_schema::schema::disk_type_crucible::dsl as disk_type_crucible_dsl;
use nexus_db_schema::schema::disk_type_local_storage::dsl as disk_type_local_storage_dsl;

let results = match pagparams {
PaginatedBy::Id(pagparams) => {
Expand All @@ -461,13 +474,13 @@ impl DataStore {
.on(dsl::id.eq(disk_type_local_storage_dsl::disk_id)),
)
.filter(dsl::time_deleted.is_null())
.filter(dsl::attach_instance_id.eq(authz_instance.id()))
.filter(dsl::attach_instance_id.eq(instance_id))
.select((
model::Disk::as_select(),
Option::<DiskTypeCrucible>::as_select(),
Option::<DiskTypeLocalStorage>::as_select(),
))
.get_results_async(&*conn)
.get_results_async(conn)
.await
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;

Expand All @@ -494,7 +507,7 @@ impl DataStore {
let allocation = dsl::local_storage_dataset_allocation
.filter(dsl::id.eq(to_db_typed_uuid(allocation_id)))
.select(LocalStorageDatasetAllocation::as_select())
.first_async(&*conn)
.first_async(conn)
.await
.map_err(|e| {
public_error_from_diesel(
Expand Down Expand Up @@ -747,7 +760,9 @@ impl DataStore {
authz_disk: &authz::Disk,
max_disks: u32,
) -> Result<(Instance, Disk), Error> {
use nexus_db_schema::schema::{disk, instance};
use nexus_db_schema::schema::disk;
use nexus_db_schema::schema::instance;
use nexus_db_schema::schema::sled_resource_vmm;

opctx.authorize(authz::Action::Modify, authz_instance).await?;
opctx.authorize(authz::Action::Modify, authz_disk).await?;
Expand All @@ -771,6 +786,74 @@ impl DataStore {

let attach_update = DiskSetClauseForAttach::new(authz_instance.id());

let disk = self.disk_get(&opctx, authz_disk.id()).await?;
let resource_query = match disk {
Disk::Crucible(_) => disk::table.into_boxed().filter(
disk::dsl::disk_state.eq_any(ok_to_attach_disk_state_labels),
),

// Attaching a local storage disk to the instance has to be blocked
// if sled reservation has occured for this instance: local storage
// allocation records are only created during sled reservation, and
// importantly a particular configuration of local storage
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit, sorry:

Suggested change
// importantly a particular configuration of local storage
// importantly, a particular configuration of local storage

// allocations for an instance are only _validated_ during the sled
// reservation.
//
// The instance start saga perform sled reservation, creates the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo:

Suggested change
// The instance start saga perform sled reservation, creates the
// The instance start saga performs sled reservation, creates the

// corresponding VMM record, and changes the instance's runtime
// state all in _separate saga nodes_. This means that there could
Comment on lines +802 to +804
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

take it or leave it: it may perhaps be worth noting something here about why this has to happen like that, so the future reader doesn't just think "well why not just change that part?". i recall that we went down that path briefly yesterday.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely, though I can't recall the exact details here from the meeting, and I can't seem to find the recording of the call. I do remember it had something to do with the way the VMM state machine is defined. What were the details here?

// be an indeterminate amount of time between running the sled
// reservation query and changing the instance's state.
//
// If a client attaches a local storage disk to an instance after
// sled reservation occurs but before the instance's start moves to
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit, perhaps:

Suggested change
// sled reservation occurs but before the instance's start moves to
// sled reservation occurs but before the instance's start saga moves to

// starting, and we do not block it, there are several problems that
// result:
//
// - if an allocation does not already exist for the local storage
// disk, the instance_start saga will fail (and unwind) when
// trying to ensure that the allocation's dataset and zvol exist
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// trying to ensure that the allocation's dataset and zvol exist
// trying to ensure that the allocation's dataset and zvol exist,

// because the allocation_id column is None.
//
// - if an allocation does already exist for the local storage disk,
// _it may not be for the same sled the VMM is on_. the sled
// reservation query would prevent this, but this attach (if not
// blocked) happened afterwards. This would mean Nexus would
// construct a InstanceSledLocalConfig that contains DelegatedZvol
// entries that refer to different sleds.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we mayhaps explicitly note that this would result in a totally unstartable instance? or perhaps that's totally obvious.

//
// - if an allocation does exist already, and it's for the same sled
// the VMM is on, it may be colocated on a zpool with another
// local storage disk's allocation. again, the sled reservation
// query prevents this.
//
// - if an allocation does exist already, and it's for the same
// sled the VMM is on, and it's on a distinct zpool, then it's
// probably fine, but it's safer to let the sled reservation query
// validate everything, and it makes a much smaller query to block
// this case as well.
//
// `reserve_on_random_sled` will create an entry in
// `SledResourcesVmm` when the query is successful in finding a VMM
// reservation, so use that here: if there is a `SledResourcesVmm`
// record for this instance, then block attachment.
//
// Note that depending on our implementation, this may be the code
// path responsible for attaching disks to already-running instances
// when we support hot-plug. Local storage disks may never support
// hot-plug because running zones cannot be reconfigured (aka a new
// zvol rdsk device cannot be added to a running propolis zone).
Disk::LocalStorage(_) => disk::table
.into_boxed()
.filter(
disk::dsl::disk_state
.eq_any(ok_to_attach_disk_state_labels),
)
.filter(not(exists(sled_resource_vmm::table.filter(
sled_resource_vmm::dsl::instance_id.eq(authz_instance.id()),
)))),
};

let query = Instance::attach_resource(
authz_instance.id(),
authz_disk.id(),
Expand All @@ -779,9 +862,7 @@ impl DataStore {
.eq_any(ok_to_attach_instance_states)
.and(instance::dsl::active_propolis_id.is_null()),
),
disk::table.into_boxed().filter(
disk::dsl::disk_state.eq_any(ok_to_attach_disk_state_labels),
),
resource_query,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i believe that if the query to validate that no preexisting local storage allocations are present fails, we will return AttachError::NoUpdate. we will then end up down here, where that error variant is handled:

AttachError::NoUpdate { attached_count, resource, collection } => {
let disk_state = resource.state().into();
match disk_state {
// Idempotent errors: We did not perform an update,
// because we're already in the process of attaching.
api::external::DiskState::Attached(id) if id == authz_instance.id() => {
return Ok((collection, resource));
}
api::external::DiskState::Attaching(id) if id == authz_instance.id() => {
return Ok((collection, resource));
}
// Ok-to-attach disk states: Inspect the state to infer
// why we did not attach.
api::external::DiskState::Creating |
api::external::DiskState::Detached => {
if collection.runtime_state.propolis_id.is_some() {
return Err(
Error::invalid_request(
"cannot attach disk: instance is not \
fully stopped"
)
);
}
match collection.runtime_state.nexus_state.state() {
// Ok-to-be-attached instance states:
api::external::InstanceState::Creating |
api::external::InstanceState::Stopped => {
// The disk is ready to be attached, and the
// instance is ready to be attached. Perhaps
// we are at attachment capacity?
if attached_count == i64::from(max_disks) {
return Err(Error::invalid_request(&format!(
"cannot attach more than {} disks to instance",
max_disks
)));
}
// We can't attach, but the error hasn't
// helped us infer why.
return Err(Error::internal_error(
"cannot attach disk"
));
}
// Not okay-to-be-attached instance states:
_ => {
Err(Error::invalid_request(&format!(
"cannot attach disk to instance in {} state",
collection.runtime_state.nexus_state.state(),
)))
}
}
},
// Not-okay-to-attach disk states: The disk is attached elsewhere.
api::external::DiskState::Attached(_) |
api::external::DiskState::Attaching(_) |
api::external::DiskState::Detaching(_) => {
Err(Error::invalid_request(&format!(
"cannot attach disk \"{}\": disk is attached to another instance",
resource.name().as_str(),
)))
}
_ => {
Err(Error::invalid_request(&format!(
"cannot attach disk \"{}\": invalid state {}",
resource.name().as_str(),
disk_state,
)))
}
}
},

because we don't check for this situation in that match, we end up returning an error that just says "invalid state" and prints the disk state:

Err(Error::invalid_request(&format!(
"cannot attach disk \"{}\": invalid state {}",
resource.name().as_str(),
disk_state,
)))

but, that error won't be very helpful here, as it doesn't actually describe why the disk couldn't be attached. i think we should really be returning something more descriptive to the user here. it would be nice if we could say something like "cannot attach local storage disk to this instance, as local storage has already been allocated on a sled" or something.

max_disks,
diesel::update(disk::dsl::disk).set(attach_update),
);
Expand Down
Loading
Loading