Skip to content

Commit

Permalink
fix(nexus): fixing child retire during rebuild
Browse files Browse the repository at this point in the history
* Fixed stuck rebuild job when a child is retired during rebuild
* Fixed stuck nexus destroy if the nexus had a rebuild job failed
* A Python test for retire-during rebuild added

Signed-off-by: Dmitry Savitskiy <[email protected]>
  • Loading branch information
dsavitskiy committed Jul 19, 2023
1 parent fa9a41e commit 48c8dc1
Show file tree
Hide file tree
Showing 10 changed files with 300 additions and 179 deletions.
32 changes: 9 additions & 23 deletions mayastor/src/bdev/nexus/nexus_bdev.rs
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,11 @@ impl<'n> Nexus<'n> {
unsafe { self.bdev().required_alignment() }
}

/// TODO
pub fn children_iter(&self) -> std::slice::Iter<NexusChild<'n>> {
self.children.iter()
}

/// Reconfigures the child event handler.
pub(crate) async fn reconfigure(&self, event: DrEvent) {
info!(
Expand Down Expand Up @@ -883,19 +888,7 @@ impl<'n> Nexus<'n> {
self.cancel_child_rebuild_jobs(child.get_name()).await;
}

unsafe {
for child in self.as_mut().get_unchecked_mut().children.iter_mut() {
info!("Destroying child bdev {}", child.get_name());
if let Err(e) = child.close().await {
// TODO: should an error be returned here?
error!(
"Failed to close child {} with error {}",
child.get_name(),
e.verbose()
);
}
}
}
self.close_children().await;

// Persist the fact that the nexus destruction has completed.
self.persist(PersistOp::Shutdown).await;
Expand Down Expand Up @@ -1283,16 +1276,9 @@ impl<'n> BdevOps for Nexus<'n> {
Reactor::block_on(async move {
let self_ref = unsafe { &mut *self_ptr };

for child in self_ref.children.iter_mut() {
if child.state() == ChildState::Open {
if let Err(e) = child.close().await {
error!(
"{}: child {} failed to close with error {}",
self_ref.name,
child.get_name(),
e.verbose()
);
}
for child in self_ref.children_iter() {
if child.is_healthy() {
child.close().await.ok();
}
}

Expand Down
85 changes: 31 additions & 54 deletions mayastor/src/bdev/nexus/nexus_bdev_children.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

use std::{cmp::min, pin::Pin};

use futures::future::join_all;
use snafu::ResultExt;

use super::{
Expand Down Expand Up @@ -299,17 +298,12 @@ impl<'n> Nexus<'n> {
Some(val) => val,
};

unsafe {
if let Err(e) = self.as_mut().get_unchecked_mut().children[idx]
.close()
.await
{
return Err(Error::CloseChild {
name: self.name.clone(),
child: self.children[idx].get_name().to_string(),
source: e,
});
}
if let Err(e) = self.children[idx].close().await {
return Err(Error::CloseChild {
name: self.name.clone(),
child: self.children[idx].get_name().to_string(),
source: e,
});
}

let child_state = self.children[idx].state();
Expand Down Expand Up @@ -460,18 +454,25 @@ impl<'n> Nexus<'n> {
}
}

/// Close each child that belongs to this nexus.
pub(crate) async fn close_children(mut self: Pin<&mut Self>) {
let futures = unsafe {
self.as_mut()
.get_unchecked_mut()
.children
.iter_mut()
.map(|c| c.close())
};
let results = join_all(futures).await;
if results.iter().any(|c| c.is_err()) {
error!("{}: Failed to close children", self.name);
/// Unconditionally closes all children of this nexus.
pub(crate) async fn close_children(&self) {
info!(
"{:?}: closing {n} children...",
self,
n = self.children.len()
);

let mut failed = 0;
for child in self.children_iter() {
if child.close().await.is_err() {
failed += 1;
}
}

if failed == 0 {
info!("{:?}: all children closed", self);
} else {
error!("{:?}: failed to close some of the children", self);
}
}

Expand Down Expand Up @@ -520,20 +521,9 @@ impl<'n> Nexus<'n> {
// completed yet so we fail the registration all together for now.
if failed {
// Close any children that WERE succesfully opened.
unsafe {
for child in
self.as_mut().get_unchecked_mut().children.iter_mut()
{
if child.state() == ChildState::Open {
if let Err(error) = child.close().await {
error!(
"{}: failed to close child {}: {}",
name,
child.name,
error.verbose()
);
}
}
for child in self.children_iter() {
if child.is_healthy() {
child.close().await.ok();
}
}

Expand Down Expand Up @@ -563,20 +553,7 @@ impl<'n> Nexus<'n> {
}

if let Err(error) = write_ex_err {
unsafe {
for child in
self.as_mut().get_unchecked_mut().children.iter_mut()
{
if let Err(error) = child.close().await {
error!(
"{}: child {} failed to close with error {}",
name,
child.name,
error.verbose()
);
}
}
}
self.close_children().await;
return Err(error);
}

Expand Down Expand Up @@ -609,9 +586,9 @@ impl<'n> Nexus<'n> {
}

/// TODO
pub async fn destroy_child(&self, name: &str) -> Result<(), Error> {
pub async fn close_child(&self, name: &str) -> Result<(), Error> {
if let Some(child) = self.lookup_child(name) {
child.destroy().await.map_err(|source| Error::DestroyChild {
child.close().await.map_err(|source| Error::DestroyChild {
source,
child: name.to_string(),
name: self.name.to_string(),
Expand Down
18 changes: 14 additions & 4 deletions mayastor/src/bdev/nexus/nexus_channel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,12 @@ pub(crate) fn fault_nexus_child(nexus: Pin<&mut Nexus>, name: &str) -> bool {
nexus
.children
.iter()
.filter(|c| c.state() == ChildState::Open)
.filter(|c| {
matches!(
c.state(),
ChildState::Open | ChildState::Faulted(Reason::OutOfSync)
)
})
.filter(|c| {
// If there were previous retires, we do not have a reference
// to a BlockDevice. We do however, know it can't be the device
Expand All @@ -65,11 +70,16 @@ pub(crate) fn fault_nexus_child(nexus: Pin<&mut Nexus>, name: &str) -> bool {
}
})
.any(|c| {
Ok(ChildState::Open)
Ok(ChildState::Faulted(Reason::OutOfSync))
== c.state.compare_exchange(
ChildState::Open,
ChildState::Faulted(Reason::IoError),
ChildState::Faulted(Reason::OutOfSync),
ChildState::Faulted(Reason::RebuildFailed),
)
|| Ok(ChildState::Open)
== c.state.compare_exchange(
ChildState::Open,
ChildState::Faulted(Reason::IoError),
)
})
}

Expand Down
Loading

0 comments on commit 48c8dc1

Please sign in to comment.