Skip to content

Commit 5b36521

Browse files
mayastor-borstiagolobocastro
mayastor-bors
andcommitted
Merge #1721
1721: [ Cherry-Pick ] Fix stuck rebuilds and stuck nexus subsystems r=tiagolobocastro a=tiagolobocastro fix(nvmx/retire): disconnect failed controllers When we are pausing the nexus, all IO must get flushed before the subsystem pausing completes. If we can't flush the IO then pausing is stuck forever... The issue we have seen is that when IO's are stuck there's nothing which can fail them and allow pause to complete. One way this can happen is when the controller is failed as it seems in this case the io queues are not getting polled. A first fix that can be done is to piggy back on the adminq polling failure and use this to drive the removal of the failed child devices from the nexus per-core channels. A better approach might be needed in the future to be able to timeout the IOs even when no completions are processed in a given I/O qpair. Signed-off-by: Tiago Castro <tiagolobocastro@gmail.com> --- fix(opts): convert adminq poll period to us This seems to have been mistakenly added as ms. In practice this would have caused no harm as this value is not currently being overrided by the helm chart. Signed-off-by: Tiago Castro <tiagolobocastro@gmail.com> --- fix(rebuild): ensure comms channel is drained on drop When the rebuild backend is dropped, we must also drain the async channel. This covers a corner case where a message may be sent at the same time as we're dropping and in this case the message would hang. This is not a hang for prod as there we have timeouts which would eventually cancel the future and allow the drop, though this can still lead to timeouts and confusion. Signed-off-by: Tiago Castro <tiagolobocastro@gmail.com> Co-authored-by: Tiago Castro <tiagolobocastro@gmail.com>
2 parents 6939b15 + acb1462 commit 5b36521

File tree

12 files changed

+138
-22
lines changed

12 files changed

+138
-22
lines changed

io-engine-tests/src/compose/mod.rs

+43-7
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,21 @@ use std::future::Future;
55
use tokio::sync::oneshot::channel;
66

77
use crate::mayastor_test_init_ex;
8-
use io_engine::core::{
9-
mayastor_env_stop,
10-
MayastorCliArgs,
11-
MayastorEnvironment,
12-
Reactor,
13-
Reactors,
14-
GLOBAL_RC,
8+
use io_engine::{
9+
core::{
10+
device_monitor_loop,
11+
mayastor_env_stop,
12+
runtime,
13+
MayastorCliArgs,
14+
MayastorEnvironment,
15+
ProtectedSubsystems,
16+
Reactor,
17+
Reactors,
18+
ResourceLockManager,
19+
ResourceLockManagerConfig,
20+
GLOBAL_RC,
21+
},
22+
grpc,
1523
};
1624
use std::time::Duration;
1725

@@ -99,6 +107,34 @@ impl<'a> MayastorTest<'a> {
99107
tokio::time::sleep(Duration::from_millis(500)).await;
100108
}
101109
}
110+
111+
/// Starts the device monitor loop which is required to fully
112+
/// remove devices when they are not in use.
113+
pub fn start_device_monitor(&self) {
114+
runtime::spawn(device_monitor_loop());
115+
}
116+
117+
/// Start the gRPC server which can be useful to debug tests.
118+
pub fn start_grpc(&self) {
119+
let cfg = ResourceLockManagerConfig::default()
120+
.with_subsystem(ProtectedSubsystems::POOL, 32)
121+
.with_subsystem(ProtectedSubsystems::NEXUS, 512)
122+
.with_subsystem(ProtectedSubsystems::REPLICA, 1024);
123+
ResourceLockManager::initialize(cfg);
124+
125+
let env = MayastorEnvironment::global_or_default();
126+
runtime::spawn(async {
127+
grpc::MayastorGrpcServer::run(
128+
&env.node_name,
129+
&env.node_nqn,
130+
env.grpc_endpoint.unwrap(),
131+
env.rpc_addr,
132+
env.api_versions,
133+
)
134+
.await
135+
.ok();
136+
});
137+
}
102138
}
103139

104140
impl<'a> Drop for MayastorTest<'a> {

io-engine/src/bdev/nexus/nexus_bdev_children.rs

+32-2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ use super::{
4141
Nexus,
4242
NexusChild,
4343
NexusOperation,
44+
NexusPauseState,
4445
NexusState,
4546
NexusStatus,
4647
PersistOp,
@@ -787,6 +788,13 @@ impl<'n> DeviceEventListener for Nexus<'n> {
787788
false,
788789
);
789790
}
791+
DeviceEventType::AdminQNoticeCtrlFailed => {
792+
Reactors::master().send_future(Nexus::disconnect_failed_child(
793+
self.name.clone(),
794+
dev_name.to_owned(),
795+
));
796+
}
797+
790798
_ => {
791799
warn!(
792800
"{:?}: ignoring event '{:?}' for device '{}'",
@@ -917,6 +925,28 @@ impl<'n> Nexus<'n> {
917925
}
918926
}
919927

928+
/// Disconnect a failed child from the given nexus.
929+
async fn disconnect_failed_child(nexus_name: String, dev: String) {
930+
let Some(nex) = nexus_lookup_mut(&nexus_name) else {
931+
warn!(
932+
"Nexus '{nexus_name}': retiring failed device '{dev}': \
933+
nexus already gone"
934+
);
935+
return;
936+
};
937+
938+
info!("Nexus '{nexus_name}': disconnect handlers for controller failed device: '{dev}'");
939+
940+
if nex.io_subsystem_state() == Some(NexusPauseState::Pausing) {
941+
nex.traverse_io_channels_async((), |channel, _| {
942+
channel.disconnect_detached_devices(|h| {
943+
h.get_device().device_name() == dev && h.is_ctrlr_failed()
944+
});
945+
})
946+
.await;
947+
}
948+
}
949+
920950
/// Retires a child device for the given nexus.
921951
async fn child_retire_routine(
922952
nexus_name: String,
@@ -981,12 +1011,12 @@ impl<'n> Nexus<'n> {
9811011
// channels, and all I/Os failing due to this device will eventually
9821012
// resubmit and succeeded (if any healthy children are left).
9831013
//
984-
// Device disconnection is done in two steps (detach, than disconnect)
1014+
// Device disconnection is done in two steps (detach, then disconnect)
9851015
// in order to prevent an I/O race when retiring a device.
9861016
self.detach_device(&dev).await;
9871017

9881018
// Disconnect the devices with failed controllers _before_ pause,
989-
// otherwise pause would stuck. Keep all controoled that are _not_
1019+
// otherwise pause would get stuck. Keep all controllers which are _not_
9901020
// failed (e.g., in the case I/O failed due to ENOSPC).
9911021
self.traverse_io_channels_async((), |channel, _| {
9921022
channel.disconnect_detached_devices(|h| h.is_ctrlr_failed());

io-engine/src/bdev/nexus/nexus_channel.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,14 @@ impl<'n> Debug for NexusChannel<'n> {
3232
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
3333
write!(
3434
f,
35-
"{io} chan '{nex}' core:{core}({cur}) [R:{r} W:{w} L:{l} C:{c}]",
35+
"{io} chan '{nex}' core:{core}({cur}) [R:{r} W:{w} D:{d} L:{l} C:{c}]",
3636
io = if self.is_io_chan { "I/O" } else { "Aux" },
3737
nex = self.nexus.nexus_name(),
3838
core = self.core,
3939
cur = Cores::current(),
4040
r = self.readers.len(),
4141
w = self.writers.len(),
42+
d = self.detached.len(),
4243
l = self.io_logs.len(),
4344
c = self.nexus.child_count(),
4445
)

io-engine/src/bdev/nvmx/controller.rs

+9-3
Original file line numberDiff line numberDiff line change
@@ -843,13 +843,14 @@ pub extern "C" fn nvme_poll_adminq(ctx: *mut c_void) -> i32 {
843843
if result < 0 {
844844
if context.start_device_destroy() {
845845
error!(
846-
"process adminq: {}: {}",
846+
"process adminq: {}: ctrl failed: {}, error: {}",
847847
context.name,
848+
context.is_failed(),
848849
Errno::from_i32(result.abs())
849850
);
850851
info!("dispatching nexus fault and retire: {}", context.name);
851-
let dev_name = context.name.to_string();
852-
let carc = NVME_CONTROLLERS.lookup_by_name(&dev_name).unwrap();
852+
let dev_name = context.name.as_str();
853+
let carc = NVME_CONTROLLERS.lookup_by_name(dev_name).unwrap();
853854
debug!(
854855
?dev_name,
855856
"notifying listeners of admin command completion failure"
@@ -863,6 +864,11 @@ pub extern "C" fn nvme_poll_adminq(ctx: *mut c_void) -> i32 {
863864
?num_listeners,
864865
"listeners notified of admin command completion failure"
865866
);
867+
} else if context.report_failed() {
868+
if let Some(carc) = NVME_CONTROLLERS.lookup_by_name(&context.name) {
869+
carc.lock()
870+
.notify_listeners(DeviceEventType::AdminQNoticeCtrlFailed);
871+
}
866872
}
867873
return 1;
868874
}

io-engine/src/bdev/nvmx/controller_inner.rs

+15
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ pub(crate) struct TimeoutConfig {
7474
reset_attempts: u32,
7575
next_reset_time: Instant,
7676
destroy_in_progress: AtomicCell<bool>,
77+
report_failed: AtomicCell<bool>,
7778
}
7879

7980
impl Drop for TimeoutConfig {
@@ -94,6 +95,7 @@ impl TimeoutConfig {
9495
reset_attempts: MAX_RESET_ATTEMPTS,
9596
next_reset_time: Instant::now(),
9697
destroy_in_progress: AtomicCell::new(false),
98+
report_failed: AtomicCell::new(true),
9799
}
98100
}
99101

@@ -116,6 +118,19 @@ impl TimeoutConfig {
116118
}
117119
}
118120

121+
/// Check if the SPDK's nvme controller is failed.
122+
pub fn is_failed(&self) -> bool {
123+
self.ctrlr.is_failed
124+
}
125+
/// Check if we need to report the controller failure.
126+
/// We only report this failure once.
127+
pub fn report_failed(&mut self) -> bool {
128+
if !self.is_failed() {
129+
return false;
130+
}
131+
self.report_failed.compare_exchange(true, false).is_ok()
132+
}
133+
119134
fn reset_cb(success: bool, ctx: *mut c_void) {
120135
let timeout_ctx = TimeoutConfig::from_ptr(ctx as *mut TimeoutConfig);
121136

io-engine/src/core/device_events.rs

+7-1
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,14 @@ pub enum DeviceEventType {
1919
DeviceResized,
2020
/// TODO
2121
MediaManagement,
22-
/// TODO
22+
/// Sent when admin q polling fails for the first time.
2323
AdminCommandCompletionFailed,
24+
/// When the adminq poll fails the first time, the controller may not yet
25+
/// be failed.
26+
/// Next time the admin q poll fails, if the controller is noticed as
27+
/// failed for the first time, this event is sent, allowing further
28+
/// clean up to be performed.
29+
AdminQNoticeCtrlFailed,
2430
}
2531

2632
/// TODO

io-engine/src/core/env.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ type Result<T, E = EnvError> = std::result::Result<T, E>;
382382
#[allow(dead_code)]
383383
pub struct MayastorEnvironment {
384384
pub node_name: String,
385-
node_nqn: Option<String>,
385+
pub node_nqn: Option<String>,
386386
pub grpc_endpoint: Option<std::net::SocketAddr>,
387387
pub registration_endpoint: Option<Uri>,
388388
ps_endpoint: Option<String>,
@@ -421,7 +421,7 @@ pub struct MayastorEnvironment {
421421
nvmf_tgt_interface: Option<String>,
422422
/// NVMF target Command Retry Delay in x100 ms.
423423
pub nvmf_tgt_crdt: [u16; TARGET_CRDT_LEN],
424-
api_versions: Vec<ApiVersion>,
424+
pub api_versions: Vec<ApiVersion>,
425425
skip_sig_handler: bool,
426426
enable_io_all_thrd_nexus_channels: bool,
427427
developer_delay: bool,

io-engine/src/rebuild/rebuild_job.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ impl RebuildJob {
320320
}
321321
}
322322

323-
#[derive(Debug, Clone)]
323+
#[derive(Debug)]
324324
struct RebuildFBendChan {
325325
sender: async_channel::Sender<RebuildJobRequest>,
326326
}

io-engine/src/rebuild/rebuild_job_backend.rs

+19-2
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ impl RebuildJobBackendManager {
336336
}
337337
}
338338

339-
/// Reply back to the requester with the generic rebuild stats.
339+
/// Reply to the requester with the generic rebuild stats.
340340
async fn reply_stats(
341341
&mut self,
342342
requester: oneshot::Sender<RebuildStats>,
@@ -488,10 +488,27 @@ impl RebuildJobBackendManager {
488488
}
489489

490490
impl Drop for RebuildJobBackendManager {
491+
/// Close and drain comms channel allowing sender to see the cancellation
492+
/// error, should it attempt to communicate.
493+
/// This is required because it seems if a message was already sent then it
494+
/// will not get dropped until both the receivers and the senders are
495+
/// dropped.
491496
fn drop(&mut self) {
497+
// set final stats now so failed stats requesters can still get stats.
492498
let stats = self.stats();
493499
info!("{self}: backend dropped; final stats: {stats:?}");
494-
self.states.write().set_final_stats(stats);
500+
self.states.write().set_final_stats(stats.clone());
501+
502+
// we close before draining, ensuring no new messages can be sent
503+
self.info_chan.receiver.close();
504+
// now we can drain, and we could just ignore, but let's try to
505+
// reply to any stats requests
506+
while let Ok(message) = self.info_chan.receiver.try_recv() {
507+
if let RebuildJobRequest::GetStats(reply) = message {
508+
reply.send(stats.clone()).ok();
509+
}
510+
}
511+
495512
for sender in self.complete_chan.lock().drain(..) {
496513
sender.send(self.state()).ok();
497514
}

io-engine/src/rebuild/rebuild_state.rs

+4-2
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,10 @@ impl RebuildStates {
7474
}
7575
/// Set the final rebuild statistics.
7676
pub(super) fn set_final_stats(&mut self, mut stats: RebuildStats) {
77-
stats.end_time = Some(Utc::now());
78-
self.final_stats = Some(stats);
77+
if self.final_stats.is_none() {
78+
stats.end_time = Some(Utc::now());
79+
self.final_stats = Some(stats);
80+
}
7981
}
8082

8183
/// Set's the next pending state

io-engine/src/subsys/config/opts.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ impl Default for NvmeBdevOpts {
392392
nvme_adminq_poll_period_us: time_try_from_env(
393393
"NVME_ADMINQ_POLL_PERIOD",
394394
1_000,
395-
TimeUnit::MilliSeconds,
395+
TimeUnit::MicroSeconds,
396396
),
397397
nvme_ioq_poll_period_us: time_try_from_env(
398398
"NVME_IOQ_POLL_PERIOD",

io-engine/tests/block_device_nvmf.rs

+3
Original file line numberDiff line numberDiff line change
@@ -1841,6 +1841,9 @@ async fn nvmf_device_hot_remove() {
18411841

18421842
impl DeviceEventListener for TestEventListener {
18431843
fn handle_device_event(&self, event: DeviceEventType, device: &str) {
1844+
if event == DeviceEventType::AdminQNoticeCtrlFailed {
1845+
return; // Not interested in this one
1846+
}
18441847
// Check event type and device name.
18451848
assert_eq!(event, DeviceEventType::DeviceRemoved);
18461849
assert_eq!(

0 commit comments

Comments
 (0)