Skip to content

Commit 044de92

Browse files
mayastor-borsdsavitskiy
mayastor-bors
andcommitted
Merge #1672
1672: feat(rebuild): prefer a local replica as a rebuild source r=dsavitskiy a=dsavitskiy This would reduce network traffic when rebuilding a remote replica and a local one is healthy. Co-authored-by: Dmitry Savitskiy <dmitry.savitskiy@datacore.com>
2 parents b146fa7 + c3816b9 commit 044de92

File tree

3 files changed

+246
-9
lines changed

3 files changed

+246
-9
lines changed

io-engine-tests/src/nexus.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,11 @@ impl NexusBuilder {
130130
self.with_bdev(&bdev)
131131
}
132132

133+
pub fn with_replicas(self, replicas: &[ReplicaBuilder]) -> Self {
134+
let cc = replicas.iter().map(|r| self.replica_uri(r)).collect();
135+
self.with_children(cc)
136+
}
137+
133138
pub fn with_local_replica(self, r: &ReplicaBuilder) -> Self {
134139
if r.rpc() != self.rpc() {
135140
panic!("Replica is not local");
@@ -152,7 +157,7 @@ impl NexusBuilder {
152157
self
153158
}
154159

155-
fn replica_uri(&self, r: &ReplicaBuilder) -> String {
160+
pub fn replica_uri(&self, r: &ReplicaBuilder) -> String {
156161
if r.rpc() == self.rpc() {
157162
r.bdev()
158163
} else {

io-engine/src/bdev/nexus/nexus_bdev_rebuild.rs

+18-8
Original file line numberDiff line numberDiff line change
@@ -87,15 +87,11 @@ impl<'n> Nexus<'n> {
8787
info!("{self:?}: start rebuild request for {child_uri}");
8888

8989
// Find a healthy child to rebuild from.
90-
let src_child_uri = match self
91-
.children_iter()
92-
.find(|c| c.is_healthy() && c.uri() != child_uri)
93-
{
94-
Some(child) => Ok(child.uri().to_owned()),
95-
None => Err(Error::NoRebuildSource {
90+
let Some(src_child_uri) = self.find_src_replica(child_uri) else {
91+
return Err(Error::NoRebuildSource {
9692
name: name.clone(),
97-
}),
98-
}?;
93+
});
94+
};
9995

10096
let dst_child_uri = match self.lookup_child(child_uri) {
10197
Some(c) if c.is_opened_unsync() => {
@@ -157,6 +153,20 @@ impl<'n> Nexus<'n> {
157153
})
158154
}
159155

156+
/// Finds the best suited source replica for the given destination.
157+
fn find_src_replica(&self, dst_uri: &str) -> Option<String> {
158+
let candidates: Vec<_> = self
159+
.children_iter()
160+
.filter(|c| c.is_healthy() && c.uri() != dst_uri)
161+
.collect();
162+
163+
candidates
164+
.iter()
165+
.find(|c| c.is_local().unwrap_or(false))
166+
.or_else(|| candidates.first())
167+
.map(|c| c.uri().to_owned())
168+
}
169+
160170
/// TODO
161171
async fn create_rebuild_job(
162172
&self,
+222
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
pub mod common;
2+
3+
use common::{
4+
compose::{
5+
rpc::v1::{
6+
nexus::{ChildState, ChildStateReason},
7+
GrpcConnect,
8+
SharedRpcHandle,
9+
},
10+
Binary,
11+
Builder,
12+
},
13+
nexus::NexusBuilder,
14+
pool::PoolBuilder,
15+
replica::ReplicaBuilder,
16+
};
17+
18+
const POOL_SIZE: u64 = 200;
19+
const REPL_SIZE: u64 = 50;
20+
const NEXUS_SIZE: u64 = REPL_SIZE;
21+
22+
struct TestNode {
23+
idx: usize,
24+
ms: SharedRpcHandle,
25+
pool: PoolBuilder,
26+
replicas: Vec<ReplicaBuilder>,
27+
}
28+
29+
impl TestNode {
30+
async fn next_replica(&mut self) -> ReplicaBuilder {
31+
let mut repl = ReplicaBuilder::new(self.ms.clone())
32+
.with_pool(&self.pool)
33+
.with_name(&format!(
34+
"repl_{i}_{j}",
35+
i = self.idx,
36+
j = self.replicas.len()
37+
))
38+
.with_new_uuid()
39+
.with_size_mb(REPL_SIZE);
40+
41+
repl.create().await.unwrap();
42+
repl.share().await.unwrap();
43+
self.replicas.push(repl.clone());
44+
repl
45+
}
46+
47+
async fn clear(&mut self) {
48+
for i in 0 .. self.replicas.len() {
49+
self.replicas[i].destroy().await.unwrap();
50+
}
51+
self.replicas.clear();
52+
}
53+
}
54+
55+
async fn test_src_selection(
56+
nodes: &mut Vec<TestNode>,
57+
nex_node: usize,
58+
child_cfg: Vec<usize>,
59+
dst: usize,
60+
expected_src_idx: usize,
61+
) {
62+
let to = std::time::Duration::from_secs(1);
63+
64+
let mut replicas = Vec::new();
65+
for i in 0 .. child_cfg.len() {
66+
replicas.push(nodes[child_cfg[i]].next_replica().await);
67+
}
68+
69+
let mut nex = NexusBuilder::new(nodes[nex_node].ms.clone())
70+
.with_name("nexus0")
71+
.with_new_uuid()
72+
.with_size_mb(NEXUS_SIZE)
73+
.with_replicas(&replicas);
74+
75+
nex.create().await.unwrap();
76+
77+
println!("---------");
78+
println!(
79+
"> {child_cfg:?}: expect to rebuild #{dst} from #{expected_src_idx}"
80+
);
81+
let children = nex.get_nexus().await.unwrap().children;
82+
83+
for (idx, child) in children.iter().enumerate() {
84+
println!(" [{idx}] {c:?}", c = child.uri);
85+
}
86+
87+
let r = &replicas[dst];
88+
println!(" rebuilding #{dst}: {uri}", uri = nex.replica_uri(r));
89+
90+
nex.offline_child_replica(r).await.unwrap();
91+
nex.wait_replica_state(
92+
r,
93+
ChildState::Degraded,
94+
Some(ChildStateReason::ByClient),
95+
to,
96+
)
97+
.await
98+
.unwrap();
99+
nex.online_child_replica(r).await.unwrap();
100+
nex.wait_children_online(to).await.unwrap();
101+
102+
let rec = nex
103+
.get_rebuild_history()
104+
.await
105+
.unwrap()
106+
.first()
107+
.unwrap()
108+
.clone();
109+
110+
let dst_idx = children
111+
.iter()
112+
.position(|c| c.uri == rec.child_uri)
113+
.unwrap();
114+
let src_idx = children.iter().position(|c| c.uri == rec.src_uri).unwrap();
115+
116+
println!(
117+
" rebuilt #{dst_idx}: {dst} from #{src_idx}: {src}",
118+
src = rec.src_uri,
119+
dst = rec.child_uri
120+
);
121+
122+
assert_eq!(
123+
src_idx, expected_src_idx,
124+
"Expected child index {expected_src_idx}, got {src_idx}"
125+
);
126+
127+
nex.destroy().await.unwrap();
128+
for node in nodes {
129+
node.clear().await;
130+
}
131+
}
132+
133+
/// Should prefer a local replica for rebuild source.
134+
#[tokio::test]
135+
async fn nexus_rebuild_prefer_local_replica() {
136+
common::composer_init();
137+
138+
let test = Builder::new()
139+
.name("cargo-test")
140+
.network("10.1.0.0/16")
141+
.unwrap()
142+
.add_container_bin(
143+
"ms_0",
144+
Binary::from_dbg("io-engine").with_args(vec![
145+
"-l",
146+
"1,2",
147+
"-Fcolor,compact,host,nodate",
148+
]),
149+
)
150+
.add_container_bin(
151+
"ms_1",
152+
Binary::from_dbg("io-engine").with_args(vec![
153+
"-l",
154+
"3,4",
155+
"-Fcolor,compact,host,nodate",
156+
]),
157+
)
158+
.add_container_bin(
159+
"ms_2",
160+
Binary::from_dbg("io-engine").with_args(vec![
161+
"-l",
162+
"5,6",
163+
"-Fcolor,compact,host,nodate",
164+
]),
165+
)
166+
.with_clean(true)
167+
.build()
168+
.await
169+
.unwrap();
170+
171+
let conn = GrpcConnect::new(&test);
172+
173+
let mut nodes = Vec::new();
174+
175+
for idx in 0 .. 3 {
176+
let ms = conn.grpc_handle_shared(&format!("ms_{idx}")).await.unwrap();
177+
178+
let mut pool = PoolBuilder::new(ms.clone())
179+
.with_name(&format!("pool_{idx}"))
180+
.with_new_uuid()
181+
.with_malloc(&format!("mem_{idx}"), POOL_SIZE);
182+
183+
pool.create().await.unwrap();
184+
185+
nodes.push(TestNode {
186+
idx,
187+
ms,
188+
pool,
189+
replicas: Vec::new(),
190+
});
191+
}
192+
193+
// All local, should select first avail.
194+
test_src_selection(&mut nodes, 0, vec![0, 0, 0], 0, 1).await;
195+
test_src_selection(&mut nodes, 0, vec![0, 0, 0], 1, 0).await;
196+
test_src_selection(&mut nodes, 0, vec![0, 0, 0], 2, 0).await;
197+
198+
// Local-remote-remote, should prefer the local one (here it is #0).
199+
test_src_selection(&mut nodes, 0, vec![0, 1, 2], 0, 1).await;
200+
test_src_selection(&mut nodes, 0, vec![0, 1, 2], 1, 0).await;
201+
test_src_selection(&mut nodes, 0, vec![0, 1, 2], 2, 0).await;
202+
203+
// Remote-local-remote, should prefer the local one (here it is #1).
204+
test_src_selection(&mut nodes, 0, vec![1, 0, 2], 0, 1).await;
205+
test_src_selection(&mut nodes, 0, vec![1, 0, 2], 1, 0).await;
206+
test_src_selection(&mut nodes, 0, vec![1, 0, 2], 2, 1).await;
207+
208+
// Remote-remote-local, should prefer the local one (here it is #2).
209+
test_src_selection(&mut nodes, 0, vec![1, 2, 0], 0, 2).await;
210+
test_src_selection(&mut nodes, 0, vec![1, 2, 0], 1, 2).await;
211+
test_src_selection(&mut nodes, 0, vec![1, 2, 0], 2, 0).await;
212+
213+
// Remote-local-local, should prefer the first avail local one (#1 or #2).
214+
test_src_selection(&mut nodes, 0, vec![1, 0, 0], 0, 1).await;
215+
test_src_selection(&mut nodes, 0, vec![1, 0, 0], 1, 2).await;
216+
test_src_selection(&mut nodes, 0, vec![1, 0, 0], 2, 1).await;
217+
218+
// All remote, should prefer the first avail.
219+
test_src_selection(&mut nodes, 0, vec![1, 1, 1], 0, 1).await;
220+
test_src_selection(&mut nodes, 0, vec![1, 1, 1], 1, 0).await;
221+
test_src_selection(&mut nodes, 0, vec![1, 1, 1], 2, 0).await;
222+
}

0 commit comments

Comments
 (0)