Skip to content

Commit 61ef768

Browse files
mayastor-borstiagolobocastro
andcommitted
chore(bors): merge pull request #887
887: Fix regression for pool creation timeout retry r=tiagolobocastro a=tiagolobocastro test: use tmp in project workspace Use a tmp folder from the workspace allowing us to cleanup up things like LVM volumes a lot easier as we can just purge it. Signed-off-by: Tiago Castro <[email protected]> --- test(pool): create on very large or very slow disks Uses LVM Lvols as backend devices for the pool. We suspend these before pool creation, allowing us to simulate slow pool creation. This test ensures that the pool creation is completed by itself and also that a client can also complete it by calling create again. Signed-off-by: Tiago Castro <[email protected]> --- fix: allow pool creation to complete asynchronously When the initial create gRPC times out, the data-plane may still be creating the pool in the background, which can happen for very large pools. Rather than assume failure, we allow this to complete in the background up to a large arbitrary amount of time. If the pool creation completes before, then we retry the creation flow. The reason why we don't simply use very large timeouts is because the gRPC operations are currently sequential, mostly due to historical reasons. Now that the data-plane is allowing concurrent calls, we should also allow this on the control-plane. TODO: allow concurrent node operations Signed-off-by: Tiago Castro <[email protected]> --- fix: check for correct not found error code A previous fix ended up not working correctly because it was merged incorrectly, somehow! Signed-off-by: Tiago Castro <[email protected]> --- chore: update terraform node prep Pull the Release key from a recent k8s version since the old keys are no longer valid. This will have to be updated from time to time. Co-authored-by: Tiago Castro <[email protected]>
2 parents dd21e4e + 8caf2a6 commit 61ef768

File tree

48 files changed

+495
-378
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+495
-378
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,4 @@ __pycache__
2626
/rpc/mayastor-api
2727
/local-fio-0-verify.state
2828
/report.xml
29+
/.tmp

control-plane/agents/src/bin/core/controller/reconciler/pool/mod.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ impl TaskPoller for PoolReconciler {
5353
results.push(Self::squash_results(vec![
5454
pool.garbage_collect(context).await,
5555
pool.recreate_state(context).await,
56+
retry_create_pool_reconciler(&mut pool, context).await,
5657
]))
5758
}
5859
capacity::remove_larger_replicas(context.registry()).await;
@@ -214,3 +215,12 @@ async fn deleting_pool_spec_reconciler(
214215
))
215216
.await
216217
}
218+
219+
#[tracing::instrument(skip(pool, context), level = "trace", fields(pool.id = %pool.id(), request.reconcile = true))]
220+
async fn retry_create_pool_reconciler(
221+
pool: &mut OperationGuardArc<PoolSpec>,
222+
context: &PollContext,
223+
) -> PollResult {
224+
pool.retry_creating(context.registry()).await?;
225+
Ok(PollerState::Idle)
226+
}

control-plane/agents/src/bin/core/controller/registry.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ pub(crate) struct RegistryInner<S: Store> {
8686
/// The duration for which the reconciler waits for the replica to
8787
/// to be healthy again before attempting to online the faulted child.
8888
faulted_child_wait_period: Option<std::time::Duration>,
89+
/// When the pool creation gRPC times out, the actual call in the io-engine
90+
/// may still progress.
91+
/// We wait up to this period before considering the operation a failure and
92+
/// GC'ing the pool.
93+
pool_async_creat_tmo: std::time::Duration,
8994
/// Disable partial rebuild for volume targets.
9095
disable_partial_rebuild: bool,
9196
/// Disable nvmf target access control gates.
@@ -122,6 +127,7 @@ impl Registry {
122127
reconcile_period: std::time::Duration,
123128
reconcile_idle_period: std::time::Duration,
124129
faulted_child_wait_period: Option<std::time::Duration>,
130+
pool_async_creat_tmo: std::time::Duration,
125131
disable_partial_rebuild: bool,
126132
disable_target_acc: bool,
127133
max_rebuilds: Option<NumRebuilds>,
@@ -165,6 +171,7 @@ impl Registry {
165171
reconcile_period,
166172
reconcile_idle_period,
167173
faulted_child_wait_period,
174+
pool_async_creat_tmo,
168175
disable_partial_rebuild,
169176
disable_target_acc,
170177
reconciler: ReconcilerControl::new(),
@@ -298,6 +305,10 @@ impl Registry {
298305
pub(crate) fn faulted_child_wait_period(&self) -> Option<std::time::Duration> {
299306
self.faulted_child_wait_period
300307
}
308+
/// Allow for this given time before assuming failure and allowing the pool to get deleted.
309+
pub(crate) fn pool_async_creat_tmo(&self) -> std::time::Duration {
310+
self.pool_async_creat_tmo
311+
}
301312
/// The maximum number of concurrent create volume requests.
302313
pub(crate) fn create_volume_limit(&self) -> usize {
303314
self.create_volume_limit

control-plane/agents/src/bin/core/controller/resources/operations_helper.rs

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,9 @@ enum SpecError {
5151
}
5252

5353
/// What to do when creation fails.
54+
#[derive(Debug)]
5455
pub(crate) enum OnCreateFail {
5556
/// Leave object as `Creating`, could allow for frontend retries.
56-
#[allow(unused)]
5757
LeaveAsIs,
5858
/// When frontend retries don't make sense, set it to deleting so we can clean-up.
5959
SetDeleting,
@@ -68,7 +68,28 @@ impl OnCreateFail {
6868
pub(crate) fn eeinval_delete<O>(result: &Result<O, SvcError>) -> Self {
6969
match result {
7070
Err(error) if error.tonic_code() == tonic::Code::InvalidArgument => Self::Delete,
71-
Err(error) if error.tonic_code() == tonic::Code::NotFound => Self::Delete,
71+
_ => Self::SetDeleting,
72+
}
73+
}
74+
/// Map errors into `Self` for pool creation requests, specifically.
75+
pub(crate) fn on_pool_create_err<O>(result: &Result<O, SvcError>) -> Self {
76+
let Err(ref error) = result else {
77+
// nonsensical but that's how the api is today...
78+
return Self::SetDeleting;
79+
};
80+
match error.tonic_code() {
81+
// 1. the disk is open by another pool or bdev
82+
// 2. the disk contains a pool with another name
83+
tonic::Code::InvalidArgument => Self::Delete,
84+
// 1. the pool disk is not available (ie not found or broken somehow)
85+
tonic::Code::NotFound => Self::Delete,
86+
// In this case, it's the pool operator's job to attempt re-creation of the pool.
87+
// 1. pre-2.6 dataplane, contention on the pool service
88+
// 2. pool disk is very slow or extremely large
89+
// 3. dataplane core is shared with other processes
90+
// TODO: use higher timeout on larger pool sizes or potentially make this
91+
// an async operation.
92+
tonic::Code::Cancelled => Self::LeaveAsIs,
7293
_ => Self::SetDeleting,
7394
}
7495
}

control-plane/agents/src/bin/core/main.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,13 @@ pub(crate) struct CliArgs {
4747
#[clap(long)]
4848
pub(crate) faulted_child_wait_period: Option<humantime::Duration>,
4949

50+
/// When the pool creation gRPC times out, the actual call in the io-engine
51+
/// may still progress.
52+
/// We wait up to this period before considering the operation a failure and
53+
/// GC'ing the pool.
54+
#[clap(long, default_value = "15m")]
55+
pub(crate) pool_async_creat_tmo: humantime::Duration,
56+
5057
/// Disable partial rebuild for volume targets.
5158
#[clap(long, env = "DISABLE_PARTIAL_REBUILD")]
5259
pub(crate) disable_partial_rebuild: bool,
@@ -194,6 +201,7 @@ async fn server(cli_args: CliArgs) -> anyhow::Result<()> {
194201
cli_args.reconcile_period.into(),
195202
cli_args.reconcile_idle_period.into(),
196203
cli_args.faulted_child_wait_period.map(|t| t.into()),
204+
cli_args.pool_async_creat_tmo.into(),
197205
cli_args.disable_partial_rebuild,
198206
cli_args.disable_target_acc,
199207
cli_args.max_rebuilds,

control-plane/agents/src/bin/core/pool/operations_helper.rs

Lines changed: 68 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,77 @@
1-
use crate::controller::{
2-
registry::Registry,
3-
resources::{operations::ResourceLifecycle, OperationGuardArc},
1+
use crate::{
2+
controller::{
3+
io_engine::PoolApi,
4+
registry::Registry,
5+
resources::{
6+
operations::ResourceLifecycle,
7+
operations_helper::{GuardedOperationsHelper, OnCreateFail, SpecOperationsHelper},
8+
OperationGuardArc,
9+
},
10+
},
11+
node::wrapper::GetterOps,
412
};
513
use agents::{errors, errors::SvcError};
614
use snafu::OptionExt;
15+
use std::ops::Deref;
716
use stor_port::types::v0::{
8-
store::replica::{PoolRef, ReplicaSpec},
9-
transport::{DestroyReplica, NodeId, ReplicaOwners},
17+
store::{
18+
pool::PoolSpec,
19+
replica::{PoolRef, ReplicaSpec},
20+
},
21+
transport::{CreatePool, DestroyReplica, NodeId, ReplicaOwners},
1022
};
1123

24+
impl OperationGuardArc<PoolSpec> {
25+
/// Retries the creation of the pool which is being done in the background by the io-engine.
26+
/// This may happen if the pool create gRPC times out, for very large pools.
27+
/// We could increase the timeout but as things stand today that would block all gRPC
28+
/// access to the node.
29+
/// TODO: Since the data-plane now allows concurrent gRPC we should also modify the
30+
/// control-plane to allow this, which would allows to set large timeouts for some gRPCs.
31+
pub(crate) async fn retry_creating(&mut self, registry: &Registry) -> Result<(), SvcError> {
32+
let request = {
33+
let spec = self.lock();
34+
if on_create_fail(&spec, registry).is_some() {
35+
return Ok(());
36+
}
37+
CreatePool::from(spec.deref())
38+
};
39+
40+
let node = registry.node_wrapper(&request.node).await?;
41+
if node.pool(&request.id).await.is_none() {
42+
return Ok(());
43+
}
44+
45+
let _ = self.start_create(registry, &request).await?;
46+
let result = node.create_pool(&request).await;
47+
let _state = self
48+
.complete_create(result, registry, OnCreateFail::LeaveAsIs)
49+
.await?;
50+
51+
Ok(())
52+
}
53+
54+
/// Ge the `OnCreateFail` policy.
55+
/// For more information see [`Self::retry_creating`].
56+
pub(crate) fn on_create_fail(&self, registry: &Registry) -> OnCreateFail {
57+
let spec = self.lock();
58+
on_create_fail(&spec, registry).unwrap_or(OnCreateFail::LeaveAsIs)
59+
}
60+
}
61+
62+
fn on_create_fail(pool: &PoolSpec, registry: &Registry) -> Option<OnCreateFail> {
63+
if !pool.status().creating() {
64+
return Some(OnCreateFail::LeaveAsIs);
65+
}
66+
let Some(last_mod_elapsed) = pool.creat_tsc.and_then(|t| t.elapsed().ok()) else {
67+
return Some(OnCreateFail::SetDeleting);
68+
};
69+
if last_mod_elapsed > registry.pool_async_creat_tmo() {
70+
return Some(OnCreateFail::SetDeleting);
71+
}
72+
None
73+
}
74+
1275
impl OperationGuardArc<ReplicaSpec> {
1376
/// Destroy the replica from its volume
1477
pub(crate) async fn destroy_volume_replica(

control-plane/agents/src/bin/core/pool/pool_operations.rs

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,7 @@ impl ResourceLifecycle for OperationGuardArc<PoolSpec> {
6565
let _ = pool.start_create(registry, request).await?;
6666

6767
let result = node.create_pool(request).await;
68-
let on_fail = OnCreateFail::eeinval_delete(&result);
69-
68+
let on_fail = OnCreateFail::on_pool_create_err(&result);
7069
let state = pool.complete_create(result, registry, on_fail).await?;
7170
let spec = pool.lock().clone();
7271
Ok(Pool::new(spec, Some(CtrlPoolState::new(state))))
@@ -93,14 +92,11 @@ impl ResourceLifecycle for OperationGuardArc<PoolSpec> {
9392
Err(SvcError::PoolNotFound { .. }) => {
9493
match node.import_pool(&self.as_ref().into()).await {
9594
Ok(_) => node.destroy_pool(request).await,
96-
Err(error)
97-
if allow_not_found
98-
&& error.tonic_code() == tonic::Code::InvalidArgument =>
99-
{
100-
Ok(())
101-
}
102-
Err(error) if error.tonic_code() == tonic::Code::InvalidArgument => Ok(()),
103-
Err(error) => Err(error),
95+
Err(error) => match error.tonic_code() {
96+
tonic::Code::NotFound if allow_not_found => Ok(()),
97+
tonic::Code::InvalidArgument => Ok(()),
98+
_other => Err(error),
99+
},
104100
}
105101
}
106102
Err(error) => Err(error),

control-plane/agents/src/bin/core/pool/specs.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,9 @@ impl ResourceSpecsLocked {
425425
let pools = self.pools_rsc();
426426
for pool in pools {
427427
if let Ok(mut guard) = pool.operation_guard() {
428-
if !guard.handle_incomplete_ops(registry).await {
428+
let on_fail = guard.on_create_fail(registry);
429+
430+
if !guard.handle_incomplete_ops_ext(registry, on_fail).await {
429431
// Not all pending operations could be handled.
430432
pending_ops = true;
431433
}

control-plane/agents/src/bin/core/tests/controller/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use stor_port::{
99
};
1010

1111
use serde_json::Value;
12-
use std::str::FromStr;
12+
use std::{str::FromStr, time::Duration};
1313
use uuid::Uuid;
1414

1515
/// Test that the content of the registry is correctly loaded from the persistent store on start up.
@@ -225,6 +225,7 @@ async fn etcd_pagination() {
225225
.with_rest(false)
226226
.with_jaeger(false)
227227
.with_store_lease_ttl(lease_ttl)
228+
.with_req_timeouts(Duration::from_millis(200), Duration::from_millis(200))
228229
.build()
229230
.await
230231
.unwrap();

control-plane/agents/src/bin/core/tests/deserializer.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ fn test_deserialization_v1_to_v2() {
152152
},
153153
sequencer: Default::default(),
154154
operation: None,
155+
creat_tsc: None,
155156
}),
156157
},
157158
TestEntry {

control-plane/agents/src/bin/core/tests/pool/mod.rs

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ use stor_port::{
2121
VolumePolicy,
2222
},
2323
},
24-
store::replica::{ReplicaSpec, ReplicaSpecKey},
24+
store::{
25+
pool::PoolLabel,
26+
replica::{ReplicaSpec, ReplicaSpecKey},
27+
},
2528
transport::{
2629
CreatePool, CreateReplica, DestroyPool, DestroyReplica, Filter, GetSpecs, NexusId,
2730
NodeId, Protocol, Replica, ReplicaId, ReplicaName, ReplicaOwners, ReplicaShareProtocol,
@@ -1027,3 +1030,84 @@ async fn destroy_after_restart() {
10271030

10281031
assert_eq!(pool.state().unwrap().id, create.id);
10291032
}
1033+
1034+
#[tokio::test]
1035+
async fn slow_create() {
1036+
const POOL_SIZE_BYTES: u64 = 128 * 1024 * 1024;
1037+
1038+
let vg = deployer_cluster::lvm::VolGroup::new("slow-pool", POOL_SIZE_BYTES).unwrap();
1039+
let lvol = vg.create_lvol("lvol0", POOL_SIZE_BYTES / 2).unwrap();
1040+
lvol.suspend().unwrap();
1041+
{
1042+
let cluster = ClusterBuilder::builder()
1043+
.with_io_engines(1)
1044+
.with_reconcile_period(Duration::from_millis(250), Duration::from_millis(250))
1045+
.with_cache_period("200ms")
1046+
.with_options(|o| o.with_io_engine_devices(vec![lvol.path()]))
1047+
.with_req_timeouts(Duration::from_millis(500), Duration::from_millis(500))
1048+
.compose_build(|b| b.with_clean(true))
1049+
.await
1050+
.unwrap();
1051+
1052+
let client = cluster.grpc_client();
1053+
1054+
let create = CreatePool {
1055+
node: cluster.node(0),
1056+
id: "bob".into(),
1057+
disks: vec![lvol.path().into()],
1058+
labels: Some(PoolLabel::from([("a".into(), "b".into())])),
1059+
};
1060+
1061+
let error = client
1062+
.pool()
1063+
.create(&create, None)
1064+
.await
1065+
.expect_err("device suspended");
1066+
assert_eq!(error.kind, ReplyErrorKind::Cancelled);
1067+
1068+
lvol.resume().unwrap();
1069+
1070+
let start = std::time::Instant::now();
1071+
let timeout = Duration::from_secs(30);
1072+
loop {
1073+
if std::time::Instant::now() > (start + timeout) {
1074+
panic!("Timeout waiting for the pool");
1075+
}
1076+
tokio::time::sleep(Duration::from_millis(100)).await;
1077+
1078+
let pools = client
1079+
.pool()
1080+
.get(Filter::Pool(create.id.clone()), None)
1081+
.await
1082+
.unwrap();
1083+
1084+
let Some(pool) = pools.0.first() else {
1085+
continue;
1086+
};
1087+
let Some(pool_spec) = pool.spec() else {
1088+
continue;
1089+
};
1090+
if !pool_spec.status.created() {
1091+
continue;
1092+
}
1093+
break;
1094+
}
1095+
let destroy = DestroyPool::from(create.clone());
1096+
client.pool().destroy(&destroy, None).await.unwrap();
1097+
1098+
// Now we try to recreate using an API call, rather than using the reconciler
1099+
lvol.suspend().unwrap();
1100+
1101+
let error = client
1102+
.pool()
1103+
.create(&create, None)
1104+
.await
1105+
.expect_err("device suspended");
1106+
assert_eq!(error.kind, ReplyErrorKind::Cancelled);
1107+
1108+
lvol.resume().unwrap();
1109+
1110+
let pool = client.pool().create(&create, None).await.unwrap();
1111+
assert!(pool.spec().unwrap().status.created());
1112+
}
1113+
}

control-plane/agents/src/bin/core/tests/snapshot/fs_cons_snapshot.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ struct DeviceDisconnect(nvmeadm::NvmeTarget);
88
impl Drop for DeviceDisconnect {
99
fn drop(&mut self) {
1010
if self.0.disconnect().is_err() {
11-
std::process::Command::new("sudo")
11+
std::process::Command::new(env!("SUDO"))
1212
.args(["nvme", "disconnect-all"])
1313
.status()
1414
.unwrap();

control-plane/agents/src/bin/core/tests/volume/capacity.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ struct DeviceDisconnect(nvmeadm::NvmeTarget);
8989
impl Drop for DeviceDisconnect {
9090
fn drop(&mut self) {
9191
if self.0.disconnect().is_err() {
92-
std::process::Command::new("sudo")
92+
std::process::Command::new(env!("SUDO"))
9393
.args(["nvme", "disconnect-all"])
9494
.status()
9595
.unwrap();

0 commit comments

Comments
 (0)