diff --git a/control-plane/csi-driver/src/bin/node/dev.rs b/control-plane/csi-driver/src/bin/node/dev.rs index db8e1172c..2872a69a5 100644 --- a/control-plane/csi-driver/src/bin/node/dev.rs +++ b/control-plane/csi-driver/src/bin/node/dev.rs @@ -110,7 +110,7 @@ impl Device { continue; } - if let Some(devname) = match_dev::match_nvmf_device(&device, &nvmf_key) { + if let Some(devname) = match_dev::match_nvmf_device_valid(&device, &nvmf_key)? { let nqn = if std::env::var("MOAC").is_ok() { format!("{}:nexus-{uuid}", nvme_target_nqn_prefix()) } else { diff --git a/control-plane/csi-driver/src/bin/node/dev/nvmf.rs b/control-plane/csi-driver/src/bin/node/dev/nvmf.rs index c9549f980..93ba6636e 100644 --- a/control-plane/csi-driver/src/bin/node/dev/nvmf.rs +++ b/control-plane/csi-driver/src/bin/node/dev/nvmf.rs @@ -1,3 +1,7 @@ +use nvmeadm::{ + error::NvmeError, + nvmf_discovery::{disconnect, ConnectArgsBuilder, TrType}, +}; use std::{ collections::HashMap, convert::{From, TryFrom}, @@ -5,11 +9,6 @@ use std::{ str::FromStr, }; -use nvmeadm::{ - error::NvmeError, - nvmf_discovery::{disconnect, ConnectArgsBuilder, TrType}, -}; - use csi_driver::PublishParams; use glob::glob; use nvmeadm::nvmf_subsystem::Subsystem; @@ -29,7 +28,7 @@ use crate::{ use super::{Attach, Detach, DeviceError, DeviceName}; lazy_static::lazy_static! { - static ref DEVICE_REGEX: Regex = Regex::new(r"nvme(\d{1,3})n1").unwrap(); + static ref DEVICE_REGEX: Regex = Regex::new(r"nvme(\d{1,5})n(\d{1,5})").unwrap(); } pub(super) struct NvmfAttach { @@ -43,6 +42,7 @@ pub(super) struct NvmfAttach { ctrl_loss_tmo: Option, keep_alive_tmo: Option, hostnqn: Option, + warn_bad: std::sync::atomic::AtomicBool, } impl NvmfAttach { @@ -70,6 +70,7 @@ impl NvmfAttach { ctrl_loss_tmo, keep_alive_tmo, hostnqn, + warn_bad: std::sync::atomic::AtomicBool::new(true), } } @@ -80,13 +81,20 @@ impl NvmfAttach { enumerator.match_subsystem("block")?; enumerator.match_property("DEVTYPE", "disk")?; + let mut first_error = Ok(None); for device in enumerator.scan_devices()? { - if match_nvmf_device(&device, &key).is_some() { - return Ok(Some(device)); + match match_device(&device, &key, &self.warn_bad) { + Ok(name) if name.is_some() => { + return Ok(Some(device)); + } + Err(error) if first_error.is_ok() => { + first_error = Err(error); + } + _ => {} } } - Ok(None) + first_error } } @@ -153,6 +161,12 @@ impl Attach for NvmfAttach { if let Some(keep_alive_tmo) = nvme_config.keep_alive_tmo() { self.keep_alive_tmo = Some(keep_alive_tmo); } + if self.io_tmo.is_none() { + if let Some(io_tmo) = publish_context.io_timeout() { + self.io_tmo = Some(*io_tmo); + } + } + Ok(()) } @@ -221,18 +235,16 @@ impl Attach for NvmfAttach { .get_device()? .ok_or_else(|| DeviceError::new("NVMe device not found"))?; let dev_name = device.sysname().to_str().unwrap(); - let major = DEVICE_REGEX - .captures(dev_name) - .ok_or_else(|| { - DeviceError::new(&format!( - "NVMe device \"{}\" does not match \"{}\"", - dev_name, *DEVICE_REGEX, - )) - })? - .get(1) - .unwrap() - .as_str(); - let pattern = format!("/sys/class/block/nvme{major}c*n1/queue"); + let captures = DEVICE_REGEX.captures(dev_name).ok_or_else(|| { + DeviceError::new(&format!( + "NVMe device \"{}\" does not match \"{}\"", + dev_name, *DEVICE_REGEX, + )) + })?; + let major = captures.get(1).unwrap().as_str(); + let nid = captures.get(2).unwrap().as_str(); + + let pattern = format!("/sys/class/block/nvme{major}c*n{nid}/queue"); let glob = glob(&pattern).unwrap(); let result = glob .into_iter() @@ -302,6 +314,48 @@ impl Detach for NvmfDetach { } } +/// Get the sysfs block device queue path for the given udev::Device. +fn block_dev_q(device: &Device) -> Result { + let dev_name = device.sysname().to_str().unwrap(); + let captures = DEVICE_REGEX.captures(dev_name).ok_or_else(|| { + DeviceError::new(&format!( + "NVMe device \"{}\" does not match \"{}\"", + dev_name, *DEVICE_REGEX, + )) + })?; + let major = captures.get(1).unwrap().as_str(); + let nid = captures.get(2).unwrap().as_str(); + Ok(format!("/sys/class/block/nvme{major}c*n{nid}/queue")) +} + +/// Check if the given device is a valid NVMf device. +/// # NOTE +/// In older kernels when a device with an existing mount is lost, the nvmf controller +/// is lost, but the block device remains, in a broken state. +/// On newer kernels, the block device is also gone. +pub(crate) fn match_device<'a>( + device: &'a Device, + key: &str, + warn_bad: &std::sync::atomic::AtomicBool, +) -> Result, DeviceError> { + let Some(devname) = match_nvmf_device(device, key) else { + return Ok(None); + }; + + let glob = glob(&block_dev_q(device)?).unwrap(); + if !glob.into_iter().any(|glob_result| glob_result.is_ok()) { + if warn_bad.load(std::sync::atomic::Ordering::Relaxed) { + let name = device.sysname().to_string_lossy(); + warn!("Block device {name} for volume {key} has no controller!"); + // todo: shoot-down the stale mounts? + warn_bad.store(false, std::sync::atomic::Ordering::Relaxed); + } + return Ok(None); + } + + Ok(Some(devname)) +} + /// Check for the presence of nvme tcp kernel module. pub(crate) fn check_nvme_tcp_module() -> Result<(), std::io::Error> { let path = "/sys/module/nvme_tcp"; diff --git a/control-plane/csi-driver/src/bin/node/match_dev.rs b/control-plane/csi-driver/src/bin/node/match_dev.rs index 4d42f7476..652159c2d 100644 --- a/control-plane/csi-driver/src/bin/node/match_dev.rs +++ b/control-plane/csi-driver/src/bin/node/match_dev.rs @@ -1,5 +1,6 @@ //! Utility functions for matching a udev record against a known device type. +use crate::error::DeviceError; use udev::Device; macro_rules! require { @@ -72,3 +73,13 @@ pub(super) fn match_nvmf_device<'a>(device: &'a Device, key: &str) -> Option<&'a Some(devname) } + +/// Match the device, if it's a nvmef device, but only if it's a valid block device. +/// See [`super::dev::nvmf::match_device`]. +pub(super) fn match_nvmf_device_valid<'a>( + device: &'a Device, + key: &str, +) -> Result, DeviceError> { + let cell = std::sync::atomic::AtomicBool::new(true); + super::dev::nvmf::match_device(device, key, &cell) +} diff --git a/shell.nix b/shell.nix index 21eedb477..bc176b407 100644 --- a/shell.nix +++ b/shell.nix @@ -91,7 +91,7 @@ mkShell { [ ! -z "${io-engine}" ] && cowsay "${io-engine-moth}" [ ! -z "${io-engine}" ] && export IO_ENGINE_BIN="${io-engine-moth}" export PATH="$PATH:$(pwd)/target/debug" - export SUDO=$(which sudo || echo /run/wrappers/bin/sudo) + export SUDO=$(which sudo 2>/dev/null || echo /run/wrappers/bin/sudo) DOCKER_CONFIG=~/.docker/config.json if [ -f "$DOCKER_CONFIG" ]; then diff --git a/tests/bdd/README.md b/tests/bdd/README.md index 838f9b976..24c6651e6 100644 --- a/tests/bdd/README.md +++ b/tests/bdd/README.md @@ -2,61 +2,77 @@ The BDD tests are written in Python and make use of the pytest-bdd library. -The feature files in the `features` directory define the behaviour expected of the control plane. These behaviours are described using the [Gherkin](https://cucumber.io/docs/gherkin/) syntax. +The feature files in the `features` directory define the behaviour expected of the control plane. These behaviours are +described using the [Gherkin](https://cucumber.io/docs/gherkin/) syntax. -The feature files can be used to auto-generate the test file. For example running `pytest-bdd generate replicas.feature > test_volume_replicas.py` +The feature files can be used to auto-generate the test file. For example +running `pytest-bdd generate replicas.feature > test_volume_replicas.py` generates the `test_volume_replicas.py` test file from the `replicas.feature` file. +When updating the feature file, you can also get some helpe updating the python code. +Example: `pytest --generate-missing --feature node.feature test_node.py` **:warning: Note: Running pytest-bdd generate will overwrite any existing files with the same name** ## Running the Tests by entering the python virtual environment + Before running any tests run the `setup.sh` script. This sets up the necessary environment to run the tests: + ```bash +# NOTE: you should be inside the nix-shell to begin source ./setup.sh ``` To run all the tests: + ```bash pytest . ``` To run individual test files: + ```bash pytest features/volume/create/test_feature.py ``` To run an individual test within a test file use the `-k` option followed by the test name: + ```bash pytest features/volume/create/test_feature.py -k test_sufficient_suitable_pools ``` ## Running the Tests + The script in `../../scripts/python/test.sh` can be used to run the tests without entering the venv. This script will implicitly enter and exit the venv during test execution. To run all the tests: + ```bash ../../scripts/python/test.sh ``` Arguments will be passed directly to pytest. Example running individual tests: + ```bash ../../scripts/python/test.sh features/volume/create/test_feature.py -k test_sufficient_suitable_pools ``` ## Debugging the Tests + Typically, the test code cleans up after itself and so it's impossible to debug the test cluster. The environmental variable `CLEAN` can be set to `false` to skip tearing down the cluster when a test ends. It should be paired with the pytest option `-x` or `--exitfirst` to exit when the first test fails otherwise the other tests may end up tearing down the cluster anyway. Example: + ```bash CLEAN=false ../../scripts/python/test.sh features/volume/create/test_feature.py -k test_sufficient_suitable_pools -x ``` The script `../../scripts/python/test.sh` does a lot of repetitive work of regenerating the auto-generated code. This step can be skipped with the `FAST` environment variable to speed up the test cycle: + ```bash FAST=true ../../scripts/python/test.sh features/volume/create/test_feature.py -k test_sufficient_suitable_pools -x ``` diff --git a/tests/bdd/common/deployer.py b/tests/bdd/common/deployer.py index 5b745f14c..e278dd9d2 100644 --- a/tests/bdd/common/deployer.py +++ b/tests/bdd/common/deployer.py @@ -194,6 +194,11 @@ def node_name(id: int): assert id >= 0 return f"io-engine-{id + 1}" + @staticmethod + def csi_node_name(id: int): + assert id >= 0 + return f"csi-node-{id + 1}" + @staticmethod def create_disks(len=1, size=100 * 1024 * 1024): host_tmp = workspace_tmp() diff --git a/tests/bdd/common/docker.py b/tests/bdd/common/docker.py index 73377a4a8..beb566252 100644 --- a/tests/bdd/common/docker.py +++ b/tests/bdd/common/docker.py @@ -67,6 +67,12 @@ def unpause_container(name): container = docker_client.containers.get(name) container.unpause() + @staticmethod + def execute(name, commands): + docker_client = docker.from_env() + container = docker_client.containers.get(name) + return container.exec_run(commands) + # Restart a container with the given name. def restart_container(name): docker_client = docker.from_env() diff --git a/tests/bdd/common/nvme.py b/tests/bdd/common/nvme.py index 65864091c..1e6a87f8c 100644 --- a/tests/bdd/common/nvme.py +++ b/tests/bdd/common/nvme.py @@ -307,17 +307,22 @@ def identify_namespace(device): def nvme_set_reconnect_delay_all(nqn, delay=10): for controller in nvme_find_controllers(nqn): - nvme_controller_set_reconnect_delay(controller, delay) + nvme_controller_set_param(controller, "reconnect_delay", delay) def nvme_set_reconnect_delay(uri, delay=10): controller = nvme_find_controller(uri) - nvme_controller_set_reconnect_delay(controller, delay) + nvme_controller_set_param(controller, "reconnect_delay", delay) -def nvme_controller_set_reconnect_delay(controller, delay=10): +def nvme_set_ctrl_loss_tmo(uri, ctrl_loss_tmo=10): + controller = nvme_find_controller(uri) + nvme_controller_set_param(controller, "ctrl_loss_tmo", ctrl_loss_tmo) + + +def nvme_controller_set_param(controller, name: str, value: int): controller = controller.get("Controller") - command = f"echo {delay} | sudo tee -a /sys/class/nvme/{controller}/reconnect_delay" + command = f"echo {value} | sudo tee -a /sys/class/nvme/{controller}/{name}" print(command) subprocess.run(command, check=True, shell=True, capture_output=True) @@ -325,3 +330,12 @@ def nvme_controller_set_reconnect_delay(controller, delay=10): @retry(wait_fixed=100, stop_max_attempt_number=40) def wait_nvme_find_device(uri): return nvme_find_device(uri) + + +@retry(wait_fixed=100, stop_max_attempt_number=40) +def wait_nvme_gone_device(uri, nqn=None): + if nqn is None: + u = urlparse(uri) + nqn = u.path[1:] + devs = nvme_find_subsystem_devices(nqn) + assert len(devs) == 0 diff --git a/tests/bdd/features/csi/node/node.feature b/tests/bdd/features/csi/node/node.feature index 1fa97bf91..7aef8c7b1 100644 --- a/tests/bdd/features/csi/node/node.feature +++ b/tests/bdd/features/csi/node/node.feature @@ -106,3 +106,16 @@ Feature: CSI node plugin Scenario: publishing a reader only block volume as rw Given a block volume staged as "MULTI_NODE_READER_ONLY" When publishing the block volume as "rw" should fail + + Scenario: re-staging after controller loss timeout + Given a staged volume + When the kernel device is removed after a controller loss timeout + Then the volume should be unstageable + And the volume should be stageable again + + Scenario: re-staging after controller loss timeout without unstaging + Given a staged volume + When the kernel device is removed after a controller loss timeout + Then the mounts become broken + But the volume should be stageable on a different path + And the nvme device should have a different controller and namespace \ No newline at end of file diff --git a/tests/bdd/features/csi/node/test_node.py b/tests/bdd/features/csi/node/test_node.py index c0146c246..b6f96108a 100644 --- a/tests/bdd/features/csi/node/test_node.py +++ b/tests/bdd/features/csi/node/test_node.py @@ -1,3 +1,7 @@ +import http +import time +from pathlib import Path + import pytest from pytest_bdd import given, scenario, then, when, parsers @@ -6,11 +10,13 @@ import grpc import csi_pb2 as pb +import openapi from common import disk_pool_label from common.apiclient import ApiClient from common.csi import CsiHandle from common.deployer import Deployer +from common.docker import Docker from openapi.model.create_pool_body import CreatePoolBody from openapi.model.publish_volume_body import PublishVolumeBody from common.operations import Volume as VolumeOps @@ -18,10 +24,18 @@ from openapi.model.create_volume_body import CreateVolumeBody from openapi.model.volume_policy import VolumePolicy from openapi.model.volume_share_protocol import VolumeShareProtocol +from common.nvme import ( + nvme_find_device, + nvme_set_reconnect_delay, + nvme_set_ctrl_loss_tmo, + wait_nvme_gone_device, + nvme_find_controller, +) POOL1_UUID = "ec176677-8202-4199-b461-2b68e53a055f" NODE1 = "io-engine-1" VOLUME_SIZE = 32 * 1024 * 1024 +FS_TYPE = "ext4" class Nexus: @@ -274,12 +288,25 @@ def test_unstaging_a_single_writer_volume(): """Unstaging a single writer volume.""" +@scenario("node.feature", "re-staging after controller loss timeout") +def test_restaging_after_controller_loss_timeout(): + """re-staging after controller loss timeout.""" + + +@scenario("node.feature", "re-staging after controller loss timeout without unstaging") +def test_restaging_after_controller_loss_timeout_without_unstaging(): + """re-staging after controller loss timeout without unstaging.""" + + @pytest.fixture def published_nexuses(setup, volumes): published = {} yield published for uuid in published.keys(): - ApiClient.volumes_api().del_volume_target(uuid) + try: + ApiClient.volumes_api().del_volume_target(uuid) + except openapi.ApiException as e: + assert e.status == http.HTTPStatus.PRECONDITION_FAILED @pytest.fixture @@ -288,7 +315,10 @@ def publish(uuid, protocol): volume = ApiClient.volumes_api().put_volume_target( uuid, publish_volume_body=PublishVolumeBody( - {}, VolumeShareProtocol("nvmf"), node=NODE1, frontend_node="" + {}, + VolumeShareProtocol("nvmf"), + node=NODE1, + frontend_node=Deployer.csi_node_name(0), ), ) nexus = Nexus(uuid, protocol, volume.state["target"]["device_uri"]) @@ -427,7 +457,7 @@ def generic_staged_volume(get_published_nexus, stage_volume, staging_target_path nexus.uri, "MULTI_NODE_SINGLE_WRITER", staging_target_path, - "ext4", + FS_TYPE, ) stage_volume(volume) return volume @@ -467,7 +497,7 @@ def attempt_to_stage_volume_with_missing_staging_target_path( mode=pb.VolumeCapability.AccessMode.Mode.MULTI_NODE_SINGLE_WRITER ), mount=pb.VolumeCapability.MountVolume( - fs_type="ext4", mount_flags=[] + fs_type=FS_TYPE, mount_flags=[] ), ), secrets={}, @@ -510,7 +540,7 @@ def attempt_to_stage_volume_with_missing_volume_id( mode=pb.VolumeCapability.AccessMode.Mode.MULTI_NODE_SINGLE_WRITER ), mount=pb.VolumeCapability.MountVolume( - fs_type="ext4", mount_flags=[] + fs_type=FS_TYPE, mount_flags=[] ), ), secrets={}, @@ -533,7 +563,7 @@ def attempt_to_stage_volume_with_missing_access_mode( staging_target_path=staging_target_path, volume_capability=pb.VolumeCapability( mount=pb.VolumeCapability.MountVolume( - fs_type="ext4", mount_flags=[] + fs_type=FS_TYPE, mount_flags=[] ) ), secrets={}, @@ -626,7 +656,7 @@ def attempt_to_stage_different_volume_with_same_staging_target_path( nexus.uri, volume.mode, volume.staging_target_path, - "ext4", + FS_TYPE, ) ) assert error.value.code() == grpc.StatusCode.ALREADY_EXISTS @@ -733,3 +763,76 @@ def publish_block_volume_as_read_or_write( @then(parsers.parse("the request should {disposition}")) def request_success_expected(disposition): return disposition == "succeed" + + +@when("the kernel device is removed after a controller loss timeout") +def _(generic_staged_volume): + """the kernel device is removed after a controller loss timeout.""" + uri = generic_staged_volume.uri + nvme_set_reconnect_delay(uri, 1) + nvme_set_ctrl_loss_tmo(uri, 1) + ApiClient.volumes_api().del_volume_target(generic_staged_volume.uuid) + wait_nvme_gone_device(uri) + + +@then("the volume should be stageable again") +def _(publish_nexus, generic_staged_volume, stage_volume): + """the volume should be stageable again.""" + publish_nexus(generic_staged_volume.uuid, generic_staged_volume.protocol) + stage_volume(generic_staged_volume) + uri = generic_staged_volume.uri + device = nvme_find_device(uri) + print(device) + + +@then("the volume should be unstageable") +def _(csi_instance, generic_staged_volume, staged_volumes): + """the volume should be unstageable.""" + volume = generic_staged_volume + csi_instance.node.NodeUnstageVolume( + pb.NodeUnstageVolumeRequest( + volume_id=volume.uuid, staging_target_path=volume.staging_target_path + ) + ) + del staged_volumes[volume.uuid] + + +@then("the mounts become broken", target_fixture="lost_device") +def _(generic_staged_volume): + """the mounts become broken.""" + result = Docker.execute( + Deployer.csi_node_name(0), + f"findmnt {generic_staged_volume.staging_target_path} -osource -rnf", + ) + assert result.exit_code == 0, f"{result.output}" + device = str(result.output, "utf-8").removesuffix("\n") + # on older kernels, the controller is gone, but the device is still present (broken) + # assert not Path(device).exists(), "device must be gone" + wait_nvme_gone_device(generic_staged_volume.uri) + assert not Path(f"/sys/fs/{FS_TYPE}/{device}").exists(), "mount must be broken" + yield device + Docker.execute( + Deployer.csi_node_name(0), + f"umount --force {generic_staged_volume.staging_target_path}", + ) + + +@then("the volume should be stageable on a different path", target_fixture="new_device") +def _(publish_nexus, generic_staged_volume, stage_volume): + """the volume should be stageable on a different path.""" + volume = generic_staged_volume + volume.staging_target_path = f"{volume.staging_target_path}-2" + publish_nexus(volume.uuid, volume.protocol) + stage_volume(volume) + yield nvme_find_device(volume.uri) + + +@then("the nvme device should have a different controller and namespace") +def _(generic_staged_volume, lost_device, new_device): + """the nvme device should have a different controller and namespace.""" + assert lost_device != new_device + print(f"{lost_device} => {new_device}") + ctrl = nvme_find_controller(generic_staged_volume.uri) + assert not new_device.startswith(ctrl.get("Controller")), "Different controller" + assert lost_device.split("n")[1] == new_device.split("n")[1], "Same subsystem" + assert lost_device.split("n")[2] != new_device.split("n")[2], "Different ns"