chore(bors): merge pull request #892

mayastor-bors · tiagolobocastro · mayastor-bors · commit 02aa5aa7fdf5 · 2024-12-03T09:41:24.000Z
892: fix(csi-node): handle devices for existing subsystems r=tiagolobocastro a=tiagolobocastro

When a device is broken but the volume is not unstaged properly, the subsytem ends up lingering in the system.
If the device is staged again, the device is assined a different namespace id. Change the regex which is used in the device parameter fo account for this. Also extend the existing csi-node BDD testing.

Co-authored-by: Tiago Castro &lt;tiagolobocastro@gmail.com&gt;
diff --git a/control-plane/csi-driver/src/bin/node/dev.rs b/control-plane/csi-driver/src/bin/node/dev.rs
@@ -110,7 +110,7 @@ impl Device {
                 continue;
             }
 
-            if let Some(devname) = match_dev::match_nvmf_device(&device, &nvmf_key) {
+            if let Some(devname) = match_dev::match_nvmf_device_valid(&device, &nvmf_key)? {
                 let nqn = if std::env::var("MOAC").is_ok() {
                     format!("{}:nexus-{uuid}", nvme_target_nqn_prefix())
                 } else {
diff --git a/control-plane/csi-driver/src/bin/node/dev/nvmf.rs b/control-plane/csi-driver/src/bin/node/dev/nvmf.rs
@@ -1,15 +1,14 @@
+use nvmeadm::{
+    error::NvmeError,
+    nvmf_discovery::{disconnect, ConnectArgsBuilder, TrType},
+};
 use std::{
     collections::HashMap,
     convert::{From, TryFrom},
     path::Path,
     str::FromStr,
 };
 
-use nvmeadm::{
-    error::NvmeError,
-    nvmf_discovery::{disconnect, ConnectArgsBuilder, TrType},
-};
-
 use csi_driver::PublishParams;
 use glob::glob;
 use nvmeadm::nvmf_subsystem::Subsystem;
@@ -29,7 +28,7 @@ use crate::{
 use super::{Attach, Detach, DeviceError, DeviceName};
 
 lazy_static::lazy_static! {
-    static ref DEVICE_REGEX: Regex = Regex::new(r"nvme(\d{1,3})n1").unwrap();
+    static ref DEVICE_REGEX: Regex = Regex::new(r"nvme(\d{1,5})n(\d{1,5})").unwrap();
 }
 
 pub(super) struct NvmfAttach {
@@ -43,6 +42,7 @@ pub(super) struct NvmfAttach {
     ctrl_loss_tmo: Option<u32>,
     keep_alive_tmo: Option<u32>,
     hostnqn: Option<String>,
+    warn_bad: std::sync::atomic::AtomicBool,
 }
 
 impl NvmfAttach {
@@ -70,6 +70,7 @@ impl NvmfAttach {
             ctrl_loss_tmo,
             keep_alive_tmo,
             hostnqn,
+            warn_bad: std::sync::atomic::AtomicBool::new(true),
         }
     }
 
@@ -80,13 +81,20 @@ impl NvmfAttach {
         enumerator.match_subsystem("block")?;
         enumerator.match_property("DEVTYPE", "disk")?;
 
+        let mut first_error = Ok(None);
         for device in enumerator.scan_devices()? {
-            if match_nvmf_device(&device, &key).is_some() {
-                return Ok(Some(device));
+            match match_device(&device, &key, &self.warn_bad) {
+                Ok(name) if name.is_some() => {
+                    return Ok(Some(device));
+                }
+                Err(error) if first_error.is_ok() => {
+                    first_error = Err(error);
+                }
+                _ => {}
             }
         }
 
-        Ok(None)
+        first_error
     }
 }
 
@@ -153,6 +161,12 @@ impl Attach for NvmfAttach {
         if let Some(keep_alive_tmo) = nvme_config.keep_alive_tmo() {
             self.keep_alive_tmo = Some(keep_alive_tmo);
         }
+        if self.io_tmo.is_none() {
+            if let Some(io_tmo) = publish_context.io_timeout() {
+                self.io_tmo = Some(*io_tmo);
+            }
+        }
+
         Ok(())
     }
 
@@ -221,18 +235,16 @@ impl Attach for NvmfAttach {
             .get_device()?
             .ok_or_else(|| DeviceError::new("NVMe device not found"))?;
         let dev_name = device.sysname().to_str().unwrap();
-        let major = DEVICE_REGEX
-            .captures(dev_name)
-            .ok_or_else(|| {
-                DeviceError::new(&format!(
-                    "NVMe device \"{}\" does not match \"{}\"",
-                    dev_name, *DEVICE_REGEX,
-                ))
-            })?
-            .get(1)
-            .unwrap()
-            .as_str();
-        let pattern = format!("/sys/class/block/nvme{major}c*n1/queue");
+        let captures = DEVICE_REGEX.captures(dev_name).ok_or_else(|| {
+            DeviceError::new(&format!(
+                "NVMe device \"{}\" does not match \"{}\"",
+                dev_name, *DEVICE_REGEX,
+            ))
+        })?;
+        let major = captures.get(1).unwrap().as_str();
+        let nid = captures.get(2).unwrap().as_str();
+
+        let pattern = format!("/sys/class/block/nvme{major}c*n{nid}/queue");
         let glob = glob(&pattern).unwrap();
         let result = glob
             .into_iter()
@@ -302,6 +314,48 @@ impl Detach for NvmfDetach {
     }
 }
 
+/// Get the sysfs block device queue path for the given udev::Device.
+fn block_dev_q(device: &Device) -> Result<String, DeviceError> {
+    let dev_name = device.sysname().to_str().unwrap();
+    let captures = DEVICE_REGEX.captures(dev_name).ok_or_else(|| {
+        DeviceError::new(&format!(
+            "NVMe device \"{}\" does not match \"{}\"",
+            dev_name, *DEVICE_REGEX,
+        ))
+    })?;
+    let major = captures.get(1).unwrap().as_str();
+    let nid = captures.get(2).unwrap().as_str();
+    Ok(format!("/sys/class/block/nvme{major}c*n{nid}/queue"))
+}
+
+/// Check if the given device is a valid NVMf device.
+/// # NOTE
+/// In older kernels when a device with an existing mount is lost, the nvmf controller
+/// is lost, but the block device remains, in a broken state.
+/// On newer kernels, the block device is also gone.
+pub(crate) fn match_device<'a>(
+    device: &'a Device,
+    key: &str,
+    warn_bad: &std::sync::atomic::AtomicBool,
+) -> Result<Option<&'a str>, DeviceError> {
+    let Some(devname) = match_nvmf_device(device, key) else {
+        return Ok(None);
+    };
+
+    let glob = glob(&block_dev_q(device)?).unwrap();
+    if !glob.into_iter().any(|glob_result| glob_result.is_ok()) {
+        if warn_bad.load(std::sync::atomic::Ordering::Relaxed) {
+            let name = device.sysname().to_string_lossy();
+            warn!("Block device {name} for volume {key} has no controller!");
+            // todo: shoot-down the stale mounts?
+            warn_bad.store(false, std::sync::atomic::Ordering::Relaxed);
+        }
+        return Ok(None);
+    }
+
+    Ok(Some(devname))
+}
+
 /// Check for the presence of nvme tcp kernel module.
 pub(crate) fn check_nvme_tcp_module() -> Result<(), std::io::Error> {
     let path = "/sys/module/nvme_tcp";
diff --git a/control-plane/csi-driver/src/bin/node/match_dev.rs b/control-plane/csi-driver/src/bin/node/match_dev.rs
@@ -1,5 +1,6 @@
 //! Utility functions for matching a udev record against a known device type.
 
+use crate::error::DeviceError;
 use udev::Device;
 
 macro_rules! require {
@@ -72,3 +73,13 @@ pub(super) fn match_nvmf_device<'a>(device: &'a Device, key: &str) -> Option<&'a
 
     Some(devname)
 }
+
+/// Match the device, if it's a nvmef device, but only if it's a valid block device.
+/// See [`super::dev::nvmf::match_device`].
+pub(super) fn match_nvmf_device_valid<'a>(
+    device: &'a Device,
+    key: &str,
+) -> Result<Option<&'a str>, DeviceError> {
+    let cell = std::sync::atomic::AtomicBool::new(true);
+    super::dev::nvmf::match_device(device, key, &cell)
+}
diff --git a/shell.nix b/shell.nix
@@ -91,7 +91,7 @@ mkShell {
     [ ! -z "${io-engine}" ] && cowsay "${io-engine-moth}"
     [ ! -z "${io-engine}" ] && export IO_ENGINE_BIN="${io-engine-moth}"
     export PATH="$PATH:$(pwd)/target/debug"
-    export SUDO=$(which sudo || echo /run/wrappers/bin/sudo)
+    export SUDO=$(which sudo 2>/dev/null || echo /run/wrappers/bin/sudo)
 
     DOCKER_CONFIG=~/.docker/config.json
     if [ -f "$DOCKER_CONFIG" ]; then
diff --git a/tests/bdd/README.md b/tests/bdd/README.md
@@ -2,61 +2,77 @@
 
 The BDD tests are written in Python and make use of the pytest-bdd library.
 
-The feature files in the `features` directory define the behaviour expected of the control plane. These behaviours are described using the [Gherkin](https://cucumber.io/docs/gherkin/) syntax.
+The feature files in the `features` directory define the behaviour expected of the control plane. These behaviours are
+described using the [Gherkin](https://cucumber.io/docs/gherkin/) syntax.
 
-The feature files can be used to auto-generate the test file. For example running `pytest-bdd generate replicas.feature > test_volume_replicas.py`
+The feature files can be used to auto-generate the test file. For example
+running `pytest-bdd generate replicas.feature > test_volume_replicas.py`
 generates the `test_volume_replicas.py` test file from the `replicas.feature` file.
+When updating the feature file, you can also get some helpe updating the python code.
+Example: `pytest --generate-missing --feature node.feature test_node.py`
 
 **:warning: Note: Running pytest-bdd generate will overwrite any existing files with the same name**
 
 ## Running the Tests by entering the python virtual environment
+
 Before running any tests run the `setup.sh` script. This sets up the necessary environment to run the tests:
+
 ```bash
+# NOTE: you should be inside the nix-shell to begin
 source ./setup.sh
 ```
 
 To run all the tests:
+
 ```bash
 pytest .
 ```
 
 To run individual test files:
+
 ```bash
 pytest features/volume/create/test_feature.py
 ```
 
 To run an individual test within a test file use the `-k` option followed by the test name:
+
 ```bash
 pytest features/volume/create/test_feature.py -k test_sufficient_suitable_pools
 ```
 
 ## Running the Tests
+
 The script in `../../scripts/python/test.sh` can be used to run the tests without entering the venv.
 This script will implicitly enter and exit the venv during test execution.
 
 To run all the tests:
+
 ```bash
 ../../scripts/python/test.sh
 ```
 
 Arguments will be passed directly to pytest. Example running individual tests:
+
 ```bash
 ../../scripts/python/test.sh features/volume/create/test_feature.py -k test_sufficient_suitable_pools
 ```
 
 ## Debugging the Tests
+
 Typically, the test code cleans up after itself and so it's impossible to debug the test cluster.
 The environmental variable `CLEAN` can be set to `false` to skip tearing down the cluster when a test ends.
 It should be paired with the pytest option `-x` or `--exitfirst` to exit when the first test fails otherwise the other
 tests may end up tearing down the cluster anyway.
 
 Example:
+
 ```bash
 CLEAN=false ../../scripts/python/test.sh features/volume/create/test_feature.py -k test_sufficient_suitable_pools -x
 ```
 
 The script `../../scripts/python/test.sh` does a lot of repetitive work of regenerating the auto-generated code.
 This step can be skipped with the `FAST` environment variable to speed up the test cycle:
+
 ```bash
 FAST=true ../../scripts/python/test.sh features/volume/create/test_feature.py -k test_sufficient_suitable_pools -x
 ```
diff --git a/tests/bdd/common/deployer.py b/tests/bdd/common/deployer.py
@@ -194,6 +194,11 @@ def node_name(id: int):
         assert id >= 0
         return f"io-engine-{id + 1}"
 
+    @staticmethod
+    def csi_node_name(id: int):
+        assert id >= 0
+        return f"csi-node-{id + 1}"
+
     @staticmethod
     def create_disks(len=1, size=100 * 1024 * 1024):
         host_tmp = workspace_tmp()
diff --git a/tests/bdd/common/docker.py b/tests/bdd/common/docker.py
@@ -67,6 +67,12 @@ def unpause_container(name):
         container = docker_client.containers.get(name)
         container.unpause()
 
+    @staticmethod
+    def execute(name, commands):
+        docker_client = docker.from_env()
+        container = docker_client.containers.get(name)
+        return container.exec_run(commands)
+
     # Restart a container with the given name.
     def restart_container(name):
         docker_client = docker.from_env()
diff --git a/tests/bdd/common/nvme.py b/tests/bdd/common/nvme.py
@@ -307,21 +307,35 @@ def identify_namespace(device):
 
 def nvme_set_reconnect_delay_all(nqn, delay=10):
     for controller in nvme_find_controllers(nqn):
-        nvme_controller_set_reconnect_delay(controller, delay)
+        nvme_controller_set_param(controller, "reconnect_delay", delay)
 
 
 def nvme_set_reconnect_delay(uri, delay=10):
     controller = nvme_find_controller(uri)
-    nvme_controller_set_reconnect_delay(controller, delay)
+    nvme_controller_set_param(controller, "reconnect_delay", delay)
 
 
-def nvme_controller_set_reconnect_delay(controller, delay=10):
+def nvme_set_ctrl_loss_tmo(uri, ctrl_loss_tmo=10):
+    controller = nvme_find_controller(uri)
+    nvme_controller_set_param(controller, "ctrl_loss_tmo", ctrl_loss_tmo)
+
+
+def nvme_controller_set_param(controller, name: str, value: int):
     controller = controller.get("Controller")
-    command = f"echo {delay} | sudo tee -a /sys/class/nvme/{controller}/reconnect_delay"
+    command = f"echo {value} | sudo tee -a /sys/class/nvme/{controller}/{name}"
     print(command)
     subprocess.run(command, check=True, shell=True, capture_output=True)
 
 
 @retry(wait_fixed=100, stop_max_attempt_number=40)
 def wait_nvme_find_device(uri):
     return nvme_find_device(uri)
+
+
+@retry(wait_fixed=100, stop_max_attempt_number=40)
+def wait_nvme_gone_device(uri, nqn=None):
+    if nqn is None:
+        u = urlparse(uri)
+        nqn = u.path[1:]
+    devs = nvme_find_subsystem_devices(nqn)
+    assert len(devs) == 0
diff --git a/tests/bdd/features/csi/node/node.feature b/tests/bdd/features/csi/node/node.feature
@@ -106,3 +106,16 @@ Feature: CSI node plugin
   Scenario: publishing a reader only block volume as rw
     Given a block volume staged as "MULTI_NODE_READER_ONLY"
     When publishing the block volume as "rw" should fail
+
+  Scenario: re-staging after controller loss timeout
+    Given a staged volume
+    When the kernel device is removed after a controller loss timeout
+    Then the volume should be unstageable
+    And the volume should be stageable again
+
+  Scenario: re-staging after controller loss timeout without unstaging
+    Given a staged volume
+    When the kernel device is removed after a controller loss timeout
+    Then the mounts become broken
+    But the volume should be stageable on a different path
+    And the nvme device should have a different controller and namespace
diff --git a/tests/bdd/features/csi/node/test_node.py b/tests/bdd/features/csi/node/test_node.py

Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ impl Device {`
`110`	`110`	`continue;`
`111`	`111`	`}`
`112`	`112`
`113`		`- if let Some(devname) = match_dev::match_nvmf_device(&device, &nvmf_key) {`
	`113`	`+ if let Some(devname) = match_dev::match_nvmf_device_valid(&device, &nvmf_key)? {`
`114`	`114`	`let nqn = if std::env::var("MOAC").is_ok() {`
`115`	`115`	`format!("{}:nexus-{uuid}", nvme_target_nqn_prefix())`
`116`	`116`	`} else {`