Skip to content

[SmartSwitch] Add tests for reboot of a smart switch #16566

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions tests/common/devices/sonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,42 @@ def is_supervisor_node(self):
inv_files = im._sources
return is_supervisor_node(inv_files, self.hostname)

def is_smartswitch(self):
"""Check if the current node is a SmartSwitch

Returns:
True if the current node is a SmartSwitch, else False
"""
config_facts = self.config_facts(host=self.hostname, source="running")['ansible_facts']
if (
"DEVICE_METADATA" in config_facts and
"localhost" in config_facts["DEVICE_METADATA"] and
"subtype" in config_facts["DEVICE_METADATA"]["localhost"] and
config_facts["DEVICE_METADATA"]["localhost"]["subtype"] == "SmartSwitch" and
"type" in config_facts["DEVICE_METADATA"]["localhost"] and
config_facts["DEVICE_METADATA"]["localhost"]["type"] != "SmartSwitchDPU"
):
return True

return False

def is_dpu(self):
"""Check if the current node is a DPU

Returns:
True if the current node is a DPU, else False
"""
config_facts = self.config_facts(host=self.hostname, source="running")['ansible_facts']
if (
"DEVICE_METADATA" in config_facts and
"localhost" in config_facts["DEVICE_METADATA"] and
"type" in config_facts["DEVICE_METADATA"]["localhost"] and
config_facts["DEVICE_METADATA"]["localhost"]["type"] == "SmartSwitchDPU"
):
return True

return False

def is_frontend_node(self):
"""Check if the current node is a frontend node in case of multi-DUT.

Expand Down
41 changes: 39 additions & 2 deletions tests/common/reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from tests.common.helpers.dut_utils import ignore_t2_syslog_msgs, create_duthost_console, creds_on_dut
from tests.common.fixtures.conn_graph_facts import get_graph_facts


logger = logging.getLogger(__name__)

# Create the waiting power on event
Expand Down Expand Up @@ -142,6 +141,17 @@
}
}

'''
command : command to reboot the smartswitch DUT
'''
reboot_ss_ctrl_dict = {
REBOOT_TYPE_COLD: {
"command": "reboot",
"timeout": 300,
"cause": r"'reboot'|Non-Hardware \(reboot|^reboot"
}
}

MAX_NUM_REBOOT_CAUSE_HISTORY = 10
REBOOT_TYPE_HISTOYR_QUEUE = deque([], MAX_NUM_REBOOT_CAUSE_HISTORY)
REBOOT_CAUSE_HISTORY_TITLE = ["name", "cause", "time", "user", "comment"]
Expand Down Expand Up @@ -224,6 +234,28 @@ def execute_reboot_helper():
return [reboot_res, dut_datetime]


@support_ignore_loganalyzer
def reboot_smartswitch(duthost, reboot_type=REBOOT_TYPE_COLD):
"""
reboots SmartSwitch or a DPU
:param duthost: DUT host object
:param reboot_type: reboot type (cold)
"""

if reboot_type not in reboot_ss_ctrl_dict:
logger.info("Skipping the reboot test as the reboot type {} is not supported".format(reboot_type))
return

hostname = duthost.hostname
dut_datetime = duthost.get_now_time(utc_timezone=True)

logging.info("Rebooting the DUT {} with type {}".format(hostname, reboot_type))

reboot_res = duthost.command(reboot_ss_ctrl_dict[reboot_type]["command"])

return [reboot_res, dut_datetime]


@support_ignore_loganalyzer
def reboot(duthost, localhost, reboot_type='cold', delay=10,
timeout=0, wait=0, wait_for_ssh=True, wait_warmboot_finalizer=False, warmboot_finalizer_timeout=0,
Expand Down Expand Up @@ -284,7 +316,12 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
console_thread_res = pool.apply_async(
collect_console_log, args=(duthost, localhost, timeout + wait_conlsole_connection))
time.sleep(wait_conlsole_connection)
reboot_res, dut_datetime = perform_reboot(duthost, pool, reboot_command, reboot_helper, reboot_kwargs, reboot_type)
# Perform reboot
if duthost.is_smartswitch():
reboot_res, dut_datetime = reboot_smartswitch(duthost, reboot_type)
else:
reboot_res, dut_datetime = perform_reboot(duthost, pool, reboot_command, reboot_helper,
reboot_kwargs, reboot_type)

wait_for_shutdown(duthost, localhost, delay, timeout, reboot_res)

Expand Down
6 changes: 6 additions & 0 deletions tests/platform_tests/test_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,9 @@ def test_fast_reboot(duthosts, enum_rand_one_per_hwsku_hostname,
if duthost.is_multi_asic:
pytest.skip("Multi-ASIC devices not supporting fast reboot")

if duthost.is_smartswitch():
pytest.skip("Smart Switch devices does not support fast reboot")

reboot_and_check(localhost, duthost, conn_graph_facts.get("device_conn", {}).get(duthost.hostname, {}),
xcvr_skip_list, reboot_type=REBOOT_TYPE_FAST, duthosts=duthosts)

Expand All @@ -236,6 +239,9 @@ def test_warm_reboot(duthosts, enum_rand_one_per_hwsku_hostname,
if duthost.is_multi_asic:
pytest.skip("Multi-ASIC devices not supporting warm reboot")

if duthost.is_smartswitch():
pytest.skip("Smart Switch devices does not support warm reboot")

asic_type = duthost.facts["asic_type"]

if asic_type in ["mellanox"]:
Expand Down
11 changes: 5 additions & 6 deletions tests/smartswitch/common/device_utils_dpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ def post_test_switch_check(duthost, localhost,
return


def post_test_dpu_check(duthost, dpuhosts, dpu_name):
def post_test_dpu_check(duthost, dpuhosts, dpu_name, reboot_cause):
"""
Runs all required checks for a given DPU
Args:
Expand Down Expand Up @@ -464,14 +464,13 @@ def post_test_dpu_check(duthost, dpuhosts, dpu_name):
logging.info(f"Checking reboot cause of {dpu_name}")
pytest_assert(
wait_until(REBOOT_CAUSE_TIMEOUT, REBOOT_CAUSE_INT, 0,
check_dpu_reboot_cause, duthost, dpu_name, "Non-Hardware"),
check_dpu_reboot_cause, duthost, dpu_name, reboot_cause),
f"Reboot cause for DPU {dpu_name} is incorrect"
)


def post_test_dpus_check(duthost, dpuhosts,
dpu_on_list, dpu_off_list,
ip_address_list, num_dpu_modules):
def post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list,
num_dpu_modules, reboot_cause):
"""
Checks DPU OFF/ON and reboot cause status Post Test
Args:
Expand All @@ -489,7 +488,7 @@ def post_test_dpus_check(duthost, dpuhosts,
logging.info("Post test DPUs check in parallel")
for dpu in dpu_on_list:
executor.submit(post_test_dpu_check, duthost,
dpuhosts, dpu)
dpuhosts, dpu, reboot_cause)

logging.info("Checking all powered on DPUs connectivity")
ping_status = check_dpu_ping_status(duthost, ip_address_list)
Expand Down
62 changes: 62 additions & 0 deletions tests/smartswitch/common/reboot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import logging
import pytest
from tests.common.reboot import reboot_ss_ctrl_dict as reboot_dict, REBOOT_TYPE_HISTOYR_QUEUE, \
sync_reboot_history_queue_with_dut

logger = logging.getLogger(__name__)

REBOOT_TYPE_COLD = "cold"
REBOOT_TYPE_UNKNOWN = "unknown"
REBOOT_TYPE_KERNEL_PANIC = "Kernel Panic"
REBOOT_TYPE_WATCHDOG = "Watchdog"


def log_and_perform_reboot(duthost, reboot_type, dpu_name):
"""
Logs and initiates the reboot process based on the host type.
Skips the test if the host is a DPU.

@param duthost: DUT host object
@param reboot_type: Type of reboot to perform
@param dpu_name: Name of the DPU (optional)
"""
hostname = duthost.hostname
logger.info("Rebooting the DUT {} with type {}".format(hostname, reboot_type))

if reboot_type == REBOOT_TYPE_COLD:
if duthost.is_smartswitch():
if dpu_name is None:
logger.info("Rebooting the switch {} with cold reboot".format(hostname))
return duthost.command("sudo reboot")
else:
logger.info("Rebooting the DUT {} with cold reboot".format(hostname))
return duthost.command("sudo reboot -d {}".format(dpu_name))
elif duthost.is_dpu():
pytest.skip("Skipping the reboot test as the DUT is a DPU")
else:
pytest.skip("Skipping the reboot test as the reboot type {} is not supported".format(reboot_type))


def perform_reboot(duthost, reboot_type=REBOOT_TYPE_COLD, dpu_name=None):
"""
Performs a reboot and validates the DPU status after reboot.

@param duthost: DUT host object
@param reboot_type: Reboot type
@param dpu_name: DPU name
"""
if reboot_type not in reboot_dict:
pytest.skip("Skipping the reboot test as the reboot type {} is not supported".format(reboot_type))

logger.info("Sync reboot cause history queue with DUT reboot cause history queue")
sync_reboot_history_queue_with_dut(duthost)

res = log_and_perform_reboot(duthost, reboot_type, dpu_name)
if res['failed'] is True:
if dpu_name is None:
pytest.fail("Failed to reboot the {} with type {}".format(duthost.hostname, reboot_type))
else:
pytest.fail("Failed to reboot the DPU {}".format(dpu_name))

logger.info("Appending the last reboot type to the queue")
REBOOT_TYPE_HISTOYR_QUEUE.append(reboot_type)
9 changes: 3 additions & 6 deletions tests/smartswitch/platform_tests/test_platform_dpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,7 @@ def test_pcie_link(duthosts, dpuhosts,
duthost.shell("sudo config chassis modules \
startup %s" % (dpu_on_list[index]))

post_test_dpus_check(duthost, dpuhosts,
dpu_on_list, dpu_off_list,
ip_address_list, num_dpu_modules)
post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "Non-Hardware")

logging.info("Verifying output of '{}' on '{}'..."
.format(CMD_PCIE_INFO, duthost.hostname))
Expand Down Expand Up @@ -297,9 +295,8 @@ def test_system_health_summary(duthosts, dpuhosts,
num_dpu_modules)

logging.info("Checking DPU is completely UP")
post_test_dpus_check(duthost, dpuhosts,
dpu_on_list, dpu_off_list,
ip_address_list, num_dpu_modules)
post_test_dpus_check(duthost, dpuhosts, dpu_on_list,
ip_address_list, num_dpu_modules, "Non-Hardware")

logging.info("Checking show system-health summary on Switch")
output_health_summary = duthost.command("show system-health summary")
Expand Down
82 changes: 73 additions & 9 deletions tests/smartswitch/platform_tests/test_reload_dpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
import re
from tests.common.platform.processes_utils import wait_critical_processes
from tests.common.reboot import reboot, REBOOT_TYPE_COLD
from tests.smartswitch.common.device_utils_dpu import get_dpu_link_status,\
check_dpu_ping_status, check_dpu_link_and_status, check_dpu_module_status,\
from tests.common.helpers.platform_api import module
from tests.smartswitch.common.device_utils_dpu import check_dpu_link_and_status,\
pre_test_check, post_test_switch_check, post_test_dpus_check,\
check_dpu_reboot_cause, num_dpu_modules # noqa: F401
num_dpu_modules # noqa: F401
from tests.common.platform.device_utils import platform_api_conn, start_platform_api_service # noqa: F401,F403
from tests.smartswitch.common.reboot import perform_reboot
from tests.common.helpers.multi_thread_utils import SafeThreadPoolExecutor

pytestmark = [
pytest.mark.topology('smartswitch')
Expand Down Expand Up @@ -153,9 +155,7 @@ def test_dpu_status_post_dpu_kernel_panic(duthosts, dpuhosts,
dpuhosts[dpu_id].shell(kernel_panic_cmd, executable="/bin/bash")

logging.info("Executing post test dpu check")
post_test_dpus_check(duthost, dpuhosts,
dpu_on_list, dpu_off_list,
ip_address_list, num_dpu_modules)
post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "Non-Hardware")


def test_dpu_check_post_dpu_mem_exhaustion(duthosts, dpuhosts,
Expand All @@ -182,6 +182,70 @@ def test_dpu_check_post_dpu_mem_exhaustion(duthosts, dpuhosts,
dpuhosts[dpu_id].shell(memory_exhaustion_cmd, executable="/bin/bash")

logging.info("Executing post test dpu check")
post_test_dpus_check(duthost, dpuhosts,
dpu_on_list, dpu_off_list,
ip_address_list, num_dpu_modules)
post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list,
num_dpu_modules, "Non-Hardware")


def test_cold_reboot_dpus(duthosts, dpuhosts, enum_rand_one_per_hwsku_hostname,
platform_api_conn, num_dpu_modules): # noqa: F811, E501
"""
Test to cold reboot all DPUs in the DUT.
Steps:
1. Perform pre-test checks to gather DPU state.
2. Initiate cold reboot on all DPUs concurrently.
3. Perform post-test checks to verify the state after reboot.

Args:
duthosts: DUT hosts object
dpuhosts: DPU hosts object
enum_rand_one_per_hwsku_hostname: Randomized DUT hostname
platform_api_conn: Platform API connection object
num_dpu_modules: Number of DPU modules to reboot
"""
duthost = duthosts[enum_rand_one_per_hwsku_hostname]

logging.info("Executing pre test check")
ip_address_list, dpu_on_list, dpu_off_list = pre_test_check(duthost, platform_api_conn, num_dpu_modules)

def reboot_dpu(duthost, platform_api_conn, index):
try:
dpu_name = module.get_name(platform_api_conn, index)
perform_reboot(duthost, REBOOT_TYPE_COLD, dpu_name)
except Exception as e:
logging.error(f"Failed to reboot DPU at index {index}: {e}")

with SafeThreadPoolExecutor(max_workers=num_dpu_modules) as executor:
logging.info("Rebooting all DPUs in parallel")
for index in range(num_dpu_modules):
executor.submit(reboot_dpu, duthost, platform_api_conn, index)

logging.info("Executing post test dpu check")
post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "Non-Hardware")


def test_cold_reboot_switch(duthosts, dpuhosts, enum_rand_one_per_hwsku_hostname,
platform_api_conn, num_dpu_modules): # noqa: F811, E501
"""
Test to cold reboot the switch in the DUT.
Steps:
1. Perform pre-test checks to gather DPU state.
2. Initiate a cold reboot on the switch.
3. Perform post-test checks to verify the state of DPUs after the reboot.

Args:
duthosts: DUT hosts object
dpuhosts: DPU hosts object
enum_rand_one_per_hwsku_hostname: Randomized DUT hostname
platform_api_conn: Platform API connection object
num_dpu_modules: Number of DPU modules to verify
"""
duthost = duthosts[enum_rand_one_per_hwsku_hostname]

logging.info("Executing pre test check")
ip_address_list, dpu_on_list, dpu_off_list = pre_test_check(duthost, platform_api_conn, num_dpu_modules)

logging.info("Starting switch reboot...")
perform_reboot(duthost, REBOOT_TYPE_COLD, None)

logging.info("Executing post switch reboot dpu check")
post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "reboot")
Loading