Skip to content

Commit 0e46b47

Browse files
authored
Enhance Mellanox reboot cause test case (#6944)
Add new script to cover following reboot cause scenarios: BIOS - In case the BIOS upgrade process ended with failure and cause the switch to reset. CPU - Reset is initiated by SW on the CPU. it could be that SW encountered some catastrophic situation like a memory leak, eventually, the kernel reset the whole switch. Reset from ASIC - Reset which is caused by ASIC. - What is the motivation for this PR? Add test for sonic-net/sonic-platform-common#277 - How did you do it? - How did you verify/test it? Add test script for enhance reboot cause - Any platform specific information? Mellanox platforms, except for SPC1 and SIMX - Supported testbed topology if it's a new test case? Any topology
1 parent 945eac8 commit 0e46b47

File tree

3 files changed

+131
-36
lines changed

3 files changed

+131
-36
lines changed

tests/common/reboot.py

+41-14
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
REBOOT_TYPE_WATCHDOG = "watchdog"
2323
REBOOT_TYPE_UNKNOWN = "Unknown"
2424
REBOOT_TYPE_THERMAL_OVERLOAD = "Thermal Overload"
25+
REBOOT_TYPE_CPU = "cpu"
26+
REBOOT_TYPE_BIOS = "bios"
27+
REBOOT_TYPE_ASIC = "asic"
2528

2629
# Event to signal DUT activeness
2730
DUT_ACTIVE = threading.Event()
@@ -87,6 +90,24 @@
8790
"cause": "warm-reboot",
8891
"test_reboot_cause_only": False
8992
},
93+
REBOOT_TYPE_CPU: {
94+
"timeout": 300,
95+
"wait": 120,
96+
"cause": "CPU",
97+
"test_reboot_cause_only": True
98+
},
99+
REBOOT_TYPE_BIOS: {
100+
"timeout": 300,
101+
"wait": 120,
102+
"cause": "BIOS",
103+
"test_reboot_cause_only": True
104+
},
105+
REBOOT_TYPE_ASIC: {
106+
"timeout": 300,
107+
"wait": 120,
108+
"cause": "ASIC",
109+
"test_reboot_cause_only": True
110+
}
90111
}
91112

92113
MAX_NUM_REBOOT_CAUSE_HISTORY = 10
@@ -187,7 +208,7 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
187208
pool = ThreadPool()
188209
hostname = duthost.hostname
189210
try:
190-
reboot_ctrl = reboot_ctrl_dict[reboot_type]
211+
reboot_ctrl = reboot_ctrl_dict[reboot_type]
191212
reboot_command = reboot_ctrl['command'] if reboot_type != REBOOT_TYPE_POWEROFF else None
192213
if timeout == 0:
193214
timeout = reboot_ctrl['timeout']
@@ -199,7 +220,7 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
199220
raise ValueError('invalid reboot type: "{} for {}"'.format(reboot_type, hostname))
200221

201222
reboot_res, dut_datetime = perform_reboot(duthost, pool, reboot_command, reboot_helper, reboot_kwargs, reboot_type)
202-
223+
203224
wait_for_shutdown(duthost, localhost, delay, timeout, reboot_res)
204225
# if wait_for_ssh flag is False, do not wait for dut to boot up
205226
if not wait_for_ssh:
@@ -222,7 +243,8 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
222243
pool.terminate()
223244
dut_uptime = duthost.get_up_time()
224245
logger.info('DUT {} up since {}'.format(hostname, dut_uptime))
225-
assert float(dut_uptime.strftime("%s")) > float(dut_datetime.strftime("%s")), "Device {} did not reboot".format(hostname)
246+
assert float(dut_uptime.strftime("%s")) > float(dut_datetime.strftime("%s")), "Device {} did not reboot". \
247+
format(hostname)
226248

227249

228250
def get_reboot_cause(dut):
@@ -232,7 +254,7 @@ def get_reboot_cause(dut):
232254
"""
233255
logging.info('Getting reboot cause from dut {}'.format(dut.hostname))
234256
output = dut.shell('show reboot-cause')
235-
cause = output['stdout']
257+
cause = output['stdout']
236258

237259
for type, ctrl in reboot_ctrl_dict.items():
238260
if re.search(ctrl['cause'], cause):
@@ -282,13 +304,13 @@ def sync_reboot_history_queue_with_dut(dut):
282304
dut_reboot_history_queue = dut.show_and_parse("show reboot-cause history")
283305
dut_reboot_history_received = True
284306
break
285-
except Exception as e:
307+
except Exception:
286308
e_type, e_value, e_traceback = sys.exc_info()
287309
logging.info("Exception type: %s" % e_type.__name__)
288310
logging.info("Exception message: %s" % e_value)
289-
logging.info("Backing off for %d seconds before retrying", ((retry_count+1) * RETRY_BACKOFF_TIME))
311+
logging.info("Backing off for %d seconds before retrying", ((retry_count + 1) * RETRY_BACKOFF_TIME))
290312

291-
time.sleep(((retry_count+1) * RETRY_BACKOFF_TIME))
313+
time.sleep(((retry_count + 1) * RETRY_BACKOFF_TIME))
292314
continue
293315

294316
# If retry logic did not yield reboot cause history from DUT,
@@ -349,21 +371,26 @@ def check_reboot_cause_history(dut, reboot_type_history_queue):
349371
logging.info("Verify reboot-cause history title")
350372
if reboot_cause_history_got:
351373
if not set(REBOOT_CAUSE_HISTORY_TITLE) == set(reboot_cause_history_got[0].keys()):
352-
logging.error("Expected reboot-cause history title:{} not match actual reboot-cause history title:{}".format(
353-
REBOOT_CAUSE_HISTORY_TITLE, reboot_cause_history_got[0].keys()))
374+
logging.error("Expected reboot-cause history title:{} not match actual reboot-cause history title:{}".
375+
format(REBOOT_CAUSE_HISTORY_TITLE, reboot_cause_history_got[0].keys()))
354376
return False
355377

356-
logging.info("Verify reboot-cause output are sorted in reverse chronological order" )
378+
logging.info("Verify reboot-cause output are sorted in reverse chronological order")
357379
reboot_type_history_len = len(reboot_type_history_queue)
358380
if reboot_type_history_len <= len(reboot_cause_history_got):
359381
for index, reboot_type in enumerate(reboot_type_history_queue):
360382
if reboot_type not in reboot_ctrl_dict:
361-
logging.warn("Reboot type: {} not in dictionary. Skipping history check for this entry.".format(reboot_type))
383+
logging.warn("Reboot type: {} not in dictionary. Skipping history check for this entry.".
384+
format(reboot_type))
362385
continue
363-
logging.info("index: %d, reboot cause: %s, reboot cause from DUT: %s" % (index, reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index-1]["cause"]))
364-
if not re.search(reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index-1]["cause"]):
386+
logging.info("index: %d, reboot cause: %s, reboot cause from DUT: %s" %
387+
(index, reboot_ctrl_dict[reboot_type]["cause"],
388+
reboot_cause_history_got[reboot_type_history_len - index - 1]["cause"]))
389+
if not re.search(reboot_ctrl_dict[reboot_type]["cause"],
390+
reboot_cause_history_got[reboot_type_history_len - index - 1]["cause"]):
365391
logging.error("The {} reboot-cause not match. expected_reboot type={}, actual_reboot_cause={}".format(
366-
index, reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index]["cause"]))
392+
index, reboot_ctrl_dict[reboot_type]["cause"],
393+
reboot_cause_history_got[reboot_type_history_len - index]["cause"]))
367394
return False
368395
return True
369396
logging.error("The number of expected reboot-cause:{} is more than that of actual reboot-cuase:{}".format(

tests/platform_tests/mellanox/mellanox_thermal_control_test_helper.py

+48-22
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import logging
55
import time
66
from pkg_resources import parse_version
7-
from tests.platform_tests.thermal_control_test_helper import *
7+
from tests.platform_tests.thermal_control_test_helper import mocker, FanStatusMocker, ThermalStatusMocker, \
8+
SingleFanMocker
89
from tests.common.mellanox_data import get_platform_data
910
from minimum_table import get_min_table
1011

@@ -96,6 +97,7 @@
9697
}
9798
}
9899

100+
99101
class SysfsNotExistError(Exception):
100102
"""
101103
Exception when sys fs not exist.
@@ -138,7 +140,6 @@ def __init__(self, dut):
138140
:param dut: DUT object representing a SONiC switch under test.
139141
"""
140142
self.dut = dut
141-
#self.unlink_file_list = {}
142143
self._extract_num_of_fans_and_fan_drawers()
143144
self.deinit_retry = 5
144145

@@ -289,7 +290,7 @@ def deinit(self):
289290
for file_path, link_target in self.unlink_file_list.items():
290291
try:
291292
self.dut.command('ln -f -s {} {}'.format(link_target, file_path))
292-
except Exception as e:
293+
except Exception:
293294
# Catch any exception for later retry
294295
failed_recover_links[file_path] = link_target
295296

@@ -300,7 +301,7 @@ def deinit(self):
300301
self.dut.shell('rm -f {}'.format(file_path))
301302
else:
302303
self.dut.shell('echo \'{}\' > {}'.format(value, file_path))
303-
except Exception as e:
304+
except Exception:
304305
# Catch any exception for later retry
305306
failed_recover_files[file_path] = value
306307

@@ -417,7 +418,7 @@ def mock_fan_direction_fan_dir_per_fan(self, direction):
417418
"""
418419
try:
419420
_ = int(self.helper.read_value(FanDrawerData.FAN_DIR_PATH_PER_FAN.format(self.index)))
420-
except SysfsNotExistError as e:
421+
except SysfsNotExistError:
421422
self.mocked_direction = NOT_AVAILABLE
422423
return
423424

@@ -438,7 +439,7 @@ def mock_fan_direction_fan_dir_for_all_fans(self, direction):
438439
"""
439440
try:
440441
fan_dir_bits = int(self.helper.read_value(FanDrawerData.FAN_DIR_PATH_ALL_FANS))
441-
except SysfsNotExistError as e:
442+
except SysfsNotExistError:
442443
self.mocked_direction = NOT_AVAILABLE
443444
return
444445

@@ -479,6 +480,7 @@ def get_expect_led_color(self):
479480

480481
return 'green'
481482

483+
482484
class FanData:
483485
"""
484486
Data mocker of a FAN.
@@ -713,7 +715,7 @@ def check_result(self, actual_data):
713715
mismatch_in_actual_data = []
714716
for actual_data_item in actual_data:
715717
primary = actual_data_item[self.primary_field]
716-
if not primary in expected:
718+
if primary not in expected:
717719
extra_in_actual_data.append(actual_data_item)
718720
else:
719721
for field in actual_data_item.keys():
@@ -726,16 +728,16 @@ def check_result(self, actual_data):
726728

727729
result = True
728730
if len(extra_in_actual_data) > 0:
729-
logging.error('Found extra data in actual_data: {}'\
730-
.format(json.dumps(extra_in_actual_data, indent=2)))
731+
logging.error('Found extra data in actual_data: {}'
732+
.format(json.dumps(extra_in_actual_data, indent=2)))
731733
result = False
732734
if len(mismatch_in_actual_data) > 0:
733-
logging.error('Found mismatch data in actual_data: {}'\
734-
.format(json.dumps(mismatch_in_actual_data, indent=2)))
735+
logging.error('Found mismatch data in actual_data: {}'
736+
.format(json.dumps(mismatch_in_actual_data, indent=2)))
735737
result = False
736738
if len(expected.keys()) > 0:
737-
logging.error('Expected data not found in actual_data: {}'\
738-
.format(json.dumps(expected, indent=2)))
739+
logging.error('Expected data not found in actual_data: {}'
740+
.format(json.dumps(expected, indent=2)))
739741
result = False
740742

741743
return result
@@ -761,7 +763,7 @@ def __init__(self, dut):
761763
self.expected_data = {}
762764
self.expected_data_headers = ['drawer', 'led', 'fan', 'speed', 'direction', 'presence', 'status']
763765
self.primary_field = 'fan'
764-
self.excluded_fields = ['timestamp',]
766+
self.excluded_fields = ['timestamp', ]
765767

766768
def deinit(self):
767769
"""
@@ -779,12 +781,11 @@ def mock_data(self):
779781
drawer_index = 1
780782
drawer_data = None
781783
presence = 0
782-
direction = NOT_AVAILABLE
783784
naming_rule = FAN_NAMING_RULE['fan']
784785
# All system fan is controlled to have the same speed, so only
785786
# get a random value once here
786787
speed = random.randint(60, 100)
787-
FanData.mock_cooling_cur_state(self.mock_helper, speed/10)
788+
FanData.mock_cooling_cur_state(self.mock_helper, speed / 10)
788789
while fan_index <= MockerHelper.FAN_NUM:
789790
try:
790791
if (fan_index - 1) % MockerHelper.FAN_NUM_PER_DRAWER == 0:
@@ -806,7 +807,7 @@ def mock_data(self):
806807
fan_data.mock_target_speed(speed)
807808
self.expected_data[fan_data.name] = [
808809
drawer_data.name,
809-
'N/A', # update this value later
810+
'N/A', # update this value later
810811
fan_data.name,
811812
'{}%'.format(fan_data.mocked_speed),
812813
drawer_data.mocked_direction,
@@ -894,9 +895,10 @@ def __init__(self, dut):
894895
ThermalStatusMocker.__init__(self, dut)
895896
self.mock_helper = MockerHelper(dut)
896897
self.expected_data = {}
897-
self.expected_data_headers = ['sensor', 'temperature', 'high th', 'low th', 'crit high th', 'crit low th', 'warning']
898+
self.expected_data_headers = ['sensor', 'temperature', 'high th', 'low th', 'crit high th', 'crit low th',
899+
'warning']
898900
self.primary_field = 'sensor'
899-
self.excluded_fields = ['timestamp',]
901+
self.excluded_fields = ['timestamp', ]
900902

901903
def deinit(self):
902904
"""
@@ -1097,7 +1099,8 @@ def mock_over_speed(self):
10971099
Change the mocked FAN speed to faster than target speed and exceed speed tolerance.
10981100
:return:
10991101
"""
1100-
self.fan_data.mock_speed(AbnormalFanMocker.TARGET_SPEED_VALUE * (100 + AbnormalFanMocker.SPEED_TOLERANCE) / 100 + 10)
1102+
self.fan_data.mock_speed(
1103+
AbnormalFanMocker.TARGET_SPEED_VALUE * (100 + AbnormalFanMocker.SPEED_TOLERANCE) / 100 + 10)
11011104
self.fan_data.mock_target_speed(AbnormalFanMocker.TARGET_SPEED_VALUE)
11021105
self.expect_led_color = 'red'
11031106

@@ -1106,7 +1109,8 @@ def mock_under_speed(self):
11061109
Change the mocked FAN speed to slower than target speed and exceed speed tolerance.
11071110
:return:
11081111
"""
1109-
self.fan_data.mock_speed(AbnormalFanMocker.TARGET_SPEED_VALUE * (100 - AbnormalFanMocker.SPEED_TOLERANCE) / 100 - 10)
1112+
self.fan_data.mock_speed(
1113+
AbnormalFanMocker.TARGET_SPEED_VALUE * (100 - AbnormalFanMocker.SPEED_TOLERANCE) / 100 - 10)
11101114
self.fan_data.mock_target_speed(AbnormalFanMocker.TARGET_SPEED_VALUE)
11111115
self.expect_led_color = 'red'
11121116

@@ -1237,7 +1241,7 @@ def mock_power_threshold(self, number_psus):
12371241
if not max_power:
12381242
power = int(self.mock_helper.read_value(self.PSU_POWER.format(i + 1)))
12391243
# Round up to 100 watt and then double it to avoid noise when power fluctuate
1240-
max_power = int(round(power/100000000.0)) * 100000000 * 2
1244+
max_power = int(round(power / 100000000.0)) * 100000000 * 2
12411245
self.mock_helper.mock_value(self.PSU_POWER_CAPACITY.format(i + 1), max_power, True)
12421246

12431247
# Also mock ambient temperatures
@@ -1273,3 +1277,25 @@ def read_port_ambient_thermal(self):
12731277

12741278
def read_fan_ambient_thermal(self):
12751279
return int(self.mock_helper.read_value(self.FAN_AMBIENT_TEMP))
1280+
1281+
1282+
@mocker('RebootCauseMocker')
1283+
class RebootCauseMocker(object):
1284+
RESET_RELOAD_BIOS = '/var/run/hw-management/system/reset_reload_bios'
1285+
RESET_FROM_COMEX = '/var/run/hw-management/system/reset_from_comex'
1286+
RESET_FROM_ASIC = '/var/run/hw-management/system/reset_from_asic'
1287+
1288+
def __init__(self, dut):
1289+
self.mock_helper = MockerHelper(dut)
1290+
1291+
def deinit(self):
1292+
self.mock_helper.deinit()
1293+
1294+
def mock_reset_reload_bios(self):
1295+
self.mock_helper.mock_value(self.RESET_RELOAD_BIOS, 1)
1296+
1297+
def mock_reset_from_comex(self):
1298+
self.mock_helper.mock_value(self.RESET_FROM_COMEX, 1)
1299+
1300+
def mock_reset_from_asic(self):
1301+
self.mock_helper.mock_value(self.RESET_FROM_ASIC, 1)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import allure
2+
import logging
3+
import pytest
4+
from tests.common.reboot import REBOOT_TYPE_CPU, REBOOT_TYPE_BIOS, REBOOT_TYPE_ASIC, check_reboot_cause
5+
from tests.platform_tests.thermal_control_test_helper import mocker_factory # noqa: F401
6+
7+
pytestmark = [
8+
pytest.mark.asic('mellanox'),
9+
pytest.mark.topology('any')
10+
]
11+
12+
logger = logging.getLogger(__name__)
13+
14+
mocker = None
15+
REBOOT_CAUSE_TYPES = [REBOOT_TYPE_CPU, REBOOT_TYPE_BIOS, REBOOT_TYPE_ASIC]
16+
17+
18+
@pytest.mark.parametrize("reboot_cause", REBOOT_CAUSE_TYPES)
19+
def test_reboot_cause(rand_selected_dut, mocker_factory, reboot_cause): # noqa: F811
20+
"""
21+
Validate reboot cause from cpu/bios/asic
22+
:param rand_selected_dut: The fixture returns a randomly selected DUT
23+
:param mocker_factory: The fixture returns a mocker
24+
:param reboot_cause: The specific reboot cause
25+
"""
26+
duthost = rand_selected_dut
27+
with allure.step('Create mocker - RebootCauseMocker'):
28+
mocker = mocker_factory(duthost, 'RebootCauseMocker')
29+
30+
with allure.step('Mock reset from {}'.format(reboot_cause)):
31+
if reboot_cause == REBOOT_TYPE_CPU:
32+
mocker.mock_reset_from_comex()
33+
elif reboot_cause == REBOOT_TYPE_BIOS:
34+
mocker.mock_reset_reload_bios()
35+
elif reboot_cause == REBOOT_TYPE_ASIC:
36+
mocker.mock_reset_from_asic()
37+
38+
with allure.step('Restart determine-reboot-cause service'):
39+
duthost.restart_service('determine-reboot-cause')
40+
41+
with allure.step('Check Reboot cause is {}'.format(reboot_cause)):
42+
check_reboot_cause(duthost, reboot_cause)

0 commit comments

Comments
 (0)