Skip to content

Commit 88bf8ec

Browse files
authored
[chassis][midplane] Modify the chassisd to log expected/unexpected midplane connectivity messages (#480)
* [chassis][midplane] Modify the chassisd to log expected/unexpected midplane connectivity messages Add mechanism to get the linecard_reboot_timeout value from platform_env.conf file. This provides capabilitiy to different platform can have a different timeout value * Add UT test linecard reboot --------- Signed-off-by: mlok <[email protected]>
1 parent 9d0c550 commit 88bf8ec

File tree

2 files changed

+176
-2
lines changed

2 files changed

+176
-2
lines changed

sonic-chassisd/scripts/chassisd

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,12 @@ CHASSIS_MIDPLANE_INFO_ACCESS_FIELD = 'access'
6969
CHASSIS_MODULE_HOSTNAME_TABLE = 'CHASSIS_MODULE_TABLE'
7070
CHASSIS_MODULE_INFO_HOSTNAME_FIELD = 'hostname'
7171

72+
CHASSIS_MODULE_REBOOT_INFO_TABLE = 'CHASSIS_MODULE_REBOOT_INFO_TABLE'
73+
CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD = 'timestamp'
74+
CHASSIS_MODULE_REBOOT_REBOOT_FIELD = 'reboot'
75+
DEFAULT_LINECARD_REBOOT_TIMEOUT = 180
76+
PLATFORM_ENV_CONF_FILE = "/usr/share/sonic/platform/platform_env.conf"
77+
7278
CHASSIS_INFO_UPDATE_PERIOD_SECS = 10
7379
CHASSIS_DB_CLEANUP_MODULE_DOWN_PERIOD = 30 # Minutes
7480

@@ -198,9 +204,18 @@ class ModuleUpdater(logger.Logger):
198204
CHASSIS_ASIC_INFO_TABLE)
199205

200206
self.hostname_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_HOSTNAME_TABLE)
207+
self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE)
201208
self.down_modules = {}
202209
self.chassis_app_db_clean_sha = None
203210

211+
self.linecard_reboot_timeout = DEFAULT_LINECARD_REBOOT_TIMEOUT
212+
if os.path.isfile(PLATFORM_ENV_CONF_FILE):
213+
with open(PLATFORM_ENV_CONF_FILE, 'r') as file:
214+
for line in file:
215+
field = line.split('=')[0].strip()
216+
if field == "linecard_reboot_timeout":
217+
self.linecard_reboot_timeout = int(line.split('=')[1].strip())
218+
204219
self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False)
205220
if not self.midplane_initialized:
206221
self.log_error("Chassisd midplane intialization failed")
@@ -362,6 +377,31 @@ class ModuleUpdater(logger.Logger):
362377
else:
363378
return False
364379

380+
def is_module_reboot_expected(self, key):
381+
fvs = self.module_reboot_table.get(key)
382+
if isinstance(fvs, list) and fvs[0] is True:
383+
fvs = dict(fvs[-1])
384+
if fvs[CHASSIS_MODULE_REBOOT_REBOOT_FIELD] == "expected":
385+
return True
386+
return False
387+
388+
def module_reboot_set_time(self, key):
389+
time_now = time.time()
390+
fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD, str(time_now))])
391+
self.module_reboot_table.set(key,fvs)
392+
393+
def is_module_reboot_system_up_expired(self, key):
394+
fvs = self.module_reboot_table.get(key)
395+
if isinstance(fvs, list) and fvs[0] is True:
396+
fvs = dict(fvs[-1])
397+
if CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD in fvs.keys():
398+
timestamp= float(fvs[CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD])
399+
time_now = time.time()
400+
if time_now - timestamp >= self.linecard_reboot_timeout:
401+
self.module_reboot_table._del(key)
402+
return True
403+
return False
404+
365405
def check_midplane_reachability(self):
366406
if not self.midplane_initialized:
367407
return
@@ -395,10 +435,20 @@ class ModuleUpdater(logger.Logger):
395435
current_midplane_state = fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]
396436

397437
if midplane_access is False and current_midplane_state == 'True':
398-
self.log_warning("Module {} lost midplane connectivity".format(module_key))
438+
if self.is_module_reboot_expected(module_key):
439+
self.module_reboot_set_time(module_key)
440+
self.log_warning("Expected: Module {} lost midplane connectivity".format(module_key))
441+
else:
442+
self.log_warning("Unexpected: Module {} lost midplane connectivity".format(module_key))
399443
elif midplane_access is True and current_midplane_state == 'False':
400444
self.log_notice("Module {} midplane connectivity is up".format(module_key))
401-
445+
# clean up the reboot_info_table
446+
if self.module_reboot_table.get(module_key) is not None:
447+
self.module_reboot_table._del(module_key)
448+
elif midplane_access is False and current_midplane_state == 'False':
449+
if self.is_module_reboot_system_up_expired(module_key):
450+
self.log_warning("Unexpected: Module {} midplane connectivity is not restored in {} seconds".format(module_key, self.linecard_reboot_timeout))
451+
402452
# Update db with midplane information
403453
fvs = swsscommon.FieldValuePairs([(CHASSIS_MIDPLANE_INFO_IP_FIELD, midplane_ip),
404454
(CHASSIS_MIDPLANE_INFO_ACCESS_FIELD, str(midplane_access))])

sonic-chassisd/tests/test_chassisd.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import sys
3+
import mock
34
from imp import load_source
45

56
from mock import Mock, MagicMock, patch
@@ -40,6 +41,10 @@
4041
CHASSIS_ASIC_PCI_ADDRESS_FIELD = 'asic_pci_address'
4142
CHASSIS_ASIC_ID_IN_MODULE_FIELD = 'asic_id_in_module'
4243

44+
CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD = 'timestamp'
45+
CHASSIS_MODULE_REBOOT_REBOOT_FIELD = 'reboot'
46+
PLATFORM_ENV_CONF_FILE = "/usr/share/sonic/platform/platform_env.conf"
47+
4348
def setup_function():
4449
ModuleUpdater.log_notice = MagicMock()
4550
ModuleUpdater.log_warning = MagicMock()
@@ -357,6 +362,125 @@ def test_midplane_presence_modules():
357362
fvs = midplane_table.get(name)
358363
assert fvs == None
359364

365+
builtin_open = open # save the unpatched version
366+
def mock_open(*args, **kwargs):
367+
if args[0] == PLATFORM_ENV_CONF_FILE:
368+
return mock.mock_open(read_data="dummy=1\nlinecard_reboot_timeout=240\n")(*args, **kwargs)
369+
# unpatched version for every other path
370+
return builtin_open(*args, **kwargs)
371+
372+
@patch("builtins.open", mock_open)
373+
@patch('os.path.isfile', MagicMock(return_value=True))
374+
def test_midplane_presence_modules_linecard_reboot():
375+
chassis = MockChassis()
376+
377+
#Supervisor
378+
index = 0
379+
name = "SUPERVISOR0"
380+
desc = "Supervisor card"
381+
slot = 16
382+
serial = "RP1000101"
383+
module_type = ModuleBase.MODULE_TYPE_SUPERVISOR
384+
supervisor = MockModule(index, name, desc, module_type, slot, serial)
385+
supervisor.set_midplane_ip()
386+
chassis.module_list.append(supervisor)
387+
388+
#Linecard
389+
index = 1
390+
name = "LINE-CARD0"
391+
desc = "36 port 400G card"
392+
slot = 1
393+
serial = "LC1000101"
394+
module_type = ModuleBase.MODULE_TYPE_LINE
395+
module = MockModule(index, name, desc, module_type, slot, serial)
396+
module.set_midplane_ip()
397+
chassis.module_list.append(module)
398+
399+
#Fabric-card
400+
index = 1
401+
name = "FABRIC-CARD0"
402+
desc = "Switch fabric card"
403+
slot = 17
404+
serial = "FC1000101"
405+
module_type = ModuleBase.MODULE_TYPE_FABRIC
406+
fabric = MockModule(index, name, desc, module_type, slot, serial)
407+
chassis.module_list.append(fabric)
408+
409+
#Run on supervisor
410+
module_updater = ModuleUpdater(SYSLOG_IDENTIFIER, chassis, slot,
411+
module.supervisor_slot)
412+
module_updater.supervisor_slot = supervisor.get_slot()
413+
module_updater.my_slot = supervisor.get_slot()
414+
module_updater.modules_num_update()
415+
module_updater.module_db_update()
416+
module_updater.check_midplane_reachability()
417+
418+
midplane_table = module_updater.midplane_table
419+
#Check only one entry in database
420+
assert 1 == midplane_table.size()
421+
422+
#Check fields in database
423+
name = "LINE-CARD0"
424+
fvs = midplane_table.get(name)
425+
assert fvs != None
426+
if isinstance(fvs, list):
427+
fvs = dict(fvs[-1])
428+
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
429+
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]
430+
431+
#Set access of line-card to Up (midplane connectivity is down initially)
432+
module.set_midplane_reachable(True)
433+
module_updater.check_midplane_reachability()
434+
fvs = midplane_table.get(name)
435+
assert fvs != None
436+
if isinstance(fvs, list):
437+
fvs = dict(fvs[-1])
438+
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
439+
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]
440+
441+
442+
#Set access of line-card to Down (to mock midplane connectivity state change)
443+
module.set_midplane_reachable(False)
444+
# set expected reboot of linecard
445+
module_reboot_table = module_updater.module_reboot_table
446+
linecard_fvs = swsscommon.FieldValuePairs([("reboot", "expected")])
447+
module_reboot_table.set(name,linecard_fvs)
448+
module_updater.check_midplane_reachability()
449+
fvs = midplane_table.get(name)
450+
assert fvs != None
451+
if isinstance(fvs, list):
452+
fvs = dict(fvs[-1])
453+
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
454+
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]
455+
456+
#Set access of line-card to up on time (to mock midplane connectivity state change)
457+
module.set_midplane_reachable(True)
458+
module_updater.check_midplane_reachability()
459+
fvs = midplane_table.get(name)
460+
assert fvs != None
461+
if isinstance(fvs, list):
462+
fvs = dict(fvs[-1])
463+
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
464+
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]
465+
466+
# test linecard reboot midplane connectivity restored timeout
467+
# Set access of line-card to Down (to mock midplane connectivity state change)
468+
module.set_midplane_reachable(False)
469+
linecard_fvs = swsscommon.FieldValuePairs([("reboot", "expected")])
470+
module_reboot_table.set(name,linecard_fvs)
471+
module_updater.check_midplane_reachability()
472+
time_now= time.time() - module_updater.linecard_reboot_timeout
473+
linecard_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD, str(time_now))])
474+
module_reboot_table.set(name,linecard_fvs)
475+
module_updater.check_midplane_reachability()
476+
fvs = midplane_table.get(name)
477+
assert fvs != None
478+
if isinstance(fvs, list):
479+
fvs = dict(fvs[-1])
480+
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
481+
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]
482+
assert module_updater.linecard_reboot_timeout == 240
483+
360484
def test_midplane_presence_supervisor():
361485
chassis = MockChassis()
362486

0 commit comments

Comments
 (0)