Skip to content

Commit 1565a23

Browse files
[thermalctld] Update line card thermal sensor status to DB (sonic-net#211)
Update line card thermal sensor status to DB, includes PSU thermal sensors and SFP thermal sensors on line card. Depends on sonic-net#8422. #### Description In thermal update function, update PSU, SFP and direct thermal of line card #### Motivation and Context To support modular chassis #### How Has This Been Tested? 1. Full platform regression, 100% passed 2. Unit test passed
1 parent 294995b commit 1565a23

File tree

3 files changed

+160
-86
lines changed

3 files changed

+160
-86
lines changed

sonic-thermalctld/scripts/thermalctld

+124-78
Original file line numberDiff line numberDiff line change
@@ -538,11 +538,17 @@ class TemperatureUpdater(logger.Logger):
538538

539539
self.is_chassis_system = chassis.is_modular_chassis()
540540
if self.is_chassis_system:
541+
self.module_thermals = set()
541542
my_slot = try_get(chassis.get_my_slot, INVALID_SLOT)
542543
if my_slot != INVALID_SLOT:
543-
table_name = TemperatureUpdater.TEMPER_INFO_TABLE_NAME+'_'+str(my_slot)
544-
chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB")
545-
self.chassis_table = swsscommon.Table(chassis_state_db, table_name)
544+
try:
545+
# Modular chassis does not have to have table CHASSIS_STATE_DB.
546+
# So catch the exception here and ignore it.
547+
table_name = TemperatureUpdater.TEMPER_INFO_TABLE_NAME+'_'+str(my_slot)
548+
chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB")
549+
self.chassis_table = swsscommon.Table(chassis_state_db, table_name)
550+
except Exception as e:
551+
self.chassis_table = None
546552

547553
def deinit(self):
548554
"""
@@ -576,31 +582,61 @@ class TemperatureUpdater(logger.Logger):
576582
for index, thermal in enumerate(self.chassis.get_all_thermals()):
577583
if self.task_stopping_event.is_set():
578584
return
579-
try:
580-
self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index)
581-
except Exception as e:
582-
self.log_warning('Failed to update thermal status - {}'.format(repr(e)))
585+
586+
self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index)
583587

584588
for psu_index, psu in enumerate(self.chassis.get_all_psus()):
585589
parent_name = 'PSU {}'.format(psu_index + 1)
586590
for thermal_index, thermal in enumerate(psu.get_all_thermals()):
587591
if self.task_stopping_event.is_set():
588592
return
589-
try:
590-
self._refresh_temperature_status(parent_name, thermal, thermal_index)
591-
except Exception as e:
592-
self.log_warning('Failed to update thermal status - {}'.format(repr(e)))
593+
594+
self._refresh_temperature_status(parent_name, thermal, thermal_index)
593595

594596
for sfp_index, sfp in enumerate(self.chassis.get_all_sfps()):
595597
parent_name = 'SFP {}'.format(sfp_index + 1)
596598
for thermal_index, thermal in enumerate(sfp.get_all_thermals()):
597599
if self.task_stopping_event.is_set():
598600
return
599-
try:
600-
self._refresh_temperature_status(parent_name, thermal, thermal_index)
601-
except Exception as e:
602-
self.log_warning('Failed to update thermal status - {}'.format(repr(e)))
603601

602+
self._refresh_temperature_status(parent_name, thermal, thermal_index)
603+
604+
if self.is_chassis_system:
605+
available_thermals = set()
606+
for module_index, module in enumerate(self.chassis.get_all_modules()):
607+
module_name = try_get(module.get_name, 'Module {}'.format(module_index + 1))
608+
609+
for thermal_index, thermal in enumerate(module.get_all_thermals()):
610+
if self.task_stopping_event.is_set():
611+
return
612+
613+
available_thermals.add((thermal, module_name, thermal_index))
614+
self._refresh_temperature_status(module_name, thermal, thermal_index)
615+
616+
for sfp_index, sfp in enumerate(module.get_all_sfps()):
617+
sfp_name = '{} SFP {}'.format(module_name, sfp_index + 1)
618+
for thermal_index, thermal in enumerate(sfp.get_all_thermals()):
619+
if self.task_stopping_event.is_set():
620+
return
621+
622+
available_thermals.add((thermal, sfp_name, thermal_index))
623+
self._refresh_temperature_status(sfp_name, thermal, thermal_index)
624+
625+
for psu_index, psu in enumerate(module.get_all_psus()):
626+
psu_name = '{} PSU {}'.format(module_name, psu_index + 1)
627+
for thermal_index, thermal in enumerate(psu.get_all_thermals()):
628+
if self.task_stopping_event.is_set():
629+
return
630+
631+
available_thermals.add((thermal, psu_name, thermal_index))
632+
self._refresh_temperature_status(psu_name, thermal, thermal_index)
633+
634+
635+
thermals_to_remove = self.module_thermals - available_thermals
636+
self.module_thermals = available_thermals
637+
for thermal, parent_name, thermal_index in thermals_to_remove:
638+
self._remove_thermal_from_db(thermal, parent_name, thermal_index)
639+
604640
self.log_debug("End temperature updating")
605641

606642
def _refresh_temperature_status(self, parent_name, thermal, thermal_index):
@@ -611,72 +647,82 @@ class TemperatureUpdater(logger.Logger):
611647
:param thermal_index: Index of the thermal object in platform chassis
612648
:return:
613649
"""
614-
name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1))
650+
try:
651+
name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1))
652+
653+
# Only save entity info for thermals that belong to chassis and PSU
654+
# for SFP thermal, they don't need save entity info because snmp can deduce the relation from TRANSCEIVER_DOM_SENSOR
655+
# and as we save logical port in TRANSCEIVER_INFO table, for split cable, a SFP thermal might have multiple parent
656+
# logical port
657+
if 'SFP' not in parent_name:
658+
update_entity_info(self.phy_entity_table, parent_name, name, thermal, thermal_index + 1)
659+
660+
if name not in self.temperature_status_dict:
661+
self.temperature_status_dict[name] = TemperatureStatus()
662+
663+
temperature_status = self.temperature_status_dict[name]
664+
665+
high_threshold = NOT_AVAILABLE
666+
low_threshold = NOT_AVAILABLE
667+
high_critical_threshold = NOT_AVAILABLE
668+
low_critical_threshold = NOT_AVAILABLE
669+
maximum_temperature = NOT_AVAILABLE
670+
minimum_temperature = NOT_AVAILABLE
671+
temperature = try_get(thermal.get_temperature)
672+
is_replaceable = try_get(thermal.is_replaceable, False)
673+
if temperature != NOT_AVAILABLE:
674+
temperature_status.set_temperature(name, temperature)
675+
minimum_temperature = try_get(thermal.get_minimum_recorded)
676+
maximum_temperature = try_get(thermal.get_maximum_recorded)
677+
high_threshold = try_get(thermal.get_high_threshold)
678+
low_threshold = try_get(thermal.get_low_threshold)
679+
high_critical_threshold = try_get(thermal.get_high_critical_threshold)
680+
low_critical_threshold = try_get(thermal.get_low_critical_threshold)
681+
682+
warning = False
683+
if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold):
684+
self._log_on_status_changed(not temperature_status.over_temperature,
685+
'High temperature warning cleared: {} temperature restored to {}C, high threshold {}C'.
686+
format(name, temperature, high_threshold),
687+
'High temperature warning: {} current temperature {}C, high threshold {}C'.
688+
format(name, temperature, high_threshold)
689+
)
690+
warning = warning | temperature_status.over_temperature
691+
692+
if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold):
693+
self._log_on_status_changed(not temperature_status.under_temperature,
694+
'Low temperature warning cleared: {} temperature restored to {}C, low threshold {}C'.
695+
format(name, temperature, low_threshold),
696+
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
697+
format(name, temperature, low_threshold)
698+
)
699+
warning = warning | temperature_status.under_temperature
700+
701+
fvs = swsscommon.FieldValuePairs(
702+
[('temperature', str(temperature)),
703+
('minimum_temperature', str(minimum_temperature)),
704+
('maximum_temperature', str(maximum_temperature)),
705+
('high_threshold', str(high_threshold)),
706+
('low_threshold', str(low_threshold)),
707+
('warning_status', str(warning)),
708+
('critical_high_threshold', str(high_critical_threshold)),
709+
('critical_low_threshold', str(low_critical_threshold)),
710+
('is_replaceable', str(is_replaceable)),
711+
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
712+
])
615713

616-
# Only save entity info for thermals that belong to chassis and PSU
617-
# for SFP thermal, they don't need save entity info because snmp can deduce the relation from TRANSCEIVER_DOM_SENSOR
618-
# and as we save logical port in TRANSCEIVER_INFO table, for split cable, a SFP thermal might have multiple parent
619-
# logical port
620-
if 'SFP' not in parent_name:
621-
update_entity_info(self.phy_entity_table, parent_name, name, thermal, thermal_index + 1)
622-
623-
if name not in self.temperature_status_dict:
624-
self.temperature_status_dict[name] = TemperatureStatus()
625-
626-
temperature_status = self.temperature_status_dict[name]
627-
628-
high_threshold = NOT_AVAILABLE
629-
low_threshold = NOT_AVAILABLE
630-
high_critical_threshold = NOT_AVAILABLE
631-
low_critical_threshold = NOT_AVAILABLE
632-
maximum_temperature = NOT_AVAILABLE
633-
minimum_temperature = NOT_AVAILABLE
634-
temperature = try_get(thermal.get_temperature)
635-
is_replaceable = try_get(thermal.is_replaceable, False)
636-
if temperature != NOT_AVAILABLE:
637-
temperature_status.set_temperature(name, temperature)
638-
minimum_temperature = try_get(thermal.get_minimum_recorded)
639-
maximum_temperature = try_get(thermal.get_maximum_recorded)
640-
high_threshold = try_get(thermal.get_high_threshold)
641-
low_threshold = try_get(thermal.get_low_threshold)
642-
high_critical_threshold = try_get(thermal.get_high_critical_threshold)
643-
low_critical_threshold = try_get(thermal.get_low_critical_threshold)
644-
645-
warning = False
646-
if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold):
647-
self._log_on_status_changed(not temperature_status.over_temperature,
648-
'High temperature warning cleared: {} temperature restored to {}C, high threshold {}C'.
649-
format(name, temperature, high_threshold),
650-
'High temperature warning: {} current temperature {}C, high threshold {}C'.
651-
format(name, temperature, high_threshold)
652-
)
653-
warning = warning | temperature_status.over_temperature
654-
655-
if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold):
656-
self._log_on_status_changed(not temperature_status.under_temperature,
657-
'Low temperature warning cleared: {} temperature restored to {}C, low threshold {}C'.
658-
format(name, temperature, low_threshold),
659-
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
660-
format(name, temperature, low_threshold)
661-
)
662-
warning = warning | temperature_status.under_temperature
714+
self.table.set(name, fvs)
715+
if self.is_chassis_system and self.chassis_table is not None:
716+
self.chassis_table.set(name, fvs)
717+
except Exception as e:
718+
self.log_warning('Failed to update thermal status for {} - {}'.format(name, repr(e)))
663719

664-
fvs = swsscommon.FieldValuePairs(
665-
[('temperature', str(temperature)),
666-
('minimum_temperature', str(minimum_temperature)),
667-
('maximum_temperature', str(maximum_temperature)),
668-
('high_threshold', str(high_threshold)),
669-
('low_threshold', str(low_threshold)),
670-
('warning_status', str(warning)),
671-
('critical_high_threshold', str(high_critical_threshold)),
672-
('critical_low_threshold', str(low_critical_threshold)),
673-
('is_replaceable', str(is_replaceable)),
674-
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
675-
])
720+
def _remove_thermal_from_db(self, thermal, parent_name, thermal_index):
721+
name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1))
722+
self.table._del(name)
676723

677-
self.table.set(name, fvs)
678-
if self.is_chassis_system and self.chassis_table is not None:
679-
self.chassis_table.set(name, fvs)
724+
if self.chassis_table is not None:
725+
self.chassis_table._del(name)
680726

681727

682728
class ThermalMonitor(ProcessTaskBase):

sonic-thermalctld/tests/mock_platform.py

+16
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,17 @@ def make_error_thermal(self):
388388
thermal = MockErrorThermal()
389389
self._thermal_list.append(thermal)
390390

391+
def make_module_thermal(self):
392+
module = MockModule()
393+
self._module_list.append(module)
394+
sfp = MockSfp()
395+
sfp._thermal_list.append(MockThermal())
396+
psu = MockPsu()
397+
psu._thermal_list.append(MockThermal())
398+
module._sfp_list.append(sfp)
399+
module._psu_list.append(psu)
400+
module._thermal_list.append(MockThermal())
401+
391402
def is_modular_chassis(self):
392403
return self._is_chassis_system
393404

@@ -430,3 +441,8 @@ def get_position_in_parent(self):
430441

431442
def is_replaceable(self):
432443
return self._replaceable
444+
445+
446+
class MockModule(module_base.ModuleBase):
447+
def __init__(self):
448+
super(MockModule, self).__init__()

sonic-thermalctld/tests/test_thermalctld.py

+20-8
Original file line numberDiff line numberDiff line change
@@ -470,15 +470,15 @@ def test_update_psu_thermals(self):
470470
temperature_updater.update()
471471
assert temperature_updater.log_warning.call_count == 0
472472

473-
temperature_updater._refresh_temperature_status = mock.MagicMock(side_effect=Exception("Test message"))
473+
mock_thermal.get_temperature = mock.MagicMock(side_effect=Exception("Test message"))
474474
temperature_updater.update()
475475
assert temperature_updater.log_warning.call_count == 1
476476

477477
# TODO: Clean this up once we no longer need to support Python 2
478478
if sys.version_info.major == 3:
479-
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message')")
479+
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for PSU 1 Thermal 1 - Exception('Test message')")
480480
else:
481-
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message',)")
481+
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for PSU 1 Thermal 1 - Exception('Test message',)")
482482

483483
def test_update_sfp_thermals(self):
484484
chassis = MockChassis()
@@ -490,15 +490,15 @@ def test_update_sfp_thermals(self):
490490
temperature_updater.update()
491491
assert temperature_updater.log_warning.call_count == 0
492492

493-
temperature_updater._refresh_temperature_status = mock.MagicMock(side_effect=Exception("Test message"))
493+
mock_thermal.get_temperature = mock.MagicMock(side_effect=Exception("Test message"))
494494
temperature_updater.update()
495495
assert temperature_updater.log_warning.call_count == 1
496496

497497
# TODO: Clean this up once we no longer need to support Python 2
498498
if sys.version_info.major == 3:
499-
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message')")
499+
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for SFP 1 Thermal 1 - Exception('Test message')")
500500
else:
501-
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message',)")
501+
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for SFP 1 Thermal 1 - Exception('Test message',)")
502502

503503
def test_update_thermal_with_exception(self):
504504
chassis = MockChassis()
@@ -514,16 +514,28 @@ def test_update_thermal_with_exception(self):
514514
# TODO: Clean this up once we no longer need to support Python 2
515515
if sys.version_info.major == 3:
516516
expected_calls = [
517-
mock.call("Failed to update thermal status - Exception('Failed to get temperature')"),
517+
mock.call("Failed to update thermal status for chassis 1 Thermal 1 - Exception('Failed to get temperature')"),
518518
mock.call('High temperature warning: chassis 1 Thermal 2 current temperature 3C, high threshold 2C')
519519
]
520520
else:
521521
expected_calls = [
522-
mock.call("Failed to update thermal status - Exception('Failed to get temperature',)"),
522+
mock.call("Failed to update thermal status for chassis 1 Thermal 1 - Exception('Failed to get temperature',)"),
523523
mock.call('High temperature warning: chassis 1 Thermal 2 current temperature 3C, high threshold 2C')
524524
]
525525
assert temperature_updater.log_warning.mock_calls == expected_calls
526526

527+
def test_update_module_thermals(self):
528+
chassis = MockChassis()
529+
chassis.make_module_thermal()
530+
chassis.set_modular_chassis(True)
531+
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
532+
temperature_updater.update()
533+
assert len(temperature_updater.module_thermals) == 3
534+
535+
chassis._module_list = []
536+
temperature_updater.update()
537+
assert len(temperature_updater.module_thermals) == 0
538+
527539

528540
# Modular chassis-related tests
529541

0 commit comments

Comments
 (0)