Skip to content

Commit 97e40ce

Browse files
[thermal fix] 1. Catch exception for each update iteration; 2. add unit test (sonic-net#51)
1 parent fc455a7 commit 97e40ce

File tree

3 files changed

+94
-32
lines changed

3 files changed

+94
-32
lines changed

sonic-thermalctld/scripts/thermalctld

+51-31
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ def try_get(callback, default=NOT_AVAILABLE):
4141
"""
4242
try:
4343
ret = callback()
44+
if ret is None:
45+
ret = default
4446
except NotImplementedError:
4547
ret = default
4648

@@ -174,16 +176,19 @@ class FanUpdater(object):
174176
:return:
175177
"""
176178
logger.log_debug("Start fan updating")
177-
try:
178-
for index, fan in enumerate(self.chassis.get_all_fans()):
179+
for index, fan in enumerate(self.chassis.get_all_fans()):
180+
try:
179181
self._refresh_fan_status(fan, index)
182+
except Exception as e:
183+
logger.log_warning('Failed to update FAN status - {}'.format(e))
180184

181-
for psu_index, psu in enumerate(self.chassis.get_all_psus()):
182-
psu_name = try_get(psu.get_name, 'PSU {}'.format(psu_index))
183-
for fan_index, fan in enumerate(psu.get_all_fans()):
185+
for psu_index, psu in enumerate(self.chassis.get_all_psus()):
186+
psu_name = try_get(psu.get_name, 'PSU {}'.format(psu_index))
187+
for fan_index, fan in enumerate(psu.get_all_fans()):
188+
try:
184189
self._refresh_fan_status(fan, fan_index, '{} FAN'.format(psu_name))
185-
except Exception as e:
186-
logger.log_warning('Failed to update FAN status - {}'.format(e))
190+
except Exception as e:
191+
logger.log_warning('Failed to update PSU FAN status - {}'.format(e))
187192

188193
logger.log_debug("End fan updating")
189194

@@ -201,10 +206,18 @@ class FanUpdater(object):
201206

202207
fan_status = self.fan_status_dict[fan_name]
203208

209+
speed = NOT_AVAILABLE
210+
speed_tolerance = NOT_AVAILABLE
211+
speed_target = NOT_AVAILABLE
212+
fan_fault_status = NOT_AVAILABLE
213+
fan_direction = NOT_AVAILABLE
204214
presence = try_get(fan.get_presence, False)
205-
speed = try_get(fan.get_speed)
206-
speed_tolerance = try_get(fan.get_speed_tolerance)
207-
speed_target = try_get(fan.get_target_speed)
215+
if presence:
216+
speed = try_get(fan.get_speed)
217+
speed_tolerance = try_get(fan.get_speed_tolerance)
218+
speed_target = try_get(fan.get_target_speed)
219+
fan_fault_status = try_get(fan.get_status, False)
220+
fan_direction = try_get(fan.get_direction)
208221

209222
set_led = False
210223
if fan_status.set_presence(presence):
@@ -215,15 +228,15 @@ class FanUpdater(object):
215228
'the system, potential overheat hazard'.format(fan_name)
216229
)
217230

218-
if fan_status.set_under_speed(speed, speed_target, speed_tolerance):
231+
if presence and fan_status.set_under_speed(speed, speed_target, speed_tolerance):
219232
set_led = True
220233
log_on_status_changed(not fan_status.under_speed,
221234
'Fan under speed warning cleared: {} speed back to normal.'.format(fan_name),
222235
'Fan under speed warning: {} current speed={}, target speed={}, tolerance={}.'.
223236
format(fan_name, speed, speed_target, speed_tolerance)
224237
)
225238

226-
if fan_status.set_over_speed(speed, speed_target, speed_tolerance):
239+
if presence and fan_status.set_over_speed(speed, speed_target, speed_tolerance):
227240
set_led = True
228241
log_on_status_changed(not fan_status.over_speed,
229242
'Fan over speed warning cleared: {} speed back to normal.'.format(fan_name),
@@ -240,8 +253,8 @@ class FanUpdater(object):
240253
[('presence', str(presence)),
241254
('model', str(try_get(fan.get_model))),
242255
('serial', str(try_get(fan.get_serial))),
243-
('status', str(try_get(fan.get_status, False))),
244-
('direction', str(try_get(fan.get_direction))),
256+
('status', str(fan_fault_status)),
257+
('direction', str(fan_direction)),
245258
('speed', str(speed)),
246259
('speed_tolerance', str(speed_tolerance)),
247260
('speed_target', str(speed_target)),
@@ -378,12 +391,12 @@ class TemperatureUpdater(object):
378391
Update all temperature information to database
379392
:return:
380393
"""
381-
logger.log_debug("Start temperature updating")
382-
try:
383-
for index, thermal in enumerate(self.chassis.get_all_thermals()):
394+
logger.log_debug("Start temperature updating")
395+
for index, thermal in enumerate(self.chassis.get_all_thermals()):
396+
try:
384397
self._refresh_temperature_status(thermal, index)
385-
except Exception as e:
386-
logger.log_warning('Failed to update thermal status - {}'.format(e))
398+
except Exception as e:
399+
logger.log_warning('Failed to update thermal status - {}'.format(e))
387400

388401
logger.log_debug("End temperature updating")
389402

@@ -400,26 +413,33 @@ class TemperatureUpdater(object):
400413

401414
temperature_status = self.temperature_status_dict[name]
402415

416+
high_threshold = NOT_AVAILABLE
417+
low_threshold = NOT_AVAILABLE
418+
high_critical_threshold = NOT_AVAILABLE
419+
low_critical_threshold = NOT_AVAILABLE
403420
temperature = try_get(thermal.get_temperature)
404-
temperature_status.set_temperature(name, temperature)
405-
high_threshold = try_get(thermal.get_high_threshold)
406-
low_threshold = try_get(thermal.get_low_threshold)
407-
421+
if temperature != NOT_AVAILABLE:
422+
temperature_status.set_temperature(name, temperature)
423+
high_threshold = try_get(thermal.get_high_threshold)
424+
low_threshold = try_get(thermal.get_low_threshold)
425+
high_critical_threshold = try_get(thermal.get_high_critical_threshold)
426+
low_critical_threshold = try_get(thermal.get_low_critical_threshold)
427+
408428
warning = False
409-
if temperature_status.set_over_temperature(temperature, high_threshold):
429+
if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold):
410430
log_on_status_changed(not temperature_status.over_temperature,
411-
'High temperature warning: {} current temperature {}C, high threshold {}C'.
412-
format(name, temperature, high_threshold),
413431
'High temperature warning cleared: {} temperature restore to {}C, high threshold {}C.'.
432+
format(name, temperature, high_threshold),
433+
'High temperature warning: {} current temperature {}C, high threshold {}C'.
414434
format(name, temperature, high_threshold)
415435
)
416436
warning = warning | temperature_status.over_temperature
417437

418-
if temperature_status.set_under_temperature(temperature, low_threshold):
438+
if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold):
419439
log_on_status_changed(not temperature_status.under_temperature,
420-
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
421-
format(name, temperature, low_threshold),
422440
'Low temperature warning cleared: {} temperature restore to {}C, low threshold {}C.'.
441+
format(name, temperature, low_threshold),
442+
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
423443
format(name, temperature, low_threshold)
424444
)
425445
warning = warning | temperature_status.under_temperature
@@ -429,8 +449,8 @@ class TemperatureUpdater(object):
429449
('high_threshold', str(high_threshold)),
430450
('low_threshold', str(low_threshold)),
431451
('warning_status', str(warning)),
432-
('critical_high_threshold', str(try_get(thermal.get_high_critical_threshold))),
433-
('critical_low_threshold', str(try_get(thermal.get_low_critical_threshold))),
452+
('critical_high_threshold', str(high_critical_threshold)),
453+
('critical_low_threshold', str(low_critical_threshold)),
434454
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
435455
])
436456

sonic-thermalctld/tests/mock_platform.py

+18
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ def make_normal_speed(self):
6868
self.speed_tolerance = 0
6969

7070

71+
class MockErrorFan(MockFan):
72+
def get_speed(self):
73+
raise Exception('Fail to get speed')
74+
75+
7176
class MockPsu(MockDevice):
7277
def __init__(self):
7378
self.fan_list = []
@@ -118,6 +123,11 @@ def make_normal_temper(self):
118123
self.temperature = 2
119124
self.low_threshold = 1
120125

126+
127+
class MockErrorThermal(MockThermal):
128+
def get_temperature(self):
129+
raise Exception('Fail to get temperature')
130+
121131

122132
class MockChassis:
123133
def __init__(self):
@@ -149,6 +159,10 @@ def make_over_speed_fan(self):
149159
fan.make_over_speed()
150160
self.fan_list.append(fan)
151161

162+
def make_error_fan(self):
163+
fan = MockErrorFan()
164+
self.fan_list.append(fan)
165+
152166
def make_over_temper_thermal(self):
153167
thermal = MockThermal()
154168
thermal.make_over_temper()
@@ -159,3 +173,7 @@ def make_under_temper_thermal(self):
159173
thermal.make_under_temper()
160174
self.thermal_list.append(thermal)
161175

176+
def make_error_thermal(self):
177+
thermal = MockErrorThermal()
178+
self.thermal_list.append(thermal)
179+

sonic-thermalctld/tests/test_thermalctld.py

+25-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import sys
33
from mock import Mock, MagicMock, patch
44
from sonic_daemon_base import daemon_base
5-
from .mock_platform import MockChassis, MockFan
5+
from .mock_platform import MockChassis, MockFan, MockThermal
66

77
daemon_base.db_connect = MagicMock()
88

@@ -198,3 +198,27 @@ def test_temperupdater_under_temper():
198198
temperature_updater.update()
199199
logger.log_notice.assert_called_once()
200200

201+
202+
def test_update_fan_with_exception():
203+
chassis = MockChassis()
204+
chassis.make_error_fan()
205+
fan = MockFan()
206+
fan.make_over_speed()
207+
chassis.get_all_fans().append(fan)
208+
209+
fan_updater = FanUpdater(chassis)
210+
fan_updater.update()
211+
assert fan.get_status_led() == MockFan.STATUS_LED_COLOR_RED
212+
logger.log_warning.assert_called()
213+
214+
215+
def test_update_thermal_with_exception():
216+
chassis = MockChassis()
217+
chassis.make_error_thermal()
218+
thermal = MockThermal()
219+
thermal.make_over_temper()
220+
chassis.get_all_thermals().append(thermal)
221+
222+
temperature_updater = TemperatureUpdater(chassis)
223+
temperature_updater.update()
224+
logger.log_warning.assert_called()

0 commit comments

Comments
 (0)