Skip to content

Commit 69ad1ed

Browse files
Fix system-health hardware_checker to consume fan tolerance details (#16689)
Why I did it Fan tolerance checking is done through new APIs, is_under_speed and is_over_speed, which populate corresponding fields into the database. speed_tolerance is no longer used and was removed, but system-health was not updated and indicates failures: ADO: 25279165 root@sonic/# show system-health summary System status summary System status LED red_blink Services: Status: OK Hardware: Status: Not OK Reasons: Failed to get speed tolerance for fantray5.fan1 Failed to get speed tolerance for fantray5.fan0 Failed to get speed tolerance for fantray4.fan1 Failed to get speed tolerance for fantray4.fan0 Failed to get speed tolerance for fantray3.fan1 Failed to get speed tolerance for fantray3.fan0 Failed to get speed tolerance for fantray2.fan1 Failed to get speed tolerance for fantray2.fan0 Failed to get speed tolerance for fantray1.fan1 Failed to get speed tolerance for fantray1.fan0 Failed to get speed tolerance for fantray0.fan1 Failed to get speed tolerance for fantray0.fan0 Failed to get speed tolerance for PSU1.fan0 Failed to get speed tolerance for PSU0.fan0 How I did it Updated hardware_checker.py in system-health to consume new is_under_speed and is_over_speed database entries instead of speed_tolerance and hard-coded calculations. How to verify it root@sonic:/# show system-health summary System status summary System status LED green Services: Status: OK Hardware: Status: OK
1 parent 9795166 commit 69ad1ed

File tree

2 files changed

+37
-19
lines changed

2 files changed

+37
-19
lines changed

src/system-health/health_checker/hardware_checker.py

+15-13
Original file line numberDiff line numberDiff line change
@@ -102,37 +102,39 @@ def _check_fan_status(self, config):
102102
if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'):
103103
speed = data_dict.get('speed', None)
104104
speed_target = data_dict.get('speed_target', None)
105-
speed_tolerance = data_dict.get('speed_tolerance', None)
105+
is_under_speed = data_dict.get('is_under_speed', None)
106+
is_over_speed = data_dict.get('is_over_speed', None)
106107
if not speed:
107108
self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name))
108109
continue
109110
elif not speed_target:
110111
self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name))
111112
continue
112-
elif not speed_tolerance:
113-
self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name))
113+
elif is_under_speed is None:
114+
self.set_object_not_ok('Fan', name, 'Failed to get under speed threshold check for {}'.format(name))
115+
continue
116+
elif is_over_speed is None:
117+
self.set_object_not_ok('Fan', name, 'Failed to get over speed threshold check for {}'.format(name))
114118
continue
115119
else:
116120
try:
117121
speed = float(speed)
118122
speed_target = float(speed_target)
119-
speed_tolerance = float(speed_tolerance)
120-
speed_min_th = speed_target * (1 - float(speed_tolerance) / 100)
121-
speed_max_th = speed_target * (1 + float(speed_tolerance) / 100)
122-
if speed < speed_min_th or speed > speed_max_th:
123+
if 'true' in (is_under_speed.lower(), is_over_speed.lower()):
123124
self.set_object_not_ok('Fan', name,
124-
'{} speed is out of range, speed={}, range=[{},{}]'.format(name,
125-
speed,
126-
speed_min_th,
127-
speed_max_th))
125+
'{} speed is out of range, speed={}, target={}'.format(
126+
name,
127+
speed,
128+
speed_target))
128129
continue
129130
except ValueError:
130131
self.set_object_not_ok('Fan', name,
131-
'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format(
132+
'Invalid fan speed data for {}, speed={}, target={}, is_under_speed={}, is_over_speed={}'.format(
132133
name,
133134
speed,
134135
speed_target,
135-
speed_tolerance))
136+
is_under_speed,
137+
is_over_speed))
136138
continue
137139

138140
if not self._ignore_check(config.ignore_devices, 'fan', name, 'direction'):

src/system-health/tests/test_system_health.py

+22-6
Original file line numberDiff line numberDiff line change
@@ -298,36 +298,49 @@ def test_hardware_checker():
298298
'status': 'True',
299299
'speed': '60',
300300
'speed_target': '60',
301-
'speed_tolerance': '20',
301+
'is_under_speed': 'False',
302+
'is_over_speed': 'False',
302303
'direction': 'intake'
303304
},
304305
'FAN_INFO|fan2': {
305306
'presence': 'False',
306307
'status': 'True',
307308
'speed': '60',
308309
'speed_target': '60',
309-
'speed_tolerance': '20'
310+
'is_under_speed': 'False',
311+
'is_over_speed': 'False',
310312
},
311313
'FAN_INFO|fan3': {
312314
'presence': 'True',
313315
'status': 'False',
314316
'speed': '60',
315317
'speed_target': '60',
316-
'speed_tolerance': '20'
318+
'is_under_speed': 'False',
319+
'is_over_speed': 'False',
317320
},
318321
'FAN_INFO|fan4': {
319322
'presence': 'True',
320323
'status': 'True',
321324
'speed': '20',
322325
'speed_target': '60',
323-
'speed_tolerance': '20'
326+
'is_under_speed': 'True',
327+
'is_over_speed': 'False',
324328
},
325329
'FAN_INFO|fan5': {
330+
'presence': 'True',
331+
'status': 'True',
332+
'speed': '90',
333+
'speed_target': '60',
334+
'is_under_speed': 'False',
335+
'is_over_speed': 'True',
336+
},
337+
'FAN_INFO|fan6': {
326338
'presence': 'True',
327339
'status': 'True',
328340
'speed': '60',
329341
'speed_target': '60',
330-
'speed_tolerance': '20',
342+
'is_under_speed': 'False',
343+
'is_over_speed': 'False',
331344
'direction': 'exhaust'
332345
}
333346
})
@@ -426,7 +439,10 @@ def test_hardware_checker():
426439

427440
assert 'fan5' in checker._info
428441
assert checker._info['fan5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
429-
assert checker._info['fan5'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'fan5 direction exhaust is not aligned with fan1 direction intake'
442+
443+
assert 'fan6' in checker._info
444+
assert checker._info['fan6'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
445+
assert checker._info['fan6'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'fan6 direction exhaust is not aligned with fan1 direction intake'
430446

431447
assert 'PSU 1' in checker._info
432448
assert checker._info['PSU 1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK

0 commit comments

Comments
 (0)