Skip to content

Commit 0191300

Browse files
[Mellanox] Auto correct PSU voltage threshold (WA) (#10394)
- Why I did it There is a hardware bug that PSU voltage threshold sysfs returns incorrect value. The workaround is to call "sensor -s" to refresh it. - How I did it Call "sensor -s" when the threshold value is not incorrect and PSU is "DELTA 1100" - How to verify it Unit test and Manual test
1 parent 812f17d commit 0191300

File tree

10 files changed

+169
-0
lines changed

10 files changed

+169
-0
lines changed

device/mellanox/x86_64-mlnx_msn3700-r0/sensors.conf

+6
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
8585
label power2 "PSU-2 12V Rail Pwr (out)"
8686
label curr1 "PSU-2 220V Rail Curr (in)"
8787
label curr2 "PSU-2 12V Rail Curr (out)"
88+
set in3_lcrit in3_crit * 0.662
89+
set in3_min in3_crit * 0.745
90+
set in3_max in3_crit * 0.952
8891
chip "dps460-i2c-*-59"
8992
label in1 "PSU-1 220V Rail (in)"
9093
ignore in2
@@ -99,6 +102,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
99102
label power2 "PSU-1 12V Rail Pwr (out)"
100103
label curr1 "PSU-1 220V Rail Curr (in)"
101104
label curr2 "PSU-1 12V Rail Curr (out)"
105+
set in3_lcrit in3_crit * 0.662
106+
set in3_min in3_crit * 0.745
107+
set in3_max in3_crit * 0.952
102108

103109
# Chassis fans
104110
chip "mlxreg_fan-isa-*"

device/mellanox/x86_64-mlnx_msn3700c-r0/sensors.conf

+6
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
8585
label power2 "PSU-2 12V Rail Pwr (out)"
8686
label curr1 "PSU-2 220V Rail Curr (in)"
8787
label curr2 "PSU-2 12V Rail Curr (out)"
88+
set in3_lcrit in3_crit * 0.662
89+
set in3_min in3_crit * 0.745
90+
set in3_max in3_crit * 0.952
8891
chip "dps460-i2c-*-59"
8992
label in1 "PSU-1 220V Rail (in)"
9093
ignore in2
@@ -99,6 +102,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
99102
label power2 "PSU-1 12V Rail Pwr (out)"
100103
label curr1 "PSU-1 220V Rail Curr (in)"
101104
label curr2 "PSU-1 12V Rail Curr (out)"
105+
set in3_lcrit in3_crit * 0.662
106+
set in3_min in3_crit * 0.745
107+
set in3_max in3_crit * 0.952
102108

103109
# Chassis fans
104110
chip "mlxreg_fan-isa-*"

device/mellanox/x86_64-mlnx_msn3800-r0/sensors.conf

+6
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
106106
label power2 "PSU-2 12V Rail Pwr (out)"
107107
label curr1 "PSU-2 220V Rail Curr (in)"
108108
label curr2 "PSU-2 12V Rail Curr (out)"
109+
set in3_lcrit in3_crit * 0.662
110+
set in3_min in3_crit * 0.745
111+
set in3_max in3_crit * 0.952
109112
chip "dps460-i2c-*-59"
110113
label in1 "PSU-1 220V Rail (in)"
111114
ignore in2
@@ -120,6 +123,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
120123
label power2 "PSU-1 12V Rail Pwr (out)"
121124
label curr1 "PSU-1 220V Rail Curr (in)"
122125
label curr2 "PSU-1 12V Rail Curr (out)"
126+
set in3_lcrit in3_crit * 0.662
127+
set in3_min in3_crit * 0.745
128+
set in3_max in3_crit * 0.952
123129

124130
# Chassis fans
125131
chip "mlxreg_fan-isa-*"

device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf

+6
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
167167
label power2 "PSU-1(L) 12V Rail Pwr (out)"
168168
label curr1 "PSU-1(L) 220V Rail Curr (in)"
169169
label curr2 "PSU-1(L) 12V Rail Curr (out)"
170+
set in3_lcrit in3_crit * 0.662
171+
set in3_min in3_crit * 0.745
172+
set in3_max in3_crit * 0.952
170173
chip "dps460-i2c-*-59"
171174
label in1 "PSU-2(R) 220V Rail (in)"
172175
ignore in2
@@ -181,6 +184,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
181184
label power2 "PSU-2(R) 12V Rail Pwr (out)"
182185
label curr1 "PSU-2(R) 220V Rail Curr (in)"
183186
label curr2 "PSU-2(R) 12V Rail Curr (out)"
187+
set in3_lcrit in3_crit * 0.662
188+
set in3_min in3_crit * 0.745
189+
set in3_max in3_crit * 0.952
184190

185191
# Chassis fans
186192
chip "mlxreg_fan-isa-*"

device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf.a1

+6
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
123123
label power2 "PSU-1(L) 12V Rail Pwr (out)"
124124
label curr1 "PSU-1(L) 220V Rail Curr (in)"
125125
label curr2 "PSU-1(L) 12V Rail Curr (out)"
126+
set in3_lcrit in3_crit * 0.662
127+
set in3_min in3_crit * 0.745
128+
set in3_max in3_crit * 0.952
126129
chip "dps460-i2c-*-59"
127130
label in1 "PSU-2(R) 220V Rail (in)"
128131
ignore in2
@@ -137,6 +140,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
137140
label power2 "PSU-2(R) 12V Rail Pwr (out)"
138141
label curr1 "PSU-2(R) 220V Rail Curr (in)"
139142
label curr2 "PSU-2(R) 12V Rail Curr (out)"
143+
set in3_lcrit in3_crit * 0.662
144+
set in3_min in3_crit * 0.745
145+
set in3_max in3_crit * 0.952
140146

141147
# Chassis fans
142148
chip "mlxreg_fan-isa-*"

platform/mellanox/mlnx-platform-api/sonic_platform/psu.py

+70
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@
2424

2525
try:
2626
import os
27+
import time
2728
from sonic_platform_base.psu_base import PsuBase
2829
from sonic_py_common.logger import Logger
30+
from .device_data import DeviceDataManager
2931
from .led import PsuLed, SharedLed, ComponentFaultyIndicator
3032
from . import utils
3133
from .vpd_parser import VpdParser
@@ -411,6 +413,7 @@ def get_voltage_high_threshold(self):
411413
capability = utils.read_str_from_file(self.psu_voltage_capability)
412414
if 'max' in capability:
413415
max_voltage = utils.read_int_from_file(self.psu_voltage_max, log_func=logger.log_info)
416+
max_voltage = InvalidPsuVolWA.run(self, max_voltage, self.psu_voltage_max)
414417
return float(max_voltage) / 1000
415418

416419
return None
@@ -431,6 +434,7 @@ def get_voltage_low_threshold(self):
431434
capability = utils.read_str_from_file(self.psu_voltage_capability)
432435
if 'min' in capability:
433436
min_voltage = utils.read_int_from_file(self.psu_voltage_min, log_func=logger.log_info)
437+
min_voltage = InvalidPsuVolWA.run(self, min_voltage, self.psu_voltage_min)
434438
return float(min_voltage) / 1000
435439

436440
return None
@@ -448,3 +452,69 @@ def get_maximum_supplied_power(self):
448452
return float(power_max) / 1000000
449453
else:
450454
return None
455+
456+
457+
class InvalidPsuVolWA:
458+
"""This class is created as a workaround for a known hardware issue that the PSU voltage threshold could be a
459+
invalid value 127998. Once we read a voltage threshold value equal to 127998, we should do following:
460+
1. Check the PSU vendor, it should be Delta
461+
2. Generate a temp sensor configuration file which contains a few set commands. Those set commands are the WA provided by low level team.
462+
3. Call "sensors -s -c <tmp_conf_file>"
463+
4. Wait for it to take effect
464+
465+
This issue is found on 3700, 3700c, 3800, 4600c
466+
"""
467+
468+
INVALID_VOLTAGE_VALUE = 127998
469+
EXPECT_VENDOR_NAME = 'DELTA'
470+
EXPECT_CAPACITY = '1100'
471+
EXPECT_PLATFORMS = ['x86_64-mlnx_msn3700-r0', 'x86_64-mlnx_msn3700c-r0', 'x86_64-mlnx_msn3800-r0', 'x86_64-mlnx_msn4600c-r0']
472+
MFR_FIELD = 'MFR_NAME'
473+
CAPACITY_FIELD = 'CAPACITY'
474+
WAIT_TIME = 5
475+
476+
@classmethod
477+
def run(cls, psu, threshold_value, threshold_file):
478+
if threshold_value != cls.INVALID_VOLTAGE_VALUE:
479+
# If the threshold value is not an invalid value, just return
480+
return threshold_value
481+
482+
platform_name = DeviceDataManager.get_platform_name()
483+
# Apply the WA to specified platforms
484+
if platform_name not in cls.EXPECT_PLATFORMS:
485+
# It is unlikely to go to this branch, so we log a warning here
486+
logger.log_warning('PSU {} threshold file {} value {}, but platform is {}'.format(psu.index, threshold_file, threshold_value, platform_name))
487+
return threshold_value
488+
489+
# Check PSU vendor, make sure it is DELTA
490+
vendor_name = psu.vpd_parser.get_entry_value(cls.MFR_FIELD)
491+
if vendor_name != 'N/A' and vendor_name != cls.EXPECT_VENDOR_NAME:
492+
# It is unlikely to go to this branch, so we log a warning here
493+
logger.log_warning('PSU {} threshold file {} value {}, but its vendor is {}'.format(psu.index, threshold_file, threshold_value, vendor_name))
494+
return threshold_value
495+
496+
# Check PSU version, make sure it is 1100
497+
capacity = psu.vpd_parser.get_entry_value(cls.CAPACITY_FIELD)
498+
if capacity != 'N/A' and capacity != cls.EXPECT_CAPACITY:
499+
logger.log_warning('PSU {} threshold file {} value {}, but its capacity is {}'.format(psu.index, threshold_file, threshold_value, capacity))
500+
return threshold_value
501+
502+
# Run a sensor -s command to triger hardware to get the real threashold value
503+
utils.run_command('sensor -s')
504+
505+
# Wait for the threshold value change
506+
return cls.wait_set_done(threshold_file)
507+
508+
@classmethod
509+
def wait_set_done(cls, threshold_file):
510+
wait_time = cls.WAIT_TIME
511+
while wait_time > 0:
512+
value = utils.read_int_from_file(threshold_file, log_func=logger.log_info)
513+
if value != cls.INVALID_VOLTAGE_VALUE:
514+
return value
515+
516+
wait_time -= 1
517+
time.sleep(1)
518+
519+
logger.log_error('sensor -s does not recover PSU threshold sensor after {} seconds'.format(cls.WAIT_TIME))
520+
return None

platform/mellanox/mlnx-platform-api/sonic_platform/utils.py

+13
Original file line numberDiff line numberDiff line change
@@ -194,3 +194,16 @@ def _impl(*args, **kwargs):
194194
return return_value
195195
return _impl
196196
return wrapper
197+
198+
199+
def run_command(command):
200+
"""
201+
Utility function to run an shell command and return the output.
202+
:param command: Shell command string.
203+
:return: Output of the shell command.
204+
"""
205+
try:
206+
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
207+
return process.communicate()[0].strip()
208+
except Exception:
209+
return None

platform/mellanox/mlnx-platform-api/sonic_platform/vpd_parser.py

+15
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
SN_VPD_FIELD = "SN_VPD_FIELD"
2525
PN_VPD_FIELD = "PN_VPD_FIELD"
2626
REV_VPD_FIELD = "REV_VPD_FIELD"
27+
MFR_VPD_FIELD = "MFR_NAME"
2728

2829

2930
class VpdParser:
@@ -82,3 +83,17 @@ def get_revision(self):
8283
logger.log_error("Fail to read revision: No key {} in VPD {}".format(REV_VPD_FIELD, self.vpd_file))
8384
return 'N/A'
8485
return self.vpd_data.get(REV_VPD_FIELD, 'N/A')
86+
87+
def get_entry_value(self, key):
88+
"""
89+
Retrieves an vpd entry of the device
90+
91+
Returns:
92+
string: Vpd entry value of device
93+
"""
94+
if self._get_data() and key not in self.vpd_data:
95+
logger.log_warning("Fail to read vpd info: No key {} in VPD {}".format(key, self.vpd_file))
96+
return 'N/A'
97+
return self.vpd_data.get(key, 'N/A')
98+
99+

platform/mellanox/mlnx-platform-api/tests/test_psu.py

+37
Original file line numberDiff line numberDiff line change
@@ -116,3 +116,40 @@ def test_psu_vpd(self):
116116
assert psu.get_model() == 'MTEF-PSF-AC-C'
117117
assert psu.get_serial() == 'MT1946X07684'
118118
assert psu.get_revision() == 'A3'
119+
120+
assert psu.vpd_parser.get_entry_value('MFR_NAME') == 'DELTA'
121+
122+
@mock.patch('sonic_platform.utils.read_int_from_file', mock.MagicMock(return_value=9999))
123+
@mock.patch('sonic_platform.utils.run_command')
124+
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_platform_name')
125+
@mock.patch('sonic_platform.vpd_parser.VpdParser.get_entry_value')
126+
def test_psu_workaround(self, mock_get_entry_value, mock_get_platform_name, mock_run_command):
127+
from sonic_platform.psu import InvalidPsuVolWA
128+
psu = Psu(0)
129+
# Threshold value is not InvalidPsuVolWA.INVALID_VOLTAGE_VALUE
130+
assert InvalidPsuVolWA.run(psu, 9999, '') == 9999
131+
132+
# Platform name is not in InvalidPsuVolWA.EXPECT_PLATFORMS
133+
mock_get_platform_name.return_value = 'some platform'
134+
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE
135+
136+
# PSU vendor is not InvalidPsuVolWA.EXPECT_VENDOR_NAME
137+
vpd_info = {
138+
InvalidPsuVolWA.MFR_FIELD: 'some psu',
139+
InvalidPsuVolWA.CAPACITY_FIELD: 'some capacity'
140+
}
141+
def get_entry_value(key):
142+
return vpd_info[key]
143+
144+
mock_get_entry_value.side_effect = get_entry_value
145+
mock_get_platform_name.return_value = 'x86_64-mlnx_msn3700-r0'
146+
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE
147+
148+
# PSU capacity is not InvalidPsuVolWA.EXPECT_CAPACITY
149+
vpd_info[InvalidPsuVolWA.MFR_FIELD] = InvalidPsuVolWA.EXPECT_VENDOR_NAME
150+
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE
151+
152+
# Normal
153+
vpd_info[InvalidPsuVolWA.CAPACITY_FIELD] = InvalidPsuVolWA.EXPECT_CAPACITY
154+
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == 9999
155+
mock_run_command.assert_called_with('sensor -s')

platform/mellanox/mlnx-platform-api/tests/test_utils.py

+4
Original file line numberDiff line numberDiff line change
@@ -116,3 +116,7 @@ def func():
116116

117117
assert func() == 100
118118
assert mock_log.call_count == 1
119+
120+
def test_run_command(self):
121+
output = utils.run_command('ls')
122+
assert output

0 commit comments

Comments
 (0)