Skip to content

Commit 75f6eb0

Browse files
Junchao-MellanoxCarl Keene
authored and
Carl Keene
committed
[Mellanox] Add bitmap support for SFP error event (sonic-net#7605)
#### Why I did it Currently, SONiC use a single value to represent SFP error, however, multiple SFP errors could exist at the same time. This PR is aimed to support it #### How I did it Return bitmap instead of single value when a SFP event occurs Signed-off-by: Stephen Sun <[email protected]>
1 parent 59569ec commit 75f6eb0

File tree

5 files changed

+244
-32
lines changed

5 files changed

+244
-32
lines changed

platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -569,18 +569,22 @@ def get_change_event(self, timeout=0):
569569

570570
wait_for_ever = (timeout == 0)
571571
port_dict = {}
572+
error_dict = {}
572573
if wait_for_ever:
573574
timeout = MAX_SELECT_DELAY
574575
while True:
575-
status = self.sfp_event.check_sfp_status(port_dict, timeout)
576+
status = self.sfp_event.check_sfp_status(port_dict, error_dict, timeout)
576577
if bool(port_dict):
577578
break
578579
else:
579-
status = self.sfp_event.check_sfp_status(port_dict, timeout)
580+
status = self.sfp_event.check_sfp_status(port_dict, error_dict, timeout)
580581

581582
if status:
582583
self.reinit_sfps(port_dict)
583-
return True, {'sfp':port_dict}
584+
result_dict = {'sfp':port_dict}
585+
if error_dict:
586+
result_dict['sfp_error'] = error_dict
587+
return True, result_dict
584588
else:
585589
return True, {'sfp':{}}
586590

platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py

+91-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
try:
1010
import subprocess
11+
import os
1112
from sonic_platform_base.sfp_base import SfpBase
1213
from sonic_platform_base.sonic_eeprom import eeprom_dts
1314
from sonic_platform_base.sonic_sfp.sff8472 import sff8472InterfaceId
@@ -33,6 +34,18 @@
3334
except ImportError as e:
3435
pass
3536

37+
try:
38+
if os.environ["PLATFORM_API_UNIT_TESTING"] == "1":
39+
# Unable to import SDK constants under unit test
40+
# Define them here
41+
SX_PORT_MODULE_STATUS_INITIALIZING = 0
42+
SX_PORT_MODULE_STATUS_PLUGGED = 1
43+
SX_PORT_MODULE_STATUS_UNPLUGGED = 2
44+
SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR = 3
45+
SX_PORT_MODULE_STATUS_PLUGGED_DISABLED = 4
46+
except KeyError:
47+
pass
48+
3649
# definitions of the offset and width for values in XCVR info eeprom
3750
XCVR_INTFACE_BULK_OFFSET = 0
3851
XCVR_INTFACE_BULK_WIDTH_QSFP = 20
@@ -328,6 +341,18 @@ def __exit__(self, exc_type, exc_val, exc_tb):
328341
class SFP(SfpBase):
329342
"""Platform-specific SFP class"""
330343

344+
SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE = 'Long range for non-Mellanox cable or module'
345+
SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST = 'Enforce part number list'
346+
SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED = 'PMD type not enabled'
347+
SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED = 'PCIE system power slot exceeded'
348+
SFP_MLNX_ERROR_DESCRIPTION_RESERVED = 'Reserved'
349+
350+
SFP_MLNX_ERROR_BIT_LONGRANGE_NON_MLNX_CABLE = 0x00010000
351+
SFP_MLNX_ERROR_BIT_ENFORCE_PART_NUMBER_LIST = 0x00020000
352+
SFP_MLNX_ERROR_BIT_PMD_TYPE_NOT_ENABLED = 0x00040000
353+
SFP_MLNX_ERROR_BIT_PCIE_POWER_SLOT_EXCEEDED = 0x00080000
354+
SFP_MLNX_ERROR_BIT_RESERVED = 0x80000000
355+
331356
def __init__(self, sfp_index, sfp_type, sdk_handle_getter, platform):
332357
SfpBase.__init__(self)
333358
self.index = sfp_index + 1
@@ -386,7 +411,7 @@ def get_presence(self):
386411
# Read out any bytes from any offset
387412
def _read_eeprom_specific_bytes(self, offset, num_bytes):
388413
eeprom_raw = []
389-
ethtool_cmd = "ethtool -m sfp{} hex on offset {} length {}".format(self.index, offset, num_bytes)
414+
ethtool_cmd = "ethtool -m sfp{} hex on offset {} length {} 2>/dev/null".format(self.index, offset, num_bytes)
390415
try:
391416
output = subprocess.check_output(ethtool_cmd,
392417
shell=True,
@@ -2165,3 +2190,68 @@ def is_replaceable(self):
21652190
bool: True if it is replaceable.
21662191
"""
21672192
return True
2193+
2194+
def _get_error_code(self):
2195+
"""
2196+
Get error code of the SFP module
2197+
2198+
Returns:
2199+
The error code fetch from SDK API
2200+
"""
2201+
module_id_info_list = new_sx_mgmt_module_id_info_t_arr(1)
2202+
module_info_list = new_sx_mgmt_phy_module_info_t_arr(1)
2203+
2204+
module_id_info = sx_mgmt_module_id_info_t()
2205+
module_id_info.slot_id = 0
2206+
module_id_info.module_id = self.sdk_index
2207+
sx_mgmt_module_id_info_t_arr_setitem(module_id_info_list, 0, module_id_info)
2208+
2209+
rc = sx_mgmt_phy_module_info_get(self.sdk_handle, module_id_info_list, 1, module_info_list)
2210+
assert SX_STATUS_SUCCESS == rc, "sx_mgmt_phy_module_info_get failed, error code {}".format(rc)
2211+
2212+
mod_info = sx_mgmt_phy_module_info_t_arr_getitem(module_info_list, 0)
2213+
return mod_info.module_state.oper_state, mod_info.module_state.error_type
2214+
2215+
@classmethod
2216+
def _get_error_description_dict(cls):
2217+
return {0: cls.SFP_ERROR_DESCRIPTION_POWER_BUDGET_EXCEEDED,
2218+
1: cls.SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE,
2219+
2: cls.SFP_ERROR_DESCRIPTION_I2C_STUCK,
2220+
3: cls.SFP_ERROR_DESCRIPTION_BAD_EEPROM,
2221+
4: cls.SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST,
2222+
5: cls.SFP_ERROR_DESCRIPTION_UNSUPPORTED_CABLE,
2223+
6: cls.SFP_ERROR_DESCRIPTION_HIGH_TEMP,
2224+
7: cls.SFP_ERROR_DESCRIPTION_BAD_CABLE,
2225+
8: cls.SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED,
2226+
12: cls.SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED,
2227+
255: cls.SFP_MLNX_ERROR_DESCRIPTION_RESERVED
2228+
}
2229+
2230+
def get_error_description(self):
2231+
"""
2232+
Get error description
2233+
2234+
Args:
2235+
error_code: The error code returned by _get_error_code
2236+
2237+
Returns:
2238+
The error description
2239+
"""
2240+
oper_status, error_code = self._get_error_code()
2241+
if oper_status == SX_PORT_MODULE_STATUS_INITIALIZING:
2242+
error_description = self.SFP_STATUS_INITIALIZING
2243+
elif oper_status == SX_PORT_MODULE_STATUS_PLUGGED:
2244+
error_description = self.SFP_STATUS_OK
2245+
elif oper_status == SX_PORT_MODULE_STATUS_UNPLUGGED:
2246+
error_description = self.SFP_STATUS_UNPLUGGED
2247+
elif oper_status == SX_PORT_MODULE_STATUS_PLUGGED_DISABLED:
2248+
error_description = self.SFP_STATUS_DISABLED
2249+
elif oper_status == SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR:
2250+
error_description_dict = self._get_error_description_dict()
2251+
if error_code in error_description_dict:
2252+
error_description = error_description_dict[error_code]
2253+
else:
2254+
error_description = "Unknown error ({})".format(error_code)
2255+
else:
2256+
error_description = "Unknow SFP module status ({})".format(oper_status)
2257+
return error_description

platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py

+58-27
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,16 @@
77
import os
88
import time
99
import select
10-
from python_sdk_api.sx_api import *
10+
if 'MLNX_PLATFORM_API_UNIT_TESTING' not in os.environ:
11+
from python_sdk_api.sx_api import *
12+
else:
13+
from mock import MagicMock
14+
class MockSxFd(object):
15+
fd = 99
16+
new_sx_fd_t_p = MagicMock(return_value=MockSxFd())
17+
new_sx_user_channel_t_p = MagicMock()
1118
from sonic_py_common.logger import Logger
19+
from .sfp import SFP
1220

1321
# SFP status from PMAOS register
1422
# 0x1 plug in
@@ -22,15 +30,6 @@
2230
SDK_SFP_STATE_ERR = 0x3
2331
SDK_SFP_STATE_DIS = 0x4
2432

25-
# SFP status that will be handled by XCVRD
26-
STATUS_PLUGIN = '1'
27-
STATUS_PLUGOUT = '0'
28-
STATUS_ERR_I2C_STUCK = '2'
29-
STATUS_ERR_BAD_EEPROM = '3'
30-
STATUS_ERR_UNSUPPORTED_CABLE = '4'
31-
STATUS_ERR_HIGH_TEMP = '5'
32-
STATUS_ERR_BAD_CABLE = '6'
33-
3433
# SFP status used in this file only, will not expose to XCVRD
3534
# STATUS_ERROR will be mapped to different status according to the error code
3635
STATUS_UNKNOWN = '-1'
@@ -60,19 +59,39 @@
6059
'''
6160

6261
# SFP errors that will block eeprom accessing
63-
sdk_sfp_err_type_dict = {
64-
0x2: STATUS_ERR_I2C_STUCK,
65-
0x3: STATUS_ERR_BAD_EEPROM,
66-
0x5: STATUS_ERR_UNSUPPORTED_CABLE,
67-
0x6: STATUS_ERR_HIGH_TEMP,
68-
0x7: STATUS_ERR_BAD_CABLE
62+
SDK_SFP_BLOCKING_ERRORS = [
63+
0x2, # SFP.SFP_ERROR_BIT_I2C_STUCK,
64+
0x3, # SFP.SFP_ERROR_BIT_BAD_EEPROM,
65+
0x5, # SFP.SFP_ERROR_BIT_UNSUPPORTED_CABLE,
66+
0x6, # SFP.SFP_ERROR_BIT_HIGH_TEMP,
67+
0x7, # SFP.SFP_ERROR_BIT_BAD_CABLE
68+
]
69+
70+
SDK_ERRORS_TO_ERROR_BITS = {
71+
0x0: SFP.SFP_ERROR_BIT_POWER_BUDGET_EXCEEDED,
72+
0x1: SFP.SFP_MLNX_ERROR_BIT_LONGRANGE_NON_MLNX_CABLE,
73+
0x2: SFP.SFP_ERROR_BIT_I2C_STUCK,
74+
0x3: SFP.SFP_ERROR_BIT_BAD_EEPROM,
75+
0x4: SFP.SFP_MLNX_ERROR_BIT_ENFORCE_PART_NUMBER_LIST,
76+
0x5: SFP.SFP_ERROR_BIT_UNSUPPORTED_CABLE,
77+
0x6: SFP.SFP_ERROR_BIT_HIGH_TEMP,
78+
0x7: SFP.SFP_ERROR_BIT_BAD_CABLE,
79+
0x8: SFP.SFP_MLNX_ERROR_BIT_PMD_TYPE_NOT_ENABLED,
80+
0xc: SFP.SFP_MLNX_ERROR_BIT_PCIE_POWER_SLOT_EXCEEDED
81+
}
82+
83+
SDK_ERRORS_TO_DESCRIPTION = {
84+
0x1: SFP.SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE,
85+
0x4: SFP.SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST,
86+
0x8: SFP.SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED,
87+
0xc: SFP.SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED
6988
}
7089

7190
sfp_value_status_dict = {
72-
SDK_SFP_STATE_IN: STATUS_PLUGIN,
73-
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
91+
SDK_SFP_STATE_IN: str(SFP.SFP_STATUS_BIT_INSERTED),
92+
SDK_SFP_STATE_OUT: str(SFP.SFP_STATUS_BIT_REMOVED),
7493
SDK_SFP_STATE_ERR: STATUS_ERROR,
75-
SDK_SFP_STATE_DIS: STATUS_PLUGOUT,
94+
SDK_SFP_STATE_DIS: str(SFP.SFP_STATUS_BIT_REMOVED),
7695
}
7796

7897
# system level event/error
@@ -195,7 +214,7 @@ def deinitialize(self):
195214
delete_sx_fd_t_p(self.rx_fd_p)
196215
delete_sx_user_channel_t_p(self.user_channel_p)
197216

198-
def check_sfp_status(self, port_change, timeout):
217+
def check_sfp_status(self, port_change, error_dict, timeout):
199218
"""
200219
the meaning of timeout is aligned with select.select, which has the following meaning:
201220
0: poll, returns without blocked
@@ -233,6 +252,7 @@ def check_sfp_status(self, port_change, timeout):
233252
break
234253

235254
sfp_state = sfp_value_status_dict.get(module_state, STATUS_UNKNOWN)
255+
error_description = None
236256
if sfp_state == STATUS_UNKNOWN:
237257
# in the following sequence, STATUS_UNKNOWN can be returned.
238258
# so we shouldn't raise exception here.
@@ -247,18 +267,29 @@ def check_sfp_status(self, port_change, timeout):
247267

248268
# If get SFP status error(0x3) from SDK, then need to read the error_type to get the detailed error
249269
if sfp_state == STATUS_ERROR:
250-
if error_type in sdk_sfp_err_type_dict.keys():
251-
# In SFP at error status case, need to overwrite the sfp_state with the exact error code
252-
sfp_state = sdk_sfp_err_type_dict[error_type]
253-
else:
254-
# For errors don't block the eeprom accessing, we don't report it to XCVRD
255-
logger.log_info("SFP error on port but not blocking eeprom read, error_type {}".format(error_type))
256-
found +=1
270+
sfp_state_bits = SDK_ERRORS_TO_ERROR_BITS.get(error_type)
271+
if sfp_state_bits is None:
272+
logger.log_error("Unrecognized error {} detected on ports {}".format(error_type, port_list))
273+
found += 1
257274
continue
258275

276+
if error_type in SDK_SFP_BLOCKING_ERRORS:
277+
# In SFP at error status case, need to overwrite the sfp_state with the exact error code
278+
sfp_state_bits |= SFP.SFP_ERROR_BIT_BLOCKING
279+
280+
# An error should be always set along with 'INSERTED'
281+
sfp_state_bits |= SFP.SFP_STATUS_BIT_INSERTED
282+
283+
# For vendor specific errors, the description should be returned as well
284+
error_description = SDK_ERRORS_TO_DESCRIPTION.get(error_type)
285+
286+
sfp_state = str(sfp_state_bits)
287+
259288
for port in port_list:
260289
logger.log_info("SFP on port {} state {}".format(port, sfp_state))
261290
port_change[port+1] = sfp_state
291+
if error_description:
292+
error_dict[port+1] = error_description
262293
found += 1
263294

264295
return found != 0

platform/mellanox/mlnx-platform-api/tests/test_sfp.py

+42-1
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@
88
modules_path = os.path.dirname(test_path)
99
sys.path.insert(0, modules_path)
1010

11+
os.environ["PLATFORM_API_UNIT_TESTING"] = "1"
12+
1113
from sonic_py_common import device_info
12-
from sonic_platform.sfp import SFP
14+
from sonic_platform.sfp import SFP, SX_PORT_MODULE_STATUS_INITIALIZING, SX_PORT_MODULE_STATUS_PLUGGED, SX_PORT_MODULE_STATUS_UNPLUGGED, SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR, SX_PORT_MODULE_STATUS_PLUGGED_DISABLED
15+
1316
from sonic_platform.chassis import Chassis
1417

1518

@@ -26,8 +29,14 @@ def mock_get_sdk_handle(self):
2629
self.sdk_handle = 1
2730
return self.sdk_handle
2831

32+
33+
def mock_get_sfp_error_code(self):
34+
return self.oper_code, self.error_code
35+
36+
2937
device_info.get_platform = mock_get_platform
3038
SFP._read_eeprom_specific_bytes = mock_read_eeprom_specific_bytes
39+
SFP._get_error_code = mock_get_sfp_error_code
3140
Chassis.get_sdk_handle = mock_get_sdk_handle
3241

3342

@@ -82,3 +91,35 @@ def test_sfp_full_initialize_without_partial():
8291
# Verify when get_sfp is called, the SFP modules won't be initialized again
8392
sfp1 = allsfp[0]
8493
assert sfp1 == chassis.get_sfp(1)
94+
95+
96+
def test_sfp_get_error_status():
97+
chassis = Chassis()
98+
99+
# Fetch an SFP module to test
100+
sfp = chassis.get_sfp(1)
101+
102+
description_dict = sfp._get_error_description_dict()
103+
104+
sfp.oper_code = SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR
105+
for error in description_dict.keys():
106+
sfp.error_code = error
107+
description = sfp.get_error_description()
108+
109+
assert description == description_dict[sfp.error_code]
110+
111+
sfp.error_code = -1
112+
description = sfp.get_error_description()
113+
assert description == "Unknown error (-1)"
114+
115+
expected_description_list = [
116+
(SX_PORT_MODULE_STATUS_INITIALIZING, "Initializing"),
117+
(SX_PORT_MODULE_STATUS_PLUGGED, "OK"),
118+
(SX_PORT_MODULE_STATUS_UNPLUGGED, "Unplugged"),
119+
(SX_PORT_MODULE_STATUS_PLUGGED_DISABLED, "Disabled")
120+
]
121+
for oper_code, expected_description in expected_description_list:
122+
sfp.oper_code = oper_code
123+
description = sfp.get_error_description()
124+
125+
assert description == expected_description
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import os
2+
import select
3+
import sys
4+
5+
from mock import MagicMock
6+
7+
test_path = os.path.dirname(os.path.abspath(__file__))
8+
modules_path = os.path.dirname(test_path)
9+
sys.path.insert(0, modules_path)
10+
11+
from sonic_platform_base.sfp_base import SfpBase
12+
13+
class TestSfpEvent(object):
14+
@classmethod
15+
def setup_class(cls):
16+
os.environ["MLNX_PLATFORM_API_UNIT_TESTING"] = "1"
17+
select.select = MagicMock(return_value=([99], None, None))
18+
19+
def test_check_sfp_status(self):
20+
from sonic_platform.sfp_event import SDK_SFP_STATE_IN, SDK_SFP_STATE_OUT, SDK_SFP_STATE_ERR
21+
from sonic_platform.sfp_event import SDK_ERRORS_TO_ERROR_BITS, SDK_ERRORS_TO_DESCRIPTION, SDK_SFP_BLOCKING_ERRORS
22+
23+
self.executor(SDK_SFP_STATE_IN, None, SfpBase.SFP_STATUS_BIT_INSERTED)
24+
self.executor(SDK_SFP_STATE_OUT, None, SfpBase.SFP_STATUS_BIT_REMOVED)
25+
for error_type, error_status in SDK_ERRORS_TO_ERROR_BITS.items():
26+
description = SDK_ERRORS_TO_DESCRIPTION.get(error_type)
27+
if error_type in SDK_SFP_BLOCKING_ERRORS:
28+
error_status |= SfpBase.SFP_ERROR_BIT_BLOCKING
29+
error_status |= SfpBase.SFP_STATUS_BIT_INSERTED
30+
self.executor(SDK_SFP_STATE_ERR, error_type, error_status, description)
31+
32+
def executor(self, mock_module_state, mock_error_type, expect_status, description=None):
33+
from sonic_platform.sfp_event import sfp_event
34+
35+
event = sfp_event()
36+
event.on_pmpe = MagicMock(return_value=(True, [0,1], mock_module_state, mock_error_type))
37+
port_change = {}
38+
error_dict = {}
39+
found = event.check_sfp_status(port_change, error_dict, 0)
40+
assert found
41+
expect_status_str = str(expect_status)
42+
assert 1 in port_change and port_change[1] == expect_status_str
43+
assert 2 in port_change and port_change[2] == expect_status_str
44+
if description:
45+
assert 1 in error_dict and error_dict[1] == description
46+
assert 2 in error_dict and error_dict[2] == description

0 commit comments

Comments
 (0)