Skip to content

Commit 53639de

Browse files
[xcvrd] Add bitmap support for SFP error event (sonic-net#184)
Support SFP error bitmap. Currently, SONiC use a single value to represent SFP error, however, multiple SFP errors could exist at the same time. This PR is aimed to support it Signed-off-by: Stephen Sun <[email protected]>
1 parent 2fc05b2 commit 53639de

File tree

4 files changed

+128
-93
lines changed

4 files changed

+128
-93
lines changed

sonic-xcvrd/tests/test_xcvrd.py

+29-12
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
11
import os
22
import sys
3-
import subprocess
43

5-
import pytest
64
import unittest
7-
from imp import load_source
85
if sys.version_info >= (3, 3):
96
from unittest.mock import MagicMock, patch
107
else:
118
from mock import MagicMock, patch
129

1310
from sonic_py_common import daemon_base
1411
from swsscommon import swsscommon
12+
from sonic_platform_base.sfp_base import SfpBase
1513
from .mock_swsscommon import Table
1614

1715

@@ -24,13 +22,12 @@
2422
test_path = os.path.dirname(os.path.abspath(__file__))
2523
modules_path = os.path.dirname(test_path)
2624
scripts_path = os.path.join(modules_path, "xcvrd")
27-
helper_file_path = os.path.join(scripts_path, "xcvrd_utilities"+"/y_cable_helper.py")
2825
sys.path.insert(0, modules_path)
2926

3027
os.environ["XCVRD_UNIT_TESTING"] = "1"
31-
load_source('y_cable_helper', scripts_path + '/xcvrd_utilities/y_cable_helper.py')
32-
from y_cable_helper import *
3328
from xcvrd.xcvrd import *
29+
from xcvrd.xcvrd_utilities.y_cable_helper import *
30+
from xcvrd.xcvrd_utilities.sfp_status_helper import *
3431

3532

3633
class TestXcvrdScript(object):
@@ -219,9 +216,9 @@ def test_init_port_sfp_status_tbl(self):
219216
init_port_sfp_status_tbl(stop_event)
220217

221218
@patch('xcvrd.xcvrd_utilities.y_cable_helper.y_cable_platform_sfputil', MagicMock(return_value=[0]))
222-
@patch('y_cable_helper.logical_port_name_to_physical_port_list', MagicMock(return_value=[0]))
223-
@patch('y_cable_helper._wrapper_get_presence', MagicMock(return_value=True))
224-
@patch('y_cable_helper.get_muxcable_info', MagicMock(return_value={'tor_active': 'self',
219+
@patch('xcvrd.xcvrd_utilities.y_cable_helper.logical_port_name_to_physical_port_list', MagicMock(return_value=[0]))
220+
@patch('xcvrd.xcvrd_utilities.y_cable_helper._wrapper_get_presence', MagicMock(return_value=True))
221+
@patch('xcvrd.xcvrd_utilities.y_cable_helper.get_muxcable_info', MagicMock(return_value={'tor_active': 'self',
225222
'mux_direction': 'self',
226223
'manual_switch_count': '7',
227224
'auto_switch_count': '71',
@@ -258,9 +255,9 @@ def test_post_port_mux_info_to_db(self):
258255
assert(rc != -1)
259256

260257
@patch('xcvrd.xcvrd_utilities.y_cable_helper.y_cable_platform_sfputil', MagicMock(return_value=[0]))
261-
@patch('y_cable_helper.logical_port_name_to_physical_port_list', MagicMock(return_value=[0]))
262-
@patch('y_cable_helper._wrapper_get_presence', MagicMock(return_value=True))
263-
@patch('y_cable_helper.get_muxcable_static_info', MagicMock(return_value={'read_side': 'self',
258+
@patch('xcvrd.xcvrd_utilities.y_cable_helper.logical_port_name_to_physical_port_list', MagicMock(return_value=[0]))
259+
@patch('xcvrd.xcvrd_utilities.y_cable_helper._wrapper_get_presence', MagicMock(return_value=True))
260+
@patch('xcvrd.xcvrd_utilities.y_cable_helper.get_muxcable_static_info', MagicMock(return_value={'read_side': 'self',
264261
'nic_lane1_precursor1': '1',
265262
'nic_lane1_precursor2': '-7',
266263
'nic_lane1_maincursor': '-1',
@@ -318,3 +315,23 @@ def test_get_media_settings_key(self):
318315
result = get_media_settings_key(0, xcvr_info_dict)
319316
assert result == ['MOLEX-1064141421', 'QSFP+-*']
320317
# TODO: Ensure that error message was logged
318+
319+
def test_detect_port_in_error_status(self):
320+
class MockTable:
321+
def get(self, key):
322+
pass
323+
324+
status_tbl = MockTable()
325+
status_tbl.get = MagicMock(return_value=(True, {'error': 'N/A'}))
326+
assert not detect_port_in_error_status(None, status_tbl)
327+
328+
status_tbl.get = MagicMock(return_value=(True, {'error': SfpBase.SFP_ERROR_DESCRIPTION_BLOCKING}))
329+
assert detect_port_in_error_status(None, status_tbl)
330+
331+
def test_is_error_sfp_status(self):
332+
error_values = [7, 11, 19, 35]
333+
for error_value in error_values:
334+
assert is_error_block_eeprom_reading(error_value)
335+
336+
assert not is_error_block_eeprom_reading(int(SFP_STATUS_INSERTED))
337+
assert not is_error_block_eeprom_reading(int(SFP_STATUS_REMOVED))

sonic-xcvrd/xcvrd/xcvrd.py

+51-58
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@
1515
import threading
1616
import time
1717

18-
from enum import Enum
1918
from sonic_py_common import daemon_base, device_info, logger
2019
from sonic_py_common import multi_asic
2120
from swsscommon import swsscommon
2221

22+
from .xcvrd_utilities import sfp_status_helper
2323
from .xcvrd_utilities import y_cable_helper
2424
except ImportError as e:
2525
raise ImportError(str(e) + " - required module not found")
@@ -43,18 +43,6 @@
4343
TIME_FOR_SFP_READY_SECS = 1
4444
XCVRD_MAIN_THREAD_SLEEP_SECS = 60
4545

46-
# SFP status definition, shall be aligned with the definition in get_change_event() of ChassisBase
47-
SFP_STATUS_REMOVED = '0'
48-
SFP_STATUS_INSERTED = '1'
49-
50-
# SFP error code enum, new elements can be added to the enum if new errors need to be supported.
51-
SFP_STATUS_ERR_ENUM = Enum('SFP_STATUS_ERR_ENUM', ['SFP_STATUS_ERR_I2C_STUCK', 'SFP_STATUS_ERR_BAD_EEPROM',
52-
'SFP_STATUS_ERR_UNSUPPORTED_CABLE', 'SFP_STATUS_ERR_HIGH_TEMP',
53-
'SFP_STATUS_ERR_BAD_CABLE'], start=2)
54-
55-
# Convert the error code to string and store them in a set for convenience
56-
errors_block_eeprom_reading = set(str(error_code.value) for error_code in SFP_STATUS_ERR_ENUM)
57-
5846
EVENT_ON_ALL_SFP = '-1'
5947
# events definition
6048
SYSTEM_NOT_READY = 'system_not_ready'
@@ -188,11 +176,13 @@ def _wrapper_get_transceiver_change_event(timeout):
188176
if platform_chassis is not None:
189177
try:
190178
status, events = platform_chassis.get_change_event(timeout)
191-
sfp_events = events['sfp']
192-
return status, sfp_events
179+
sfp_events = events.get('sfp')
180+
sfp_errors = events.get('sfp_error')
181+
return status, sfp_events, sfp_errors
193182
except NotImplementedError:
194183
pass
195-
return platform_sfputil.get_transceiver_change_event(timeout)
184+
status, events = platform_sfputil.get_transceiver_change_event(timeout)
185+
return status, events, None
196186

197187

198188
def _wrapper_get_sfp_type(physical_port):
@@ -203,6 +193,14 @@ def _wrapper_get_sfp_type(physical_port):
203193
pass
204194
return None
205195

196+
197+
def _wrapper_get_sfp_error_description(physical_port):
198+
if platform_chassis:
199+
try:
200+
return platform_chassis.get_sfp(physical_port).get_error_description()
201+
except NotImplementedError:
202+
pass
203+
return None
206204
# Remove unnecessary unit from the raw data
207205

208206

@@ -553,7 +551,7 @@ def recover_missing_sfp_table_entries(sfp_util, int_tbl, status_tbl, stop_event)
553551
continue
554552

555553
keys = int_tbl[asic_index].getKeys()
556-
if logical_port_name not in keys and not detect_port_in_error_status(logical_port_name, status_tbl[asic_index]):
554+
if logical_port_name not in keys and not sfp_status_helper.detect_port_in_error_status(logical_port_name, status_tbl[asic_index]):
557555
post_port_sfp_info_to_db(logical_port_name, int_tbl[asic_index], transceiver_dict, stop_event)
558556

559557

@@ -791,30 +789,17 @@ def waiting_time_compensation_with_sleep(time_start, time_to_wait):
791789
# Update port SFP status table on receiving SFP change event
792790

793791

794-
def update_port_transceiver_status_table(logical_port_name, status_tbl, status):
795-
fvs = swsscommon.FieldValuePairs([('status', status)])
792+
def update_port_transceiver_status_table(logical_port_name, status_tbl, status, error_descriptions='N/A'):
793+
fvs = swsscommon.FieldValuePairs([('status', status), ('error', error_descriptions)])
796794
status_tbl.set(logical_port_name, fvs)
797795

796+
798797
# Delete port from SFP status table
799798

800799

801800
def delete_port_from_status_table(logical_port_name, status_tbl):
802801
status_tbl._del(logical_port_name)
803802

804-
# Check whether port in error status
805-
806-
807-
def detect_port_in_error_status(logical_port_name, status_tbl):
808-
rec, fvp = status_tbl.get(logical_port_name)
809-
if rec:
810-
status_dict = dict(fvp)
811-
if status_dict['status'] in errors_block_eeprom_reading:
812-
return True
813-
else:
814-
return False
815-
else:
816-
return False
817-
818803
# Init TRANSCEIVER_STATUS table
819804

820805

@@ -844,16 +829,16 @@ def init_port_sfp_status_tbl(stop_event=threading.Event()):
844829
physical_port_list = logical_port_name_to_physical_port_list(logical_port_name)
845830
if physical_port_list is None:
846831
helper_logger.log_error("No physical ports found for logical port '{}'".format(logical_port_name))
847-
update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], SFP_STATUS_REMOVED)
832+
update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_REMOVED)
848833

849834
for physical_port in physical_port_list:
850835
if stop_event.is_set():
851836
break
852837

853838
if not _wrapper_get_presence(physical_port):
854-
update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], SFP_STATUS_REMOVED)
839+
update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_REMOVED)
855840
else:
856-
update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], SFP_STATUS_INSERTED)
841+
update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_INSERTED)
857842

858843
#
859844
# Helper classes ===============================================================
@@ -892,7 +877,7 @@ def task_worker(self, y_cable_presence):
892877
logger.log_warning("Got invalid asic index for {}, ignored".format(logical_port_name))
893878
continue
894879

895-
if not detect_port_in_error_status(logical_port_name, status_tbl[asic_index]):
880+
if not sfp_status_helper.detect_port_in_error_status(logical_port_name, status_tbl[asic_index]):
896881
post_port_dom_info_to_db(logical_port_name, dom_tbl[asic_index], self.task_stopping_event)
897882
post_port_dom_threshold_info_to_db(logical_port_name, dom_tbl[asic_index], self.task_stopping_event)
898883
if y_cable_presence[0] is True:
@@ -1035,7 +1020,7 @@ def task_worker(self, stopping_event, sfp_error_event, y_cable_presence):
10351020
while not stopping_event.is_set():
10361021
next_state = state
10371022
time_start = time.time()
1038-
status, port_dict = _wrapper_get_transceiver_change_event(timeout)
1023+
status, port_dict, error_dict = _wrapper_get_transceiver_change_event(timeout)
10391024
if not port_dict:
10401025
continue
10411026
helper_logger.log_debug("Got event {} {} in state {}".format(status, port_dict, state))
@@ -1095,11 +1080,11 @@ def task_worker(self, stopping_event, sfp_error_event, y_cable_presence):
10951080
logger.log_warning("Got invalid asic index for {}, ignored".format(logical_port))
10961081
continue
10971082

1098-
if value == SFP_STATUS_INSERTED:
1083+
if value == sfp_status_helper.SFP_STATUS_INSERTED:
10991084
helper_logger.log_info("Got SFP inserted event")
11001085
# A plugin event will clear the error state.
11011086
update_port_transceiver_status_table(
1102-
logical_port, status_tbl[asic_index], SFP_STATUS_INSERTED)
1087+
logical_port, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_INSERTED)
11031088
helper_logger.log_info("receive plug in and update port sfp status table.")
11041089
rc = post_port_sfp_info_to_db(logical_port, int_tbl[asic_index], transceiver_dict)
11051090
# If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
@@ -1111,28 +1096,36 @@ def task_worker(self, stopping_event, sfp_error_event, y_cable_presence):
11111096
post_port_dom_threshold_info_to_db(logical_port, dom_tbl[asic_index])
11121097
notify_media_setting(logical_port, transceiver_dict, app_port_tbl[asic_index])
11131098
transceiver_dict.clear()
1114-
elif value == SFP_STATUS_REMOVED:
1099+
elif value == sfp_status_helper.SFP_STATUS_REMOVED:
11151100
helper_logger.log_info("Got SFP removed event")
11161101
update_port_transceiver_status_table(
1117-
logical_port, status_tbl[asic_index], SFP_STATUS_REMOVED)
1118-
helper_logger.log_info("receive plug out and pdate port sfp status table.")
1102+
logical_port, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_REMOVED)
1103+
helper_logger.log_info("receive plug out and update port sfp status table.")
11191104
del_port_sfp_dom_info_from_db(logical_port, int_tbl[asic_index], dom_tbl[asic_index])
1120-
elif value in errors_block_eeprom_reading:
1121-
helper_logger.log_info("Got SFP Error event")
1122-
# Add port to error table to stop accessing eeprom of it
1123-
# If the port already in the error table, the stored error code will
1124-
# be updated to the new one.
1125-
update_port_transceiver_status_table(logical_port, status_tbl[asic_index], value)
1126-
helper_logger.log_info("receive error update port sfp status table.")
1127-
# In this case EEPROM is not accessible, so remove the DOM info
1128-
# since it will be outdated if long time no update.
1129-
# but will keep the interface info in the DB since it static.
1130-
del_port_sfp_dom_info_from_db(logical_port, None, dom_tbl[asic_index])
1131-
11321105
else:
1133-
# SFP return unkown event, just ignore for now.
1134-
helper_logger.log_warning("Got unknown event {}, ignored".format(value))
1135-
continue
1106+
try:
1107+
error_bits = int(value)
1108+
helper_logger.log_info("Got SFP error event {}".format(value))
1109+
1110+
error_descriptions = sfp_status_helper.fetch_generic_error_description(error_bits)
1111+
1112+
if sfp_status_helper.has_vendor_specific_error(error_bits):
1113+
if error_dict:
1114+
vendor_specific_error_description = error_dict.get(key)
1115+
else:
1116+
vendor_specific_error_description = _wrapper_get_sfp_error_description(key)
1117+
error_descriptions.append(vendor_specific_error_description)
1118+
1119+
# Add error info to database
1120+
# Any existing error will be replaced by the new one.
1121+
update_port_transceiver_status_table(logical_port, status_tbl[asic_index], value, '|'.join(error_descriptions))
1122+
helper_logger.log_info("Receive error update port sfp status table.")
1123+
# In this case EEPROM is not accessible. The DOM info will be removed since it can be out-of-date.
1124+
# The interface info remains in the DB since it is static.
1125+
if sfp_status_helper.is_error_block_eeprom_reading(error_bits):
1126+
del_port_sfp_dom_info_from_db(logical_port, None, dom_tbl[asic_index])
1127+
except (TypeError, ValueError) as e:
1128+
logger.log_error("Got unrecognized event {}, ignored".format(value))
11361129

11371130
# Since ports could be connected to a mux cable, if there is a change event process the change for being on a Y cable Port
11381131
y_cable_helper.change_ports_status_for_y_cable_change_event(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from sonic_platform_base.sfp_base import SfpBase
2+
3+
# SFP status definition, shall be aligned with the definition in get_change_event() of ChassisBase
4+
SFP_STATUS_REMOVED = '0'
5+
SFP_STATUS_INSERTED = '1'
6+
7+
# SFP error code dictinary, new elements can be added if new errors need to be supported.
8+
SFP_ERRORS_BLOCKING_MASK = 0x02
9+
SFP_ERRORS_GENERIC_MASK = 0x0000FFFE
10+
SFP_ERRORS_VENDOR_SPECIFIC_MASK = 0xFFFF0000
11+
12+
def is_error_block_eeprom_reading(error_bits):
13+
return 0 != (error_bits & SFP_ERRORS_BLOCKING_MASK)
14+
15+
16+
def has_vendor_specific_error(error_bits):
17+
return 0 != (error_bits & SFP_ERRORS_VENDOR_SPECIFIC_MASK)
18+
19+
20+
def fetch_generic_error_description(error_bits):
21+
generic_error_bits = (error_bits & SFP_ERRORS_GENERIC_MASK)
22+
error_descriptions = []
23+
if generic_error_bits:
24+
for error_bit, error_description in SfpBase.SFP_ERROR_BIT_TO_DESCRIPTION_DICT.items():
25+
if error_bit & generic_error_bits:
26+
error_descriptions.append(error_description)
27+
return error_descriptions
28+
29+
30+
def detect_port_in_error_status(logical_port_name, status_tbl):
31+
rec, fvp = status_tbl.get(logical_port_name)
32+
if rec:
33+
status_dict = dict(fvp)
34+
error = status_dict.get('error')
35+
return SfpBase.SFP_ERROR_DESCRIPTION_BLOCKING in error
36+
return False
37+

0 commit comments

Comments
 (0)