Skip to content

Commit 238fc06

Browse files
authored
[xcvrd] Extend xcvrd with SFP error event handling (sonic-net#52)
* extend xcvrd with SFP error event handling * change sfp error table to status table, store plug in/out and error status using enum for sfp error code
1 parent 97e40ce commit 238fc06

File tree

1 file changed

+95
-9
lines changed

1 file changed

+95
-9
lines changed

sonic-xcvrd/scripts/xcvrd

+95-9
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ try:
1919
from sonic_daemon_base import daemon_base
2020
from sonic_daemon_base.daemon_base import Logger
2121
from sonic_daemon_base.daemon_base import DaemonBase
22+
from enum import Enum
2223
except ImportError, e:
2324
raise ImportError (str(e) + " - required module not found")
2425

@@ -33,15 +34,25 @@ PLATFORM_SPECIFIC_CLASS_NAME = "SfpUtil"
3334

3435
TRANSCEIVER_INFO_TABLE = 'TRANSCEIVER_INFO'
3536
TRANSCEIVER_DOM_SENSOR_TABLE = 'TRANSCEIVER_DOM_SENSOR'
37+
TRANSCEIVER_STATUS_TABLE = 'TRANSCEIVER_STATUS'
3638

3739
SELECT_TIMEOUT_MSECS = 1000
3840

3941
DOM_INFO_UPDATE_PERIOD_SECS = 60
4042
TIME_FOR_SFP_READY_SECS = 1
4143
XCVRD_MAIN_THREAD_SLEEP_SECS = 60
4244

43-
SFP_STATUS_INSERTED = '1'
45+
# SFP status definition, shall be aligned with the definition in get_change_event() of ChassisBase
4446
SFP_STATUS_REMOVED = '0'
47+
SFP_STATUS_INSERTED = '1'
48+
49+
# SFP error code enum, new elements can be added to the enum if new errors need to be supported.
50+
SFP_STATUS_ERR_ENUM = Enum('SFP_STATUS_ERR_ENUM', ['SFP_STATUS_ERR_I2C_STUCK', 'SFP_STATUS_ERR_BAD_EEPROM',
51+
'SFP_STATUS_ERR_UNSUPPORTED_CABLE', 'SFP_STATUS_ERR_HIGH_TEMP',
52+
'SFP_STATUS_ERR_BAD_CABLE'], start=2)
53+
54+
# Convert the error code to string and store them in a set for convenience
55+
errors_block_eeprom_reading = set(str(error_code.value) for error_code in SFP_STATUS_ERR_ENUM)
4556

4657
EVENT_ON_ALL_SFP = '-1'
4758
# events definition
@@ -411,23 +422,25 @@ def del_port_sfp_dom_info_from_db(logical_port_name, int_tbl, dom_tbl):
411422
ganged_member_num += 1
412423

413424
try:
414-
int_tbl._del(port_name)
415-
dom_tbl._del(port_name)
425+
if int_tbl != None:
426+
int_tbl._del(port_name)
427+
if dom_tbl != None:
428+
dom_tbl._del(port_name)
416429

417430
except NotImplementedError:
418431
logger.log_error("This functionality is currently not implemented for this platform")
419432
sys.exit(NOT_IMPLEMENTED_ERROR)
420433

421434
# recover missing sfp table entries if any
422-
def recover_missing_sfp_table_entries(sfp_util, int_tbl, stop_event):
435+
def recover_missing_sfp_table_entries(sfp_util, int_tbl, status_tbl, stop_event):
423436
transceiver_dict = {}
424437

425438
keys = int_tbl.getKeys()
426439
logical_port_list = sfp_util.logical
427440
for logical_port_name in logical_port_list:
428441
if stop_event.is_set():
429442
break
430-
if logical_port_name not in keys:
443+
if logical_port_name not in keys and not detect_port_in_error_status(logical_port_name, status_tbl):
431444
post_port_sfp_info_to_db(logical_port_name, int_tbl, transceiver_dict, stop_event)
432445

433446

@@ -641,6 +654,53 @@ def waiting_time_compensation_with_sleep(time_start, time_to_wait):
641654
if time_diff < time_to_wait:
642655
time.sleep(time_to_wait - time_diff)
643656

657+
# Update port SFP status table on receiving SFP change event
658+
def update_port_transceiver_status_table(logical_port_name, status_tbl, status):
659+
fvs = swsscommon.FieldValuePairs([('status', status)])
660+
status_tbl.set(logical_port_name, fvs)
661+
662+
# Delete port from SFP status table
663+
def delete_port_from_status_table(logical_port_name, status_tbl):
664+
status_tbl._del(logical_port_name)
665+
666+
# Check whether port in error status
667+
def detect_port_in_error_status(logical_port_name, status_tbl):
668+
rec, fvp = status_tbl.get(logical_port_name)
669+
if rec:
670+
status_dict = dict(fvp)
671+
if status_dict['status'] in errors_block_eeprom_reading:
672+
return True
673+
else:
674+
return False
675+
else:
676+
return False
677+
678+
# Init TRANSCEIVER_STATUS table
679+
def init_port_sfp_status_tbl(stop_event=threading.Event()):
680+
# Connect to STATE_DB and create transceiver status table
681+
state_db = daemon_base.db_connect(swsscommon.STATE_DB)
682+
status_tbl = swsscommon.Table(state_db, TRANSCEIVER_STATUS_TABLE)
683+
684+
# Init TRANSCEIVER_STATUS table
685+
logical_port_list = platform_sfputil.logical
686+
for logical_port_name in logical_port_list:
687+
if stop_event.is_set():
688+
break
689+
physical_port_list = logical_port_name_to_physical_port_list(logical_port_name)
690+
if physical_port_list is None:
691+
logger.log_error("No physical ports found for logical port '%s'" % logical_port_name)
692+
update_port_transceiver_status_table(logical_port_name, status_tbl, SFP_STATUS_REMOVED)
693+
694+
for physical_port in physical_port_list:
695+
if stop_event.is_set():
696+
break
697+
698+
if not _wrapper_get_presence(physical_port):
699+
update_port_transceiver_status_table(logical_port_name, status_tbl, SFP_STATUS_REMOVED)
700+
else:
701+
update_port_transceiver_status_table(logical_port_name, status_tbl, SFP_STATUS_INSERTED)
702+
703+
644704
#
645705
# Helper classes ===============================================================
646706
#
@@ -657,13 +717,15 @@ class dom_info_update_task:
657717
# Connect to STATE_DB and create transceiver dom info table
658718
state_db = daemon_base.db_connect(swsscommon.STATE_DB)
659719
dom_tbl = swsscommon.Table(state_db, TRANSCEIVER_DOM_SENSOR_TABLE)
720+
status_tbl = swsscommon.Table(state_db, TRANSCEIVER_STATUS_TABLE)
660721

661722
# Start loop to update dom info in DB periodically
662723
while not self.task_stopping_event.wait(DOM_INFO_UPDATE_PERIOD_SECS):
663724
logical_port_list = platform_sfputil.logical
664725
for logical_port_name in logical_port_list:
665-
post_port_dom_info_to_db(logical_port_name, dom_tbl, self.task_stopping_event)
666-
post_port_dom_threshold_info_to_db(logical_port_name, dom_tbl, self.task_stopping_event)
726+
if not detect_port_in_error_status(logical_port_name, status_tbl):
727+
post_port_dom_info_to_db(logical_port_name, dom_tbl, self.task_stopping_event)
728+
post_port_dom_threshold_info_to_db(logical_port_name, dom_tbl, self.task_stopping_event)
667729

668730
logger.log_info("Stop DOM monitoring loop")
669731

@@ -716,6 +778,7 @@ class sfp_state_update_task:
716778
state_db = daemon_base.db_connect(swsscommon.STATE_DB)
717779
int_tbl = swsscommon.Table(state_db, TRANSCEIVER_INFO_TABLE)
718780
dom_tbl = swsscommon.Table(state_db, TRANSCEIVER_DOM_SENSOR_TABLE)
781+
status_tbl = swsscommon.Table(state_db, TRANSCEIVER_STATUS_TABLE)
719782

720783
# Connect to APPL_DB to notify Media notifications
721784
appl_db = daemon_base.db_connect(swsscommon.APPL_DB)
@@ -846,6 +909,9 @@ class sfp_state_update_task:
846909
for logical_port in logical_port_list:
847910
if value == SFP_STATUS_INSERTED:
848911
logger.log_info("Got SFP inserted event")
912+
# A plugin event will clear the error state.
913+
update_port_transceiver_status_table(logical_port, status_tbl, SFP_STATUS_INSERTED)
914+
logger.log_info("receive plug in and update port sfp status table.")
849915
rc = post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict)
850916
# If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
851917
if rc == SFP_EEPROM_NOT_READY:
@@ -858,9 +924,23 @@ class sfp_state_update_task:
858924
transceiver_dict.clear()
859925
elif value == SFP_STATUS_REMOVED:
860926
logger.log_info("Got SFP removed event")
927+
update_port_transceiver_status_table(logical_port, status_tbl, SFP_STATUS_REMOVED)
928+
logger.log_info("receive plug out and pdate port sfp status table.")
861929
del_port_sfp_dom_info_from_db(logical_port, int_tbl, dom_tbl)
930+
elif value in errors_block_eeprom_reading:
931+
logger.log_info("Got SFP Error event")
932+
# Add port to error table to stop accessing eeprom of it
933+
# If the port already in the error table, the stored error code will
934+
# be updated to the new one.
935+
update_port_transceiver_status_table(logical_port, status_tbl, value)
936+
logger.log_info("receive error update port sfp status table.")
937+
# In this case EEPROM is not accessible, so remove the DOM info
938+
# since it will be outdated if long time no update.
939+
# but will keep the interface info in the DB since it static.
940+
del_port_sfp_dom_info_from_db(logical_port, None, dom_tbl)
941+
862942
else:
863-
# TODO, SFP return error code, need handle accordingly.
943+
# SFP return unkown event, just ignore for now.
864944
logger.log_warning("Got unknown event {}, ignored".format(value))
865945
continue
866946
else:
@@ -1012,6 +1092,7 @@ class DaemonXcvrd(DaemonBase):
10121092
state_db = daemon_base.db_connect(swsscommon.STATE_DB)
10131093
self.int_tbl = swsscommon.Table(state_db, TRANSCEIVER_INFO_TABLE)
10141094
self.dom_tbl = swsscommon.Table(state_db, TRANSCEIVER_DOM_SENSOR_TABLE)
1095+
self.status_tbl = swsscommon.Table(state_db, TRANSCEIVER_STATUS_TABLE)
10151096

10161097
self.load_media_settings()
10171098
warmstart = swsscommon.WarmStart()
@@ -1027,6 +1108,10 @@ class DaemonXcvrd(DaemonBase):
10271108
logger.log_info("Post all port DOM/SFP info to DB")
10281109
post_port_sfp_dom_info_to_db(is_warm_start, self.stop_event)
10291110

1111+
# Init port sfp status table
1112+
logger.log_info("Init port sfp status table")
1113+
init_port_sfp_status_tbl(self.stop_event)
1114+
10301115
# Deinitialize daemon
10311116
def deinit(self):
10321117
logger.log_info("Start daemon deinit...")
@@ -1035,6 +1120,7 @@ class DaemonXcvrd(DaemonBase):
10351120
logical_port_list = platform_sfputil.logical
10361121
for logical_port_name in logical_port_list:
10371122
del_port_sfp_dom_info_from_db(logical_port_name, self.int_tbl, self.dom_tbl)
1123+
delete_port_from_status_table(logical_port_name, self.status_tbl)
10381124

10391125
# Run daemon
10401126
def run(self):
@@ -1056,7 +1142,7 @@ class DaemonXcvrd(DaemonBase):
10561142

10571143
while not self.stop_event.wait(self.timeout):
10581144
# Check the integrity of the sfp info table and recover the missing entries if any
1059-
recover_missing_sfp_table_entries(platform_sfputil, self.int_tbl, self.stop_event)
1145+
recover_missing_sfp_table_entries(platform_sfputil, self.int_tbl, self.status_tbl, self.stop_event)
10601146

10611147
logger.log_info("Stop daemon main loop")
10621148

0 commit comments

Comments
 (0)