Skip to content

Commit 7543993

Browse files
[202012] [Mellanox] Fix issue: SFP eeprom corrupted after replacing cable with different sfp type (#13543)
- Why I did it There are 3 tasks in xcvrd: main task, run a loop to recover missing SFP static information to DB every 1 minute SFP state task, a process which listens cable plug in/out event, insert SFP static information to DB while a cable is inserted SFP DOM update task, a thread which handles cable DOM information update every 1 minute Let assume user replaces QSFP with QSFP-DD. There are two issues: Only SFP state task listens cable plug in/out event, main task and SFP DOM update task does not know SFP type has changed, they still “think” the SFP type is QSFP. So, main task and SFP DOM update task uses QSFP standard to parse QSFP-DD EEPROM which causes corrupted data. There is a race condition between main task and SFP state task. They both insert SFP static information to DB. Depends on timing, it is possible that main task using wrong SFP type to override SFP static information. The PR is to fix these two issues. There is no such issue on 202205 and above because there is a refactor for xcvrd: SFP state task was changed from process to thread, so that all 3 tasks share the same memory space, they always have correct SFP type. Recover missing SFP information logical was moved from main task to SFP state task. There is no race condition anymore. - How I did it It is difficult to back port latest xcvrd because there are many refactor/new features in xcvrd after 202012 release. It will be huge effort to do so. Based on that, we decided to fix the issue on Nvidia platform API side. The fix is that: refreshing SFP type before any SFP API which accessing SFP EEPROM. Refreshing SFP type before any SFP API would cause a small performance down: Due to my test on 202012 branch, accessing transceiver INFO and DOM INFO for 32 ports takes 1.7 seconds before the change. The number changes to 2.4 seconds after the change. I suppose the performance down is acceptable. - How to verify it Manual test Regression
1 parent 5b88954 commit 7543993

File tree

1 file changed

+65
-36
lines changed
  • platform/mellanox/mlnx-platform-api/sonic_platform

1 file changed

+65
-36
lines changed

platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py

+65-36
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#############################################################################
88

99
try:
10+
import functools
1011
import time
1112
from sonic_platform_base.sfp_base import SfpBase
1213
from sonic_platform_base.sonic_eeprom import eeprom_dts
@@ -323,6 +324,24 @@ def deinitialize_sdk_handle(sdk_handle):
323324
logger.log_warning("Sdk handle is none")
324325
return False
325326

327+
328+
def refresh_sfp_type(refresh_dom_capability=True):
329+
"""Decorator to force refreshing sfp type
330+
331+
Args:
332+
refresh_dom_capability (bool, optional): refresh DOM capability. Defaults to True.
333+
"""
334+
def decorator(method):
335+
@functools.wraps(method)
336+
def _impl(self, *args, **kwargs):
337+
self._refresh_sfp_type()
338+
if refresh_dom_capability:
339+
self._dom_capability_detect()
340+
return method(self, *args, **kwargs)
341+
return _impl
342+
return decorator
343+
344+
326345
class SFP(SfpBase):
327346
"""Platform-specific SFP class"""
328347

@@ -332,9 +351,9 @@ def __init__(self, sfp_index, sfp_type, sdk_handle_getter, platform):
332351
self.sdk_sysfs_page_path_header = SFP_SYSFS_PATH.format(sfp_index)
333352
self.sfp_eeprom_path = "qsfp{}".format(self.index)
334353
self.sfp_status_path = "qsfp{}_status".format(self.index)
335-
self._detect_sfp_type(sfp_type)
354+
self._sfp_type = None
355+
self._default_sfp_type = sfp_type
336356
self.dom_tx_disable_supported = False
337-
self._dom_capability_detect()
338357
self.sdk_handle_getter = sdk_handle_getter
339358
self.sdk_index = sfp_index
340359

@@ -352,7 +371,7 @@ def reinit(self):
352371
Re-initialize this SFP object when a new SFP inserted
353372
:return:
354373
"""
355-
self._detect_sfp_type(self.sfp_type)
374+
self._refresh_sfp_type()
356375
self._dom_capability_detect()
357376

358377
def get_presence(self):
@@ -382,7 +401,8 @@ def _read_eeprom_specific_bytes(self, offset, num_bytes):
382401
if page_num == 0:
383402
page = self.sdk_sysfs_page_path_header + PATH_PAGE00
384403
else:
385-
if self.sfp_type == SFP_TYPE and page_num == 1:
404+
self._refresh_sfp_type()
405+
if page_num == 1 and self.sfp_type == SFP_TYPE:
386406
page = self.sdk_sysfs_page_path_header + PATH_PAGE00_A2
387407
else:
388408
page = self.sdk_sysfs_page_path_header + PATH_PAGE.format(page_num)
@@ -394,26 +414,33 @@ def _read_eeprom_specific_bytes(self, offset, num_bytes):
394414
except (OSError, IOError):
395415
return None
396416

397-
def _detect_sfp_type(self, sfp_type):
398-
eeprom_raw = []
399-
eeprom_raw = self._read_eeprom_specific_bytes(XCVR_TYPE_OFFSET, XCVR_TYPE_WIDTH)
400-
if eeprom_raw:
401-
if eeprom_raw[0] in SFP_TYPE_CODE_LIST:
402-
self.sfp_type = SFP_TYPE
403-
elif eeprom_raw[0] in QSFP_TYPE_CODE_LIST:
404-
self.sfp_type = QSFP_TYPE
405-
elif eeprom_raw[0] in QSFP_DD_TYPE_CODE_LIST:
406-
self.sfp_type = QSFP_DD_TYPE
417+
@property
418+
def sfp_type(self):
419+
if self._sfp_type is None:
420+
eeprom_raw = []
421+
eeprom_raw = self._read_eeprom_specific_bytes(XCVR_TYPE_OFFSET, XCVR_TYPE_WIDTH)
422+
if eeprom_raw:
423+
if eeprom_raw[0] in SFP_TYPE_CODE_LIST:
424+
self._sfp_type = SFP_TYPE
425+
elif eeprom_raw[0] in QSFP_TYPE_CODE_LIST:
426+
self._sfp_type = QSFP_TYPE
427+
elif eeprom_raw[0] in QSFP_DD_TYPE_CODE_LIST:
428+
self._sfp_type = QSFP_DD_TYPE
429+
else:
430+
# we don't recognize this identifier value, treat the xSFP module as the default type
431+
self._sfp_type = self._default_sfp_type
432+
logger.log_info("Identifier value of {} module {} is {} which isn't recognized and will be treated as default type ({})".format(
433+
self._default_sfp_type, self.index, eeprom_raw[0], self._default_sfp_type
434+
))
407435
else:
408-
# we don't regonize this identifier value, treat the xSFP module as the default type
409-
self.sfp_type = sfp_type
410-
logger.log_info("Identifier value of {} module {} is {} which isn't regonized and will be treated as default type ({})".format(
411-
sfp_type, self.index, eeprom_raw[0], sfp_type
412-
))
413-
else:
414-
# eeprom_raw being None indicates the module is not present.
415-
# in this case we treat it as the default type according to the SKU
416-
self.sfp_type = sfp_type
436+
# eeprom_raw being None indicates the module is not present.
437+
# in this case we treat it as the default type according to the SKU
438+
self._sfp_type = self._default_sfp_type
439+
self._default_sfp_type = self._sfp_type
440+
return self._sfp_type
441+
442+
def _refresh_sfp_type(self):
443+
self._sfp_type = None
417444

418445
def _is_qsfp_copper(self):
419446
# This function read the specification compliance byte and
@@ -537,6 +564,7 @@ def _dom_capability_detect(self):
537564
self.dom_supported = False
538565
self.dom_temp_supported = False
539566
self.dom_volt_supported = False
567+
self.second_application_list = False
540568
self.dom_rx_power_supported = False
541569
self.dom_tx_power_supported = False
542570
self.dom_tx_bias_power_supported = False
@@ -595,7 +623,7 @@ def _convert_string_to_num(self, value_str):
595623
else:
596624
return 'N/A'
597625

598-
626+
@refresh_sfp_type()
599627
def get_transceiver_info(self):
600628
"""
601629
Retrieves transceiver info of this SFP
@@ -900,7 +928,7 @@ def get_transceiver_info(self):
900928

901929
return transceiver_info_dict
902930

903-
931+
@refresh_sfp_type()
904932
def get_transceiver_bulk_status(self):
905933
"""
906934
Retrieves transceiver bulk status of this SFP
@@ -1100,7 +1128,7 @@ def get_transceiver_bulk_status(self):
11001128

11011129
return transceiver_dom_info_dict
11021130

1103-
1131+
@refresh_sfp_type()
11041132
def get_transceiver_threshold_info(self):
11051133
"""
11061134
Retrieves transceiver threshold info of this SFP
@@ -1279,7 +1307,7 @@ def get_transceiver_threshold_info(self):
12791307

12801308
return transceiver_dom_threshold_info_dict
12811309

1282-
1310+
@refresh_sfp_type()
12831311
def get_reset_status(self):
12841312
"""
12851313
Retrieves the reset status of SFP
@@ -1332,6 +1360,7 @@ def get_reset_status(self):
13321360
dom_channel_status_data = sfpd_obj.parse_dom_channel_status(dom_channel_status_raw, 0)
13331361
return dom_channel_status_data['data']['Status']['value'] == 'On'
13341362

1363+
@refresh_sfp_type()
13351364
def get_rx_los(self):
13361365
"""
13371366
Retrieves the RX LOS (lost-of-signal) status of SFP
@@ -1364,7 +1393,7 @@ def get_rx_los(self):
13641393
rx_los_list.append(False)
13651394
return rx_los_list
13661395

1367-
1396+
@refresh_sfp_type()
13681397
def get_tx_fault(self):
13691398
"""
13701399
Retrieves the TX fault status of SFP
@@ -1397,7 +1426,7 @@ def get_tx_fault(self):
13971426
tx_fault_list.append(False)
13981427
return tx_fault_list
13991428

1400-
1429+
@refresh_sfp_type()
14011430
def get_tx_disable(self):
14021431
"""
14031432
Retrieves the tx_disable status of this SFP
@@ -1500,7 +1529,7 @@ def get_lpmode(self):
15001529

15011530
return oper_pwr_mode == SX_MGMT_PHY_MOD_PWR_MODE_LOW_E
15021531

1503-
1532+
@refresh_sfp_type(refresh_dom_capability=True)
15041533
def get_power_override(self):
15051534
"""
15061535
Retrieves the power-override status of this SFP
@@ -1521,7 +1550,7 @@ def get_power_override(self):
15211550
else:
15221551
return NotImplementedError
15231552

1524-
1553+
@refresh_sfp_type()
15251554
def get_temperature(self):
15261555
"""
15271556
Retrieves the temperature of this SFP
@@ -1579,7 +1608,7 @@ def get_temperature(self):
15791608
else:
15801609
return None
15811610

1582-
1611+
@refresh_sfp_type()
15831612
def get_voltage(self):
15841613
"""
15851614
Retrieves the supply voltage of this SFP
@@ -1638,7 +1667,7 @@ def get_voltage(self):
16381667
else:
16391668
return None
16401669

1641-
1670+
@refresh_sfp_type()
16421671
def get_tx_bias(self):
16431672
"""
16441673
Retrieves the TX bias current of this SFP
@@ -1672,7 +1701,7 @@ def get_tx_bias(self):
16721701
if sfpd_obj is None:
16731702
return None
16741703

1675-
if dom_tx_bias_power_supported:
1704+
if self.dom_tx_bias_power_supported:
16761705
dom_tx_bias_raw = self._read_eeprom_specific_bytes((offset + QSFP_DD_TX_BIAS_OFFSET), QSFP_DD_TX_BIAS_WIDTH)
16771706
if dom_tx_bias_raw is not None:
16781707
dom_tx_bias_data = sfpd_obj.parse_dom_tx_bias(dom_tx_bias_raw, 0)
@@ -1705,7 +1734,7 @@ def get_tx_bias(self):
17051734

17061735
return tx_bias_list
17071736

1708-
1737+
@refresh_sfp_type()
17091738
def get_rx_power(self):
17101739
"""
17111740
Retrieves the received optical power for this SFP
@@ -1781,7 +1810,7 @@ def get_rx_power(self):
17811810
return None
17821811
return rx_power_list
17831812

1784-
1813+
@refresh_sfp_type()
17851814
def get_tx_power(self):
17861815
"""
17871816
Retrieves the TX power of this SFP

0 commit comments

Comments
 (0)