Skip to content

Commit 8d65e2c

Browse files
Junchao-Mellanoxmssonicbld
authored andcommitted
[Mellanox] Fix issues found for CMIS host management (#17637)
- Why I did it 1. Thermal updater should wait more time for module to be initialized 2. sfp should get temperature threshold from EEPROM because SDK sysfs is not yet supported 3. Rename sfp function to fix typo 4. sfp.get_presence should return False if module is under initialization - How I did it 1. Thermal updater should wait more time for module to be initialized 2. sfp should get temperature threshold from EEPROM because SDK sysfs is not yet supported 3. Rename sfp function to fix typo 4. sfp.get_presence should return False if module is under initialization - How to verify it Manual test Unit test
1 parent 3f29b28 commit 8d65e2c

File tree

7 files changed

+109
-80
lines changed

7 files changed

+109
-80
lines changed

platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py

Lines changed: 58 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -327,17 +327,10 @@ def get_presence(self):
327327
Returns:
328328
bool: True if device is present, False if not
329329
"""
330-
if DeviceDataManager.is_independent_mode():
331-
if utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control') != 0:
332-
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_present'):
333-
return False
334-
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_good'):
335-
return False
336-
if not utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/power_on'):
337-
return False
338-
if utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_reset') == 1:
339-
return False
340-
330+
try:
331+
self.is_sw_control()
332+
except:
333+
return False
341334
eeprom_raw = self._read_eeprom(0, 1, log_on_error=False)
342335
return eeprom_raw is not None
343336

@@ -877,6 +870,13 @@ def get_tx_fault(self):
877870
return [False] * api.NUM_CHANNELS if api else None
878871

879872
def get_temperature(self):
873+
"""Get SFP temperature
874+
875+
Returns:
876+
None if there is an error (sysfs does not exist or sysfs return None or module EEPROM not readable)
877+
0.0 if module temperature is not supported or module is under initialization
878+
other float value if module temperature is available
879+
"""
880880
try:
881881
if not self.is_sw_control():
882882
temp_file = f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input'
@@ -893,59 +893,68 @@ def get_temperature(self):
893893
temperature = super().get_temperature()
894894
return temperature if temperature is not None else None
895895

896-
def get_temperature_warning_threashold(self):
896+
def get_temperature_warning_threshold(self):
897897
"""Get temperature warning threshold
898898
899899
Returns:
900-
int: temperature warning threshold
900+
None if there is an error (module EEPROM not readable)
901+
0.0 if warning threshold is not supported or module is under initialization
902+
other float value if warning threshold is available
901903
"""
902904
try:
903-
if not self.is_sw_control():
904-
emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency',
905-
log_func=None,
906-
default=None)
907-
return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
905+
self.is_sw_control()
908906
except:
909-
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
910-
911-
thresh = self._get_temperature_threshold()
912-
if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh:
913-
return thresh[consts.TEMP_HIGH_WARNING_FIELD]
914-
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
907+
return 0.0
908+
909+
support, thresh = self._get_temperature_threshold()
910+
if support is None or thresh is None:
911+
# Failed to read from EEPROM
912+
return None
913+
if support is False:
914+
# Do not support
915+
return 0.0
916+
return thresh.get(consts.TEMP_HIGH_WARNING_FIELD, SFP_DEFAULT_TEMP_WARNNING_THRESHOLD)
915917

916-
def get_temperature_critical_threashold(self):
918+
def get_temperature_critical_threshold(self):
917919
"""Get temperature critical threshold
918920
919921
Returns:
920-
int: temperature critical threshold
922+
None if there is an error (module EEPROM not readable)
923+
0.0 if critical threshold is not supported or module is under initialization
924+
other float value if critical threshold is available
921925
"""
922926
try:
923-
if not self.is_sw_control():
924-
critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical',
925-
log_func=None,
926-
default=None)
927-
return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
927+
self.is_sw_control()
928928
except:
929-
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
929+
return 0.0
930930

931-
thresh = self._get_temperature_threshold()
932-
if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh:
933-
return thresh[consts.TEMP_HIGH_ALARM_FIELD]
934-
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
931+
support, thresh = self._get_temperature_threshold()
932+
if support is None or thresh is None:
933+
# Failed to read from EEPROM
934+
return None
935+
if support is False:
936+
# Do not support
937+
return 0.0
938+
return thresh.get(consts.TEMP_HIGH_ALARM_FIELD, SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD)
935939

936940
def _get_temperature_threshold(self):
941+
"""Get temperature thresholds data from EEPROM
942+
943+
Returns:
944+
tuple: (support, thresh_dict)
945+
"""
937946
self.reinit()
938947
api = self.get_xcvr_api()
939948
if not api:
940-
return None
949+
return None, None
941950

942951
thresh_support = api.get_transceiver_thresholds_support()
943952
if thresh_support:
944953
if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api):
945-
return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
946-
return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
954+
return thresh_support, api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
955+
return thresh_support, api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
947956
else:
948-
return None
957+
return thresh_support, {}
949958

950959
def get_xcvr_api(self):
951960
"""
@@ -964,17 +973,22 @@ def get_xcvr_api(self):
964973
def is_sw_control(self):
965974
if not DeviceDataManager.is_independent_mode():
966975
return False
967-
976+
968977
db = utils.DbUtils.get_db_instance('STATE_DB')
969978
logical_port = NvidiaSFPCommon.get_logical_port_by_sfp_index(self.sdk_index)
970979
if not logical_port:
971-
raise Exception(f'Module {self.sdk_index} is not present or in initialization')
980+
raise Exception(f'Module {self.sdk_index} is not present or under initialization')
972981

973982
initialized = db.exists('STATE_DB', f'TRANSCEIVER_STATUS|{logical_port}')
974983
if not initialized:
975-
raise Exception(f'Module {self.sdk_index} is not present or in initialization')
984+
raise Exception(f'Module {self.sdk_index} is not present or under initialization')
976985

977-
return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control') == 1
986+
try:
987+
return utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/control',
988+
raise_exception=True, log_func=None) == 1
989+
except:
990+
# just in case control file does not exist
991+
raise Exception(f'Module {self.sdk_index} is under initialization')
978992

979993

980994
class RJ45Port(NvidiaSFPCommon):

platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,8 @@ def get_temperature(self):
431431
A float number of current temperature in Celsius up to nearest thousandth
432432
of one degree Celsius, e.g. 30.125
433433
"""
434-
return self.sfp.get_temperature()
434+
value = self.sfp.get_temperature()
435+
return value if (value != 0.0 and value is not None) else None
435436

436437
def get_high_threshold(self):
437438
"""
@@ -441,7 +442,8 @@ def get_high_threshold(self):
441442
A float number, the high threshold temperature of thermal in Celsius
442443
up to nearest thousandth of one degree Celsius, e.g. 30.125
443444
"""
444-
return self.sfp.get_temperature_warning_threashold()
445+
value = self.sfp.get_temperature_warning_threshold()
446+
return value if (value != 0.0 and value is not None) else None
445447

446448
def get_high_critical_threshold(self):
447449
"""
@@ -451,7 +453,8 @@ def get_high_critical_threshold(self):
451453
A float number, the high critical threshold temperature of thermal in Celsius
452454
up to nearest thousandth of one degree Celsius, e.g. 30.125
453455
"""
454-
return self.sfp.get_temperature_critical_threashold()
456+
value = self.sfp.get_temperature_critical_threshold()
457+
return value if (value != 0.0 and value is not None) else None
455458

456459
def get_position_in_parent(self):
457460
"""

platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,5 @@ def deinitialize(cls):
4646
is a no-op.
4747
:return:
4848
"""
49-
if DeviceDataManager.is_independent_mode():
49+
if DeviceDataManager.is_independent_mode() and cls.thermal_updater_task:
5050
cls.thermal_updater_task.stop()

platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def __init__(self, sfp_list):
5656
def load_tc_config(self):
5757
asic_poll_interval = 1
5858
sfp_poll_interval = 10
59-
data = utils.load_json_file(TC_CONFIG_FILE)
59+
data = utils.load_json_file(TC_CONFIG_FILE, log_func=None)
6060
if not data:
6161
logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval')
6262

@@ -108,7 +108,7 @@ def clean_thermal_data(self):
108108

109109
def wait_all_sfp_ready(self):
110110
logger.log_notice('Waiting for all SFP modules ready...')
111-
max_wait_time = 60
111+
max_wait_time = 300
112112
ready_set = set()
113113
while len(ready_set) != len(self._sfp_list):
114114
for sfp in self._sfp_list:
@@ -129,11 +129,11 @@ def get_asic_temp(self):
129129
temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None)
130130
return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None
131131

132-
def get_asic_temp_warning_threashold(self):
132+
def get_asic_temp_warning_threshold(self):
133133
emergency = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/emergency', default=None, log_func=None)
134134
return emergency * ASIC_TEMPERATURE_SCALE if emergency is not None else ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
135135

136-
def get_asic_temp_critical_threashold(self):
136+
def get_asic_temp_critical_threshold(self):
137137
critical = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/critical', default=None, log_func=None)
138138
return critical * ASIC_TEMPERATURE_SCALE if critical is not None else ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
139139

@@ -148,19 +148,19 @@ def update_single_module(self, sfp):
148148
critical_thresh = 0
149149
fault = 0
150150
else:
151-
warning_thresh = sfp.get_temperature_warning_threashold()
152-
critical_thresh = sfp.get_temperature_critical_threashold()
151+
warning_thresh = sfp.get_temperature_warning_threshold()
152+
critical_thresh = sfp.get_temperature_critical_threshold()
153153
fault = ERROR_READ_THERMAL_DATA if (temperature is None or warning_thresh is None or critical_thresh is None) else 0
154-
temperature = 0 if temperature is None else int(temperature * SFP_TEMPERATURE_SCALE)
155-
warning_thresh = 0 if warning_thresh is None else int(warning_thresh * SFP_TEMPERATURE_SCALE)
156-
critical_thresh = 0 if critical_thresh is None else int(critical_thresh * SFP_TEMPERATURE_SCALE)
154+
temperature = 0 if temperature is None else temperature * SFP_TEMPERATURE_SCALE
155+
warning_thresh = 0 if warning_thresh is None else warning_thresh * SFP_TEMPERATURE_SCALE
156+
critical_thresh = 0 if critical_thresh is None else critical_thresh * SFP_TEMPERATURE_SCALE
157157

158158
hw_management_independent_mode_update.thermal_data_set_module(
159159
0, # ASIC index always 0 for now
160160
sfp.sdk_index + 1,
161-
temperature,
162-
critical_thresh,
163-
warning_thresh,
161+
int(temperature),
162+
int(critical_thresh),
163+
int(warning_thresh),
164164
fault
165165
)
166166
else:
@@ -170,7 +170,7 @@ def update_single_module(self, sfp):
170170
if pre_presence != presence:
171171
self._sfp_status[sfp.sdk_index] = presence
172172
except Exception as e:
173-
logger.log_error('Failed to update module {sfp.sdk_index} thermal data - {e}')
173+
logger.log_error(f'Failed to update module {sfp.sdk_index} thermal data - {e}')
174174
hw_management_independent_mode_update.thermal_data_set_module(
175175
0, # ASIC index always 0 for now
176176
sfp.sdk_index + 1,
@@ -187,8 +187,8 @@ def update_module(self):
187187
def update_asic(self):
188188
try:
189189
asic_temp = self.get_asic_temp()
190-
warn_threshold = self.get_asic_temp_warning_threashold()
191-
critical_threshold = self.get_asic_temp_critical_threashold()
190+
warn_threshold = self.get_asic_temp_warning_threshold()
191+
critical_threshold = self.get_asic_temp_critical_threshold()
192192
fault = 0
193193
if asic_temp is None:
194194
logger.log_error('Failed to read ASIC temperature, send fault to hw-management-tc')
@@ -203,7 +203,7 @@ def update_asic(self):
203203
fault
204204
)
205205
except Exception as e:
206-
logger.log_error('Failed to update ASIC thermal data - {e}')
206+
logger.log_error(f'Failed to update ASIC thermal data - {e}')
207207
hw_management_independent_mode_update.thermal_data_set_asic(
208208
0, # ASIC index always 0 for now
209209
0,

platform/mellanox/mlnx-platform-api/tests/test_sfp.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -230,14 +230,18 @@ def test_get_page_and_page_offset(self, mock_get_type_str, mock_eeprom_path, moc
230230
assert page == '/tmp/1/data'
231231
assert page_offset is 0
232232

233+
@mock.patch('sonic_platform.sfp.SFP.is_sw_control')
233234
@mock.patch('sonic_platform.sfp.SFP._read_eeprom')
234-
def test_sfp_get_presence(self, mock_read):
235+
def test_sfp_get_presence(self, mock_read, mock_control):
235236
sfp = SFP(0)
236237
mock_read.return_value = None
237238
assert not sfp.get_presence()
238239

239240
mock_read.return_value = 0
240241
assert sfp.get_presence()
242+
243+
mock_control.side_effect = RuntimeError('')
244+
assert not sfp.get_presence()
241245

242246
@mock.patch('sonic_platform.utils.read_int_from_file')
243247
def test_rj45_get_presence(self, mock_read_int):
@@ -318,14 +322,16 @@ def test_get_temperature(self, mock_read, mock_exists):
318322
def test_get_temperature_threshold(self):
319323
sfp = SFP(0)
320324
sfp.is_sw_control = mock.MagicMock(return_value=True)
321-
assert sfp.get_temperature_warning_threashold() == 70.0
322-
assert sfp.get_temperature_critical_threashold() == 80.0
323325

324326
mock_api = mock.MagicMock()
325327
mock_api.get_transceiver_thresholds_support = mock.MagicMock(return_value=False)
326-
sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api)
327-
assert sfp.get_temperature_warning_threashold() == 70.0
328-
assert sfp.get_temperature_critical_threashold() == 80.0
328+
sfp.get_xcvr_api = mock.MagicMock(return_value=None)
329+
assert sfp.get_temperature_warning_threshold() is None
330+
assert sfp.get_temperature_critical_threshold() is None
331+
332+
sfp.get_xcvr_api.return_value = mock_api
333+
assert sfp.get_temperature_warning_threshold() == 0.0
334+
assert sfp.get_temperature_critical_threshold() == 0.0
329335

330336
from sonic_platform_base.sonic_xcvr.fields import consts
331337
mock_api.get_transceiver_thresholds_support.return_value = True
@@ -334,8 +340,8 @@ def test_get_temperature_threshold(self):
334340
consts.TEMP_HIGH_ALARM_FIELD: 85.0,
335341
consts.TEMP_HIGH_WARNING_FIELD: 75.0
336342
})
337-
assert sfp.get_temperature_warning_threashold() == 75.0
338-
assert sfp.get_temperature_critical_threashold() == 85.0
343+
assert sfp.get_temperature_warning_threshold() == 75.0
344+
assert sfp.get_temperature_critical_threshold() == 85.0
339345

340346
@mock.patch('sonic_platform.sfp.NvidiaSFPCommon.get_logical_port_by_sfp_index')
341347
@mock.patch('sonic_platform.utils.read_int_from_file')

platform/mellanox/mlnx-platform-api/tests/test_thermal.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,11 +160,17 @@ def test_sfp_thermal(self):
160160
assert thermal.get_position_in_parent() == 1
161161
assert thermal.is_replaceable() == False
162162
sfp.get_temperature = mock.MagicMock(return_value=35.4)
163-
sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70)
164-
sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80)
163+
sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=70)
164+
sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=80)
165165
assert thermal.get_temperature() == 35.4
166166
assert thermal.get_high_threshold() == 70
167167
assert thermal.get_high_critical_threshold() == 80
168+
sfp.get_temperature = mock.MagicMock(return_value=0)
169+
sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=0)
170+
sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=None)
171+
assert thermal.get_temperature() is None
172+
assert thermal.get_high_threshold() is None
173+
assert thermal.get_high_critical_threshold() is None
168174

169175
@mock.patch('sonic_platform.utils.read_float_from_file')
170176
def test_get_temperature(self, mock_read):

platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,23 +97,23 @@ def test_update_asic(self, mock_read):
9797
mock_read.return_value = 8
9898
updater = ThermalUpdater(None)
9999
assert updater.get_asic_temp() == 1000
100-
assert updater.get_asic_temp_warning_threashold() == 1000
101-
assert updater.get_asic_temp_critical_threashold() == 1000
100+
assert updater.get_asic_temp_warning_threshold() == 1000
101+
assert updater.get_asic_temp_critical_threshold() == 1000
102102
updater.update_asic()
103103
hw_management_independent_mode_update.thermal_data_set_asic.assert_called_once()
104104

105105
mock_read.return_value = None
106106
assert updater.get_asic_temp() is None
107-
assert updater.get_asic_temp_warning_threashold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
108-
assert updater.get_asic_temp_critical_threashold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
107+
assert updater.get_asic_temp_warning_threshold() == ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD
108+
assert updater.get_asic_temp_critical_threshold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD
109109

110110
def test_update_module(self):
111111
mock_sfp = mock.MagicMock()
112112
mock_sfp.sdk_index = 10
113113
mock_sfp.get_presence = mock.MagicMock(return_value=True)
114114
mock_sfp.get_temperature = mock.MagicMock(return_value=55.0)
115-
mock_sfp.get_temperature_warning_threashold = mock.MagicMock(return_value=70.0)
116-
mock_sfp.get_temperature_critical_threashold = mock.MagicMock(return_value=80.0)
115+
mock_sfp.get_temperature_warning_threshold = mock.MagicMock(return_value=70.0)
116+
mock_sfp.get_temperature_critical_threshold = mock.MagicMock(return_value=80.0)
117117
updater = ThermalUpdater([mock_sfp])
118118
updater.update_module()
119119
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 55000, 80000, 70000, 0)

0 commit comments

Comments
 (0)