From b89cc032b3ed5ac7963538fbe09e3506be1176a1 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Thu, 11 Jul 2019 12:14:57 +0300 Subject: [PATCH 1/3] support new platform api, thermal and psu part for psu, all APIs are supported. for thermal, we support get_temperature, get_high_threshold for the thermal sensors of cpu core, cpu pack, psu and sfp module and get_temperature for the ambient thermal sensors around the asic, port, fan, comex and board. --- .../sonic_platform/chassis.py | 20 +- .../mlnx-platform-api/sonic_platform/psu.py | 135 +++++-- .../sonic_platform/thermal.py | 351 ++++++++++++++++++ 3 files changed, 478 insertions(+), 28 deletions(-) create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index f9875a296d35..af15632c4e3a 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -16,6 +16,7 @@ from sonic_platform.fan import Fan from sonic_platform.fan import FAN_PATH from sonic_platform.sfp import SFP + from sonic_platform.thermal import Thermal, initialize_thermals from sonic_platform.watchdog import get_watchdog from sonic_daemon_base.daemon_base import Logger from eeprom import Eeprom @@ -69,7 +70,7 @@ # magic code defnition for port number, qsfp port position of each hwsku # port_position_tuple = (PORT_START, QSFP_PORT_START, PORT_END, PORT_IN_BLOCK, EEPROM_OFFSET) -hwsku_dict = {'ACS-MSN2700': 0, "LS-SN2700":0, 'ACS-MSN2740': 0, 'ACS-MSN2100': 1, 'ACS-MSN2410': 2, 'ACS-MSN2010': 3, 'ACS-MSN3700': 0, 'ACS-MSN3700C': 0, 'Mellanox-SN2700': 0, 'Mellanox-SN2700-D48C8': 0} +hwsku_dict_port = {'ACS-MSN2700': 0, "LS-SN2700":0, 'ACS-MSN2740': 0, 'ACS-MSN2100': 1, 'ACS-MSN2410': 2, 'ACS-MSN2010': 3, 'ACS-MSN3700': 0, 'ACS-MSN3700C': 0, 'Mellanox-SN2700': 0, 'Mellanox-SN2700-D48C8': 0} port_position_tuple_list = [(0, 0, 31, 32, 1), (0, 0, 15, 16, 1), (0, 48, 55, 56, 1),(0, 18, 21, 22, 1)] class Chassis(ChassisBase): @@ -78,9 +79,12 @@ class Chassis(ChassisBase): def __init__(self): super(Chassis, self).__init__() + # Initialize SKU name + self.sku = self._get_sku_name() + # Initialize PSU list for index in range(MLNX_NUM_PSU): - psu = Psu(index) + psu = Psu(index, self.sku) self._psu_list.append(psu) # Initialize watchdog @@ -99,7 +103,7 @@ def __init__(self): self._fan_list.append(fan) # Initialize SFP list - port_position_tuple = self._get_port_position_tuple_by_sku_name() + port_position_tuple = self._get_port_position_tuple_by_sku_name(self.sku) self.PORT_START = port_position_tuple[0] self.QSFP_PORT_START = port_position_tuple[1] self.PORT_END = port_position_tuple[2] @@ -112,6 +116,9 @@ def __init__(self): sfp_module = SFP(index, 'SFP') self._sfp_list.append(sfp_module) + # Initialize thermals + initialize_thermals(self.sku, self._thermal_list, self._psu_list) + # Initialize EEPROM self.eeprom = Eeprom() @@ -137,10 +144,13 @@ def _extract_num_of_fans_and_fan_drawers(self): return num_of_fan, num_of_drawer - def _get_port_position_tuple_by_sku_name(self): + def _get_sku_name(self): p = subprocess.Popen(GET_HWSKU_CMD, shell=True, stdout=subprocess.PIPE) out, err = p.communicate() - position_tuple = port_position_tuple_list[hwsku_dict[out.rstrip('\n')]] + return out.rstrip('\n') + + def _get_port_position_tuple_by_sku_name(self, sku): + position_tuple = port_position_tuple_list[hwsku_dict_port[self.sku]] return position_tuple def get_base_mac(self): diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index bcbd643eb005..68650d2e25fd 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -12,44 +12,100 @@ try: from sonic_platform_base.psu_base import PsuBase + from sonic_daemon_base.daemon_base import Logger from sonic_platform.fan import Fan except ImportError as e: raise ImportError (str(e) + "- required module not found") +# Global logger class instance +SYSLOG_IDENTIFIER = "mlnx-psu" +logger = Logger(SYSLOG_IDENTIFIER) + psu_list = [] +PSU_CURRENT = "current" +PSU_VOLTAGE = "voltage" +PSU_POWER = "power" +# in most SKUs the file psuX_curr, psuX_volt and psuX_power contain current, voltage and power data respectively. +# but there are exceptions which will be handled by the following dictionary +hwsku_dict_psu = {'ACS-MSN3700': 1, 'ACS-MSN3700C': 1, 'ACS-MSN3800': 1} +psu_profile_list = [ + # default filename convention + { + PSU_CURRENT : "power/psu{}_curr", + PSU_VOLTAGE : "power/psu{}_volt", + PSU_POWER : "power/psu{}_power" + }, + # for 3700, 3700c, 3800 + { + PSU_CURRENT : "power/psu{}_curr", + PSU_VOLTAGE : "power/psu{}_volt_out2", + PSU_POWER : "power/psu{}_power" + } +] + class Psu(PsuBase): """Platform-specific Psu class""" - def __init__(self, psu_index): + def __init__(self, psu_index, sku): global psu_list PsuBase.__init__(self) # PSU is 1-based on Mellanox platform self.index = psu_index + 1 psu_list.append(self.index) - self.psu_path = "/var/run/hw-management/thermal/" - self.psu_oper_status = "psu{}_pwr_status".format(self.index) - self.psu_presence = "psu{}_status".format(self.index) - if os.path.exists(os.path.join(self.psu_path, self.psu_presence)): - self.presence_file_exists = True + self.psu_path = "/var/run/hw-management/" + self.psu_oper_status = "thermal/psu{}_pwr_status".format(self.index) + + if sku in hwsku_dict_psu: + filemap = psu_profile_list[hwsku_dict_psu[sku]] + else: + filemap = psu_profile_list[0] + + print "sku {} filemap {}".format(sku, filemap) + psu_voltage = filemap[PSU_VOLTAGE].format(self.index) + if os.path.exists(os.path.join(self.psu_path, psu_voltage)): + self.psu_voltage = psu_voltage + else: + self.psu_voltage = None + psu_curr = filemap[PSU_CURRENT].format(self.index) + if os.path.exists(os.path.join(self.psu_path, psu_curr)): + self.psu_current = psu_curr + else: + self.psu_current = None + psu_power = filemap[PSU_POWER].format(self.index) + if os.path.exists(os.path.join(self.psu_path, psu_power)): + self.psu_power = psu_power + else: + self.psu_power = None + psu_presence = "thermal/psu{}_status".format(self.index) + if os.path.exists(os.path.join(self.psu_path, psu_presence)): + self.psu_presence = psu_presence else: - self.presence_file_exists = False + self.psu_presence = None fan = Fan(psu_index, psu_index, True) if fan.get_presence(): self._fan = fan - def get_status(self): + def _read_generic_file(self, filename, len): + """ + Read a generic file, returns the contents of the file + """ + result = 0 + try: + with open(filename, 'r') as fileobj: + result = int(fileobj.read()) + except: + logger.log_info("Fail to read file {}, maybe it doesn't exist".format(filename)) + result = 0 + return result + + def get_powergood_status(self): """ Retrieves the operational status of power supply unit (PSU) defined Returns: bool: True if PSU is operating properly, False if not """ - status = 0 - try: - with open(os.path.join(self.psu_path, self.psu_oper_status), 'r') as power_status: - status = int(power_status.read()) - except (ValueError, IOError): - status = 0 + status = self._read_generic_file(os.path.join(self.psu_path, self.psu_oper_status), 0) return status == 1 @@ -60,15 +116,48 @@ def get_presence(self): Returns: bool: True if PSU is present, False if not """ - status = 0 - if self.presence_file_exists: - try: - with open(os.path.join(self.psu_path, self.psu_presence), 'r') as presence_status: - status = int(presence_status.read()) - except (ValueError, IOError): - status = 0 + if self.psu_presence is not None: + status = self._read_generic_file(os.path.join(self.psu_path, self.psu_presence), 0) + return status == 1 else: - status = self.index in psu_list + return True - return status == 1 + def get_voltage(self): + """ + Retrieves current PSU voltage output + Returns: + A float number, the output voltage in volts, + e.g. 12.1 + """ + if self.psu_voltage is not None: + voltage = self._read_generic_file(os.path.join(self.psu_path, self.psu_voltage), 0) + return float(voltage) / 1000 + else: + return None + + def get_current(self): + """ + Retrieves present electric current supplied by PSU + + Returns: + A float number, the electric current in amperes, e.g 15.4 + """ + if self.psu_current is not None: + amperes = self._read_generic_file(os.path.join(self.psu_path, self.psu_current), 0) + return float(amperes) / 1000 + else: + return None + + def get_power(self): + """ + Retrieves current energy supplied by PSU + + Returns: + A float number, the power in watts, e.g. 302.6 + """ + if self.psu_power is not None: + power = self._read_generic_file(os.path.join(self.psu_path, self.psu_power), 0) + return float(power) / 1000000 + else: + return None diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py new file mode 100644 index 000000000000..dec25d36d89b --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python + +############################################################################# +# Mellanox +# +# Module contains an implementation of SONiC Platform Base API and +# provides the thermals data which are available in the platform +# +############################################################################# + +import os.path + +try: + from sonic_platform_base.thermal_base import ThermalBase + from sonic_daemon_base.daemon_base import Logger + from os import listdir + from os.path import isfile, join + import io +except ImportError as e: + raise ImportError (str(e) + "- required module not found") + +# Global logger class instance +SYSLOG_IDENTIFIER = "mlnx-thermal" +logger = Logger(SYSLOG_IDENTIFIER) + +THERMAL_DEV_CATEGORY_CPU_CORE = "cpu_core" +THERMAL_DEV_CATEGORY_CPU_PACK = "cpu_pack" +THERMAL_DEV_CATEGORY_MODULE = "module" +THERMAL_DEV_CATEGORY_PSU = "psu" +THERMAL_DEV_CATEGORY_GEARBOX = "gearbox" +THERMAL_DEV_CATEGORY_AMBIENT = "ambient" + +THERMAL_DEV_ASIC_AMBIENT = "asic_amb" +THERMAL_DEV_FAN_AMBIENT = "fan_amb" +THERMAL_DEV_PORT_AMBIENT = "port_amb" +THERMAL_DEV_COMEX_AMBIENT = "comex_amb" +THERMAL_DEV_BOARD_AMBIENT = "board_amb" + +THERMAL_API_GET_TEMPERATURE = "get_temperature" +THERMAL_API_GET_HIGH_THRESHOLD = "get_high_threshold" + +HW_MGMT_THERMAL_ROOT = "/var/run/hw-management/thermal/" + +thermal_api_handler_cpu_core = { + THERMAL_API_GET_TEMPERATURE:"cpu_core{}", + THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max" +} +thermal_api_handler_cpu_pack = { + THERMAL_API_GET_TEMPERATURE:"cpu_pack", + THERMAL_API_GET_HIGH_THRESHOLD:"cpu_pack_max" +} +thermal_api_handler_module = { + THERMAL_API_GET_TEMPERATURE:"temp_input_module{}", + THERMAL_API_GET_HIGH_THRESHOLD:"temp_crit_module{}" +} +thermal_api_handler_psu = { + THERMAL_API_GET_TEMPERATURE:"psu{}", + THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_max" +} +thermal_api_handler_gearbox = { + THERMAL_API_GET_TEMPERATURE:"temp_input_gearbox{}", + THERMAL_API_GET_HIGH_THRESHOLD:None +} +thermal_ambient_apis = { + THERMAL_DEV_ASIC_AMBIENT : "asic", + THERMAL_DEV_PORT_AMBIENT : "port_amb", + THERMAL_DEV_FAN_AMBIENT : "fan_amb", + THERMAL_DEV_COMEX_AMBIENT : "comex_amb", + THERMAL_DEV_BOARD_AMBIENT : "board_amb" +} +thermal_ambient_name = { + THERMAL_DEV_ASIC_AMBIENT : "Ambient ASIC Temp", + THERMAL_DEV_PORT_AMBIENT : "Ambient Port Side Temp", + THERMAL_DEV_FAN_AMBIENT : "Ambient Fan Side Temp", + THERMAL_DEV_COMEX_AMBIENT : "Ambient COMEX Temp", + THERMAL_DEV_BOARD_AMBIENT : "Ambient Board Temp" +} +thermal_api_handlers = { + THERMAL_DEV_CATEGORY_CPU_CORE : thermal_api_handler_cpu_core, + THERMAL_DEV_CATEGORY_CPU_PACK : thermal_api_handler_cpu_pack, + THERMAL_DEV_CATEGORY_MODULE : thermal_api_handler_module, + THERMAL_DEV_CATEGORY_PSU : thermal_api_handler_psu, + THERMAL_DEV_CATEGORY_GEARBOX : thermal_api_handler_gearbox +} +thermal_name = { + THERMAL_DEV_CATEGORY_CPU_CORE : "CPU Core {} Temp", + THERMAL_DEV_CATEGORY_CPU_PACK : "CPU Pack Temp", + THERMAL_DEV_CATEGORY_MODULE : "xSFP module {} Temp", + THERMAL_DEV_CATEGORY_PSU : "PSU-{} Temp", + THERMAL_DEV_CATEGORY_GEARBOX : "Gearbox {} Temp" +} + +thermal_device_categories_all = [ + THERMAL_DEV_CATEGORY_CPU_CORE, + THERMAL_DEV_CATEGORY_CPU_PACK, + THERMAL_DEV_CATEGORY_MODULE, + THERMAL_DEV_CATEGORY_PSU, + THERMAL_DEV_CATEGORY_AMBIENT, + THERMAL_DEV_CATEGORY_GEARBOX +] + +thermal_device_categories_singleton = [ + THERMAL_DEV_CATEGORY_CPU_PACK, + THERMAL_DEV_CATEGORY_AMBIENT +] +thermal_api_names = [ + THERMAL_API_GET_TEMPERATURE, + THERMAL_API_GET_HIGH_THRESHOLD +] + +hwsku_dict_thermal = {'ACS-MSN2700': 0, "LS-SN2700":0, 'ACS-MSN2740': 3, 'ACS-MSN2100': 1, 'ACS-MSN2410': 2, 'ACS-MSN2010': 4, 'ACS-MSN3700': 5, 'ACS-MSN3700C': 6, 'Mellanox-SN2700': 0, 'Mellanox-SN2700-D48C8': 0, 'ACS-MSN3800': 7} +thermal_profile_list = [ + # 2700 + { + THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2), + THERMAL_DEV_CATEGORY_MODULE:(1, 32), + THERMAL_DEV_CATEGORY_PSU:(1, 2), + THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), + THERMAL_DEV_CATEGORY_GEARBOX:(0,0), + THERMAL_DEV_CATEGORY_AMBIENT:(0, + [ + THERMAL_DEV_ASIC_AMBIENT, + THERMAL_DEV_PORT_AMBIENT, + THERMAL_DEV_FAN_AMBIENT + ] + ) + }, + # 2100 + { + THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4), + THERMAL_DEV_CATEGORY_MODULE:(1, 16), + THERMAL_DEV_CATEGORY_PSU:(0, 0), + THERMAL_DEV_CATEGORY_CPU_PACK:(0,0), + THERMAL_DEV_CATEGORY_GEARBOX:(0,0), + THERMAL_DEV_CATEGORY_AMBIENT:(0, + [ + THERMAL_DEV_ASIC_AMBIENT, + THERMAL_DEV_PORT_AMBIENT, + THERMAL_DEV_FAN_AMBIENT, + ] + ) + }, + # 2410 + { + THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2), + THERMAL_DEV_CATEGORY_MODULE:(1, 56), + THERMAL_DEV_CATEGORY_PSU:(1, 2), + THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), + THERMAL_DEV_CATEGORY_GEARBOX:(0,0), + THERMAL_DEV_CATEGORY_AMBIENT:(0, + [ + THERMAL_DEV_ASIC_AMBIENT, + THERMAL_DEV_PORT_AMBIENT, + THERMAL_DEV_FAN_AMBIENT, + ] + ) + }, + # 2740 + { + THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4), + THERMAL_DEV_CATEGORY_MODULE:(1, 32), + THERMAL_DEV_CATEGORY_PSU:(1, 2), + THERMAL_DEV_CATEGORY_CPU_PACK:(0,0), + THERMAL_DEV_CATEGORY_GEARBOX:(0,0), + THERMAL_DEV_CATEGORY_AMBIENT:(0, + [ + THERMAL_DEV_ASIC_AMBIENT, + THERMAL_DEV_PORT_AMBIENT, + THERMAL_DEV_FAN_AMBIENT, + ] + ) + }, + # 2010 + { + THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4), + THERMAL_DEV_CATEGORY_MODULE:(1, 22), + THERMAL_DEV_CATEGORY_PSU:(0, 0), + THERMAL_DEV_CATEGORY_CPU_PACK:(0,0), + THERMAL_DEV_CATEGORY_GEARBOX:(0,0), + THERMAL_DEV_CATEGORY_AMBIENT:(0, + [ + THERMAL_DEV_ASIC_AMBIENT, + THERMAL_DEV_PORT_AMBIENT, + THERMAL_DEV_FAN_AMBIENT, + ] + ) + }, + # 3700 + { + THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4), + THERMAL_DEV_CATEGORY_MODULE:(1, 32), + THERMAL_DEV_CATEGORY_PSU:(1, 2), + THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), + THERMAL_DEV_CATEGORY_GEARBOX:(0,0), + THERMAL_DEV_CATEGORY_AMBIENT:(0, + [ + THERMAL_DEV_ASIC_AMBIENT, + THERMAL_DEV_COMEX_AMBIENT, + THERMAL_DEV_PORT_AMBIENT, + THERMAL_DEV_FAN_AMBIENT + ] + ) + }, + # 3700c + { + THERMAL_DEV_CATEGORY_CPU_CORE:(0, 2), + THERMAL_DEV_CATEGORY_MODULE:(1, 32), + THERMAL_DEV_CATEGORY_PSU:(1, 2), + THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), + THERMAL_DEV_CATEGORY_GEARBOX:(0,0), + THERMAL_DEV_CATEGORY_AMBIENT:(0, + [ + THERMAL_DEV_ASIC_AMBIENT, + THERMAL_DEV_COMEX_AMBIENT, + THERMAL_DEV_PORT_AMBIENT, + THERMAL_DEV_FAN_AMBIENT + ] + ) + }, + # 3800 + { + THERMAL_DEV_CATEGORY_CPU_CORE:(0, 4), + THERMAL_DEV_CATEGORY_MODULE:(1, 64), + THERMAL_DEV_CATEGORY_PSU:(1, 2), + THERMAL_DEV_CATEGORY_CPU_PACK:(0,1), + THERMAL_DEV_CATEGORY_GEARBOX:(1,32), + THERMAL_DEV_CATEGORY_AMBIENT:(0, + [ + THERMAL_DEV_ASIC_AMBIENT, + THERMAL_DEV_COMEX_AMBIENT, + THERMAL_DEV_PORT_AMBIENT, + THERMAL_DEV_FAN_AMBIENT + ] + ) + }, +] + +def initialize_thermals(sku, thermal_list, psu_list): + tp_index = hwsku_dict_thermal[sku] + thermal_profile = thermal_profile_list[tp_index] + for category in thermal_device_categories_all: + if category == THERMAL_DEV_CATEGORY_AMBIENT: + count, ambient_list = thermal_profile[category] + for ambient in ambient_list: + thermal = Thermal(category, ambient, True) + thermal_list.append(thermal) + else: + start, count = 0, 0 + if category in thermal_profile: + start, count = thermal_profile[category] + if count == 0: + continue + if count == 1: + thermal = Thermal(category, 0, False) + thermal_list.append(thermal) + else: + if category == THERMAL_DEV_CATEGORY_PSU: + for index in range(count): + thermal = Thermal(category, start + index, True, psu_list[index]) + thermal_list.append(thermal) + else: + for index in range(count): + thermal = Thermal(category, start + index, True) + thermal_list.append(thermal) + +class Thermal(ThermalBase): + def __init__(self, category, index, has_index, dependency = None): + """ + index should be a string for category ambient and int for other categories + """ + if category == THERMAL_DEV_CATEGORY_AMBIENT: + self.name = thermal_ambient_name[index] + self.index = index + elif has_index: + self.name = thermal_name[category].format(index) + self.index = index + else: + self.name = thermal_name[category] + self.index = 0 + + self.category = category + self.temperature = self._get_file_from_api(THERMAL_API_GET_TEMPERATURE) + self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD) + self.dependency = dependency + + def get_name(self): + """ + Retrieves the name of the device + + Returns: + string: The name of the device + """ + return self.name + + def _read_generic_file(self, filename, len): + """ + Read a generic file, returns the contents of the file + """ + result = None + try: + with open(filename, 'r') as fileobj: + result = fileobj.read() + except: + if self.dependency is None or self.dependency.get_powergood_status(): + logger.log_warning("Fail to read file {}, maybe it doesn't exist".format(filename)) + result = None + return result + + def _get_file_from_api(self, api_name): + if self.category == THERMAL_DEV_CATEGORY_AMBIENT: + if api_name == THERMAL_API_GET_TEMPERATURE: + filename = thermal_ambient_apis[self.index] + else: + return None + else: + handler = thermal_api_handlers[self.category][api_name] + if self.category in thermal_device_categories_singleton: + filename = handler + else: + filename = handler.format(self.index) + return join(HW_MGMT_THERMAL_ROOT, filename) + + def get_temperature(self): + """ + Retrieves current temperature reading from thermal + + Returns: + A float number of current temperature in Celsius up to nearest thousandth + of one degree Celsius, e.g. 30.125 + """ + value_str = self._read_generic_file(self.temperature, 0) + if value_str is None: + return None + value_float = float(value_str) + return value_float / 1000.0 + + def get_high_threshold(self): + """ + Retrieves the high threshold temperature of thermal + + Returns: + A float number, the high threshold temperature of thermal in Celsius + up to nearest thousandth of one degree Celsius, e.g. 30.125 + """ + if self.high_threshold is None: + return None + value_str = self._read_generic_file(self.high_threshold, 0) + if value_str is None: + return None + value_float = float(value_str) + return value_float / 1000.0 From 70322a94b4214b68b8c2de67979b05e05f5ae52b Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Sat, 20 Jul 2019 03:04:23 +0300 Subject: [PATCH 2/3] 1. address review comments 2. improve the handling of PSU inserting/removal 3. tolerance diverse psu thermal sensor file name conventions --- .../sonic_platform/chassis.py | 21 +++-- .../mlnx-platform-api/sonic_platform/psu.py | 77 +++++++++++-------- .../sonic_platform/thermal.py | 41 +++++++--- 3 files changed, 83 insertions(+), 56 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index af15632c4e3a..01f5fb154778 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -8,8 +8,6 @@ # ############################################################################# -import sys - try: from sonic_platform_base.chassis_base import ChassisBase from sonic_platform.psu import Psu @@ -22,6 +20,7 @@ from eeprom import Eeprom from os import listdir from os.path import isfile, join + import sys import io import re import subprocess @@ -65,7 +64,7 @@ COMPONENT_CPLD2 = "CPLD2" # Global logger class instance -SYSLOG_IDENTIFIER = "mlnx-chassis" +SYSLOG_IDENTIFIER = "mlnx-chassis-api" logger = Logger(SYSLOG_IDENTIFIER) # magic code defnition for port number, qsfp port position of each hwsku @@ -80,11 +79,11 @@ def __init__(self): super(Chassis, self).__init__() # Initialize SKU name - self.sku = self._get_sku_name() + self.sku_name = self._get_sku_name() # Initialize PSU list for index in range(MLNX_NUM_PSU): - psu = Psu(index, self.sku) + psu = Psu(index, self.sku_name) self._psu_list.append(psu) # Initialize watchdog @@ -103,7 +102,7 @@ def __init__(self): self._fan_list.append(fan) # Initialize SFP list - port_position_tuple = self._get_port_position_tuple_by_sku_name(self.sku) + port_position_tuple = self._get_port_position_tuple_by_sku_name() self.PORT_START = port_position_tuple[0] self.QSFP_PORT_START = port_position_tuple[1] self.PORT_END = port_position_tuple[2] @@ -117,7 +116,7 @@ def __init__(self): self._sfp_list.append(sfp_module) # Initialize thermals - initialize_thermals(self.sku, self._thermal_list, self._psu_list) + initialize_thermals(self.sku_name, self._thermal_list, self._psu_list) # Initialize EEPROM self.eeprom = Eeprom() @@ -149,8 +148,8 @@ def _get_sku_name(self): out, err = p.communicate() return out.rstrip('\n') - def _get_port_position_tuple_by_sku_name(self, sku): - position_tuple = port_position_tuple_list[hwsku_dict_port[self.sku]] + def _get_port_position_tuple_by_sku_name(self): + position_tuple = port_position_tuple_list[hwsku_dict_port[self.sku_name]] return position_tuple def get_base_mac(self): @@ -193,8 +192,8 @@ def _read_generic_file(self, filename, len): result = fileobj.read(len) fileobj.close() return result - except: - logger.log_warning("Fail to read file {}, maybe it doesn't exist".format(filename)) + except Exception as e: + logger.log_info("Fail to read file {} due to {}".format(filename, repr(e))) return '' def _verify_reboot_cause(self, filename): diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index 68650d2e25fd..21fd6dc38f06 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -8,9 +8,8 @@ # ############################################################################# -import os.path - try: + import os.path from sonic_platform_base.psu_base import PsuBase from sonic_daemon_base.daemon_base import Logger from sonic_platform.fan import Fan @@ -18,7 +17,7 @@ raise ImportError (str(e) + "- required module not found") # Global logger class instance -SYSLOG_IDENTIFIER = "mlnx-psu" +SYSLOG_IDENTIFIER = "mlnx-psu-api" logger = Logger(SYSLOG_IDENTIFIER) psu_list = [] @@ -26,6 +25,12 @@ PSU_CURRENT = "current" PSU_VOLTAGE = "voltage" PSU_POWER = "power" + +# SKUs with unplugable PSUs: +# 1. don't have psuX_status and should be treated as always present +# 2. don't have voltage, current and power values +hwsku_dict_with_unplugable_psu = ['ACS-MSN2010', 'ACS-MSN2100'] + # in most SKUs the file psuX_curr, psuX_volt and psuX_power contain current, voltage and power data respectively. # but there are exceptions which will be handled by the following dictionary hwsku_dict_psu = {'ACS-MSN3700': 1, 'ACS-MSN3700C': 1, 'ACS-MSN3800': 1} @@ -53,34 +58,39 @@ def __init__(self, psu_index, sku): self.index = psu_index + 1 psu_list.append(self.index) self.psu_path = "/var/run/hw-management/" - self.psu_oper_status = "thermal/psu{}_pwr_status".format(self.index) + psu_oper_status = "thermal/psu{}_pwr_status".format(self.index) + #psu_oper_status should always be present for all SKUs + self.psu_oper_status = os.path.join(self.psu_path, psu_oper_status) if sku in hwsku_dict_psu: filemap = psu_profile_list[hwsku_dict_psu[sku]] else: filemap = psu_profile_list[0] - print "sku {} filemap {}".format(sku, filemap) - psu_voltage = filemap[PSU_VOLTAGE].format(self.index) - if os.path.exists(os.path.join(self.psu_path, psu_voltage)): - self.psu_voltage = psu_voltage - else: + if sku in hwsku_dict_with_unplugable_psu: + self.always_presence = True self.psu_voltage = None - psu_curr = filemap[PSU_CURRENT].format(self.index) - if os.path.exists(os.path.join(self.psu_path, psu_curr)): - self.psu_current = psu_curr - else: self.psu_current = None - psu_power = filemap[PSU_POWER].format(self.index) - if os.path.exists(os.path.join(self.psu_path, psu_power)): - self.psu_power = psu_power - else: self.psu_power = None - psu_presence = "thermal/psu{}_status".format(self.index) - if os.path.exists(os.path.join(self.psu_path, psu_presence)): - self.psu_presence = psu_presence - else: self.psu_presence = None + else: + self.always_presence = False + psu_voltage = filemap[PSU_VOLTAGE].format(self.index) + psu_voltage = os.path.join(self.psu_path, psu_voltage) + self.psu_voltage = psu_voltage + + psu_current = filemap[PSU_CURRENT].format(self.index) + psu_current = os.path.join(self.psu_path, psu_current) + self.psu_current = psu_current + + psu_power = filemap[PSU_POWER].format(self.index) + psu_power = os.path.join(self.psu_path, psu_power) + self.psu_power = psu_power + + psu_presence = "thermal/psu{}_status".format(self.index) + psu_presence = os.path.join(self.psu_path, psu_presence) + self.psu_presence = psu_presence + fan = Fan(psu_index, psu_index, True) if fan.get_presence(): self._fan = fan @@ -93,9 +103,8 @@ def _read_generic_file(self, filename, len): try: with open(filename, 'r') as fileobj: result = int(fileobj.read()) - except: - logger.log_info("Fail to read file {}, maybe it doesn't exist".format(filename)) - result = 0 + except Exception as e: + logger.log_info("Fail to read file {} due to {}".format(filename, repr(e))) return result def get_powergood_status(self): @@ -116,11 +125,11 @@ def get_presence(self): Returns: bool: True if PSU is present, False if not """ - if self.psu_presence is not None: - status = self._read_generic_file(os.path.join(self.psu_path, self.psu_presence), 0) - return status == 1 + if self.always_presence: + return self.always_presence else: - return True + status = self._read_generic_file(self.psu_presence, 0) + return status == 1 def get_voltage(self): """ @@ -130,8 +139,8 @@ def get_voltage(self): A float number, the output voltage in volts, e.g. 12.1 """ - if self.psu_voltage is not None: - voltage = self._read_generic_file(os.path.join(self.psu_path, self.psu_voltage), 0) + if self.psu_voltage is not None and os.path.exists(self.psu_voltage): + voltage = self._read_generic_file(self.psu_voltage, 0) return float(voltage) / 1000 else: return None @@ -143,8 +152,8 @@ def get_current(self): Returns: A float number, the electric current in amperes, e.g 15.4 """ - if self.psu_current is not None: - amperes = self._read_generic_file(os.path.join(self.psu_path, self.psu_current), 0) + if self.psu_current is not None and os.path.exists(self.psu_current): + amperes = self._read_generic_file(self.psu_current, 0) return float(amperes) / 1000 else: return None @@ -156,8 +165,8 @@ def get_power(self): Returns: A float number, the power in watts, e.g. 302.6 """ - if self.psu_power is not None: - power = self._read_generic_file(os.path.join(self.psu_path, self.psu_power), 0) + if self.psu_power is not None and os.path.exists(self.psu_power): + power = self._read_generic_file(self.psu_power, 0) return float(power) / 1000000 else: return None diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index dec25d36d89b..faabdd808630 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -8,19 +8,18 @@ # ############################################################################# -import os.path - try: from sonic_platform_base.thermal_base import ThermalBase from sonic_daemon_base.daemon_base import Logger from os import listdir from os.path import isfile, join import io + import os.path except ImportError as e: raise ImportError (str(e) + "- required module not found") # Global logger class instance -SYSLOG_IDENTIFIER = "mlnx-thermal" +SYSLOG_IDENTIFIER = "mlnx-thermal-api" logger = Logger(SYSLOG_IDENTIFIER) THERMAL_DEV_CATEGORY_CPU_CORE = "cpu_core" @@ -54,7 +53,7 @@ THERMAL_API_GET_HIGH_THRESHOLD:"temp_crit_module{}" } thermal_api_handler_psu = { - THERMAL_API_GET_TEMPERATURE:"psu{}", + THERMAL_API_GET_TEMPERATURE:None, THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_max" } thermal_api_handler_gearbox = { @@ -108,7 +107,14 @@ THERMAL_API_GET_HIGH_THRESHOLD ] -hwsku_dict_thermal = {'ACS-MSN2700': 0, "LS-SN2700":0, 'ACS-MSN2740': 3, 'ACS-MSN2100': 1, 'ACS-MSN2410': 2, 'ACS-MSN2010': 4, 'ACS-MSN3700': 5, 'ACS-MSN3700C': 6, 'Mellanox-SN2700': 0, 'Mellanox-SN2700-D48C8': 0, 'ACS-MSN3800': 7} +# thermal sensor file name convention for SKUs with single thermal sensor for each PSU +THERMAL_PSU_TEMP_FOR_SINGLE_SENSOR = "psu{}" +# thermal sensor file name convention for SKUs with multiple thermal sensors for each PSU +THERMAL_PSU_TEMP_FOR_MULTI_SENSORS = "psu{}_temp" + +hwsku_with_single_thermal_per_sku = ['ACS-MSN2700', 'LS-SN2700','Mellanox-SN2700'] + +hwsku_dict_thermal = {'ACS-MSN2700': 0, 'LS-SN2700':0, 'ACS-MSN2740': 3, 'ACS-MSN2100': 1, 'ACS-MSN2410': 2, 'ACS-MSN2010': 4, 'ACS-MSN3700': 5, 'ACS-MSN3700C': 6, 'Mellanox-SN2700': 0, 'Mellanox-SN2700-D48C8': 0, 'ACS-MSN3800': 7} thermal_profile_list = [ # 2700 { @@ -236,6 +242,13 @@ ] def initialize_thermals(sku, thermal_list, psu_list): + # initialize temperature sensor handlers + if sku in hwsku_with_single_thermal_per_sku: + thermal_api_handler_psu[THERMAL_API_GET_TEMPERATURE] = THERMAL_PSU_TEMP_FOR_SINGLE_SENSOR + else: + thermal_api_handler_psu[THERMAL_API_GET_TEMPERATURE] = THERMAL_PSU_TEMP_FOR_MULTI_SENSORS + + # create thermal objects for all categories of sensors tp_index = hwsku_dict_thermal[sku] thermal_profile = thermal_profile_list[tp_index] for category in thermal_device_categories_all: @@ -256,7 +269,7 @@ def initialize_thermals(sku, thermal_list, psu_list): else: if category == THERMAL_DEV_CATEGORY_PSU: for index in range(count): - thermal = Thermal(category, start + index, True, psu_list[index]) + thermal = Thermal(category, start + index, True, psu_list[index].get_powergood_status, "power off") thermal_list.append(thermal) else: for index in range(count): @@ -264,7 +277,7 @@ def initialize_thermals(sku, thermal_list, psu_list): thermal_list.append(thermal) class Thermal(ThermalBase): - def __init__(self, category, index, has_index, dependency = None): + def __init__(self, category, index, has_index, dependency = None, hint = None): """ index should be a string for category ambient and int for other categories """ @@ -282,6 +295,7 @@ def __init__(self, category, index, has_index, dependency = None): self.temperature = self._get_file_from_api(THERMAL_API_GET_TEMPERATURE) self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD) self.dependency = dependency + self.dependent_hint = hint def get_name(self): """ @@ -300,10 +314,8 @@ def _read_generic_file(self, filename, len): try: with open(filename, 'r') as fileobj: result = fileobj.read() - except: - if self.dependency is None or self.dependency.get_powergood_status(): - logger.log_warning("Fail to read file {}, maybe it doesn't exist".format(filename)) - result = None + except Exception as e: + logger.log_info("Fail to read file {} due to {}".format(filename, repr(e))) return result def _get_file_from_api(self, api_name): @@ -328,6 +340,13 @@ def get_temperature(self): A float number of current temperature in Celsius up to nearest thousandth of one degree Celsius, e.g. 30.125 """ + if self.dependency and not self.dependency(): + if self.dependent_hint: + hint = self.dependent_hint + else: + hint = "unknown reason" + logger.log_info("get_temperature for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.temperature, 0) if value_str is None: return None From d4db5d80e0fae97cf38dfbee9f91b26d69e0a85d Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Sat, 20 Jul 2019 05:36:18 +0300 Subject: [PATCH 3/3] 1. adjust thermal code according to the latest version of hw-management 2. check power_good_status rather than whether file existing ahead of reading voltage, current and power of PSU --- .../mlnx-platform-api/sonic_platform/psu.py | 6 +++--- .../mlnx-platform-api/sonic_platform/thermal.py | 17 ++--------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index 21fd6dc38f06..a6f217d82bd3 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -139,7 +139,7 @@ def get_voltage(self): A float number, the output voltage in volts, e.g. 12.1 """ - if self.psu_voltage is not None and os.path.exists(self.psu_voltage): + if self.psu_voltage is not None and self.get_powergood_status(): voltage = self._read_generic_file(self.psu_voltage, 0) return float(voltage) / 1000 else: @@ -152,7 +152,7 @@ def get_current(self): Returns: A float number, the electric current in amperes, e.g 15.4 """ - if self.psu_current is not None and os.path.exists(self.psu_current): + if self.psu_current is not None and self.get_powergood_status(): amperes = self._read_generic_file(self.psu_current, 0) return float(amperes) / 1000 else: @@ -165,7 +165,7 @@ def get_power(self): Returns: A float number, the power in watts, e.g. 302.6 """ - if self.psu_power is not None and os.path.exists(self.psu_power): + if self.psu_power is not None and self.get_powergood_status(): power = self._read_generic_file(self.psu_power, 0) return float(power) / 1000000 else: diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index faabdd808630..5195d378a468 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -53,8 +53,8 @@ THERMAL_API_GET_HIGH_THRESHOLD:"temp_crit_module{}" } thermal_api_handler_psu = { - THERMAL_API_GET_TEMPERATURE:None, - THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_max" + THERMAL_API_GET_TEMPERATURE:"psu{}_temp", + THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_temp_max" } thermal_api_handler_gearbox = { THERMAL_API_GET_TEMPERATURE:"temp_input_gearbox{}", @@ -107,13 +107,6 @@ THERMAL_API_GET_HIGH_THRESHOLD ] -# thermal sensor file name convention for SKUs with single thermal sensor for each PSU -THERMAL_PSU_TEMP_FOR_SINGLE_SENSOR = "psu{}" -# thermal sensor file name convention for SKUs with multiple thermal sensors for each PSU -THERMAL_PSU_TEMP_FOR_MULTI_SENSORS = "psu{}_temp" - -hwsku_with_single_thermal_per_sku = ['ACS-MSN2700', 'LS-SN2700','Mellanox-SN2700'] - hwsku_dict_thermal = {'ACS-MSN2700': 0, 'LS-SN2700':0, 'ACS-MSN2740': 3, 'ACS-MSN2100': 1, 'ACS-MSN2410': 2, 'ACS-MSN2010': 4, 'ACS-MSN3700': 5, 'ACS-MSN3700C': 6, 'Mellanox-SN2700': 0, 'Mellanox-SN2700-D48C8': 0, 'ACS-MSN3800': 7} thermal_profile_list = [ # 2700 @@ -242,12 +235,6 @@ ] def initialize_thermals(sku, thermal_list, psu_list): - # initialize temperature sensor handlers - if sku in hwsku_with_single_thermal_per_sku: - thermal_api_handler_psu[THERMAL_API_GET_TEMPERATURE] = THERMAL_PSU_TEMP_FOR_SINGLE_SENSOR - else: - thermal_api_handler_psu[THERMAL_API_GET_TEMPERATURE] = THERMAL_PSU_TEMP_FOR_MULTI_SENSORS - # create thermal objects for all categories of sensors tp_index = hwsku_dict_thermal[sku] thermal_profile = thermal_profile_list[tp_index]