Skip to content

Commit c17cd19

Browse files
stephenxsyxieca
authored andcommitted
[Mellanox] mlnx-sfpd init flow enhancement (sonic-net#3294)
* fix sfpd initialize issue * fix review comments * rephrase the output log * fix retry counter * change the retry time to 10, means set max waiting time 1024s * fix mlnx-sfpd init flow with new solution * [mlnx-sfpd] address comments 1. wait for 5 seconds * 30 times, 150 seconds totally. use constant wait time for each retry. 2. use try/except structure so that error can be handled in a graceful way * [mlnx-sfpd] wait 5 seconds after SDK_DAEMON_READY_FILE exists to make sure SDK is fully up. * [mlnx-sfpd]simplify initialization by using deinitialize on initializing failure
1 parent b80d60c commit c17cd19

File tree

1 file changed

+73
-31
lines changed

1 file changed

+73
-31
lines changed

platform/mellanox/mlnx-sfpd/scripts/mlnx-sfpd

+73-31
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ STATUS_UNKNOWN = '2'
3030

3131
SFPD_LIVENESS_EXPIRE_SECS = 30
3232

33+
SDK_DAEMON_READY_FILE = '/tmp/sdk_ready'
34+
3335
sfp_value_status_dict = {
3436
SDK_SFP_STATE_IN: STATUS_PLUGIN,
3537
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
@@ -64,7 +66,8 @@ def log_error(msg, also_print_to_console=False):
6466
class MlnxSfpd:
6567
''' Listen to plugin/plugout cable events '''
6668

67-
SX_OPEN_RETRIES = 20
69+
SX_OPEN_RETRIES = 30
70+
SX_OPEN_TIMEOUT = 5
6871
SELECT_TIMEOUT = 1
6972

7073
def __init__(self):
@@ -75,7 +78,6 @@ class MlnxSfpd:
7578
# Allocate SDK fd and user channel structures
7679
self.rx_fd_p = new_sx_fd_t_p()
7780
self.user_channel_p = new_sx_user_channel_t_p()
78-
7981
self.state_db = SonicV2Connector(host=REDIS_HOSTIP)
8082

8183
# Register our signal handlers
@@ -98,37 +100,78 @@ class MlnxSfpd:
98100
def initialize(self):
99101
self.state_db.connect("STATE_DB")
100102

101-
# open SDK API handle
102-
# retry at most SX_OPEN_RETRIES times to wait
103-
# until SDK is started during system startup
104-
retry = 1
105-
while True:
106-
rc, self.handle = sx_api_open(None)
107-
if rc == SX_STATUS_SUCCESS:
108-
break
109-
110-
log_warning("failed to open SDK API handle... retrying {}".format(retry))
103+
swid_cnt_p = None
111104

112-
time.sleep(2 ** retry)
113-
retry += 1
114-
115-
if retry > self.SX_OPEN_RETRIES:
116-
raise RuntimeError("failed to open SDK API handle after {} retries".format(retry))
117-
118-
rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
119-
if rc != SX_STATUS_SUCCESS:
120-
raise RuntimeError("sx_api_host_ifc_open exited with error, rc {}".format(rc))
105+
try:
106+
# Wait for SDK daemon to be started with detect the sdk_ready file
107+
retry = 0
108+
while not os.path.exists(SDK_DAEMON_READY_FILE):
109+
if retry >= self.SX_OPEN_RETRIES:
110+
raise RuntimeError("SDK daemon failed to start after {} retries and {} seconds waiting, exiting..."
111+
.format(retry, self.SX_OPEN_TIMEOUT * self.SX_OPEN_RETRIES))
112+
else:
113+
log_info("SDK daemon not started yet, retry {} times".format(retry))
114+
retry = retry + 1
115+
time.sleep(self.SX_OPEN_TIMEOUT)
121116

122-
self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
123-
self.user_channel_p.channel.fd = self.rx_fd_p
117+
# to make sure SDK daemon has started
118+
time.sleep(self.SX_OPEN_TIMEOUT)
124119

125-
rc = sx_api_host_ifc_trap_id_register_set(self.handle,
126-
SX_ACCESS_CMD_REGISTER,
127-
self.swid,
128-
SX_TRAP_ID_PMPE,
129-
self.user_channel_p)
130-
if rc != SX_STATUS_SUCCESS:
131-
raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(c))
120+
# After SDK daemon started, sx_api_open and sx_api_host_ifc_open is ready for call
121+
rc, self.handle = sx_api_open(None)
122+
if rc != SX_STATUS_SUCCESS:
123+
raise RuntimeError("failed to call sx_api_open with rc {}, exiting...".format(rc))
124+
125+
rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p)
126+
if rc != SX_STATUS_SUCCESS:
127+
raise RuntimeError("failed to call sx_api_host_ifc_open with rc {}, exiting...".format(rc))
128+
129+
self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD
130+
self.user_channel_p.channel.fd = self.rx_fd_p
131+
132+
# Wait for switch to be created and initialized inside SDK
133+
retry = 0
134+
swid_cnt_p = new_uint32_t_p()
135+
uint32_t_p_assign(swid_cnt_p, 0)
136+
swid_cnt = 0
137+
while True:
138+
if retry >= self.SX_OPEN_RETRIES:
139+
raise RuntimeError("switch not created after {} retries and {} seconds waiting, exiting..."
140+
.format(retry, self.SX_OPEN_RETRIES * self.SX_OPEN_TIMEOUT))
141+
else:
142+
rc = sx_api_port_swid_list_get(self.handle, None, swid_cnt_p)
143+
if rc == SX_STATUS_SUCCESS:
144+
swid_cnt = uint32_t_p_value(swid_cnt_p)
145+
if swid_cnt > 0:
146+
delete_uint32_t_p(swid_cnt_p)
147+
swid_cnt_p = None
148+
break
149+
else:
150+
log_info("switch not created yet, swid_cnt {}, retry {} times and wait for {} seconds"
151+
.format(swid_cnt, retry, self.SX_OPEN_TIMEOUT * retry))
152+
else:
153+
raise RuntimeError("sx_api_port_swid_list_get fail with rc {}, retry {} times and wait for {} seconds".
154+
format(rc, retry, self.SX_OPEN_TIMEOUT * retry))
155+
156+
retry = retry + 1
157+
time.sleep(self.SX_OPEN_TIMEOUT)
158+
159+
# After switch was created inside SDK, sx_api_host_ifc_trap_id_register_set is ready to call
160+
rc = sx_api_host_ifc_trap_id_register_set(self.handle,
161+
SX_ACCESS_CMD_REGISTER,
162+
self.swid,
163+
SX_TRAP_ID_PMPE,
164+
self.user_channel_p)
165+
166+
if rc != SX_STATUS_SUCCESS:
167+
raise RuntimeError("sx_api_host_ifc_trap_id_register_set failed with rc {}, exiting...".format(rc))
168+
169+
self.running = True
170+
except Exception as e:
171+
log_error("mlnx-sfpd initialization failed due to {}, exiting...".format(repr(e)))
172+
if swid_cnt_p is not None:
173+
delete_uint32_t_p(swid_cnt_p)
174+
self.deinitialize()
132175

133176
def deinitialize(self):
134177
# remove mlnx-sfpd liveness key in DB if not expired yet
@@ -156,7 +199,6 @@ class MlnxSfpd:
156199
log_error("sx_api_close exited with error, rc {}".format(rc))
157200

158201
def run(self):
159-
self.running = True
160202

161203
while self.running:
162204
try:

0 commit comments

Comments
 (0)