@@ -30,6 +30,8 @@ STATUS_UNKNOWN = '2'
30
30
31
31
SFPD_LIVENESS_EXPIRE_SECS = 30
32
32
33
+ SDK_DAEMON_READY_FILE = '/tmp/sdk_ready'
34
+
33
35
sfp_value_status_dict = {
34
36
SDK_SFP_STATE_IN : STATUS_PLUGIN ,
35
37
SDK_SFP_STATE_OUT : STATUS_PLUGOUT ,
@@ -64,7 +66,8 @@ def log_error(msg, also_print_to_console=False):
64
66
class MlnxSfpd :
65
67
''' Listen to plugin/plugout cable events '''
66
68
67
- SX_OPEN_RETRIES = 20
69
+ SX_OPEN_RETRIES = 30
70
+ SX_OPEN_TIMEOUT = 5
68
71
SELECT_TIMEOUT = 1
69
72
70
73
def __init__ (self ):
@@ -75,7 +78,6 @@ class MlnxSfpd:
75
78
# Allocate SDK fd and user channel structures
76
79
self .rx_fd_p = new_sx_fd_t_p ()
77
80
self .user_channel_p = new_sx_user_channel_t_p ()
78
-
79
81
self .state_db = SonicV2Connector (host = REDIS_HOSTIP )
80
82
81
83
# Register our signal handlers
@@ -98,37 +100,78 @@ class MlnxSfpd:
98
100
def initialize (self ):
99
101
self .state_db .connect ("STATE_DB" )
100
102
101
- # open SDK API handle
102
- # retry at most SX_OPEN_RETRIES times to wait
103
- # until SDK is started during system startup
104
- retry = 1
105
- while True :
106
- rc , self .handle = sx_api_open (None )
107
- if rc == SX_STATUS_SUCCESS :
108
- break
109
-
110
- log_warning ("failed to open SDK API handle... retrying {}" .format (retry ))
103
+ swid_cnt_p = None
111
104
112
- time .sleep (2 ** retry )
113
- retry += 1
114
-
115
- if retry > self .SX_OPEN_RETRIES :
116
- raise RuntimeError ("failed to open SDK API handle after {} retries" .format (retry ))
117
-
118
- rc = sx_api_host_ifc_open (self .handle , self .rx_fd_p )
119
- if rc != SX_STATUS_SUCCESS :
120
- raise RuntimeError ("sx_api_host_ifc_open exited with error, rc {}" .format (rc ))
105
+ try :
106
+ # Wait for SDK daemon to be started with detect the sdk_ready file
107
+ retry = 0
108
+ while not os .path .exists (SDK_DAEMON_READY_FILE ):
109
+ if retry >= self .SX_OPEN_RETRIES :
110
+ raise RuntimeError ("SDK daemon failed to start after {} retries and {} seconds waiting, exiting..."
111
+ .format (retry , self .SX_OPEN_TIMEOUT * self .SX_OPEN_RETRIES ))
112
+ else :
113
+ log_info ("SDK daemon not started yet, retry {} times" .format (retry ))
114
+ retry = retry + 1
115
+ time .sleep (self .SX_OPEN_TIMEOUT )
121
116
122
- self . user_channel_p . type = SX_USER_CHANNEL_TYPE_FD
123
- self . user_channel_p . channel . fd = self .rx_fd_p
117
+ # to make sure SDK daemon has started
118
+ time . sleep ( self .SX_OPEN_TIMEOUT )
124
119
125
- rc = sx_api_host_ifc_trap_id_register_set (self .handle ,
126
- SX_ACCESS_CMD_REGISTER ,
127
- self .swid ,
128
- SX_TRAP_ID_PMPE ,
129
- self .user_channel_p )
130
- if rc != SX_STATUS_SUCCESS :
131
- raise RuntimeError ("sx_api_host_ifc_trap_id_register_set exited with error, rc {}" .format (c ))
120
+ # After SDK daemon started, sx_api_open and sx_api_host_ifc_open is ready for call
121
+ rc , self .handle = sx_api_open (None )
122
+ if rc != SX_STATUS_SUCCESS :
123
+ raise RuntimeError ("failed to call sx_api_open with rc {}, exiting..." .format (rc ))
124
+
125
+ rc = sx_api_host_ifc_open (self .handle , self .rx_fd_p )
126
+ if rc != SX_STATUS_SUCCESS :
127
+ raise RuntimeError ("failed to call sx_api_host_ifc_open with rc {}, exiting..." .format (rc ))
128
+
129
+ self .user_channel_p .type = SX_USER_CHANNEL_TYPE_FD
130
+ self .user_channel_p .channel .fd = self .rx_fd_p
131
+
132
+ # Wait for switch to be created and initialized inside SDK
133
+ retry = 0
134
+ swid_cnt_p = new_uint32_t_p ()
135
+ uint32_t_p_assign (swid_cnt_p , 0 )
136
+ swid_cnt = 0
137
+ while True :
138
+ if retry >= self .SX_OPEN_RETRIES :
139
+ raise RuntimeError ("switch not created after {} retries and {} seconds waiting, exiting..."
140
+ .format (retry , self .SX_OPEN_RETRIES * self .SX_OPEN_TIMEOUT ))
141
+ else :
142
+ rc = sx_api_port_swid_list_get (self .handle , None , swid_cnt_p )
143
+ if rc == SX_STATUS_SUCCESS :
144
+ swid_cnt = uint32_t_p_value (swid_cnt_p )
145
+ if swid_cnt > 0 :
146
+ delete_uint32_t_p (swid_cnt_p )
147
+ swid_cnt_p = None
148
+ break
149
+ else :
150
+ log_info ("switch not created yet, swid_cnt {}, retry {} times and wait for {} seconds"
151
+ .format (swid_cnt , retry , self .SX_OPEN_TIMEOUT * retry ))
152
+ else :
153
+ raise RuntimeError ("sx_api_port_swid_list_get fail with rc {}, retry {} times and wait for {} seconds" .
154
+ format (rc , retry , self .SX_OPEN_TIMEOUT * retry ))
155
+
156
+ retry = retry + 1
157
+ time .sleep (self .SX_OPEN_TIMEOUT )
158
+
159
+ # After switch was created inside SDK, sx_api_host_ifc_trap_id_register_set is ready to call
160
+ rc = sx_api_host_ifc_trap_id_register_set (self .handle ,
161
+ SX_ACCESS_CMD_REGISTER ,
162
+ self .swid ,
163
+ SX_TRAP_ID_PMPE ,
164
+ self .user_channel_p )
165
+
166
+ if rc != SX_STATUS_SUCCESS :
167
+ raise RuntimeError ("sx_api_host_ifc_trap_id_register_set failed with rc {}, exiting..." .format (rc ))
168
+
169
+ self .running = True
170
+ except Exception as e :
171
+ log_error ("mlnx-sfpd initialization failed due to {}, exiting..." .format (repr (e )))
172
+ if swid_cnt_p is not None :
173
+ delete_uint32_t_p (swid_cnt_p )
174
+ self .deinitialize ()
132
175
133
176
def deinitialize (self ):
134
177
# remove mlnx-sfpd liveness key in DB if not expired yet
@@ -156,7 +199,6 @@ class MlnxSfpd:
156
199
log_error ("sx_api_close exited with error, rc {}" .format (rc ))
157
200
158
201
def run (self ):
159
- self .running = True
160
202
161
203
while self .running :
162
204
try :
0 commit comments