Skip to content

Commit 8bb9c5a

Browse files
authored
Add retry reading/setting mux status to simulated y-cable driver (#221)
Description Add retry reading/setting mux status to simulated y-cable driver Motivation and Context When DUT is rebooted, xcvrd may call the simulated y-cable driver to get mux direction before the mgmt interface is up. The simulated y-cable driver needs to send HTTP request to mux simulator server to read mux status. It has dependency on the mgmt interface. So, this could result in below error: Oct 16 03:15:44.029933 sonic-dut ERR pmon#xcvrd[34]: y_cable_port 1: GET http://192.168.1.33:8082/mux/vms21-6/0 for physical_port 1 failed with URLError(OSError(113, 'No route to host')) Oct 16 03:15:44.030306 sonic-dut ERR pmon#xcvrd[34]: Error: Could not establish the active side for Y cable port Ethernet0 to perform read_y_cable update state db This would cause other problem and may result in same interface on both upper tor and lower tor remains in "standby" state. The fix is to add retry to the simulated y-cable driver for reading or setting mux status. The retry interval is 1 second. Retry timeout is 30 seconds. How Has This Been Tested? * The issue can be reliably reproduced on a 7260 dualtor testbed after run the test_acl::TestAclWithReboot cases. With this fix, the issue cannot be reproduced. * Tested the config mux mode active command with or without icmp responder. * Tested updating mux status by call mux simulator API to see if the new status is reflected to DUTs. Signed-off-by: Xin Wang <[email protected]>
1 parent 2ebd786 commit 8bb9c5a

File tree

1 file changed

+91
-34
lines changed

1 file changed

+91
-34
lines changed

sonic_y_cable/microsoft/y_cable_simulated.py

Lines changed: 91 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import os
1010
import urllib.request
1111
import urllib.error
12+
import time
1213

1314
from sonic_py_common import device_info
1415
from portconfig import get_port_config
@@ -32,6 +33,10 @@ class YCable(YCableBase):
3233
NIC_VOLTAGE = 5.0
3334
LOCAL_VOLTAGE = 5.0
3435

36+
POLL_TIMEOUT = 30
37+
POLL_INTERVAL = 1
38+
URLOPEN_TIMEOUT = 5
39+
3540
def __init__(self, port, logger):
3641
YCableBase.__init__(self, port, logger)
3742
if not os.path.exists(self.MUX_SIMULATOR_CONFIG_FILE) or not os.path.isfile(self.MUX_SIMULATOR_CONFIG_FILE):
@@ -85,22 +90,45 @@ def _get(self, url=None):
8590
else:
8691
get_url = self._url
8792

88-
try:
93+
start_time = time.time()
94+
attempt = 1
95+
while True:
8996
try:
90-
req = urllib.request.Request(get_url)
91-
with urllib.request.urlopen(req) as resp:
92-
return json.loads(resp.read().decode('utf-8'))
93-
except urllib.error.HTTPError as e:
94-
self.log_error('GET {} for physical_port {} failed with {}, detail: {}'.format(
97+
try:
98+
req = urllib.request.Request(get_url)
99+
with urllib.request.urlopen(req, timeout=self.URLOPEN_TIMEOUT) as resp:
100+
return json.loads(resp.read().decode('utf-8'))
101+
except urllib.error.HTTPError as e:
102+
self.log_error('attempt={}, GET {} for physical_port {} failed with {}, detail: {}'.format(
103+
attempt,
104+
get_url,
105+
self.port,
106+
repr(e),
107+
e.read()))
108+
except (urllib.error.URLError, json.decoder.JSONDecodeError, Exception) as e:
109+
self.log_error('attempt={}, GET {} for physical_port {} failed with {}'.format(
110+
attempt,
111+
get_url,
112+
self.port,
113+
repr(e)))
114+
115+
# Retry in case of exception, to workaround 'no route to host' issue after pmon restart
116+
if (time.time() - start_time) > self.POLL_TIMEOUT:
117+
self.log_error('Retry GET {} for physical port {} timeout after {} seconds, attempted={}'.format(
95118
get_url,
96119
self.port,
97-
repr(e),
98-
e.read()))
99-
except (urllib.error.URLError, json.decoder.JSONDecodeError, Exception) as e:
100-
self.log_error('GET {} for physical_port {} failed with {}'.format(
101-
get_url,
102-
self.port,
103-
repr(e)))
120+
self.POLL_TIMEOUT,
121+
attempt
122+
))
123+
break
124+
else:
125+
self.log_notice('Sleep {} seconds to retry GET {} for physical port {}'.format(
126+
self.POLL_INTERVAL,
127+
get_url,
128+
self.port
129+
))
130+
attempt += 1
131+
time.sleep(self.POLL_INTERVAL)
104132

105133
return None
106134

@@ -118,27 +146,52 @@ def _post(self, url=None, data=None):
118146
else:
119147
post_data = None
120148

121-
try:
149+
start_time = time.time()
150+
attempt = 1
151+
while True:
122152
try:
123-
headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}
124-
req = urllib.request.Request(post_url, post_data, headers, method='POST')
125-
with urllib.request.urlopen(req) as resp:
126-
return json.loads(resp.read().decode('utf-8'))
127-
except urllib.error.HTTPError as e:
128-
self.log_error('POST {} with data {} for physical_port {} failed with {}, detail: {}'.format(
129-
post_url,
153+
try:
154+
headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}
155+
req = urllib.request.Request(post_url, post_data, headers, method='POST')
156+
with urllib.request.urlopen(req, timeout=self.URLOPEN_TIMEOUT) as resp:
157+
return json.loads(resp.read().decode('utf-8'))
158+
except urllib.error.HTTPError as e:
159+
self.log_error('attempt={}, POST {} with data {} for physical_port {} failed with {}, detail: {}'.format(
160+
attempt,
161+
post_url,
162+
post_data,
163+
self.port,
164+
repr(e),
165+
e.read()
166+
))
167+
except (urllib.error.URLError, json.decoder.JSONDecodeError, Exception) as e:
168+
self.log_error('attempt={}, POST {} with data {} for physical_port {} failed with {}'.format(
169+
attempt,
170+
post_url,
171+
post_data,
172+
self.port,
173+
repr(e)
174+
))
175+
176+
# Retry in case of exception, to workaround 'no route to host' issue after pmon restart
177+
if time.time() - start_time > self.POLL_TIMEOUT:
178+
self.log_error('Retry POST {} with data{} for physical port {} timeout after {} seconds, attempted={}'.format(
179+
get_url,
130180
post_data,
131181
self.port,
132-
repr(e),
133-
e.read()
182+
self.POLL_TIMEOUT,
183+
attempt
134184
))
135-
except (urllib.error.URLError, json.decoder.JSONDecodeError, Exception) as e:
136-
self.log_error('POST {} with data {} for physical_port {} failed with {}'.format(
137-
post_url,
185+
break
186+
else:
187+
self.log_notice('Sleep {} seconds to retry POST {} with data {} for physical port {}'.format(
188+
self.POLL_INTERVAL,
189+
get_url,
138190
post_data,
139-
self.port,
140-
repr(e)
191+
self.port
141192
))
193+
attempt += 1
194+
time.sleep(self.POLL_INTERVAL)
142195

143196
return None
144197

@@ -244,13 +297,17 @@ def get_mux_direction(self):
244297
TARGET_UNKNOWN, if mux direction API fails.
245298
"""
246299
status = self._get_status()
247-
if not status:
300+
301+
if not isinstance(status, dict):
248302
return self.TARGET_UNKNOWN
249303

250-
if status['active_side'] == self.UPPER_TOR:
251-
return self.TARGET_TOR_A
252-
elif status['active_side'] == self.LOWER_TOR:
253-
return self.TARGET_TOR_B
304+
if 'active_side' in status:
305+
if status['active_side'] == self.UPPER_TOR:
306+
return self.TARGET_TOR_A
307+
elif status['active_side'] == self.LOWER_TOR:
308+
return self.TARGET_TOR_B
309+
else:
310+
return self.TARGET_UNKNOWN
254311
else:
255312
return self.TARGET_UNKNOWN
256313

@@ -1243,4 +1300,4 @@ def debug_dump_registers(self, option=None):
12431300
which would help diagnose the cable for proper functioning
12441301
"""
12451302

1246-
return {}
1303+
return {}

0 commit comments

Comments
 (0)