Skip to content

Commit 791b124

Browse files
liuh-80AharonMalkin
authored andcommitted
Add UT for orchagent watchdog (sonic-net#8306)
### Description of PR Add UT for orchagent watchdog. Summary: SWSS service will add watchdog mechanism to generate keepalive message, and generate alert when swss have issue. This PR will add new UT to cover the watchdog mechanism. ### Type of change - [ ] Bug fix - [ ] Testbed and Framework(new/improvement) - [x] Test case(new/improvement) ### Back port request - [ ] 201911 - [ ] 202012 - [ ] 202205 ### Approach #### What is the motivation for this PR? Add new UT to test and protect watchdog mechanism from code change. #### How did you do it? Pause orchagent service with 'kill -stop' command and check if the watchdog can send alert. #### How did you verify/test it? Manually test new UT. Pass PR validation. #### Any platform specific information? No #### Supported testbed topology if it's a new test case? Any ### Documentation <!-- (If it's a new feature, new test case) Did you update documentation/Wiki relevant to your implementation? Link to the wiki page? -->
1 parent 629a443 commit 791b124

File tree

2 files changed

+64
-0
lines changed

2 files changed

+64
-0
lines changed

.azure-pipelines/pr_test_scripts.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ t0:
6767
- test_interfaces.py
6868
- test_procdockerstatsd.py
6969
- database/test_db_scripts.py
70+
- system_health/test_watchdog.py
7071

7172
t0-2vlans:
7273
- dhcp_relay/test_dhcp_relay.py

tests/system_health/test_watchdog.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import logging
2+
import pytest
3+
import time
4+
from tests.common.helpers.assertions import pytest_assert
5+
6+
pytestmark = [
7+
pytest.mark.disable_loganalyzer,
8+
pytest.mark.topology('any')
9+
]
10+
11+
logger = logging.getLogger(__name__)
12+
13+
SLEEP_TIME = 10
14+
15+
16+
@pytest.fixture
17+
def pause_orchagent(duthost):
18+
# find orchagent pid
19+
pid = duthost.shell(
20+
r"pgrep orchagent",
21+
module_ignore_errors=True)['stdout']
22+
logger.info('Get orchagent pid: {}'.format(pid))
23+
24+
# pause orchagent and clear syslog
25+
duthost.shell(r"sudo kill -STOP {}".format(pid), module_ignore_errors=True)
26+
duthost.shell(r"sudo truncate -s 0 /var/log/syslog", module_ignore_errors=True)
27+
28+
yield
29+
30+
# resume orchagent and clear syslog
31+
duthost.shell(r"sudo kill -CONT {}".format(pid), module_ignore_errors=True)
32+
duthost.shell(r"sudo truncate -s 0 /var/log/syslog", module_ignore_errors=True)
33+
34+
35+
def test_orchagent_watchdog(duthosts, enum_rand_one_per_hwsku_hostname, pause_orchagent):
36+
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
37+
38+
result = duthost.shell(
39+
r"docker exec -i swss sh -c 'test -f /etc/supervisor/watchdog_processes && echo exist'",
40+
module_ignore_errors=True)['stdout']
41+
logger.info('Check watchdog exist: {}'.format(result))
42+
if result != 'exist':
43+
pytest.skip("Skip orchagent watchdog test.")
44+
45+
# wait watchdog emit alert, orchagent watchdog timeout is 60 seconds
46+
WATCHDOG_TIMEOUT = 120
47+
current_attempt = 0
48+
while (True):
49+
time.sleep(SLEEP_TIME)
50+
alert = duthost.shell(
51+
r"sudo cat /var/log/syslog | grep 'is stuck in namespace'",
52+
module_ignore_errors=True)['stdout']
53+
logger.info('Get alert from host: {}'.format(alert))
54+
if "orchagent" in str(alert):
55+
return
56+
else:
57+
# orchagent watchdog timeout is 60 seconds
58+
if current_attempt >= WATCHDOG_TIMEOUT/SLEEP_TIME:
59+
pytest_assert(
60+
False,
61+
"orchagent watchdog did not been trigger after {} seconds".format(WATCHDOG_TIMEOUT))
62+
else:
63+
current_attempt += 1

0 commit comments

Comments
 (0)