Skip to content

Commit 2a8af27

Browse files
jlevequelguohan
authored andcommitted
[201803] [services] Restart SwSS service upon unexpected critical process exit (#2546)
* [service] Restart SwSS Docker container if orchagent exits unexpectedly * [dhcp_relay] Use STATE_DB to determine whether interfaces are ready * Supervisor now autorestarts rsyslogd upon unexpected exit * Add other critical processes to event listener * Make supervisor-proc-exit-listener script global, have it read from 'critical_processes' file inside container * Add SwSS to 'WantedBy=' option of services which should be started along with SwSS
1 parent ec694a5 commit 2a8af27

File tree

19 files changed

+127
-60
lines changed

19 files changed

+127
-60
lines changed
Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,40 @@
11
#!/usr/bin/env bash
22

3-
function wait_until_iface_ready
4-
{
5-
IFACE=$1
3+
STATE_DB_IDX="6"
64

7-
echo "Waiting until interface $IFACE is up..."
8-
9-
# Wait for the interface to come up (i.e., 'ip link show' returns 0)
10-
until ip link show dev $IFACE up > /dev/null 2>&1; do
11-
sleep 1
12-
done
5+
PORT_TABLE_PREFIX="PORT_TABLE"
6+
VLAN_TABLE_PREFIX="VLAN_TABLE"
7+
LAG_TABLE_PREFIX="LAG_TABLE"
138

14-
echo "Interface $IFACE is up"
9+
function wait_until_iface_ready
10+
{
11+
TABLE_PREFIX=$1
12+
IFACE=$2
1513

16-
echo "Waiting until interface $IFACE has an IPv4 address..."
14+
echo "Waiting until interface $IFACE is ready..."
1715

18-
# Wait until the interface gets assigned an IPv4 address
16+
# Wait for the interface to come up
17+
# (i.e., interface is present in STATE_DB and state is "ok")
1918
while true; do
20-
IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1)
21-
22-
if [ -n "$IP" ]; then
19+
RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null)
20+
if [ x"$RESULT" == x"ok" ]; then
2321
break
2422
fi
2523

2624
sleep 1
2725
done
2826

29-
echo "Interface $IFACE is configured with IP $IP"
27+
echo "Interface ${IFACE} is ready!"
3028
}
3129

3230

33-
# Wait for all interfaces to come up and have IPv4 addresses assigned
31+
# Wait for all interfaces to be up and ready
3432
{% for (name, prefix) in INTERFACE %}
35-
wait_until_iface_ready {{ name }}
33+
wait_until_iface_ready ${PORT_TABLE_PREFIX} {{ name }}
3634
{% endfor %}
3735
{% for (name, prefix) in VLAN_INTERFACE %}
38-
wait_until_iface_ready {{ name }}
36+
wait_until_iface_ready ${VLAN_TABLE_PREFIX} {{ name }}
3937
{% endfor %}
4038
{% for (name, prefix) in PORTCHANNEL_INTERFACE %}
41-
wait_until_iface_ready {{ name }}
39+
wait_until_iface_ready ${LAG_TABLE_PREFIX} {{ name }}
4240
{% endfor %}

dockers/docker-orchagent/Dockerfile.j2

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ COPY ["files/arp_update", "/usr/bin"]
3030
COPY ["enable_counters.py", "/usr/bin"]
3131
COPY ["start.sh", "orchagent.sh", "swssconfig.sh", "/usr/bin/"]
3232
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
33+
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
34+
COPY ["critical_processes", "/etc/supervisor/"]
3335

3436
## Copy all Jinja2 template files into the templates folder
3537
COPY ["*.j2", "/usr/share/sonic/templates/"]
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
orchagent
2+
portsyncd
3+
intfsyncd
4+
neighsyncd
5+
vlanmgrd
6+
intfmgrd
7+
buffermgrd

dockers/docker-orchagent/supervisord.conf

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ logfile_maxbytes=1MB
33
logfile_backups=2
44
nodaemon=true
55

6+
[eventlistener:supervisor-proc-exit-listener]
7+
command=/usr/bin/supervisor-proc-exit-listener
8+
events=PROCESS_STATE_EXITED
9+
autostart=true
10+
autorestart=unexpected
11+
612
[program:start.sh]
713
command=/usr/bin/start.sh
814
priority=1
@@ -15,7 +21,7 @@ stderr_logfile=syslog
1521
command=/usr/sbin/rsyslogd -n
1622
priority=2
1723
autostart=false
18-
autorestart=false
24+
autorestart=unexpected
1925
stdout_logfile=syslog
2026
stderr_logfile=syslog
2127

files/build_templates/dhcp_relay.service.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh attach
1111
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop
1212

1313
[Install]
14-
WantedBy=multi-user.target teamd.service
14+
WantedBy=multi-user.target swss.service teamd.service

files/build_templates/radv.service.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh attach
1111
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop
1212

1313
[Install]
14-
WantedBy=multi-user.target
14+
WantedBy=multi-user.target swss.service

files/build_templates/snmp.service.j2

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,6 @@ Before=ntp-config.service
88
ExecStartPre=/usr/bin/{{docker_container_name}}.sh start
99
ExecStart=/usr/bin/{{docker_container_name}}.sh attach
1010
ExecStop=/usr/bin/{{docker_container_name}}.sh stop
11+
12+
[Install]
13+
WantedBy=multi-user.target swss.service

files/build_templates/swss.service.j2

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ After=opennsl-modules-3.16.0-6-amd64.service
1414
After=nps-modules-3.16.0-6-amd64.service
1515
{% endif %}
1616
Before=ntp-config.service
17+
StartLimitInterval=1200
18+
StartLimitBurst=3
1719

1820
[Service]
1921
User=root
@@ -52,6 +54,8 @@ ExecStopPost=/usr/bin/mst stop
5254
ExecStopPost=/etc/init.d/xpnet.sh stop
5355
ExecStopPost=/etc/init.d/xpnet.sh start
5456
{% endif %}
57+
Restart=always
58+
RestartSec=30
5559

5660
[Install]
5761
WantedBy=multi-user.target
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[Unit]
22
Description=TEAMD container
3-
Requires=updategraph.service
4-
After=updategraph.service
3+
Requires=updategraph.service swss.service
4+
After=updategraph.service swss.service
55
Before=ntp-config.service
66

77
[Service]
@@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{docker_container_name}}.sh attach
1111
ExecStop=/usr/bin/{{docker_container_name}}.sh stop
1212

1313
[Install]
14-
WantedBy=multi-user.target
14+
WantedBy=multi-user.target swss.service
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/usr/bin/env python
2+
3+
import os
4+
import signal
5+
import sys
6+
import syslog
7+
8+
from supervisor import childutils
9+
10+
# Contents of file should be the names of critical processes (as defined in
11+
# supervisor.conf file), one per line
12+
CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'
13+
14+
def main():
15+
# Read the list of critical processes from a file
16+
with open(CRITICAL_PROCESSES_FILE, 'r') as f:
17+
critical_processes = [line.rstrip('\n') for line in f]
18+
19+
while True:
20+
# Transition from ACKNOWLEDGED to READY
21+
childutils.listener.ready()
22+
23+
line = sys.stdin.readline()
24+
headers = childutils.get_headers(line)
25+
payload = sys.stdin.read(int(headers['len']))
26+
27+
# Transition from READY to ACKNOWLEDGED
28+
childutils.listener.ok()
29+
30+
# We only care about PROCESS_STATE_EXITED events
31+
if headers['eventname'] == 'PROCESS_STATE_EXITED':
32+
payload_headers, payload_data = childutils.eventdata(payload + '\n')
33+
34+
expected = int(payload_headers['expected'])
35+
processname = payload_headers['processname']
36+
37+
# If a critical process exited unexpectedly, terminate supervisor
38+
if expected == 0 and processname in critical_processes:
39+
MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..."
40+
msg = MSG_FORMAT_STR.format(payload_headers['processname'])
41+
syslog.syslog(syslog.LOG_INFO, msg)
42+
os.kill(os.getppid(), signal.SIGTERM)
43+
44+
if __name__ == "__main__":
45+
main()

platform/broadcom/docker-orchagent-brcm.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_BRCM)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
1616
$(DOCKER_ORCHAGENT_BRCM)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw
1717

1818
$(DOCKER_ORCHAGENT_BRCM)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
19-
$(DOCKER_ORCHAGENT_BRCM)_FILES += $(ARP_UPDATE_SCRIPT)
19+
$(DOCKER_ORCHAGENT_BRCM)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)

platform/cavium/docker-orchagent-cavm.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_CAVM)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
1616
$(DOCKER_ORCHAGENT_CAVM)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw
1717

1818
$(DOCKER_ORCHAGENT_CAVM)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
19-
$(DOCKER_ORCHAGENT_CAVM)_FILES += $(ARP_UPDATE_SCRIPT)
19+
$(DOCKER_ORCHAGENT_CAVM)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)

platform/centec/docker-orchagent-centec.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_CENTEC)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
1616
$(DOCKER_ORCHAGENT_CENTEC)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw
1717

1818
$(DOCKER_ORCHAGENT_CENTEC)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
19-
$(DOCKER_ORCHAGENT_CENTEC)_FILES += $(ARP_UPDATE_SCRIPT)
19+
$(DOCKER_ORCHAGENT_CENTEC)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)

platform/marvell/docker-orchagent-mrvl.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@ $(DOCKER_ORCHAGENT_MRVL)_RUN_OPT += -v /host/machine.conf:/host/machine.conf
1515
$(DOCKER_ORCHAGENT_MRVL)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
1616

1717
$(DOCKER_ORCHAGENT_MRVL)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
18-
$(DOCKER_ORCHAGENT_MRVL)_FILES += $(ARP_UPDATE_SCRIPT)
18+
$(DOCKER_ORCHAGENT_MRVL)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)

platform/mellanox/docker-orchagent-mlnx.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_MLNX)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
1616
$(DOCKER_ORCHAGENT_MLNX)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw
1717

1818
$(DOCKER_ORCHAGENT_MLNX)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
19-
$(DOCKER_ORCHAGENT_MLNX)_FILES += $(ARP_UPDATE_SCRIPT)
19+
$(DOCKER_ORCHAGENT_MLNX)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)

platform/nephos/docker-orchagent-nephos.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_NEPHOS)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
1616
$(DOCKER_ORCHAGENT_NEPHOS)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw
1717

1818
$(DOCKER_ORCHAGENT_NEPHOS)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
19-
$(DOCKER_ORCHAGENT_NEPHOS)_FILES += $(ARP_UPDATE_SCRIPT)
19+
$(DOCKER_ORCHAGENT_NEPHOS)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)

rules/docker-dhcp-relay.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
DOCKER_DHCP_RELAY = docker-dhcp-relay.gz
44
$(DOCKER_DHCP_RELAY)_PATH = $(DOCKERS_PATH)/docker-dhcp-relay
5-
$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(ISC_DHCP_CLIENT)
5+
$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(ISC_DHCP_CLIENT) $(REDIS_TOOLS)
66
$(DOCKER_DHCP_RELAY)_LOAD_DOCKERS = $(DOCKER_CONFIG_ENGINE)
77
SONIC_DOCKER_IMAGES += $(DOCKER_DHCP_RELAY)
88
SONIC_INSTALL_DOCKER_IMAGES += $(DOCKER_DHCP_RELAY)

rules/scripts.mk

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@ $(ARP_UPDATE_SCRIPT)_PATH = files/scripts
55
CONFIGDB_LOAD_SCRIPT = configdb-load.sh
66
$(CONFIGDB_LOAD_SCRIPT)_PATH = files/scripts
77

8+
SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT = supervisor-proc-exit-listener
9+
$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)_PATH = files/scripts
10+
811
SONIC_COPY_FILES += $(CONFIGDB_LOAD_SCRIPT) \
9-
$(ARP_UPDATE_SCRIPT)
12+
$(ARP_UPDATE_SCRIPT) \
13+
$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
1014

1115

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,41 @@
11
#!/usr/bin/env bash
22

3-
function wait_until_iface_ready
4-
{
5-
IFACE=$1
3+
STATE_DB_IDX="6"
64

7-
echo "Waiting until interface $IFACE is up..."
8-
9-
# Wait for the interface to come up (i.e., 'ip link show' returns 0)
10-
until ip link show dev $IFACE up > /dev/null 2>&1; do
11-
sleep 1
12-
done
5+
PORT_TABLE_PREFIX="PORT_TABLE"
6+
VLAN_TABLE_PREFIX="VLAN_TABLE"
7+
LAG_TABLE_PREFIX="LAG_TABLE"
138

14-
echo "Interface $IFACE is up"
9+
function wait_until_iface_ready
10+
{
11+
TABLE_PREFIX=$1
12+
IFACE=$2
1513

16-
echo "Waiting until interface $IFACE has an IPv4 address..."
14+
echo "Waiting until interface $IFACE is ready..."
1715

18-
# Wait until the interface gets assigned an IPv4 address
16+
# Wait for the interface to come up
17+
# (i.e., interface is present in STATE_DB and state is "ok")
1918
while true; do
20-
IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1)
21-
22-
if [ -n "$IP" ]; then
19+
RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null)
20+
if [ x"$RESULT" == x"ok" ]; then
2321
break
2422
fi
2523

2624
sleep 1
2725
done
2826

29-
echo "Interface $IFACE is configured with IP $IP"
27+
echo "Interface ${IFACE} is ready!"
3028
}
3129

3230

33-
# Wait for all interfaces to come up and have IPv4 addresses assigned
34-
wait_until_iface_ready Vlan1000
35-
wait_until_iface_ready PortChannel04
36-
wait_until_iface_ready PortChannel02
37-
wait_until_iface_ready PortChannel03
38-
wait_until_iface_ready PortChannel03
39-
wait_until_iface_ready PortChannel01
40-
wait_until_iface_ready PortChannel02
41-
wait_until_iface_ready PortChannel04
42-
wait_until_iface_ready PortChannel01
31+
# Wait for all interfaces to be up and ready
32+
wait_until_iface_ready ${VLAN_TABLE_PREFIX} Vlan1000
33+
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04
34+
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02
35+
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03
36+
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03
37+
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01
38+
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02
39+
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04
40+
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01
4341

0 commit comments

Comments
 (0)