Skip to content

Commit f39e5db

Browse files
committed
[unplanned-warm-reboot] Add crash-monitor to handle process crash
crash-monitor monitors the critical processes inside dockers, and warm restart docker if the process crashes Signed-off-by: Haiyang Zheng <[email protected]>
1 parent f99f7e1 commit f39e5db

File tree

2 files changed

+361
-0
lines changed

2 files changed

+361
-0
lines changed

scripts/crash-monitor

+360
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,360 @@
1+
#!/usr/bin/python
2+
3+
import os
4+
import re
5+
import sys
6+
import syslog
7+
import commands
8+
import threading
9+
import traceback
10+
import redis
11+
import time
12+
import json
13+
import subprocess
14+
import swsssdk
15+
from optparse import OptionParser
16+
from swsssdk import SysmonDBConnector
17+
from swsssdk import _get_redis_dbid_port
18+
from time import gmtime, strftime
19+
20+
reload(sys)
21+
sys.setdefaultencoding('utf-8')
22+
23+
PID_FILE = "/var/run/crash-monitor"
24+
CRASH_MONITOR_TABLE = "CRASH_MONITOR"
25+
PROC_EVENT_TABLE = "PROC_EVENT"
26+
27+
STATE_WARM_STARTED = 'started'
28+
STATE_WARM_DONE = 'done'
29+
30+
# Delay 0 seconds before trying auto-restart
31+
RESTART_DELAY_TIME = 0
32+
# Limit 3 restart within 1200 seconds
33+
RESTART_MAX_RETRIES = 3
34+
RESTART_CHECK_INTERVAL = 1200
35+
36+
# Syslog functions
37+
SYSLOG_IDENTIFIER = 'crash-monitor'
38+
def log_info(msg):
39+
syslog.openlog(SYSLOG_IDENTIFIER)
40+
syslog.syslog(syslog.LOG_INFO, msg)
41+
syslog.closelog()
42+
43+
def log_warning(msg):
44+
syslog.openlog(SYSLOG_IDENTIFIER)
45+
syslog.syslog(syslog.LOG_WARNING, msg)
46+
syslog.closelog()
47+
48+
def log_error(msg):
49+
syslog.openlog(SYSLOG_IDENTIFIER)
50+
syslog.syslog(syslog.LOG_ERR, msg)
51+
syslog.closelog()
52+
53+
def kill_last_process():
54+
try:
55+
if os.path.exists(PID_FILE):
56+
file_r = open(PID_FILE, 'r')
57+
old_pid = file_r.readline().strip()
58+
if old_pid:
59+
log_info("kill old_pid %s " % old_pid)
60+
os.kill(int(old_pid), 9)
61+
file_r.close()
62+
except Exception:
63+
sys_log(LOG_DEBUG, "kill_last_process Exception: %s" % (traceback.format_exc()))
64+
current_pid = os.getpid()
65+
log_info("current_pid: %d" % current_pid)
66+
file_w = open(PID_FILE, 'w+')
67+
file_w.write(str(current_pid))
68+
file_w.close()
69+
70+
71+
def args_parser():
72+
usage = "\n\t%prog [--stop] -d [delay] -l [limit] -i [interval] -p [processes]"
73+
parser = OptionParser(usage=usage)
74+
75+
parser.add_option("-d", "--delay", dest="delay",
76+
help="delay time(seconds) before auto-restart.")
77+
parser.add_option("-l", "--limit", dest="limit",
78+
help="limit of restart retries within given interval.")
79+
parser.add_option("-i", "--interval", dest="interval",
80+
help="interval(seconds) to check restart retries limit")
81+
parser.add_option("-p", "--processes", dest="processes",
82+
help="list of processes to monitor for crash handling")
83+
parser.add_option("-s", "--stop", action="store_true", dest="stop_crash_monitor",
84+
help="Stop crash monitor.")
85+
86+
options, args = parser.parse_args()
87+
88+
return options, parser
89+
90+
91+
def wait_docker_startup(docker, wait_seconds):
92+
max_wait_time = 200
93+
cmd = 'docker ps | grep %s' % docker
94+
95+
try:
96+
while max_wait_time:
97+
output = commands.getoutput(cmd)
98+
lines = output.splitlines()
99+
for line in lines:
100+
items = line.strip().split()
101+
if len(items) == 10 or len(items) == 11:
102+
if items[8] == 'seconds':
103+
if int(items[7]) > wait_seconds:
104+
return
105+
elif items[8] == 'minutes':
106+
if int(items[7] * 60) > wait_seconds:
107+
return
108+
elif items[8].startswith('hour') or items[9].startswith('hour'):
109+
return
110+
elif items[8].startswith('days'):
111+
return
112+
elif items[8].startswith('weeks'):
113+
return
114+
else:
115+
print "unknow str:%s" % items[8]
116+
return
117+
118+
max_wait_time -= 1
119+
time.sleep(1)
120+
121+
except Exception:
122+
print("*ERROR* wait_for_db_startup failed: %s" % (traceback.format_exc()))
123+
124+
## Returns:
125+
## None - if the return code of the command is non-zero
126+
## String - the standard output of the command, may be empty string
127+
def run_command(cmd):
128+
log_info('runcmd: {0}'.format(cmd))
129+
rc, out = commands.getstatusoutput(cmd)
130+
if rc == 0:
131+
return out
132+
else:
133+
log_err('Failed to run: {0}\n{1}'.format(cmd, out))
134+
return None
135+
136+
class CrashMonitor(object):
137+
def __init__(self, critical_proc_str="", delay=RESTART_DELAY_TIME, max_retries=RESTART_MAX_RETRIES, check_interval=RESTART_CHECK_INTERVAL):
138+
self.delay = delay
139+
self.max_retries = max_retries
140+
self.check_interval = check_interval
141+
self.restart_timers = {}
142+
self.sysmon_db = None
143+
self.state_db = None
144+
self.critical_proc = self.expand(critical_proc_str)
145+
log_info("Monitoring critical_proc {}".format(self.critical_proc))
146+
147+
def expand(self, critical_proc_str):
148+
processes = []
149+
processes = critical_proc_str.split(',')
150+
return processes
151+
152+
def check_process_alive(self, proc):
153+
try:
154+
cmd = "ps aux | grep '/usr/.*/%s' | grep -v 'grep'" % proc
155+
output = commands.getoutput(cmd)
156+
log_info("checking if process %s is alive." % proc)
157+
if len(output):
158+
return True
159+
else:
160+
return False
161+
except Exception:
162+
log_err("check_process_alive Exception: %s" % (traceback.format_exc()))
163+
return False
164+
165+
def wait_container_startup(self, container_name, wait_seconds, warm_app_names, exp_state):
166+
try:
167+
count = 0
168+
for warm_app_name in warm_app_names:
169+
state = ""
170+
cmd = "docker exec -i database redis-cli " + _get_redis_dbid_port("STATE_DB") + " hget 'WARM_RESTART_TABLE|" + warm_app_name + "' state"
171+
# Wait up to wait_seconds for reconciled state
172+
while state != exp_state and count < wait_seconds/2:
173+
count += 1
174+
time.sleep(2)
175+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
176+
state = proc.stdout.read().rstrip()
177+
log_info("%s reached %s state"%(warm_app_name, state))
178+
if state != exp_state:
179+
log_err("%s failed to reach %s state"%(warm_app_name, exp_state))
180+
181+
except Exception:
182+
print("*ERROR* wait_container_startup failed: %s" % (traceback.format_exc()))
183+
184+
def clean_bgp_eoiu_marker(self):
185+
self.state_db.delete(self.state_db.STATE_DB, "BGP_STATE_TABLE|IPv4|eoiu")
186+
self.state_db.delete(self.state_db.STATE_DB, "BGP_STATE_TABLE|IPv6|eoiu")
187+
syslog.syslog('Cleaned ipv4 and ipv6 eoiu marker flags')
188+
189+
def clean_app_recon_state(self, warm_app_names):
190+
# clean app reconcilation state from last warm start if exists
191+
for warm_app_name in warm_app_names:
192+
cmd = "docker exec -i database redis-cli " + _get_redis_dbid_port("STATE_DB") + " hdel 'WARM_RESTART_TABLE|" + warm_app_name + "' state"
193+
run_command(cmd)
194+
195+
# Set the statedb "BGP_STATE_TABLE|crash_warm_restart"
196+
# state is 'started' or 'done'
197+
def set_crash_restart_state(self, capital_container_name, state):
198+
key = capital_container_name + "_STATE_TABLE|crash_warm_restart"
199+
self.state_db.set(self.state_db.STATE_DB, key, 'state', state)
200+
# only save the timestamps and retry count for started state
201+
if state != 'started':
202+
return
203+
info = self.state_db.get_all(self.state_db.STATE_DB, key)
204+
if 'timestamps' in info:
205+
timestamps = eval(info['timestamps'])
206+
else:
207+
timestamps = []
208+
timestamps.append(time.time())
209+
count = len(timestamps)
210+
self.state_db.set(self.state_db.STATE_DB, key, 'timestamps', timestamps)
211+
self.state_db.set(self.state_db.STATE_DB, key, 'count', count)
212+
return
213+
214+
# Get retry count for auto-restart in a certain time frame.
215+
def get_retry_count(self, docker):
216+
key = docker + "_STATE_TABLE|crash_warm_restart"
217+
info = self.state_db.get_all(self.state_db.STATE_DB, key)
218+
if info and 'timestamps' in info:
219+
timestamps = eval(info['timestamps'])
220+
else:
221+
timestamps = []
222+
count = 0
223+
for t in timestamps:
224+
if ( time.time() - t ) < self.check_interval:
225+
count = count + 1
226+
return count
227+
228+
def warm_restart(self, container_name):
229+
if container_name == "bgp":
230+
# Kill bgpd to restart the bgp graceful restart procedure
231+
log_info("Restarting bgp container...")
232+
self.clean_bgp_eoiu_marker()
233+
run_command("config warm_restart enable %s" % container_name)
234+
run_command("docker exec -i bgp pkill -9 zebra")
235+
run_command("docker exec -i bgp pkill -9 bgpd")
236+
warm_app_names = ["bgp"]
237+
elif container_name == "teamd":
238+
log_info("Restarting teamd container...")
239+
run_command("config warm_restart enable %s" % container_name)
240+
# Send USR1 signal to all teamd instances to stop them
241+
# It will prepare teamd for warm-reboot
242+
run_command("docker exec -i teamd pkill -USR1 teamd > /dev/null")
243+
warm_app_names = ["teamsyncd"]
244+
else:
245+
return
246+
self.clean_app_recon_state(warm_app_names)
247+
run_command("systemctl restart %s" % container_name)
248+
249+
exp_state = "reconciled"
250+
self.wait_container_startup(container_name, 180, warm_app_names, exp_state)
251+
run_command("config warm_restart disable %s" % container_name)
252+
self.set_crash_restart_state(container_name.upper(), STATE_WARM_DONE)
253+
254+
def warm_restart_call_back(self, *args):
255+
for container_name in args:
256+
self.warm_restart(container_name)
257+
del self.restart_timers[container_name]
258+
log_info("Completed warm_restart for %s" % (container_name))
259+
260+
def process_event_handler(self, key, data):
261+
items = key.split('|')
262+
if len(items) > 1:
263+
docker = items[1].split(':')[0]
264+
proc = items[1].split(':')[1]
265+
if proc in self.critical_proc:
266+
log_info("process_event_handler %s, data:%s" % (key, data))
267+
if not self.check_process_alive(proc):
268+
log_warn("Process %s is not running" % (proc))
269+
if docker in self.restart_timers:
270+
log_warn("Process %s is not running, but auto-restart %s already scheduled" % (proc, docker))
271+
else:
272+
retry_count = self.get_retry_count(docker.upper())
273+
if retry_count >= self.max_retries:
274+
sys_log(LOG_CRIT, "%s exceed max_retries %d within %d seconds" % (docker, self.max_retries, self.check_interval))
275+
return
276+
if self.delay == 0:
277+
sys_log(LOG_CRIT, "Process %s is not running. auto-restart %s" % (proc, docker))
278+
else:
279+
sys_log(LOG_CRIT, "Process %s is not running. auto-restart %s in % seconds" % (proc, docker, self.delay))
280+
self.set_crash_restart_state(docker.upper(), STATE_WARM_STARTED)
281+
t = threading.Timer(self.delay, self.warm_restart_call_back, [docker])
282+
self.restart_timers[docker] = t
283+
t.start()
284+
285+
def init_db_connection(self):
286+
self.sysmon_db = SysmonDBConnector()
287+
self.sysmon_db.connect(retry_on=True)
288+
self.state_db = swsssdk.SonicV2Connector(host='127.0.0.1')
289+
self.state_db.connect(self.state_db.STATE_DB, False)
290+
291+
def main_loop(self):
292+
try:
293+
wait_docker_startup("database", 10)
294+
self.init_db_connection()
295+
#self.wait_port_InitDone()
296+
self.start()
297+
298+
except redis.ConnectionError:
299+
log_err("main_loop exit as redisDB connection lost.")
300+
except Exception:
301+
log_err("*ERROR* main_loop Exception: %s" % (traceback.format_exc()))
302+
303+
def start(self):
304+
self.sysmon_db.subscribe(PROC_EVENT_TABLE, lambda table, key, data: self.process_event_handler(key, data))
305+
self.sysmon_db.listen()
306+
307+
308+
def main():
309+
sys_log(LOG_DEBUG, "Script crash-monitor.py start ")
310+
sys_log(LOG_DEBUG, "========================================")
311+
option, parser = args_parser()
312+
313+
if not option.delay:
314+
option.delay = RESTART_DELAY_TIME
315+
316+
if not option.limit:
317+
option.limit = RESTART_MAX_RETRIES
318+
319+
if not option.interval:
320+
option.interval = RESTART_CHECK_INTERVAL
321+
322+
if option.stop_crash_monitor:
323+
log_info("Stop crash monitor.")
324+
kill_last_process()
325+
return 0
326+
327+
log_info("delay:%s" % (option.delay))
328+
try:
329+
kill_last_process()
330+
CrashMonitor(option.processes, int(option.delay)).main_loop()
331+
except Exception:
332+
log_err("crash monitor Exception: %s" % (traceback.format_exc()))
333+
334+
335+
if __name__ == "__main__":
336+
# do the UNIX double-fork magic, see Stevens' "Advanced
337+
# Programming in the UNIX Environment" for details (ISBN 0201563177)
338+
try:
339+
pid = os.fork()
340+
if pid > 0:
341+
# exit first parent
342+
sys.exit(0)
343+
except OSError, e:
344+
print >> sys.stderr, "fork #1 failed: %d (%s)" % (e.errno, e.strerror)
345+
sys.exit(1)
346+
# decouple from parent environment
347+
os.chdir("/")
348+
os.setsid()
349+
os.umask(0)
350+
# do second fork
351+
try:
352+
pid = os.fork()
353+
if pid > 0:
354+
# exit from second parent, print eventual PID before
355+
sys.exit(0)
356+
except OSError, e:
357+
print >> sys.stderr, "fork #2 failed: %d (%s)" % (e.errno, e.strerror)
358+
sys.exit(1)
359+
# start the daemon main loop
360+
main()

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@
107107
'scripts/watermarkstat',
108108
'scripts/watermarkcfg',
109109
'scripts/sonic-kdump-config'
110+
'scripts/crash-monitor',
110111
],
111112
data_files=[
112113
('/etc/bash_completion.d', glob.glob('data/etc/bash_completion.d/*')),

0 commit comments

Comments
 (0)