|
| 1 | +#!/usr/bin/python |
| 2 | + |
| 3 | +import os |
| 4 | +import re |
| 5 | +import sys |
| 6 | +import syslog |
| 7 | +import commands |
| 8 | +import threading |
| 9 | +import traceback |
| 10 | +import redis |
| 11 | +import time |
| 12 | +import json |
| 13 | +import subprocess |
| 14 | +import swsssdk |
| 15 | +from optparse import OptionParser |
| 16 | +from swsssdk import SysmonDBConnector |
| 17 | +from swsssdk import _get_redis_dbid_port |
| 18 | +from time import gmtime, strftime |
| 19 | + |
| 20 | +reload(sys) |
| 21 | +sys.setdefaultencoding('utf-8') |
| 22 | + |
| 23 | +PID_FILE = "/var/run/crash-monitor" |
| 24 | +CRASH_MONITOR_TABLE = "CRASH_MONITOR" |
| 25 | +PROC_EVENT_TABLE = "PROC_EVENT" |
| 26 | + |
| 27 | +STATE_WARM_STARTED = 'started' |
| 28 | +STATE_WARM_DONE = 'done' |
| 29 | + |
| 30 | +# Delay 0 seconds before trying auto-restart |
| 31 | +RESTART_DELAY_TIME = 0 |
| 32 | +# Limit 3 restart within 1200 seconds |
| 33 | +RESTART_MAX_RETRIES = 3 |
| 34 | +RESTART_CHECK_INTERVAL = 1200 |
| 35 | + |
| 36 | +# Syslog functions |
| 37 | +SYSLOG_IDENTIFIER = 'crash-monitor' |
| 38 | +def log_info(msg): |
| 39 | + syslog.openlog(SYSLOG_IDENTIFIER) |
| 40 | + syslog.syslog(syslog.LOG_INFO, msg) |
| 41 | + syslog.closelog() |
| 42 | + |
| 43 | +def log_warning(msg): |
| 44 | + syslog.openlog(SYSLOG_IDENTIFIER) |
| 45 | + syslog.syslog(syslog.LOG_WARNING, msg) |
| 46 | + syslog.closelog() |
| 47 | + |
| 48 | +def log_error(msg): |
| 49 | + syslog.openlog(SYSLOG_IDENTIFIER) |
| 50 | + syslog.syslog(syslog.LOG_ERR, msg) |
| 51 | + syslog.closelog() |
| 52 | + |
| 53 | +def kill_last_process(): |
| 54 | + try: |
| 55 | + if os.path.exists(PID_FILE): |
| 56 | + file_r = open(PID_FILE, 'r') |
| 57 | + old_pid = file_r.readline().strip() |
| 58 | + if old_pid: |
| 59 | + log_info("kill old_pid %s " % old_pid) |
| 60 | + os.kill(int(old_pid), 9) |
| 61 | + file_r.close() |
| 62 | + except Exception: |
| 63 | + sys_log(LOG_DEBUG, "kill_last_process Exception: %s" % (traceback.format_exc())) |
| 64 | + current_pid = os.getpid() |
| 65 | + log_info("current_pid: %d" % current_pid) |
| 66 | + file_w = open(PID_FILE, 'w+') |
| 67 | + file_w.write(str(current_pid)) |
| 68 | + file_w.close() |
| 69 | + |
| 70 | + |
| 71 | +def args_parser(): |
| 72 | + usage = "\n\t%prog [--stop] -d [delay] -l [limit] -i [interval] -p [processes]" |
| 73 | + parser = OptionParser(usage=usage) |
| 74 | + |
| 75 | + parser.add_option("-d", "--delay", dest="delay", |
| 76 | + help="delay time(seconds) before auto-restart.") |
| 77 | + parser.add_option("-l", "--limit", dest="limit", |
| 78 | + help="limit of restart retries within given interval.") |
| 79 | + parser.add_option("-i", "--interval", dest="interval", |
| 80 | + help="interval(seconds) to check restart retries limit") |
| 81 | + parser.add_option("-p", "--processes", dest="processes", |
| 82 | + help="list of processes to monitor for crash handling") |
| 83 | + parser.add_option("-s", "--stop", action="store_true", dest="stop_crash_monitor", |
| 84 | + help="Stop crash monitor.") |
| 85 | + |
| 86 | + options, args = parser.parse_args() |
| 87 | + |
| 88 | + return options, parser |
| 89 | + |
| 90 | + |
| 91 | +def wait_docker_startup(docker, wait_seconds): |
| 92 | + max_wait_time = 200 |
| 93 | + cmd = 'docker ps | grep %s' % docker |
| 94 | + |
| 95 | + try: |
| 96 | + while max_wait_time: |
| 97 | + output = commands.getoutput(cmd) |
| 98 | + lines = output.splitlines() |
| 99 | + for line in lines: |
| 100 | + items = line.strip().split() |
| 101 | + if len(items) == 10 or len(items) == 11: |
| 102 | + if items[8] == 'seconds': |
| 103 | + if int(items[7]) > wait_seconds: |
| 104 | + return |
| 105 | + elif items[8] == 'minutes': |
| 106 | + if int(items[7] * 60) > wait_seconds: |
| 107 | + return |
| 108 | + elif items[8].startswith('hour') or items[9].startswith('hour'): |
| 109 | + return |
| 110 | + elif items[8].startswith('days'): |
| 111 | + return |
| 112 | + elif items[8].startswith('weeks'): |
| 113 | + return |
| 114 | + else: |
| 115 | + print "unknow str:%s" % items[8] |
| 116 | + return |
| 117 | + |
| 118 | + max_wait_time -= 1 |
| 119 | + time.sleep(1) |
| 120 | + |
| 121 | + except Exception: |
| 122 | + print("*ERROR* wait_for_db_startup failed: %s" % (traceback.format_exc())) |
| 123 | + |
| 124 | +## Returns: |
| 125 | +## None - if the return code of the command is non-zero |
| 126 | +## String - the standard output of the command, may be empty string |
| 127 | +def run_command(cmd): |
| 128 | + log_info('runcmd: {0}'.format(cmd)) |
| 129 | + rc, out = commands.getstatusoutput(cmd) |
| 130 | + if rc == 0: |
| 131 | + return out |
| 132 | + else: |
| 133 | + log_err('Failed to run: {0}\n{1}'.format(cmd, out)) |
| 134 | + return None |
| 135 | + |
| 136 | +class CrashMonitor(object): |
| 137 | + def __init__(self, critical_proc_str="", delay=RESTART_DELAY_TIME, max_retries=RESTART_MAX_RETRIES, check_interval=RESTART_CHECK_INTERVAL): |
| 138 | + self.delay = delay |
| 139 | + self.max_retries = max_retries |
| 140 | + self.check_interval = check_interval |
| 141 | + self.restart_timers = {} |
| 142 | + self.sysmon_db = None |
| 143 | + self.state_db = None |
| 144 | + self.critical_proc = self.expand(critical_proc_str) |
| 145 | + log_info("Monitoring critical_proc {}".format(self.critical_proc)) |
| 146 | + |
| 147 | + def expand(self, critical_proc_str): |
| 148 | + processes = [] |
| 149 | + processes = critical_proc_str.split(',') |
| 150 | + return processes |
| 151 | + |
| 152 | + def check_process_alive(self, proc): |
| 153 | + try: |
| 154 | + cmd = "ps aux | grep '/usr/.*/%s' | grep -v 'grep'" % proc |
| 155 | + output = commands.getoutput(cmd) |
| 156 | + log_info("checking if process %s is alive." % proc) |
| 157 | + if len(output): |
| 158 | + return True |
| 159 | + else: |
| 160 | + return False |
| 161 | + except Exception: |
| 162 | + log_err("check_process_alive Exception: %s" % (traceback.format_exc())) |
| 163 | + return False |
| 164 | + |
| 165 | + def wait_container_startup(self, container_name, wait_seconds, warm_app_names, exp_state): |
| 166 | + try: |
| 167 | + count = 0 |
| 168 | + for warm_app_name in warm_app_names: |
| 169 | + state = "" |
| 170 | + cmd = "docker exec -i database redis-cli " + _get_redis_dbid_port("STATE_DB") + " hget 'WARM_RESTART_TABLE|" + warm_app_name + "' state" |
| 171 | + # Wait up to wait_seconds for reconciled state |
| 172 | + while state != exp_state and count < wait_seconds/2: |
| 173 | + count += 1 |
| 174 | + time.sleep(2) |
| 175 | + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) |
| 176 | + state = proc.stdout.read().rstrip() |
| 177 | + log_info("%s reached %s state"%(warm_app_name, state)) |
| 178 | + if state != exp_state: |
| 179 | + log_err("%s failed to reach %s state"%(warm_app_name, exp_state)) |
| 180 | + |
| 181 | + except Exception: |
| 182 | + print("*ERROR* wait_container_startup failed: %s" % (traceback.format_exc())) |
| 183 | + |
| 184 | + def clean_bgp_eoiu_marker(self): |
| 185 | + self.state_db.delete(self.state_db.STATE_DB, "BGP_STATE_TABLE|IPv4|eoiu") |
| 186 | + self.state_db.delete(self.state_db.STATE_DB, "BGP_STATE_TABLE|IPv6|eoiu") |
| 187 | + syslog.syslog('Cleaned ipv4 and ipv6 eoiu marker flags') |
| 188 | + |
| 189 | + def clean_app_recon_state(self, warm_app_names): |
| 190 | + # clean app reconcilation state from last warm start if exists |
| 191 | + for warm_app_name in warm_app_names: |
| 192 | + cmd = "docker exec -i database redis-cli " + _get_redis_dbid_port("STATE_DB") + " hdel 'WARM_RESTART_TABLE|" + warm_app_name + "' state" |
| 193 | + run_command(cmd) |
| 194 | + |
| 195 | + # Set the statedb "BGP_STATE_TABLE|crash_warm_restart" |
| 196 | + # state is 'started' or 'done' |
| 197 | + def set_crash_restart_state(self, capital_container_name, state): |
| 198 | + key = capital_container_name + "_STATE_TABLE|crash_warm_restart" |
| 199 | + self.state_db.set(self.state_db.STATE_DB, key, 'state', state) |
| 200 | + # only save the timestamps and retry count for started state |
| 201 | + if state != 'started': |
| 202 | + return |
| 203 | + info = self.state_db.get_all(self.state_db.STATE_DB, key) |
| 204 | + if 'timestamps' in info: |
| 205 | + timestamps = eval(info['timestamps']) |
| 206 | + else: |
| 207 | + timestamps = [] |
| 208 | + timestamps.append(time.time()) |
| 209 | + count = len(timestamps) |
| 210 | + self.state_db.set(self.state_db.STATE_DB, key, 'timestamps', timestamps) |
| 211 | + self.state_db.set(self.state_db.STATE_DB, key, 'count', count) |
| 212 | + return |
| 213 | + |
| 214 | + # Get retry count for auto-restart in a certain time frame. |
| 215 | + def get_retry_count(self, docker): |
| 216 | + key = docker + "_STATE_TABLE|crash_warm_restart" |
| 217 | + info = self.state_db.get_all(self.state_db.STATE_DB, key) |
| 218 | + if info and 'timestamps' in info: |
| 219 | + timestamps = eval(info['timestamps']) |
| 220 | + else: |
| 221 | + timestamps = [] |
| 222 | + count = 0 |
| 223 | + for t in timestamps: |
| 224 | + if ( time.time() - t ) < self.check_interval: |
| 225 | + count = count + 1 |
| 226 | + return count |
| 227 | + |
| 228 | + def warm_restart(self, container_name): |
| 229 | + if container_name == "bgp": |
| 230 | + # Kill bgpd to restart the bgp graceful restart procedure |
| 231 | + log_info("Restarting bgp container...") |
| 232 | + self.clean_bgp_eoiu_marker() |
| 233 | + run_command("config warm_restart enable %s" % container_name) |
| 234 | + run_command("docker exec -i bgp pkill -9 zebra") |
| 235 | + run_command("docker exec -i bgp pkill -9 bgpd") |
| 236 | + warm_app_names = ["bgp"] |
| 237 | + elif container_name == "teamd": |
| 238 | + log_info("Restarting teamd container...") |
| 239 | + run_command("config warm_restart enable %s" % container_name) |
| 240 | + # Send USR1 signal to all teamd instances to stop them |
| 241 | + # It will prepare teamd for warm-reboot |
| 242 | + run_command("docker exec -i teamd pkill -USR1 teamd > /dev/null") |
| 243 | + warm_app_names = ["teamsyncd"] |
| 244 | + else: |
| 245 | + return |
| 246 | + self.clean_app_recon_state(warm_app_names) |
| 247 | + run_command("systemctl restart %s" % container_name) |
| 248 | + |
| 249 | + exp_state = "reconciled" |
| 250 | + self.wait_container_startup(container_name, 180, warm_app_names, exp_state) |
| 251 | + run_command("config warm_restart disable %s" % container_name) |
| 252 | + self.set_crash_restart_state(container_name.upper(), STATE_WARM_DONE) |
| 253 | + |
| 254 | + def warm_restart_call_back(self, *args): |
| 255 | + for container_name in args: |
| 256 | + self.warm_restart(container_name) |
| 257 | + del self.restart_timers[container_name] |
| 258 | + log_info("Completed warm_restart for %s" % (container_name)) |
| 259 | + |
| 260 | + def process_event_handler(self, key, data): |
| 261 | + items = key.split('|') |
| 262 | + if len(items) > 1: |
| 263 | + docker = items[1].split(':')[0] |
| 264 | + proc = items[1].split(':')[1] |
| 265 | + if proc in self.critical_proc: |
| 266 | + log_info("process_event_handler %s, data:%s" % (key, data)) |
| 267 | + if not self.check_process_alive(proc): |
| 268 | + log_warn("Process %s is not running" % (proc)) |
| 269 | + if docker in self.restart_timers: |
| 270 | + log_warn("Process %s is not running, but auto-restart %s already scheduled" % (proc, docker)) |
| 271 | + else: |
| 272 | + retry_count = self.get_retry_count(docker.upper()) |
| 273 | + if retry_count >= self.max_retries: |
| 274 | + sys_log(LOG_CRIT, "%s exceed max_retries %d within %d seconds" % (docker, self.max_retries, self.check_interval)) |
| 275 | + return |
| 276 | + if self.delay == 0: |
| 277 | + sys_log(LOG_CRIT, "Process %s is not running. auto-restart %s" % (proc, docker)) |
| 278 | + else: |
| 279 | + sys_log(LOG_CRIT, "Process %s is not running. auto-restart %s in % seconds" % (proc, docker, self.delay)) |
| 280 | + self.set_crash_restart_state(docker.upper(), STATE_WARM_STARTED) |
| 281 | + t = threading.Timer(self.delay, self.warm_restart_call_back, [docker]) |
| 282 | + self.restart_timers[docker] = t |
| 283 | + t.start() |
| 284 | + |
| 285 | + def init_db_connection(self): |
| 286 | + self.sysmon_db = SysmonDBConnector() |
| 287 | + self.sysmon_db.connect(retry_on=True) |
| 288 | + self.state_db = swsssdk.SonicV2Connector(host='127.0.0.1') |
| 289 | + self.state_db.connect(self.state_db.STATE_DB, False) |
| 290 | + |
| 291 | + def main_loop(self): |
| 292 | + try: |
| 293 | + wait_docker_startup("database", 10) |
| 294 | + self.init_db_connection() |
| 295 | + #self.wait_port_InitDone() |
| 296 | + self.start() |
| 297 | + |
| 298 | + except redis.ConnectionError: |
| 299 | + log_err("main_loop exit as redisDB connection lost.") |
| 300 | + except Exception: |
| 301 | + log_err("*ERROR* main_loop Exception: %s" % (traceback.format_exc())) |
| 302 | + |
| 303 | + def start(self): |
| 304 | + self.sysmon_db.subscribe(PROC_EVENT_TABLE, lambda table, key, data: self.process_event_handler(key, data)) |
| 305 | + self.sysmon_db.listen() |
| 306 | + |
| 307 | + |
| 308 | +def main(): |
| 309 | + sys_log(LOG_DEBUG, "Script crash-monitor.py start ") |
| 310 | + sys_log(LOG_DEBUG, "========================================") |
| 311 | + option, parser = args_parser() |
| 312 | + |
| 313 | + if not option.delay: |
| 314 | + option.delay = RESTART_DELAY_TIME |
| 315 | + |
| 316 | + if not option.limit: |
| 317 | + option.limit = RESTART_MAX_RETRIES |
| 318 | + |
| 319 | + if not option.interval: |
| 320 | + option.interval = RESTART_CHECK_INTERVAL |
| 321 | + |
| 322 | + if option.stop_crash_monitor: |
| 323 | + log_info("Stop crash monitor.") |
| 324 | + kill_last_process() |
| 325 | + return 0 |
| 326 | + |
| 327 | + log_info("delay:%s" % (option.delay)) |
| 328 | + try: |
| 329 | + kill_last_process() |
| 330 | + CrashMonitor(option.processes, int(option.delay)).main_loop() |
| 331 | + except Exception: |
| 332 | + log_err("crash monitor Exception: %s" % (traceback.format_exc())) |
| 333 | + |
| 334 | + |
| 335 | +if __name__ == "__main__": |
| 336 | + # do the UNIX double-fork magic, see Stevens' "Advanced |
| 337 | + # Programming in the UNIX Environment" for details (ISBN 0201563177) |
| 338 | + try: |
| 339 | + pid = os.fork() |
| 340 | + if pid > 0: |
| 341 | + # exit first parent |
| 342 | + sys.exit(0) |
| 343 | + except OSError, e: |
| 344 | + print >> sys.stderr, "fork #1 failed: %d (%s)" % (e.errno, e.strerror) |
| 345 | + sys.exit(1) |
| 346 | + # decouple from parent environment |
| 347 | + os.chdir("/") |
| 348 | + os.setsid() |
| 349 | + os.umask(0) |
| 350 | + # do second fork |
| 351 | + try: |
| 352 | + pid = os.fork() |
| 353 | + if pid > 0: |
| 354 | + # exit from second parent, print eventual PID before |
| 355 | + sys.exit(0) |
| 356 | + except OSError, e: |
| 357 | + print >> sys.stderr, "fork #2 failed: %d (%s)" % (e.errno, e.strerror) |
| 358 | + sys.exit(1) |
| 359 | + # start the daemon main loop |
| 360 | + main() |
0 commit comments