-
Notifications
You must be signed in to change notification settings - Fork 1.5k
[service] introducing serial port watchdog service #1743
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
39d6728
[serial port watchdog] add service to watch serial port processes
yxieca d40a12f
[rc.local] refactor platform identification code to separate function
yxieca 4b2913b
[rc.local] infrastructure to take action according to installer.conf
yxieca e6ac650
[rc.local] start watchdog on serial port specified by installer.conf
yxieca File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
328 changes: 328 additions & 0 deletions
328
files/image_config/serial-port-watchdog/serial-port-watchdog.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,328 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function, with_statement | ||
|
||
import argparse | ||
import logging | ||
import logging.handlers | ||
import os | ||
import time | ||
import signal | ||
import socket | ||
import sys | ||
|
||
from collections import namedtuple | ||
|
||
PRGNAME = 'serial-port-watchdog' | ||
|
||
DEVFS_PATH = '/dev' | ||
PROCFS_PATH = '/proc' | ||
|
||
# According to procfs(5) | ||
ProcStat = namedtuple( 'ProcStat', [ | ||
'pid', 'comm', 'state', 'ppid', 'pgrp', 'session', 'tty_nr', 'tpgid', | ||
'flags', 'minflt', 'cminflt', 'majflt', 'cmajflt', 'utime', 'stime', | ||
'cutime', 'cstime', 'priority', 'nice', 'num_threads', 'itrealvalue', | ||
'starttime', 'vsize', 'rss', 'rsslim', 'startcode', 'endcode', | ||
'startstack', 'kstkesp', 'kstkeip', 'signal', 'blocked', 'sigignore', | ||
'sigcatch', 'wchan', 'nswap', 'cnswap', 'exit_signal', 'processor', | ||
'rt_priority', 'policy', 'delayacct_blkio_ticks', 'guest_time', | ||
'cguest_time', 'start_data', 'end_data', 'start_brk', 'arg_start', | ||
'arg_end', 'env_start', 'env_end', 'exit_code' | ||
] ) | ||
|
||
# According to procfs(5) | ||
ProcIo = namedtuple( 'ProcIo', [ | ||
'rchar', 'wchar', 'syscr', 'syscw', 'read_bytes', 'write_bytes', | ||
'cancelled_write_bytes' | ||
] ) | ||
|
||
class Process( object ): | ||
def __init__( self, pid, path=PROCFS_PATH ): | ||
self.pid = pid | ||
self.path = os.path.join( path, str( pid ) ) | ||
self.childs = [] | ||
self.parent = None | ||
|
||
self.stat = None | ||
|
||
self.io = None | ||
self.stack = None | ||
self.stackStartTime = None | ||
|
||
def refresh( self ): | ||
with open( os.path.join( self.path, 'stat' ) ) as f: | ||
data = f.read().rstrip().split() | ||
self.stat = ProcStat( *data ) | ||
|
||
def getStat( self, key=None ): | ||
self.refresh() | ||
return self.stat | ||
|
||
def uid( self ): | ||
return '%s/%s' % ( self.pid, self.stat.starttime ) | ||
|
||
def ppid( self ): | ||
return self.stat.ppid | ||
|
||
def name( self ): | ||
with open( os.path.join( self.path, 'comm' ) ) as f: | ||
return f.read().rstrip() | ||
|
||
def getTtyForFd( self, fd ): | ||
path = os.path.join( self.path, 'fd', str( fd ) ) | ||
if not os.path.exists( path ): | ||
return '' | ||
return os.readlink( path ) | ||
|
||
def getStack( self ): | ||
with open( os.path.join( self.path, 'stack' ) ) as f: | ||
return f.read() | ||
|
||
def getIo( self ): | ||
with open( os.path.join( self.path, 'io' ) ) as f: | ||
data = [ int( l.split( ': ' )[ 1 ] ) for l in f.readlines() ] | ||
return ProcIo( *data ) | ||
|
||
def isUsingTty( self, tty ): | ||
return self.getTtyForFd( 0 ).endswith( tty ) | ||
|
||
def checkStuck( self, content ): | ||
stack = self.getStack() | ||
|
||
found = False | ||
for match in content: | ||
if match in stack: | ||
found = True | ||
break | ||
|
||
if not found: | ||
self.io = None | ||
self.stack = None | ||
self.stackStartTime = None | ||
return 0 | ||
|
||
io = self.getIo() | ||
|
||
if self.stack != stack or self.io != io: | ||
self.io = io | ||
self.stack = stack | ||
self.stackStartTime = time.time() | ||
return 0 | ||
|
||
return time.time() - self.stackStartTime | ||
|
||
def __repr__( self ): | ||
return '<Process uid=%s>' % self.uid() | ||
|
||
class ProcessMonitor( object ): | ||
def __init__( self, path=PROCFS_PATH ): | ||
self.path = path | ||
self.procs = {} | ||
self.filters = [] | ||
self.checkers = [] | ||
self.whitelist = [] | ||
|
||
def addProcessFilter( self, func, *args ): | ||
self.filters.append( ( func, args ) ) | ||
|
||
def addStuckChecker( self, func, *args ): | ||
self.checkers.append( ( func, args ) ) | ||
|
||
def setWhitelist( self, whitelist ): | ||
self.whitelist = whitelist | ||
|
||
def shouldHandleProcess( self, proc ): | ||
matched = False | ||
for func, args in self.filters: | ||
if func( proc, *args ): | ||
matched = True | ||
break | ||
|
||
if not matched: | ||
return False | ||
|
||
name = proc.name() | ||
for item in self.whitelist: | ||
if item in name: | ||
return False | ||
|
||
return True | ||
|
||
def getRunningPids( self ): | ||
pids = [] | ||
for entry in os.listdir( self.path ): | ||
if not entry.isdigit(): | ||
continue | ||
pids.append( int( entry ) ) | ||
return pids | ||
|
||
def killStuckProcess( self, proc, elapsed, kill, timeout ): | ||
if not elapsed: | ||
return | ||
|
||
if elapsed < timeout: | ||
if elapsed > timeout / 2: | ||
logging.info( 'process %d seems stuck, idle for %ds, waiting ' | ||
'some more time', proc.pid, elapsed ) | ||
return | ||
|
||
logging.warning( 'process %d has been stuck for %d seconds, killing...', | ||
proc.pid, elapsed ) | ||
logging.info( 'process %d kernel stack\n%s', proc.pid, proc.stack ) | ||
if kill: | ||
# XXX: SIGTERM sleep then if alive SIGKILL ? | ||
os.kill( proc.pid, signal.SIGKILL ) | ||
|
||
def killStuckProcesses( self, kill, timeout ): | ||
for proc in self.procs.values(): | ||
for checker, args in self.checkers: | ||
elapsed = checker( proc, *args ) | ||
self.killStuckProcess( proc, elapsed, kill, timeout ) | ||
|
||
def updatePid( self, pid ): | ||
p = Process( pid ) | ||
|
||
# if the process is already monitored (previously running) | ||
r = self.procs.get( pid, None ) | ||
if r: | ||
p.refresh() | ||
# if the process is still running | ||
if p.uid() == r.uid(): | ||
logging.debug( 'process %d still running', pid ) | ||
return | ||
# or the pid was reused but the process is different | ||
logging.debug( 'pid %d reused for another process', pid ) | ||
del self.procs[ pid ] | ||
|
||
# check if the process is relevant for monitoring | ||
if not self.shouldHandleProcess( p ): | ||
return | ||
|
||
logging.debug( 'watching process %d', pid ) | ||
p.refresh() | ||
self.procs[ pid ] = p | ||
|
||
def updateParenting( self ): | ||
# clear parent and childs for monitored processes | ||
for proc in self.procs.values(): | ||
del proc.childs[:] | ||
proc.parent = None | ||
|
||
# set parent and childs for monitored processes | ||
for proc in self.procs.values(): | ||
ppid = proc.ppid() | ||
parent = self.procs.get( ppid, None ) | ||
if parent: | ||
proc.parent = parent | ||
parent.childs.append( proc ) | ||
|
||
def update( self ): | ||
pids = self.getRunningPids() | ||
|
||
# remove defunct processes | ||
for pid in list(self.procs.keys()): | ||
if pid not in pids: | ||
logging.debug( 'process %d is defunct', pid ) | ||
del self.procs[ pid ] | ||
|
||
# create or update running processes information | ||
for pid in pids: | ||
try: | ||
self.updatePid( pid ) | ||
except: | ||
logging.warning( 'An issue occured whileupdating process %s', | ||
pid ) | ||
raise | ||
|
||
#self.updateParenting() | ||
|
||
def checkRootPermissions(): | ||
if os.geteuid() != 0: | ||
logging.error( 'You must be root to use this feature' ) | ||
sys.exit( 1 ) | ||
|
||
def getHostname(): | ||
try: | ||
return socket.gethostname() | ||
except: | ||
return 'localhost' | ||
|
||
def setupLogging( verbose=False ): | ||
loglevel = logging.DEBUG if verbose else logging.INFO | ||
dateFmt = '%Y-%m-%d %H:%M:%S' | ||
|
||
log = logging.getLogger() | ||
log.setLevel( logging.DEBUG ) | ||
|
||
logOut = logging.StreamHandler( sys.stdout ) | ||
logOut.setFormatter( logging.Formatter( '%(levelname)s: %(message)s' ) ) | ||
logOut.setLevel( loglevel ) | ||
log.addHandler( logOut ) | ||
|
||
logSys = logging.handlers.SysLogHandler() | ||
# format to rfc5424 format | ||
fmt = '{} {}: %(message)s'.format( getHostname(), PRGNAME ) | ||
logSys.setFormatter( logging.Formatter( fmt ) ) | ||
logSys.setLevel( logging.WARNING ) | ||
log.addHandler( logSys ) | ||
try: | ||
# the connection to the syslog socket happens with the first message | ||
log.info( 'Attaching to syslog' ) | ||
except: | ||
log.warning( 'Failed open syslog' ) | ||
|
||
def listParser( value ): | ||
if not value.strip(): | ||
return [] | ||
return value.split( ',' ) | ||
|
||
def ttyParser( dev, path=DEVFS_PATH ): | ||
if not dev.startswith( DEVFS_PATH ): | ||
dev = os.path.join( DEVFS_PATH, dev ) | ||
if not os.path.exists( dev ): | ||
raise argparse.ArgumentTypeError( '%s is not a device' % dev ) | ||
return dev | ||
|
||
def parseArgs( args ): | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument( '-d', '--dry-run', action='store_true', | ||
help='only print processes that would be killed' ) | ||
parser.add_argument( '-f', '--funcs', default=[ 'tty_' ], type=listParser, | ||
help='functions to look for in the stack trace' ) | ||
parser.add_argument( '-i', '--interval', default=60, type=float, | ||
help='interval at which to check the procfs' ) | ||
parser.add_argument( '-k', '--timeout', default=3600, type=float, | ||
help='timeout for which a process gets killed' ) | ||
parser.add_argument( '-t', '--tty', default='ttyS0', type=ttyParser, | ||
help='tty to check for stuck process' ) | ||
parser.add_argument( '-v', '--verbose', action='store_true', | ||
help='print all debug messages' ) | ||
parser.add_argument( '-w', '--whitelist', default=[ 'agetty' ], type=listParser, | ||
help='whitelist programs that should never be killed' ) | ||
|
||
return parser.parse_args( args ) | ||
|
||
def main( args ): | ||
args = parseArgs( args ) | ||
|
||
setupLogging( args.verbose ) | ||
checkRootPermissions() | ||
|
||
m = ProcessMonitor() | ||
m.addProcessFilter( Process.isUsingTty, args.tty ) | ||
m.addStuckChecker( Process.checkStuck, args.funcs ) | ||
m.setWhitelist( args.whitelist ) | ||
|
||
while True: | ||
logging.debug( 'updating processes' ) | ||
m.update() | ||
m.killStuckProcesses( kill=( not args.dry_run ), timeout=args.timeout ) | ||
time.sleep( args.interval ) | ||
|
||
return 0 | ||
|
||
if __name__ == '__main__': | ||
sys.exit( main( sys.argv[ 1: ] ) ) | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Deadlock between rc.local and serial-port-watchdog?