Skip to content

[service] introducing serial port watchdog service #1743

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ sudo cp $IMAGE_CONFIGS/hostname/hostname-config.service $FILESYSTEM_ROOT/etc/sy
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable hostname-config.service
sudo cp $IMAGE_CONFIGS/hostname/hostname-config.sh $FILESYSTEM_ROOT/usr/bin/

# Copy serial-port-watchdog configuration scripts
sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.service $FILESYSTEM_ROOT/etc/systemd/system/
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable serial-port-watchdog.service
sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.py $FILESYSTEM_ROOT/usr/bin/

# Copy updategraph script and service file
sudo cp $IMAGE_CONFIGS/updategraph/updategraph.service $FILESYSTEM_ROOT/etc/systemd/system/
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable updategraph.service
Expand Down
44 changes: 40 additions & 4 deletions files/image_config/platform/rc.local
Original file line number Diff line number Diff line change
Expand Up @@ -183,15 +183,51 @@ for x in "$@"; do
done
}

eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ")

if [ -f /host/image-$sonic_version/platform/firsttime ]; then

setup_platform()
{
if [ -n "$aboot_platform" ]; then
platform=$aboot_platform
elif [ -n "$onie_platform" ]; then
platform=$onie_platform
else
platform=''
fi
}

# Setup default values in this function before reading installer.conf
# installer.conf could override the value set in this function.
setup_platform_defaults()
{
# Default serial port: ttyS0
CONSOLE_DEV=0
}

load_platform_installer_config()
{
INSTALLER_CFG=/usr/share/sonic/device/$platform/installer.conf
if [ -f $INSTALLER_CFG ]; then
. $INSTALLER_CFG
fi
}

program_serial_port()
{
sed -i "s|ttyS.|ttyS$CONSOLE_DEV|g" /etc/systemd/system/serial-port-watchdog.service
systemctl daemon-reload
systemctl restart serial-port-watchdog.service
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

serial-port-watchdog.service [](start = 22, length = 28)

Deadlock between rc.local and serial-port-watchdog?

}

eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ")

setup_platform
setup_platform_defaults
load_platform_installer_config

program_serial_port

if [ -f /host/image-$sonic_version/platform/firsttime ]; then

if [ -z "$platform" ]; then
echo "Unknown sonic platform"
firsttime_exit
fi
Expand Down
328 changes: 328 additions & 0 deletions files/image_config/serial-port-watchdog/serial-port-watchdog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,328 @@
#!/usr/bin/env python

from __future__ import print_function, with_statement

import argparse
import logging
import logging.handlers
import os
import time
import signal
import socket
import sys

from collections import namedtuple

PRGNAME = 'serial-port-watchdog'

DEVFS_PATH = '/dev'
PROCFS_PATH = '/proc'

# According to procfs(5)
ProcStat = namedtuple( 'ProcStat', [
'pid', 'comm', 'state', 'ppid', 'pgrp', 'session', 'tty_nr', 'tpgid',
'flags', 'minflt', 'cminflt', 'majflt', 'cmajflt', 'utime', 'stime',
'cutime', 'cstime', 'priority', 'nice', 'num_threads', 'itrealvalue',
'starttime', 'vsize', 'rss', 'rsslim', 'startcode', 'endcode',
'startstack', 'kstkesp', 'kstkeip', 'signal', 'blocked', 'sigignore',
'sigcatch', 'wchan', 'nswap', 'cnswap', 'exit_signal', 'processor',
'rt_priority', 'policy', 'delayacct_blkio_ticks', 'guest_time',
'cguest_time', 'start_data', 'end_data', 'start_brk', 'arg_start',
'arg_end', 'env_start', 'env_end', 'exit_code'
] )

# According to procfs(5)
ProcIo = namedtuple( 'ProcIo', [
'rchar', 'wchar', 'syscr', 'syscw', 'read_bytes', 'write_bytes',
'cancelled_write_bytes'
] )

class Process( object ):
def __init__( self, pid, path=PROCFS_PATH ):
self.pid = pid
self.path = os.path.join( path, str( pid ) )
self.childs = []
self.parent = None

self.stat = None

self.io = None
self.stack = None
self.stackStartTime = None

def refresh( self ):
with open( os.path.join( self.path, 'stat' ) ) as f:
data = f.read().rstrip().split()
self.stat = ProcStat( *data )

def getStat( self, key=None ):
self.refresh()
return self.stat

def uid( self ):
return '%s/%s' % ( self.pid, self.stat.starttime )

def ppid( self ):
return self.stat.ppid

def name( self ):
with open( os.path.join( self.path, 'comm' ) ) as f:
return f.read().rstrip()

def getTtyForFd( self, fd ):
path = os.path.join( self.path, 'fd', str( fd ) )
if not os.path.exists( path ):
return ''
return os.readlink( path )

def getStack( self ):
with open( os.path.join( self.path, 'stack' ) ) as f:
return f.read()

def getIo( self ):
with open( os.path.join( self.path, 'io' ) ) as f:
data = [ int( l.split( ': ' )[ 1 ] ) for l in f.readlines() ]
return ProcIo( *data )

def isUsingTty( self, tty ):
return self.getTtyForFd( 0 ).endswith( tty )

def checkStuck( self, content ):
stack = self.getStack()

found = False
for match in content:
if match in stack:
found = True
break

if not found:
self.io = None
self.stack = None
self.stackStartTime = None
return 0

io = self.getIo()

if self.stack != stack or self.io != io:
self.io = io
self.stack = stack
self.stackStartTime = time.time()
return 0

return time.time() - self.stackStartTime

def __repr__( self ):
return '<Process uid=%s>' % self.uid()

class ProcessMonitor( object ):
def __init__( self, path=PROCFS_PATH ):
self.path = path
self.procs = {}
self.filters = []
self.checkers = []
self.whitelist = []

def addProcessFilter( self, func, *args ):
self.filters.append( ( func, args ) )

def addStuckChecker( self, func, *args ):
self.checkers.append( ( func, args ) )

def setWhitelist( self, whitelist ):
self.whitelist = whitelist

def shouldHandleProcess( self, proc ):
matched = False
for func, args in self.filters:
if func( proc, *args ):
matched = True
break

if not matched:
return False

name = proc.name()
for item in self.whitelist:
if item in name:
return False

return True

def getRunningPids( self ):
pids = []
for entry in os.listdir( self.path ):
if not entry.isdigit():
continue
pids.append( int( entry ) )
return pids

def killStuckProcess( self, proc, elapsed, kill, timeout ):
if not elapsed:
return

if elapsed < timeout:
if elapsed > timeout / 2:
logging.info( 'process %d seems stuck, idle for %ds, waiting '
'some more time', proc.pid, elapsed )
return

logging.warning( 'process %d has been stuck for %d seconds, killing...',
proc.pid, elapsed )
logging.info( 'process %d kernel stack\n%s', proc.pid, proc.stack )
if kill:
# XXX: SIGTERM sleep then if alive SIGKILL ?
os.kill( proc.pid, signal.SIGKILL )

def killStuckProcesses( self, kill, timeout ):
for proc in self.procs.values():
for checker, args in self.checkers:
elapsed = checker( proc, *args )
self.killStuckProcess( proc, elapsed, kill, timeout )

def updatePid( self, pid ):
p = Process( pid )

# if the process is already monitored (previously running)
r = self.procs.get( pid, None )
if r:
p.refresh()
# if the process is still running
if p.uid() == r.uid():
logging.debug( 'process %d still running', pid )
return
# or the pid was reused but the process is different
logging.debug( 'pid %d reused for another process', pid )
del self.procs[ pid ]

# check if the process is relevant for monitoring
if not self.shouldHandleProcess( p ):
return

logging.debug( 'watching process %d', pid )
p.refresh()
self.procs[ pid ] = p

def updateParenting( self ):
# clear parent and childs for monitored processes
for proc in self.procs.values():
del proc.childs[:]
proc.parent = None

# set parent and childs for monitored processes
for proc in self.procs.values():
ppid = proc.ppid()
parent = self.procs.get( ppid, None )
if parent:
proc.parent = parent
parent.childs.append( proc )

def update( self ):
pids = self.getRunningPids()

# remove defunct processes
for pid in list(self.procs.keys()):
if pid not in pids:
logging.debug( 'process %d is defunct', pid )
del self.procs[ pid ]

# create or update running processes information
for pid in pids:
try:
self.updatePid( pid )
except:
logging.warning( 'An issue occured whileupdating process %s',
pid )
raise

#self.updateParenting()

def checkRootPermissions():
if os.geteuid() != 0:
logging.error( 'You must be root to use this feature' )
sys.exit( 1 )

def getHostname():
try:
return socket.gethostname()
except:
return 'localhost'

def setupLogging( verbose=False ):
loglevel = logging.DEBUG if verbose else logging.INFO
dateFmt = '%Y-%m-%d %H:%M:%S'

log = logging.getLogger()
log.setLevel( logging.DEBUG )

logOut = logging.StreamHandler( sys.stdout )
logOut.setFormatter( logging.Formatter( '%(levelname)s: %(message)s' ) )
logOut.setLevel( loglevel )
log.addHandler( logOut )

logSys = logging.handlers.SysLogHandler()
# format to rfc5424 format
fmt = '{} {}: %(message)s'.format( getHostname(), PRGNAME )
logSys.setFormatter( logging.Formatter( fmt ) )
logSys.setLevel( logging.WARNING )
log.addHandler( logSys )
try:
# the connection to the syslog socket happens with the first message
log.info( 'Attaching to syslog' )
except:
log.warning( 'Failed open syslog' )

def listParser( value ):
if not value.strip():
return []
return value.split( ',' )

def ttyParser( dev, path=DEVFS_PATH ):
if not dev.startswith( DEVFS_PATH ):
dev = os.path.join( DEVFS_PATH, dev )
if not os.path.exists( dev ):
raise argparse.ArgumentTypeError( '%s is not a device' % dev )
return dev

def parseArgs( args ):
parser = argparse.ArgumentParser()

parser.add_argument( '-d', '--dry-run', action='store_true',
help='only print processes that would be killed' )
parser.add_argument( '-f', '--funcs', default=[ 'tty_' ], type=listParser,
help='functions to look for in the stack trace' )
parser.add_argument( '-i', '--interval', default=60, type=float,
help='interval at which to check the procfs' )
parser.add_argument( '-k', '--timeout', default=3600, type=float,
help='timeout for which a process gets killed' )
parser.add_argument( '-t', '--tty', default='ttyS0', type=ttyParser,
help='tty to check for stuck process' )
parser.add_argument( '-v', '--verbose', action='store_true',
help='print all debug messages' )
parser.add_argument( '-w', '--whitelist', default=[ 'agetty' ], type=listParser,
help='whitelist programs that should never be killed' )

return parser.parse_args( args )

def main( args ):
args = parseArgs( args )

setupLogging( args.verbose )
checkRootPermissions()

m = ProcessMonitor()
m.addProcessFilter( Process.isUsingTty, args.tty )
m.addStuckChecker( Process.checkStuck, args.funcs )
m.setWhitelist( args.whitelist )

while True:
logging.debug( 'updating processes' )
m.update()
m.killStuckProcesses( kill=( not args.dry_run ), timeout=args.timeout )
time.sleep( args.interval )

return 0

if __name__ == '__main__':
sys.exit( main( sys.argv[ 1: ] ) )

Loading