Skip to content

Commit 2aa1de5

Browse files
yxiecalguohan
authored andcommitted
[service] introducing serial port watchdog service (#1743)
* [rc.local] refactor platform identification code to separate function Signed-off-by: Ying Xie <[email protected]> * [rc.local] infrastructure to take action according to installer.conf * [serial port watchdog] add service to watch serial port processes Monitor serial port processes. Kill ones stuck for too long. Signed-off-by: Ying Xie <[email protected]> * [rc.local] start watchdog on serial port specified by installer.conf Signed-off-by: Ying Xie <[email protected]>
1 parent 989000c commit 2aa1de5

File tree

4 files changed

+385
-4
lines changed

4 files changed

+385
-4
lines changed

files/build_templates/sonic_debian_extension.j2

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,11 @@ sudo cp $IMAGE_CONFIGS/hostname/hostname-config.service $FILESYSTEM_ROOT/etc/sy
154154
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable hostname-config.service
155155
sudo cp $IMAGE_CONFIGS/hostname/hostname-config.sh $FILESYSTEM_ROOT/usr/bin/
156156

157+
# Copy serial-port-watchdog configuration scripts
158+
sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.service $FILESYSTEM_ROOT/etc/systemd/system/
159+
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable serial-port-watchdog.service
160+
sudo cp $IMAGE_CONFIGS/serial-port-watchdog/serial-port-watchdog.py $FILESYSTEM_ROOT/usr/bin/
161+
157162
# Copy updategraph script and service file
158163
sudo cp $IMAGE_CONFIGS/updategraph/updategraph.service $FILESYSTEM_ROOT/etc/systemd/system/
159164
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable updategraph.service

files/image_config/platform/rc.local

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,15 +183,51 @@ for x in "$@"; do
183183
done
184184
}
185185

186-
eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ")
187-
188-
if [ -f /host/image-$sonic_version/platform/firsttime ]; then
189-
186+
setup_platform()
187+
{
190188
if [ -n "$aboot_platform" ]; then
191189
platform=$aboot_platform
192190
elif [ -n "$onie_platform" ]; then
193191
platform=$onie_platform
194192
else
193+
platform=''
194+
fi
195+
}
196+
197+
# Setup default values in this function before reading installer.conf
198+
# installer.conf could override the value set in this function.
199+
setup_platform_defaults()
200+
{
201+
# Default serial port: ttyS0
202+
CONSOLE_DEV=0
203+
}
204+
205+
load_platform_installer_config()
206+
{
207+
INSTALLER_CFG=/usr/share/sonic/device/$platform/installer.conf
208+
if [ -f $INSTALLER_CFG ]; then
209+
. $INSTALLER_CFG
210+
fi
211+
}
212+
213+
program_serial_port()
214+
{
215+
sed -i "s|ttyS.|ttyS$CONSOLE_DEV|g" /etc/systemd/system/serial-port-watchdog.service
216+
systemctl daemon-reload
217+
systemctl restart serial-port-watchdog.service
218+
}
219+
220+
eval sonic_version=$(cat /etc/sonic/sonic_version.yml | grep build_version | cut -f2 -d" ")
221+
222+
setup_platform
223+
setup_platform_defaults
224+
load_platform_installer_config
225+
226+
program_serial_port
227+
228+
if [ -f /host/image-$sonic_version/platform/firsttime ]; then
229+
230+
if [ -z "$platform" ]; then
195231
echo "Unknown sonic platform"
196232
firsttime_exit
197233
fi
Lines changed: 328 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,328 @@
1+
#!/usr/bin/env python
2+
3+
from __future__ import print_function, with_statement
4+
5+
import argparse
6+
import logging
7+
import logging.handlers
8+
import os
9+
import time
10+
import signal
11+
import socket
12+
import sys
13+
14+
from collections import namedtuple
15+
16+
PRGNAME = 'serial-port-watchdog'
17+
18+
DEVFS_PATH = '/dev'
19+
PROCFS_PATH = '/proc'
20+
21+
# According to procfs(5)
22+
ProcStat = namedtuple( 'ProcStat', [
23+
'pid', 'comm', 'state', 'ppid', 'pgrp', 'session', 'tty_nr', 'tpgid',
24+
'flags', 'minflt', 'cminflt', 'majflt', 'cmajflt', 'utime', 'stime',
25+
'cutime', 'cstime', 'priority', 'nice', 'num_threads', 'itrealvalue',
26+
'starttime', 'vsize', 'rss', 'rsslim', 'startcode', 'endcode',
27+
'startstack', 'kstkesp', 'kstkeip', 'signal', 'blocked', 'sigignore',
28+
'sigcatch', 'wchan', 'nswap', 'cnswap', 'exit_signal', 'processor',
29+
'rt_priority', 'policy', 'delayacct_blkio_ticks', 'guest_time',
30+
'cguest_time', 'start_data', 'end_data', 'start_brk', 'arg_start',
31+
'arg_end', 'env_start', 'env_end', 'exit_code'
32+
] )
33+
34+
# According to procfs(5)
35+
ProcIo = namedtuple( 'ProcIo', [
36+
'rchar', 'wchar', 'syscr', 'syscw', 'read_bytes', 'write_bytes',
37+
'cancelled_write_bytes'
38+
] )
39+
40+
class Process( object ):
41+
def __init__( self, pid, path=PROCFS_PATH ):
42+
self.pid = pid
43+
self.path = os.path.join( path, str( pid ) )
44+
self.childs = []
45+
self.parent = None
46+
47+
self.stat = None
48+
49+
self.io = None
50+
self.stack = None
51+
self.stackStartTime = None
52+
53+
def refresh( self ):
54+
with open( os.path.join( self.path, 'stat' ) ) as f:
55+
data = f.read().rstrip().split()
56+
self.stat = ProcStat( *data )
57+
58+
def getStat( self, key=None ):
59+
self.refresh()
60+
return self.stat
61+
62+
def uid( self ):
63+
return '%s/%s' % ( self.pid, self.stat.starttime )
64+
65+
def ppid( self ):
66+
return self.stat.ppid
67+
68+
def name( self ):
69+
with open( os.path.join( self.path, 'comm' ) ) as f:
70+
return f.read().rstrip()
71+
72+
def getTtyForFd( self, fd ):
73+
path = os.path.join( self.path, 'fd', str( fd ) )
74+
if not os.path.exists( path ):
75+
return ''
76+
return os.readlink( path )
77+
78+
def getStack( self ):
79+
with open( os.path.join( self.path, 'stack' ) ) as f:
80+
return f.read()
81+
82+
def getIo( self ):
83+
with open( os.path.join( self.path, 'io' ) ) as f:
84+
data = [ int( l.split( ': ' )[ 1 ] ) for l in f.readlines() ]
85+
return ProcIo( *data )
86+
87+
def isUsingTty( self, tty ):
88+
return self.getTtyForFd( 0 ).endswith( tty )
89+
90+
def checkStuck( self, content ):
91+
stack = self.getStack()
92+
93+
found = False
94+
for match in content:
95+
if match in stack:
96+
found = True
97+
break
98+
99+
if not found:
100+
self.io = None
101+
self.stack = None
102+
self.stackStartTime = None
103+
return 0
104+
105+
io = self.getIo()
106+
107+
if self.stack != stack or self.io != io:
108+
self.io = io
109+
self.stack = stack
110+
self.stackStartTime = time.time()
111+
return 0
112+
113+
return time.time() - self.stackStartTime
114+
115+
def __repr__( self ):
116+
return '<Process uid=%s>' % self.uid()
117+
118+
class ProcessMonitor( object ):
119+
def __init__( self, path=PROCFS_PATH ):
120+
self.path = path
121+
self.procs = {}
122+
self.filters = []
123+
self.checkers = []
124+
self.whitelist = []
125+
126+
def addProcessFilter( self, func, *args ):
127+
self.filters.append( ( func, args ) )
128+
129+
def addStuckChecker( self, func, *args ):
130+
self.checkers.append( ( func, args ) )
131+
132+
def setWhitelist( self, whitelist ):
133+
self.whitelist = whitelist
134+
135+
def shouldHandleProcess( self, proc ):
136+
matched = False
137+
for func, args in self.filters:
138+
if func( proc, *args ):
139+
matched = True
140+
break
141+
142+
if not matched:
143+
return False
144+
145+
name = proc.name()
146+
for item in self.whitelist:
147+
if item in name:
148+
return False
149+
150+
return True
151+
152+
def getRunningPids( self ):
153+
pids = []
154+
for entry in os.listdir( self.path ):
155+
if not entry.isdigit():
156+
continue
157+
pids.append( int( entry ) )
158+
return pids
159+
160+
def killStuckProcess( self, proc, elapsed, kill, timeout ):
161+
if not elapsed:
162+
return
163+
164+
if elapsed < timeout:
165+
if elapsed > timeout / 2:
166+
logging.info( 'process %d seems stuck, idle for %ds, waiting '
167+
'some more time', proc.pid, elapsed )
168+
return
169+
170+
logging.warning( 'process %d has been stuck for %d seconds, killing...',
171+
proc.pid, elapsed )
172+
logging.info( 'process %d kernel stack\n%s', proc.pid, proc.stack )
173+
if kill:
174+
# XXX: SIGTERM sleep then if alive SIGKILL ?
175+
os.kill( proc.pid, signal.SIGKILL )
176+
177+
def killStuckProcesses( self, kill, timeout ):
178+
for proc in self.procs.values():
179+
for checker, args in self.checkers:
180+
elapsed = checker( proc, *args )
181+
self.killStuckProcess( proc, elapsed, kill, timeout )
182+
183+
def updatePid( self, pid ):
184+
p = Process( pid )
185+
186+
# if the process is already monitored (previously running)
187+
r = self.procs.get( pid, None )
188+
if r:
189+
p.refresh()
190+
# if the process is still running
191+
if p.uid() == r.uid():
192+
logging.debug( 'process %d still running', pid )
193+
return
194+
# or the pid was reused but the process is different
195+
logging.debug( 'pid %d reused for another process', pid )
196+
del self.procs[ pid ]
197+
198+
# check if the process is relevant for monitoring
199+
if not self.shouldHandleProcess( p ):
200+
return
201+
202+
logging.debug( 'watching process %d', pid )
203+
p.refresh()
204+
self.procs[ pid ] = p
205+
206+
def updateParenting( self ):
207+
# clear parent and childs for monitored processes
208+
for proc in self.procs.values():
209+
del proc.childs[:]
210+
proc.parent = None
211+
212+
# set parent and childs for monitored processes
213+
for proc in self.procs.values():
214+
ppid = proc.ppid()
215+
parent = self.procs.get( ppid, None )
216+
if parent:
217+
proc.parent = parent
218+
parent.childs.append( proc )
219+
220+
def update( self ):
221+
pids = self.getRunningPids()
222+
223+
# remove defunct processes
224+
for pid in list(self.procs.keys()):
225+
if pid not in pids:
226+
logging.debug( 'process %d is defunct', pid )
227+
del self.procs[ pid ]
228+
229+
# create or update running processes information
230+
for pid in pids:
231+
try:
232+
self.updatePid( pid )
233+
except:
234+
logging.warning( 'An issue occured whileupdating process %s',
235+
pid )
236+
raise
237+
238+
#self.updateParenting()
239+
240+
def checkRootPermissions():
241+
if os.geteuid() != 0:
242+
logging.error( 'You must be root to use this feature' )
243+
sys.exit( 1 )
244+
245+
def getHostname():
246+
try:
247+
return socket.gethostname()
248+
except:
249+
return 'localhost'
250+
251+
def setupLogging( verbose=False ):
252+
loglevel = logging.DEBUG if verbose else logging.INFO
253+
dateFmt = '%Y-%m-%d %H:%M:%S'
254+
255+
log = logging.getLogger()
256+
log.setLevel( logging.DEBUG )
257+
258+
logOut = logging.StreamHandler( sys.stdout )
259+
logOut.setFormatter( logging.Formatter( '%(levelname)s: %(message)s' ) )
260+
logOut.setLevel( loglevel )
261+
log.addHandler( logOut )
262+
263+
logSys = logging.handlers.SysLogHandler()
264+
# format to rfc5424 format
265+
fmt = '{} {}: %(message)s'.format( getHostname(), PRGNAME )
266+
logSys.setFormatter( logging.Formatter( fmt ) )
267+
logSys.setLevel( logging.WARNING )
268+
log.addHandler( logSys )
269+
try:
270+
# the connection to the syslog socket happens with the first message
271+
log.info( 'Attaching to syslog' )
272+
except:
273+
log.warning( 'Failed open syslog' )
274+
275+
def listParser( value ):
276+
if not value.strip():
277+
return []
278+
return value.split( ',' )
279+
280+
def ttyParser( dev, path=DEVFS_PATH ):
281+
if not dev.startswith( DEVFS_PATH ):
282+
dev = os.path.join( DEVFS_PATH, dev )
283+
if not os.path.exists( dev ):
284+
raise argparse.ArgumentTypeError( '%s is not a device' % dev )
285+
return dev
286+
287+
def parseArgs( args ):
288+
parser = argparse.ArgumentParser()
289+
290+
parser.add_argument( '-d', '--dry-run', action='store_true',
291+
help='only print processes that would be killed' )
292+
parser.add_argument( '-f', '--funcs', default=[ 'tty_' ], type=listParser,
293+
help='functions to look for in the stack trace' )
294+
parser.add_argument( '-i', '--interval', default=60, type=float,
295+
help='interval at which to check the procfs' )
296+
parser.add_argument( '-k', '--timeout', default=3600, type=float,
297+
help='timeout for which a process gets killed' )
298+
parser.add_argument( '-t', '--tty', default='ttyS0', type=ttyParser,
299+
help='tty to check for stuck process' )
300+
parser.add_argument( '-v', '--verbose', action='store_true',
301+
help='print all debug messages' )
302+
parser.add_argument( '-w', '--whitelist', default=[ 'agetty' ], type=listParser,
303+
help='whitelist programs that should never be killed' )
304+
305+
return parser.parse_args( args )
306+
307+
def main( args ):
308+
args = parseArgs( args )
309+
310+
setupLogging( args.verbose )
311+
checkRootPermissions()
312+
313+
m = ProcessMonitor()
314+
m.addProcessFilter( Process.isUsingTty, args.tty )
315+
m.addStuckChecker( Process.checkStuck, args.funcs )
316+
m.setWhitelist( args.whitelist )
317+
318+
while True:
319+
logging.debug( 'updating processes' )
320+
m.update()
321+
m.killStuckProcesses( kill=( not args.dry_run ), timeout=args.timeout )
322+
time.sleep( args.interval )
323+
324+
return 0
325+
326+
if __name__ == '__main__':
327+
sys.exit( main( sys.argv[ 1: ] ) )
328+

0 commit comments

Comments
 (0)