1
+ # stdlib
1
2
import logging
2
3
import os
3
4
import re
4
5
import signal
6
+ import shlex
5
7
import subprocess
6
8
import time
9
+
10
+ # project
7
11
from util import (
12
+ get_hostname ,
8
13
namedtuple ,
9
14
StaticWatchdog ,
10
15
)
11
16
12
- class InvalidCheckOutput (Exception ):
13
- pass
14
-
15
17
class Timeout (Exception ):
16
18
pass
17
19
@@ -22,15 +24,15 @@ class InvalidPath(Exception):
22
24
CheckResult = namedtuple ('CheckResult' ,
23
25
['status' , 'state' , 'message' , 'execution_date' , 'execution_time' ])
24
26
25
- # Status of the execution of the check
26
- ExecutionStatus = namedtuple ('ExecutionStatus ' ,
27
+ # State of the last execution of the check
28
+ ExecutionState = namedtuple ('ExecutionState ' ,
27
29
['OK' , 'TIMEOUT' , 'EXCEPTION' , 'INVALID_OUTPUT' ])
28
- S = ExecutionStatus ('ok' , 'timeout' , 'exception' , 'invalid_output' )
30
+ S = ExecutionState ('ok' , 'timeout' , 'exception' , 'invalid_output' )
29
31
30
- # State of check
31
- ResultState = namedtuple ( 'ResultState' ,
32
- [ 'NONE' , 'OK' , ' WARNING' , ' CRITICAL' , ' UNKNOWN' ] )
33
- R = ResultState ( 'init' , 'ok' , 'warning' , 'critical' , 'unknown' )
32
+ # Check result status
33
+ class R ():
34
+ OK , WARNING , CRITICAL , UNKNOWN , NONE = ( 0 , 1 , 2 , 3 , 4 )
35
+ ALL = ( OK , WARNING , CRITICAL , UNKNOWN , NONE )
34
36
35
37
log = logging .getLogger (__name__ )
36
38
@@ -43,100 +45,78 @@ class BernardCheck(object):
43
45
]))
44
46
45
47
@classmethod
46
- def from_config (cls , check_config , defaults ):
47
- check_paths = []
48
- path = check_config .get ('path' , '' )
49
- filename = check_config .get ('filename' , '' )
50
- notification = check_config .get ('notification' , '' )
51
- timeout = int (check_config .get ('timeout' , 0 ))
52
- period = int (check_config .get ('period' , 0 ))
53
- attempts = int (check_config .get ('attempts' , 0 ))
54
- name = check_config .get ('name' , None )
55
- args = check_config .get ('args' , [])
56
- notify_startup = check_config .get ('notify_startup' , None )
57
- if path :
58
- try :
59
- filenames = os .listdir (path )
60
- check_paths = []
61
- for fname in filenames :
62
- # Filter hidden files
63
- if not fname .startswith ('.' ):
64
- check_path = os .path .join (path , fname )
65
- # Keep only executable files
66
- if os .path .isfile (check_path ) and os .access (check_path , os .X_OK ):
67
- check_paths .append (check_path )
68
- except OSError , e :
69
- raise InvalidPath (str (e ))
70
- if filename :
71
- check_paths .append (filename )
72
-
48
+ def from_config (cls , name , check_config , defaults , hostname = None ):
49
+ options = check_config .get ('options' , {})
50
+ timeout = int (options .get ('timeout' , 0 ))
51
+ period = int (options .get ('period' , 0 ))
52
+ raw_command = check_config .get ('command' )
53
+ params_list = check_config .get ('params' ) or [{}]
54
+ hostname = hostname or get_hostname ()
55
+
56
+ check_config = {
57
+ 'timeout' : timeout or defaults ['timeout' ],
58
+ 'period' : period or defaults ['period' ],
59
+ }
73
60
checks = []
74
- if check_paths :
75
- check_parameter = defaults .copy ()
76
- if notification :
77
- check_parameter ['notification' ] = notification
78
- if timeout :
79
- check_parameter ['timeout' ] = timeout
80
- if period :
81
- check_parameter ['period' ] = period
82
- if attempts :
83
- check_parameter ['attempts' ] = attempts
84
- if notify_startup :
85
- check_parameter ['notify_startup' ] = notify_startup
86
- if name :
87
- check_parameter ['name' ] = name
88
- for check_path in check_paths :
89
- checks .append (cls (check = check_path , config = check_parameter ,
90
- args = args ))
61
+
62
+ # For every set of params (e.g.: {'port': 8888}) return a single check.
63
+ # We'll template the $variables in the `command` value with the params.
64
+ for param_dict in params_list :
65
+ # Stringify all of the check params. We expect everything to be
66
+ # strings through the pipeline so we'll do it early on.
67
+ for k , v in param_dict .iteritems ():
68
+ param_dict [k ] = str (v )
69
+
70
+ command = _subprocess_command (raw_command , param_dict , hostname )
71
+ checks .append (cls (name , command , check_config , param_dict ))
72
+
91
73
return checks
92
74
93
- def __init__ (self , check , config , args = []):
94
- self .check = check
75
+ def __init__ (self , name , command , config , params ):
76
+ """ Initializes a BernardCheck with the given `name` and `command`.
77
+ Any additional config (e.g. timeout or period) are given in the
78
+ `config` dict. `command` is expected to be in a subprocess-friendly
79
+ form, e.g.: ['check_foo', ['-h', 'localhost']].
80
+ """
81
+ self .name = name
95
82
self .config = config
96
- self .args = args
97
- self .command = [self .check ] + args
98
-
83
+ self .command = command
84
+ self .params = params
99
85
self .run_count = 0
100
86
self .event_count = 0
101
87
102
- self .container_size = self .config ['attempts' ] + 1
103
-
104
- # Contains the result of #{container_size} last checks
105
- self .result_container = []
88
+ # Always holds the latest result.
89
+ self .result = None
106
90
107
- # Set check_name, remove file extension and "check_" prefix
108
- if 'name' in config :
109
- check_name = config ['name' ]
110
- else :
111
- check_name = self .check .split ('/' )[- 1 ]
112
- if check_name .startswith ('check_' ):
113
- check_name = check_name [6 :]
114
- check_name = check_name .rsplit ('.' )[0 ]
115
-
116
- self .check_name = check_name .lower ()
117
- log .debug (u"Initialized check %s (%s)" % (self .check_name , ' ' .join (self .command )))
91
+ log .debug (u"Initialized check %s (%s)" % (self .name , command ))
118
92
119
93
def __repr__ (self ):
120
- return self .check_name
94
+ return self .name
95
+
96
+ def get_period (self ):
97
+ return self .config ['period' ]
121
98
122
99
def _execute_check (self ):
123
100
timeout = self .config .get ('timeout' )
124
101
output = None
125
102
returncode = None
103
+
126
104
# This is going to disable the StaticWatchdog
127
105
signal .signal (signal .SIGALRM , self .timeout_handler )
128
106
signal .alarm (timeout )
129
107
try :
130
108
try :
131
- process = subprocess .Popen (self .command , stdout = subprocess .PIPE , stderr = subprocess .PIPE )
109
+ process = subprocess .Popen (self .command ,
110
+ stdout = subprocess .PIPE ,
111
+ stderr = subprocess .PIPE )
132
112
output = process .communicate ()[0 ].strip ()
133
113
returncode = process .returncode
134
114
if len (output ) > 20 :
135
115
truncated_output = output [0 :17 ] + u'...'
136
116
else :
137
117
truncated_output = output
138
118
log .info (u"Check[%s]: %s => %s (%s)" % (
139
- self .check_name ,
119
+ self .name ,
140
120
u' ' .join (self .command ),
141
121
returncode ,
142
122
truncated_output
@@ -153,50 +133,53 @@ def _execute_check(self):
153
133
def timeout_handler (self , signum , frame ):
154
134
raise Timeout ()
155
135
156
- def run (self ):
136
+ def run (self , dogstatsd_client ):
157
137
execution_date = time .time ()
158
138
try :
159
139
output , returncode = self ._execute_check ()
140
+
160
141
if output is None :
161
- status = S .TIMEOUT
162
- state = R .UNKNOWN
142
+ state = S .TIMEOUT
143
+ status = R .UNKNOWN
163
144
message = 'Check %s timed out after %ds' % (self , self .config ['timeout' ])
164
145
else :
165
- try :
166
- state , message = self .parse_nagios (output , returncode )
167
- status = S .OK
168
- except InvalidCheckOutput :
169
- status = S .INVALID_OUTPUT
170
- state = R .UNKNOWN
171
- message = u'Failed to parse the output of the check: %s, returncode: %d, output: %s' % (
172
- self , returncode , output )
146
+ if returncode not in R .ALL :
147
+ state = S .INVALID_OUTPUT
148
+ status = R .UNKNOWN
149
+ message = u'Failed to parse the output of the check: %s, ' \
150
+ 'returncode: %d, output: %s' \
151
+ % (self , returncode , output )
173
152
log .warn (message )
174
- except OSError , exception :
175
- state = R .UNKNOWN
176
- status = S .EXCEPTION
153
+ else :
154
+ message = self .parse_nagios (output , dogstatsd_client )
155
+ state = S .OK
156
+ status = returncode
157
+ except OSError :
158
+ status = R .UNKNOWN
159
+ state = S .EXCEPTION
177
160
message = u'Failed to execute the check: %s' % self
178
161
log .warn (message , exc_info = True )
179
162
180
163
execution_time = time .time () - execution_date
181
164
self .run_count += 1
182
165
183
- return CheckResult (
166
+ check_result = CheckResult (
184
167
status = status ,
185
168
state = state ,
186
169
message = message ,
187
170
execution_date = execution_date ,
188
- execution_time = execution_time
171
+ execution_time = execution_time ,
189
172
)
173
+ self .result = check_result
174
+ return check_result
190
175
191
- def parse_nagios (self , output , returncode ):
192
- state = returncode
193
-
176
+ def parse_nagios (self , output , dogstatsd_client ):
194
177
output = output .strip ()
195
178
try :
196
179
message , tail = output .split ('|' , 1 )
197
180
except ValueError :
198
181
# No metric, return directly the output as a message
199
- return state , output
182
+ return output
200
183
201
184
message = message .strip ()
202
185
@@ -217,7 +200,7 @@ def parse_nagios(self, output, returncode):
217
200
unit = metric .group ('unit' )
218
201
219
202
dd_metric = self ._metric_name (label )
220
- # self.dogstatsd .increment('bernard.check.metric_points')
203
+ dogstatsd_client .increment ('bernard.check.metric_points' )
221
204
222
205
if unit == '%' :
223
206
value = value / 100.0
@@ -234,38 +217,58 @@ def parse_nagios(self, output, returncode):
234
217
elif unit == 'us' :
235
218
value = value / 1000000.0
236
219
elif unit == 'c' :
237
- # self.dogstatsd .rate(dd_metric, value)
220
+ dogstatsd_client .rate (dd_metric , value )
238
221
log .debug ('Saved rate: %s:%.2f' % (dd_metric , value ))
239
222
continue
240
223
241
- # self.dogstatsd .gauge(dd_metric, value)
224
+ dogstatsd_client .gauge (dd_metric , value )
242
225
log .debug ('Saved metric: %s:%.2f' % (dd_metric , value ))
243
226
244
- return state , message
227
+ return message
245
228
246
229
def _metric_name (self , label ):
247
- return 'bernard.%s.%s' % (self .check_name , label )
248
-
249
- def get_last_result (self ):
250
- return self .get_result (0 )
230
+ return 'bernard.%s.%s' % (self .name , label )
251
231
252
- def get_result (self , position = 0 ):
253
- if len (self .result_container ) > position :
254
- index = - (position + 1 )
255
- return self .result_container [index ]
256
- elif position > self .container_size :
257
- raise Exception ('Trying to get %dth result while container size is %d' % (position , self .container_size ))
258
- else :
259
- return CheckResult (execution_date = 0 , status = S .OK , state = R .NONE , message = 'Not runned yet' , execution_time = 0 )
232
+ def get_result (self ):
233
+ if self .result :
234
+ return self .result
235
+ return CheckResult (execution_date = 0 , state = S .OK , status = R .NONE ,
236
+ message = 'Not yet run.' , execution_time = 0 )
260
237
261
238
def get_status (self ):
262
- result = self .get_last_result ()
239
+ result = self .get_result ()
263
240
264
241
return {
265
- 'check_name' : self .check_name ,
242
+ 'check_name' : self .name ,
266
243
'run_count' : self .run_count ,
267
244
'status' : result .status ,
268
245
'state' : result .state ,
269
246
'message' : result .message ,
270
247
'execution_time' : result .execution_time ,
271
248
}
249
+
250
+
251
+ def _subprocess_command (raw_command , params , hostname ):
252
+ """ Given a raw command from the Bernard config and a dictionary of check
253
+ parameter, return a list that's subprocess-compatible for running the
254
+ command. We'll replace all command "variables" with a real parameter.
255
+
256
+ >>> _subprocess_command("/usr/bin/check_pg -p $port", {'port': '5433'})
257
+ ['/usr/bin/check_pg', ['-p', '5433']]
258
+ """
259
+ # $host is always available as a parameter.
260
+ if 'host' not in params :
261
+ params ['host' ] = hostname
262
+
263
+ # Replace variables.
264
+ for param , val in params .iteritems ():
265
+ raw_command = raw_command .replace ('$%s' % param , val )
266
+
267
+ # Split into subprocess format.
268
+ command_split = raw_command .split ()
269
+ if len (command_split ) == 0 :
270
+ raise Exception ('Invalid command in config: %v' % raw_command )
271
+ parsed_command = [command_split [0 ]]
272
+ if len (command_split [1 :]):
273
+ parsed_command .extend (shlex .split (' ' .join (command_split [1 :])))
274
+ return parsed_command
0 commit comments