Skip to content

Commit f2fd9e7

Browse files
committed
Merge pull request #1 from DataDog/new_config
New config
2 parents 34702af + 963c76a commit f2fd9e7

File tree

6 files changed

+301
-336
lines changed

6 files changed

+301
-336
lines changed

bernard.yaml.example

+19-27
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,20 @@
11
## Default configuration
2-
# core:
3-
# schedule:
4-
# timeout: 5 # To check will timeout and exit after {timeout} seconds
5-
# period: 60 # Scheduled once every {period} seconds
6-
# attempts: 3 # The state change is confirmed only after {attempts} attempts (1 for instant change).
7-
# notification: "" # String added in the event body
8-
# notify_startup: none # Which state to notify at startup, can be all, warning, critical or none
9-
# checks:
10-
11-
## Advanced example
12-
# core:
13-
# schedule:
14-
# timeout: 3
15-
# period: 90
16-
# notification: "@all"
17-
# checks:
18-
# - path: /path2/default_checks/
19-
# attempts: 2
20-
# - path: /path1/my_checks/
21-
# timeout: 2
22-
# period: 50
23-
# notification: "@my_email Take a look"
24-
# notify_startup: warning
25-
# args: ["--verbose"]
26-
# - filename: /path3/checks/my_check
27-
# args: ["-w", "10", "-w", "30"]
28-
# name: custom name for this check
2+
#core:
3+
# schedule:
4+
# timeout: 5 # To check will timeout and exit after {timeout} seconds
5+
# period: 15 # Scheduled once every {period} seconds
6+
#
7+
#checks:
8+
# check_ntp:
9+
# command: /usr/local/bin/check_ntp_peer -H $host
10+
#
11+
# check_pg:
12+
# command: /usr/local/bin/check_pg.sh -p $port -d $db
13+
# options:
14+
# timeout: 2
15+
# period: 60
16+
# params:
17+
# - port: 5432
18+
# db: db1
19+
# - port: 5433
20+
# db: db2

bernard/check.py

+118-115
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
1+
# stdlib
12
import logging
23
import os
34
import re
45
import signal
6+
import shlex
57
import subprocess
68
import time
9+
10+
# project
711
from util import (
12+
get_hostname,
813
namedtuple,
914
StaticWatchdog,
1015
)
1116

12-
class InvalidCheckOutput(Exception):
13-
pass
14-
1517
class Timeout(Exception):
1618
pass
1719

@@ -22,15 +24,15 @@ class InvalidPath(Exception):
2224
CheckResult = namedtuple('CheckResult',
2325
['status', 'state', 'message', 'execution_date', 'execution_time'])
2426

25-
# Status of the execution of the check
26-
ExecutionStatus = namedtuple('ExecutionStatus',
27+
# State of the last execution of the check
28+
ExecutionState = namedtuple('ExecutionState',
2729
['OK', 'TIMEOUT', 'EXCEPTION', 'INVALID_OUTPUT'])
28-
S = ExecutionStatus('ok', 'timeout', 'exception', 'invalid_output')
30+
S = ExecutionState('ok', 'timeout', 'exception', 'invalid_output')
2931

30-
# State of check
31-
ResultState = namedtuple('ResultState',
32-
['NONE', 'OK', 'WARNING', 'CRITICAL', 'UNKNOWN'])
33-
R = ResultState('init', 'ok', 'warning', 'critical', 'unknown')
32+
# Check result status
33+
class R():
34+
OK, WARNING, CRITICAL, UNKNOWN, NONE = (0, 1, 2, 3, 4)
35+
ALL = (OK, WARNING, CRITICAL, UNKNOWN, NONE)
3436

3537
log = logging.getLogger(__name__)
3638

@@ -43,100 +45,78 @@ class BernardCheck(object):
4345
]))
4446

4547
@classmethod
46-
def from_config(cls, check_config, defaults):
47-
check_paths = []
48-
path = check_config.get('path', '')
49-
filename = check_config.get('filename', '')
50-
notification = check_config.get('notification', '')
51-
timeout = int(check_config.get('timeout', 0))
52-
period = int(check_config.get('period', 0))
53-
attempts = int(check_config.get('attempts', 0))
54-
name = check_config.get('name', None)
55-
args = check_config.get('args', [])
56-
notify_startup = check_config.get('notify_startup', None)
57-
if path:
58-
try:
59-
filenames = os.listdir(path)
60-
check_paths = []
61-
for fname in filenames:
62-
# Filter hidden files
63-
if not fname.startswith('.'):
64-
check_path = os.path.join(path, fname)
65-
# Keep only executable files
66-
if os.path.isfile(check_path) and os.access(check_path, os.X_OK):
67-
check_paths.append(check_path)
68-
except OSError, e:
69-
raise InvalidPath(str(e))
70-
if filename:
71-
check_paths.append(filename)
72-
48+
def from_config(cls, name, check_config, defaults, hostname=None):
49+
options = check_config.get('options', {})
50+
timeout = int(options.get('timeout', 0))
51+
period = int(options.get('period', 0))
52+
raw_command = check_config.get('command')
53+
params_list = check_config.get('params') or [{}]
54+
hostname = hostname or get_hostname()
55+
56+
check_config = {
57+
'timeout': timeout or defaults['timeout'],
58+
'period': period or defaults['period'],
59+
}
7360
checks = []
74-
if check_paths:
75-
check_parameter = defaults.copy()
76-
if notification:
77-
check_parameter['notification'] = notification
78-
if timeout:
79-
check_parameter['timeout'] = timeout
80-
if period:
81-
check_parameter['period'] = period
82-
if attempts:
83-
check_parameter['attempts'] = attempts
84-
if notify_startup:
85-
check_parameter['notify_startup'] = notify_startup
86-
if name:
87-
check_parameter['name'] = name
88-
for check_path in check_paths:
89-
checks.append(cls(check=check_path, config=check_parameter,
90-
args=args))
61+
62+
# For every set of params (e.g.: {'port': 8888}) return a single check.
63+
# We'll template the $variables in the `command` value with the params.
64+
for param_dict in params_list:
65+
# Stringify all of the check params. We expect everything to be
66+
# strings through the pipeline so we'll do it early on.
67+
for k, v in param_dict.iteritems():
68+
param_dict[k] = str(v)
69+
70+
command = _subprocess_command(raw_command, param_dict, hostname)
71+
checks.append(cls(name, command, check_config, param_dict))
72+
9173
return checks
9274

93-
def __init__(self, check, config, args=[]):
94-
self.check = check
75+
def __init__(self, name, command, config, params):
76+
""" Initializes a BernardCheck with the given `name` and `command`.
77+
Any additional config (e.g. timeout or period) are given in the
78+
`config` dict. `command` is expected to be in a subprocess-friendly
79+
form, e.g.: ['check_foo', ['-h', 'localhost']].
80+
"""
81+
self.name = name
9582
self.config = config
96-
self.args = args
97-
self.command = [self.check] + args
98-
83+
self.command = command
84+
self.params = params
9985
self.run_count = 0
10086
self.event_count = 0
10187

102-
self.container_size = self.config['attempts'] + 1
103-
104-
# Contains the result of #{container_size} last checks
105-
self.result_container = []
88+
# Always holds the latest result.
89+
self.result = None
10690

107-
# Set check_name, remove file extension and "check_" prefix
108-
if 'name' in config:
109-
check_name = config['name']
110-
else:
111-
check_name = self.check.split('/')[-1]
112-
if check_name.startswith('check_'):
113-
check_name = check_name[6:]
114-
check_name = check_name.rsplit('.')[0]
115-
116-
self.check_name = check_name.lower()
117-
log.debug(u"Initialized check %s (%s)" % (self.check_name, ' '.join(self.command)))
91+
log.debug(u"Initialized check %s (%s)" % (self.name, command))
11892

11993
def __repr__(self):
120-
return self.check_name
94+
return self.name
95+
96+
def get_period(self):
97+
return self.config['period']
12198

12299
def _execute_check(self):
123100
timeout = self.config.get('timeout')
124101
output = None
125102
returncode = None
103+
126104
# This is going to disable the StaticWatchdog
127105
signal.signal(signal.SIGALRM, self.timeout_handler)
128106
signal.alarm(timeout)
129107
try:
130108
try:
131-
process = subprocess.Popen(self.command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
109+
process = subprocess.Popen(self.command,
110+
stdout=subprocess.PIPE,
111+
stderr=subprocess.PIPE)
132112
output = process.communicate()[0].strip()
133113
returncode = process.returncode
134114
if len(output) > 20:
135115
truncated_output = output[0:17] + u'...'
136116
else:
137117
truncated_output = output
138118
log.info(u"Check[%s]: %s => %s (%s)" % (
139-
self.check_name,
119+
self.name,
140120
u' '.join(self.command),
141121
returncode,
142122
truncated_output
@@ -153,50 +133,53 @@ def _execute_check(self):
153133
def timeout_handler(self, signum, frame):
154134
raise Timeout()
155135

156-
def run(self):
136+
def run(self, dogstatsd_client):
157137
execution_date = time.time()
158138
try:
159139
output, returncode = self._execute_check()
140+
160141
if output is None:
161-
status = S.TIMEOUT
162-
state = R.UNKNOWN
142+
state = S.TIMEOUT
143+
status = R.UNKNOWN
163144
message = 'Check %s timed out after %ds' % (self, self.config['timeout'])
164145
else:
165-
try:
166-
state, message = self.parse_nagios(output, returncode)
167-
status = S.OK
168-
except InvalidCheckOutput:
169-
status = S.INVALID_OUTPUT
170-
state = R.UNKNOWN
171-
message = u'Failed to parse the output of the check: %s, returncode: %d, output: %s' % (
172-
self, returncode, output)
146+
if returncode not in R.ALL:
147+
state = S.INVALID_OUTPUT
148+
status = R.UNKNOWN
149+
message = u'Failed to parse the output of the check: %s, ' \
150+
'returncode: %d, output: %s' \
151+
% (self, returncode, output)
173152
log.warn(message)
174-
except OSError, exception:
175-
state = R.UNKNOWN
176-
status = S.EXCEPTION
153+
else:
154+
message = self.parse_nagios(output, dogstatsd_client)
155+
state = S.OK
156+
status = returncode
157+
except OSError:
158+
status = R.UNKNOWN
159+
state = S.EXCEPTION
177160
message = u'Failed to execute the check: %s' % self
178161
log.warn(message, exc_info=True)
179162

180163
execution_time = time.time() - execution_date
181164
self.run_count += 1
182165

183-
return CheckResult(
166+
check_result = CheckResult(
184167
status=status,
185168
state=state,
186169
message=message,
187170
execution_date=execution_date,
188-
execution_time=execution_time
171+
execution_time=execution_time,
189172
)
173+
self.result = check_result
174+
return check_result
190175

191-
def parse_nagios(self, output, returncode):
192-
state = returncode
193-
176+
def parse_nagios(self, output, dogstatsd_client):
194177
output = output.strip()
195178
try:
196179
message, tail = output.split('|', 1)
197180
except ValueError:
198181
# No metric, return directly the output as a message
199-
return state, output
182+
return output
200183

201184
message = message.strip()
202185

@@ -217,7 +200,7 @@ def parse_nagios(self, output, returncode):
217200
unit = metric.group('unit')
218201

219202
dd_metric = self._metric_name(label)
220-
# self.dogstatsd.increment('bernard.check.metric_points')
203+
dogstatsd_client.increment('bernard.check.metric_points')
221204

222205
if unit == '%':
223206
value = value / 100.0
@@ -234,38 +217,58 @@ def parse_nagios(self, output, returncode):
234217
elif unit == 'us':
235218
value = value / 1000000.0
236219
elif unit == 'c':
237-
# self.dogstatsd.rate(dd_metric, value)
220+
dogstatsd_client.rate(dd_metric, value)
238221
log.debug('Saved rate: %s:%.2f' % (dd_metric, value))
239222
continue
240223

241-
# self.dogstatsd.gauge(dd_metric, value)
224+
dogstatsd_client.gauge(dd_metric, value)
242225
log.debug('Saved metric: %s:%.2f' % (dd_metric, value))
243226

244-
return state, message
227+
return message
245228

246229
def _metric_name(self, label):
247-
return 'bernard.%s.%s' % (self.check_name, label)
248-
249-
def get_last_result(self):
250-
return self.get_result(0)
230+
return 'bernard.%s.%s' % (self.name, label)
251231

252-
def get_result(self, position=0):
253-
if len(self.result_container) > position:
254-
index = - (position + 1)
255-
return self.result_container[index]
256-
elif position > self.container_size:
257-
raise Exception('Trying to get %dth result while container size is %d' % (position, self.container_size))
258-
else:
259-
return CheckResult(execution_date=0, status=S.OK, state=R.NONE, message='Not runned yet', execution_time=0)
232+
def get_result(self):
233+
if self.result:
234+
return self.result
235+
return CheckResult(execution_date=0, state=S.OK, status=R.NONE,
236+
message='Not yet run.', execution_time=0)
260237

261238
def get_status(self):
262-
result = self.get_last_result()
239+
result = self.get_result()
263240

264241
return {
265-
'check_name': self.check_name,
242+
'check_name': self.name,
266243
'run_count': self.run_count,
267244
'status': result.status,
268245
'state': result.state,
269246
'message': result.message,
270247
'execution_time': result.execution_time,
271248
}
249+
250+
251+
def _subprocess_command(raw_command, params, hostname):
252+
""" Given a raw command from the Bernard config and a dictionary of check
253+
parameter, return a list that's subprocess-compatible for running the
254+
command. We'll replace all command "variables" with a real parameter.
255+
256+
>>> _subprocess_command("/usr/bin/check_pg -p $port", {'port': '5433'})
257+
['/usr/bin/check_pg', ['-p', '5433']]
258+
"""
259+
# $host is always available as a parameter.
260+
if 'host' not in params:
261+
params['host'] = hostname
262+
263+
# Replace variables.
264+
for param, val in params.iteritems():
265+
raw_command = raw_command.replace('$%s' % param, val)
266+
267+
# Split into subprocess format.
268+
command_split = raw_command.split()
269+
if len(command_split) == 0:
270+
raise Exception('Invalid command in config: %v' % raw_command)
271+
parsed_command = [command_split[0]]
272+
if len(command_split[1:]):
273+
parsed_command.extend(shlex.split(' '.join(command_split[1:])))
274+
return parsed_command

0 commit comments

Comments
 (0)