1
1
# stdlib
2
+ from datetime import datetime
3
+ import os .path
2
4
import socket
5
+ import ssl
3
6
import time
4
7
from urlparse import urlparse
5
8
9
+ # 3rd party
10
+ from httplib2 import Http , HttpLib2Error
11
+ import tornado
12
+
6
13
# project
7
14
from checks .network_checks import NetworkCheck , Status , EventType
15
+ from config import _is_affirmative
8
16
from util import headers as agent_headers
9
17
10
- # 3rd party
11
- from httplib2 import Http , HttpLib2Error
12
18
13
- class HTTPCheck (NetworkCheck ):
19
+ def get_ca_certs_path ():
20
+ """
21
+ Get a path to the trusted certificates of the system
22
+ """
23
+ CA_CERTS = [
24
+ '/opt/datadog-agent/embedded/ssl/certs/cacert.pem' ,
25
+ os .path .join (os .path .dirname (tornado .__file__ ), 'ca-certificates.crt' ),
26
+ '/etc/ssl/certs/ca-certificates.crt' ,
27
+ ]
28
+
29
+ for f in CA_CERTS :
30
+ if os .path .exists (f ):
31
+ return f
32
+ return None
33
+
14
34
35
+ class HTTPCheck (NetworkCheck ):
15
36
SOURCE_TYPE_NAME = 'system'
16
- SERVICE_CHECK_PREFIX = 'http_check'
37
+ SC_STATUS = 'http_check'
38
+ SC_SSL_CERT = 'http_check.ssl_cert'
39
+
40
+ def __init__ (self , name , init_config , agentConfig , instances ):
41
+ self .ca_certs = init_config .get ('ca_certs' , get_ca_certs_path ())
42
+ NetworkCheck .__init__ (self , name , init_config , agentConfig , instances )
17
43
18
44
def _load_conf (self , instance ):
19
45
# Fetches the conf
20
46
tags = instance .get ('tags' , [])
21
47
username = instance .get ('username' , None )
22
48
password = instance .get ('password' , None )
23
49
timeout = int (instance .get ('timeout' , 10 ))
24
- config_headers = instance .get ('headers' ,{})
50
+ config_headers = instance .get ('headers' , {})
25
51
headers = agent_headers (self .agentConfig )
26
52
headers .update (config_headers )
27
53
url = instance .get ('url' , None )
28
- response_time = instance .get ('collect_response_time' , True )
54
+ response_time = _is_affirmative ( instance .get ('collect_response_time' , True ) )
29
55
if url is None :
30
56
raise Exception ("Bad configuration. You must specify a url" )
31
- include_content = instance .get ('include_content' , False )
32
- ssl = instance .get ('disable_ssl_validation' , True )
33
- return url , username , password , timeout , include_content , headers , response_time , tags , ssl
57
+ include_content = _is_affirmative (instance .get ('include_content' , False ))
58
+ ssl = _is_affirmative (instance .get ('disable_ssl_validation' , True ))
59
+ ssl_expire = _is_affirmative (instance .get ('check_certificate_expiration' , True ))
60
+
61
+ return url , username , password , timeout , include_content , headers , response_time , tags , ssl , ssl_expire
34
62
35
63
def _check (self , instance ):
36
- addr , username , password , timeout , include_content , headers , response_time , tags , disable_ssl_validation = self ._load_conf (instance )
64
+ addr , username , password , timeout , include_content , headers , response_time , tags , disable_ssl_validation , ssl_expire = self ._load_conf (instance )
37
65
content = ''
38
66
start = time .time ()
67
+
68
+ service_checks = []
69
+ resp = None
70
+
39
71
try :
40
72
self .log .debug ("Connecting to %s" % addr )
41
73
if disable_ssl_validation and urlparse (addr )[0 ] == "https" :
@@ -48,47 +80,76 @@ def _check(self, instance):
48
80
except socket .timeout , e :
49
81
length = int ((time .time () - start ) * 1000 )
50
82
self .log .info ("%s is DOWN, error: %s. Connection failed after %s ms" % (addr , str (e ), length ))
51
- return Status .DOWN , "%s. Connection failed after %s ms" % (str (e ), length )
83
+ service_checks .append ((
84
+ self .SC_STATUS ,
85
+ Status .DOWN ,
86
+ "%s. Connection failed after %s ms" % (str (e ), length )
87
+ ))
52
88
53
89
except HttpLib2Error , e :
54
90
length = int ((time .time () - start ) * 1000 )
55
91
self .log .info ("%s is DOWN, error: %s. Connection failed after %s ms" % (addr , str (e ), length ))
56
- return Status .DOWN , "%s. Connection failed after %s ms" % (str (e ), length )
92
+ service_checks .append ((
93
+ self .SC_STATUS ,
94
+ Status .DOWN ,
95
+ "%s. Connection failed after %s ms" % (str (e ), length )
96
+ ))
57
97
58
98
except socket .error , e :
59
99
length = int ((time .time () - start ) * 1000 )
60
100
self .log .info ("%s is DOWN, error: %s. Connection failed after %s ms" % (addr , repr (e ), length ))
61
- return Status .DOWN , "Socket error: %s. Connection failed after %s ms" % (repr (e ), length )
101
+ service_checks .append ((
102
+ self .SC_STATUS ,
103
+ Status .DOWN ,
104
+ "Socket error: %s. Connection failed after %s ms" % (repr (e ), length )
105
+ ))
62
106
63
107
except Exception , e :
64
108
length = int ((time .time () - start ) * 1000 )
65
109
self .log .error ("Unhandled exception %s. Connection failed after %s ms" % (str (e ), length ))
66
110
raise
67
111
68
- if response_time :
69
- # Stop the timer as early as possible
70
- running_time = time .time () - start
71
- # Store tags in a temporary list so that we don't modify the global tags data structure
72
- tags_list = []
73
- tags_list .extend (tags )
74
- tags_list .append ('url:%s' % addr )
75
- self .gauge ('network.http.response_time' , running_time , tags = tags_list )
76
-
77
- if int (resp .status ) >= 400 :
78
- self .log .info ("%s is DOWN, error code: %s" % (addr , str (resp .status )))
79
- if not include_content :
80
- content = ''
81
- return Status .DOWN , (resp .status , resp .reason , content or '' )
82
-
83
- self .log .debug ("%s is UP" % addr )
84
- return Status .UP , "UP"
85
-
86
- def _create_status_event (self , status , msg , instance ):
112
+ # Only report this metric if the site is not down
113
+ if response_time and not service_checks :
114
+ # Stop the timer as early as possible
115
+ running_time = time .time () - start
116
+ # Store tags in a temporary list so that we don't modify the global tags data structure
117
+ tags_list = list (tags )
118
+ tags_list .append ('url:%s' % addr )
119
+ self .gauge ('network.http.response_time' , running_time , tags = tags_list )
120
+
121
+ if not service_checks :
122
+ if resp is not None and int (resp .status ) >= 400 :
123
+ self .log .info ("%s is DOWN, error code: %s" % (addr , str (resp .status )))
124
+ if not include_content :
125
+ content = ''
126
+ service_checks .append ((
127
+ self .SC_STATUS , Status .DOWN , (resp .status , resp .reason , content or '' )
128
+ ))
129
+ else :
130
+ self .log .debug ("%s is UP" % addr )
131
+ service_checks .append ((
132
+ self .SC_STATUS , Status .UP , "UP"
133
+ ))
134
+
135
+ if ssl_expire and urlparse (addr )[0 ] == "https" :
136
+ status , msg = self .check_cert_expiration (instance )
137
+ service_checks .append ((
138
+ self .SC_SSL_CERT , status , msg
139
+ ))
140
+
141
+ return service_checks
142
+
143
+ # FIXME: 5.3 drop this function
144
+ def _create_status_event (self , sc_name , status , msg , instance ):
145
+ # Create only this deprecated event for old check
146
+ if sc_name != self .SC_STATUS :
147
+ return
87
148
# Get the instance settings
88
149
url = instance .get ('url' , None )
89
150
name = instance .get ('name' , None )
90
- nb_failures = self .statuses [name ].count (Status .DOWN )
91
- nb_tries = len (self .statuses [name ])
151
+ nb_failures = self .statuses [name ][ sc_name ] .count (Status .DOWN )
152
+ nb_tries = len (self .statuses [name ][ sc_name ] )
92
153
tags = instance .get ('tags' , [])
93
154
tags_list = []
94
155
tags_list .extend (tags )
@@ -153,11 +214,13 @@ def _create_status_event(self, status, msg, instance):
153
214
"tags" : tags_list
154
215
}
155
216
156
- def report_as_service_check (self , name , status , instance , msg = None ):
157
- service_check_name = self .normalize (name , self .SERVICE_CHECK_PREFIX )
217
+ def report_as_service_check (self , sc_name , status , instance , msg = None ):
218
+ instance_name = instance ['name' ]
219
+ service_check_name = self .normalize (instance_name , sc_name )
158
220
url = instance .get ('url' , None )
221
+ sc_tags = ['url:%s' % url ]
159
222
160
- if status == Status . DOWN :
223
+ if sc_name == self . SC_STATUS :
161
224
# format the HTTP response body into the event
162
225
if isinstance (msg , tuple ):
163
226
code , reason , content = msg
@@ -168,12 +231,40 @@ def report_as_service_check(self, name, status, instance, msg=None):
168
231
169
232
msg = "%d %s\n \n %s" % (code , reason , content )
170
233
msg = msg .rstrip ()
171
- else :
172
- msg = None
173
234
174
235
self .service_check (service_check_name ,
175
236
NetworkCheck .STATUS_TO_SERVICE_CHECK [status ],
176
- tags = [ 'url:%s' % url ] ,
237
+ tags = sc_tags ,
177
238
message = msg
178
239
)
179
240
241
+ def check_cert_expiration (self , instance ):
242
+ warning_days = int (instance .get ('days_warning' , 14 ))
243
+ url = instance .get ('url' )
244
+
245
+ o = urlparse (url )
246
+ host = o .netloc
247
+
248
+ port = o .port or 443
249
+
250
+ try :
251
+ sock = socket .socket (socket .AF_INET , socket .SOCK_STREAM )
252
+ sock .connect ((host , port ))
253
+ ssl_sock = ssl .wrap_socket (sock , cert_reqs = ssl .CERT_REQUIRED ,
254
+ ca_certs = self .ca_certs )
255
+ cert = ssl_sock .getpeercert ()
256
+
257
+ except Exception as e :
258
+ return Status .DOWN , "%s" % (str (e ))
259
+
260
+ exp_date = datetime .strptime (cert ['notAfter' ], "%b %d %H:%M:%S %Y %Z" )
261
+ days_left = exp_date - datetime .utcnow ()
262
+
263
+ if days_left .days < 0 :
264
+ return Status .DOWN , "Expired by {0} days" .format (days_left .days )
265
+
266
+ elif days_left .days < warning_days :
267
+ return Status .WARNING , "This cert is almost expired, only {0} days left" .format (days_left .days )
268
+
269
+ else :
270
+ return Status .UP , "Days left: {0}" .format (days_left .days )
0 commit comments