1
+ AWSTemplateFormatVersion : ' 2010-09-09'
2
+ Description : >
3
+ Prometheus Agent Collector
4
+
5
+
6
+
7
+
8
+ # ###################
9
+ # # Stack Metadata
10
+ Metadata :
11
+ AWS::CloudFormation::Interface :
12
+ ParameterGroups :
13
+ - Label :
14
+ default : " Network Configuration"
15
+ Parameters :
16
+ - VpcId
17
+ - SubnetId
18
+ - Label :
19
+ default : " Amazon Prometheus"
20
+ Parameters :
21
+ - ManagedPrometheusUrl
22
+ - Label :
23
+ default : " AWS ParallelCluster"
24
+ Parameters :
25
+ - PCClusterNAME
26
+ - Label :
27
+ default : " EC2 Instance Configuration"
28
+ Parameters :
29
+ - InstanceType
30
+ - LatestUbuntuAmiId
31
+ - EBSBootSize
32
+ ParameterLabels :
33
+ SubnetId :
34
+ default : Subnet Id
35
+ ManagedPrometheusUrl :
36
+ default : Amazon Prometheus Remote Write URL
37
+ LatestUbuntuAmiId :
38
+ default : Latest Ubuntu 22.04 AMI Id
39
+ PCClusterNAME :
40
+ default : AWS ParallelCluster Name
41
+
42
+
43
+ # ###################
44
+ # # Parameters
45
+ Parameters :
46
+ InstanceType :
47
+ Description : EC2 instance type
48
+ Type : String
49
+ Default : t3a.micro
50
+ AllowedValues :
51
+ - t3a.micro
52
+ - t3a.small
53
+ - t3a.medium
54
+ - t3a.large
55
+ - t2.small
56
+
57
+ LatestUbuntuAmiId :
58
+ Type : AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>
59
+ Default : ' /aws/service/canonical/ubuntu/server/22.04/stable/current/amd64/hvm/ebs-gp2/ami-id'
60
+ Description : ' Ubuntu 22.04 AMI Id'
61
+
62
+ EBSBootSize :
63
+ Type : Number
64
+ Default : 20
65
+ Description : ' Size in GiB of EBS root volume'
66
+
67
+ SubnetId :
68
+ Type : AWS::EC2::Subnet::Id
69
+ Description : ' Public or Private Subnet Id with internet access'
70
+
71
+ VpcId :
72
+ Type : AWS::EC2::VPC::Id
73
+
74
+ ManagedPrometheusUrl :
75
+ Type : String
76
+
77
+ PCClusterNAME :
78
+ Type : String
79
+ Description : " Cluster name. For example: training-cluster"
80
+
81
+ # ###########################
82
+ # # Prometheus Resources
83
+ Resources :
84
+ PrometheusAgentInstancePolicy :
85
+ Type : AWS::IAM::Policy
86
+ Properties :
87
+ PolicyName : " PrometheusAgentInstancePolicy"
88
+ PolicyDocument :
89
+ Version : 2012-10-17
90
+ Statement :
91
+ - Effect : Allow
92
+ Action :
93
+ - ec2:DescribeInstances
94
+ Resource :
95
+ - ' *'
96
+ Roles :
97
+ - !Ref PrometheusAgentInstanceRole
98
+
99
+ PrometheusAgentInstanceRole :
100
+ Type : AWS::IAM::Role
101
+ Properties :
102
+ AssumeRolePolicyDocument :
103
+ Version : 2012-10-17
104
+ Statement :
105
+ - Effect : Allow
106
+ Principal :
107
+ Service :
108
+ - ec2.amazonaws.com
109
+ Action :
110
+ - sts:AssumeRole
111
+ ManagedPolicyArns :
112
+ - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
113
+ - arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess
114
+
115
+ PrometheusAgentInstanceProfile :
116
+ Type : AWS::IAM::InstanceProfile
117
+ Properties :
118
+ Roles :
119
+ - !Ref PrometheusAgentInstanceRole
120
+
121
+ # ################
122
+ # Security groups
123
+ InternetSecurityGroup :
124
+ Type : AWS::EC2::SecurityGroup
125
+ Properties :
126
+ GroupDescription : ' Allow communication to Internet'
127
+ GroupName : ' prometheus-agent-outbound'
128
+ SecurityGroupEgress :
129
+ - CidrIp : 0.0.0.0/0
130
+ Description : ' Communication to the internet'
131
+ FromPort : -1
132
+ IpProtocol : -1
133
+ ToPort : -1
134
+ Tags :
135
+ - Key : ' Name'
136
+ Value : ' prometheus-agent'
137
+ VpcId : !Ref VpcId
138
+
139
+ PrometheusAgentSecurityGroup :
140
+ Type : AWS::EC2::SecurityGroup
141
+ Properties :
142
+ GroupDescription : ' Allow communication to/from Cluster'
143
+ GroupName : ' prometheus-agent-to-cluster'
144
+ Tags :
145
+ - Key : ' Name'
146
+ Value : ' prometheus-agent-to-cluster'
147
+ VpcId : !Ref VpcId
148
+
149
+ PrometheusAgentSecurityGroupIngress :
150
+ Type : AWS::EC2::SecurityGroupIngress
151
+ Properties :
152
+ Description : ' Allow communication to Prometheus'
153
+ IpProtocol : -1
154
+ FromPort : -1
155
+ ToPort : -1
156
+ GroupId : !Ref PrometheusAgentSecurityGroup
157
+ SourceSecurityGroupId : !Ref PrometheusAgentSecurityGroup
158
+
159
+ # ###############
160
+ # Prometheus Agent Instance
161
+ PrometheusAgentInstance :
162
+ Type : ' AWS::EC2::Instance'
163
+ Metadata :
164
+ AWS::CloudFormation::Init :
165
+ configSets :
166
+ full_install :
167
+ - install_and_enable_cfn_hup
168
+ install_and_enable_cfn_hup :
169
+ files :
170
+ /etc/cfn/cfn-hup.conf :
171
+ content : !Sub |
172
+ [main]
173
+ stack=${AWS::StackId}
174
+ region=${AWS::Region}
175
+ mode : " 000400"
176
+ owner : root
177
+ group : root
178
+ /etc/cfn/hooks.d/cfn-auto-reloader.conf :
179
+ content : !Sub |
180
+ [cfn-auto-reloader-hook]
181
+ triggers=post.update
182
+ path=Resources.EC2Instance.Metadata.AWS::CloudFormation::Init
183
+ action=/opt/aws/bin/cfn-init -v --stack ${AWS::StackName} --resource PrometheusAgentInstance --configsets InstallAndRun --region ${AWS::Region}
184
+ runas=root
185
+ mode : " 000400"
186
+ owner : root
187
+ group : root
188
+ /lib/systemd/system/cfn-hup.service :
189
+ content : |
190
+ [Unit]
191
+ Description=cfn-hup daemon
192
+ [Service]
193
+ Type=simple
194
+ ExecStart=/usr/local/bin/cfn-hup
195
+ Restart=always
196
+ [Install]
197
+ WantedBy=multi-user.target
198
+ commands :
199
+ 01enable_cfn_hup :
200
+ command : systemctl enable cfn-hup.service
201
+ 02start_cfn_hup :
202
+ command : systemctl start cfn-hup.service
203
+ Properties :
204
+ BlockDeviceMappings :
205
+ - DeviceName : ' /dev/sda1'
206
+ Ebs :
207
+ DeleteOnTermination : false
208
+ Encrypted : true
209
+ Iops : 3000
210
+ VolumeSize : !Ref EBSBootSize
211
+ VolumeType : ' gp3'
212
+ IamInstanceProfile : !Ref PrometheusAgentInstanceProfile
213
+ ImageId : !Ref LatestUbuntuAmiId
214
+ InstanceType : !Ref InstanceType
215
+ SecurityGroupIds :
216
+ - !Ref InternetSecurityGroup
217
+ - !Ref PrometheusAgentSecurityGroup
218
+ SubnetId : !Ref SubnetId
219
+ Tags :
220
+ - Key : Name
221
+ Value : ' Prometheus Agent'
222
+ UserData :
223
+ Fn::Base64 : !Sub
224
+ - |
225
+ MIME-Version: 1.0
226
+ Content-Type: multipart/mixed; boundary="==MYBOUNDARY=="
227
+
228
+ --==MYBOUNDARY==
229
+ Content-Type: text/x-shellscript; charset="us-ascii"
230
+
231
+ #!/bin/bash
232
+
233
+ export DEBIAN_FRONTEND='non-interactive'
234
+
235
+
236
+ echo "Using pinned Prometheus version"
237
+ LATEST_VERSION=3.2.1
238
+
239
+ echo "Prometheus version: $LATEST_VERSION"
240
+
241
+ # Construct the download URL with the correct version format
242
+ DOWNLOAD_URL="https://github.com/prometheus/prometheus/releases/download/v$LATEST_VERSION/prometheus-$LATEST_VERSION.linux-amd64.tar.gz"
243
+
244
+ # Download the latest Prometheus release tarball
245
+ echo "Downloading Prometheus version $LATEST_VERSION from $DOWNLOAD_URL ..."
246
+ wget --progress=dot:giga "$DOWNLOAD_URL"
247
+
248
+ # Extract Prometheus
249
+ echo "Extracting Prometheus"
250
+ tar xvfz prometheus-$LATEST_VERSION.linux-amd64.tar.gz
251
+
252
+ # Move to Prometheus directory
253
+ cd prometheus-$LATEST_VERSION.linux-amd64
254
+
255
+ # Move binaries to /usr/bin/
256
+ echo "Moving Prometheus binaries to /usr/bin/"
257
+ sudo mv prometheus /usr/bin/
258
+ sudo mv promtool /usr/bin/
259
+
260
+ # Create Prometheus config directory
261
+ echo "Creating Prometheus config directory"
262
+ sudo mkdir -p /etc/prometheus
263
+
264
+ # Move prometheus.yml to config directory
265
+ echo "Moving prometheus.yml to /etc/prometheus/"
266
+ sudo mv prometheus.yml /etc/prometheus/prometheus.yml
267
+
268
+
269
+ # Replace placeholders in the configuration template
270
+ echo "Replacing placeholders in the Prometheus configuration template"
271
+ sudo tee /etc/prometheus/prometheus.yml > /dev/null <<EOF
272
+ global:
273
+ scrape_interval: 15s
274
+ evaluation_interval: 15s
275
+ scrape_timeout: 15s
276
+
277
+ scrape_configs:
278
+ - job_name: 'slurm_exporter'
279
+ scrape_interval: 5s
280
+ ec2_sd_configs:
281
+ - port: 8080
282
+ region: ${AWS_REGION}
283
+ refresh_interval: 10s
284
+ filters:
285
+ - name: instance-state-name
286
+ values:
287
+ - running
288
+ - name: tag:Name
289
+ values:
290
+ - HeadNode
291
+ - name: tag:parallelcluster:cluster-name
292
+ values:
293
+ - ${PARALLELCLUSTER_NAME}
294
+
295
+
296
+ - job_name: 'dcgm_exporter'
297
+ scrape_interval: 5s
298
+ ec2_sd_configs:
299
+ - port: 9400
300
+ region: ${AWS_REGION}
301
+ refresh_interval: 10s
302
+ filters:
303
+ - name: instance-state-name
304
+ values:
305
+ - running
306
+ - name: tag:Name
307
+ values:
308
+ - Compute
309
+ - name: tag:parallelcluster:cluster-name
310
+ values:
311
+ - ${PARALLELCLUSTER_NAME}
312
+ - name: instance-type
313
+ values:
314
+ - p4d.24xlarge
315
+ - p4de.24xlarge
316
+ - p5.48xlarge
317
+ - p5e.48xlarge
318
+ - p5en.48xlarge
319
+
320
+ relabel_configs:
321
+ - source_labels: [__meta_ec2_tag_Name]
322
+ target_label: instance_name
323
+ - source_labels: [__meta_ec2_tag_Application]
324
+ target_label: instance_grafana
325
+ - source_labels: [__meta_ec2_instance_id]
326
+ target_label: instance_id
327
+ - source_labels: [__meta_ec2_availability_zone]
328
+ target_label: instance_az
329
+ - source_labels: [__meta_ec2_instance_state]
330
+ target_label: instance_state
331
+ - source_labels: [__meta_ec2_instance_type]
332
+ target_label: instance_type
333
+ - source_labels: [__meta_ec2_vpc_id]
334
+ target_label: instance_vpc
335
+
336
+ remote_write:
337
+ - url: ${ManagedPrometheusUrl}
338
+ queue_config:
339
+ max_samples_per_send: 1000
340
+ max_shards: 200
341
+ capacity: 2500
342
+ sigv4:
343
+ region: ${AWS::Region}
344
+ EOF
345
+
346
+ # Create Prometheus systemd service file
347
+ echo "Creating Prometheus systemd service file"
348
+ sudo tee /etc/systemd/system/prometheus.service > /dev/null <<EOF
349
+ [Unit]
350
+ Description=Prometheus Exporter
351
+
352
+ [Service]
353
+ Environment=PATH=/opt/slurm/bin:\$PATH
354
+ ExecStart=/usr/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --agent --storage.agent.path="/opt/prometheus/data-agent"
355
+ Restart=on-failure
356
+ RestartSec=15
357
+ Type=simple
358
+
359
+ [Install]
360
+ WantedBy=multi-user.target
361
+ EOF
362
+
363
+ # Reload systemd and enable Prometheus service
364
+ echo "Reloading systemd and enabling Prometheus service"
365
+ sudo systemctl daemon-reload
366
+ sudo systemctl enable --now prometheus
367
+
368
+ echo "Prometheus setup completed successfully"
369
+
370
+
371
+ # Notify CloudFormation once completed
372
+ /usr/local/bin/cfn-init -v --stack ${AWS::StackName} \
373
+ --resource PrometheusAgentInstance \
374
+ --configsets full_install \
375
+ --region ${AWS::Region}
376
+
377
+ /usr/local/bin/cfn-signal \
378
+ -e $? \
379
+ --stack ${AWS::StackName} \
380
+ --region ${AWS::Region} \
381
+ --resource PrometheusAgentInstance
382
+
383
+ --==MYBOUNDARY==--
384
+ - {
385
+ AWS_REGION : !Ref "AWS::Region",
386
+ PARALLELCLUSTER_NAME : !Ref PCClusterNAME
387
+ }
388
+
389
+ # CreationPolicy:
390
+ # ResourceSignal:
391
+ # Timeout: PT10M
392
+
393
+ # ############
394
+ # # Outputs ##
395
+ # ############
396
+ Outputs :
397
+ PrometheusAgentInstanceId :
398
+ Value : !GetAtt PrometheusAgentInstance.InstanceId
0 commit comments