Skip to content

Commit 08f60d6

Browse files
committed
Add CloudFormation for prometheus agent collector
1 parent 4039347 commit 08f60d6

File tree

1 file changed

+398
-0
lines changed

1 file changed

+398
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,398 @@
1+
AWSTemplateFormatVersion: '2010-09-09'
2+
Description: >
3+
Prometheus Agent Collector
4+
5+
6+
7+
8+
####################
9+
## Stack Metadata
10+
Metadata:
11+
AWS::CloudFormation::Interface:
12+
ParameterGroups:
13+
- Label:
14+
default: "Network Configuration"
15+
Parameters:
16+
- VpcId
17+
- SubnetId
18+
- Label:
19+
default: "Amazon Prometheus"
20+
Parameters:
21+
- ManagedPrometheusUrl
22+
- Label:
23+
default: "AWS ParallelCluster"
24+
Parameters:
25+
- PCClusterNAME
26+
- Label:
27+
default: "EC2 Instance Configuration"
28+
Parameters:
29+
- InstanceType
30+
- LatestUbuntuAmiId
31+
- EBSBootSize
32+
ParameterLabels:
33+
SubnetId:
34+
default: Subnet Id
35+
ManagedPrometheusUrl:
36+
default: Amazon Prometheus Remote Write URL
37+
LatestUbuntuAmiId:
38+
default: Latest Ubuntu 22.04 AMI Id
39+
PCClusterNAME:
40+
default: AWS ParallelCluster Name
41+
42+
43+
####################
44+
## Parameters
45+
Parameters:
46+
InstanceType:
47+
Description: EC2 instance type
48+
Type: String
49+
Default: t3a.micro
50+
AllowedValues:
51+
- t3a.micro
52+
- t3a.small
53+
- t3a.medium
54+
- t3a.large
55+
- t2.small
56+
57+
LatestUbuntuAmiId:
58+
Type: AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>
59+
Default: '/aws/service/canonical/ubuntu/server/22.04/stable/current/amd64/hvm/ebs-gp2/ami-id'
60+
Description: 'Ubuntu 22.04 AMI Id'
61+
62+
EBSBootSize:
63+
Type: Number
64+
Default: 20
65+
Description: 'Size in GiB of EBS root volume'
66+
67+
SubnetId:
68+
Type: AWS::EC2::Subnet::Id
69+
Description: 'Public or Private Subnet Id with internet access'
70+
71+
VpcId:
72+
Type: AWS::EC2::VPC::Id
73+
74+
ManagedPrometheusUrl:
75+
Type: String
76+
77+
PCClusterNAME:
78+
Type: String
79+
Description: "Cluster name. For example: training-cluster"
80+
81+
############################
82+
## Prometheus Resources
83+
Resources:
84+
PrometheusAgentInstancePolicy:
85+
Type: AWS::IAM::Policy
86+
Properties:
87+
PolicyName: "PrometheusAgentInstancePolicy"
88+
PolicyDocument:
89+
Version: 2012-10-17
90+
Statement:
91+
- Effect: Allow
92+
Action:
93+
- ec2:DescribeInstances
94+
Resource:
95+
- '*'
96+
Roles:
97+
- !Ref PrometheusAgentInstanceRole
98+
99+
PrometheusAgentInstanceRole:
100+
Type: AWS::IAM::Role
101+
Properties:
102+
AssumeRolePolicyDocument:
103+
Version: 2012-10-17
104+
Statement:
105+
- Effect: Allow
106+
Principal:
107+
Service:
108+
- ec2.amazonaws.com
109+
Action:
110+
- sts:AssumeRole
111+
ManagedPolicyArns:
112+
- arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
113+
- arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess
114+
115+
PrometheusAgentInstanceProfile:
116+
Type: AWS::IAM::InstanceProfile
117+
Properties:
118+
Roles:
119+
- !Ref PrometheusAgentInstanceRole
120+
121+
#################
122+
# Security groups
123+
InternetSecurityGroup:
124+
Type: AWS::EC2::SecurityGroup
125+
Properties:
126+
GroupDescription: 'Allow communication to Internet'
127+
GroupName: 'prometheus-agent-outbound'
128+
SecurityGroupEgress:
129+
- CidrIp: 0.0.0.0/0
130+
Description: 'Communication to the internet'
131+
FromPort: -1
132+
IpProtocol: -1
133+
ToPort: -1
134+
Tags:
135+
- Key: 'Name'
136+
Value: 'prometheus-agent'
137+
VpcId: !Ref VpcId
138+
139+
PrometheusAgentSecurityGroup:
140+
Type: AWS::EC2::SecurityGroup
141+
Properties:
142+
GroupDescription: 'Allow communication to/from Cluster'
143+
GroupName: 'prometheus-agent-to-cluster'
144+
Tags:
145+
- Key: 'Name'
146+
Value: 'prometheus-agent-to-cluster'
147+
VpcId: !Ref VpcId
148+
149+
PrometheusAgentSecurityGroupIngress:
150+
Type: AWS::EC2::SecurityGroupIngress
151+
Properties:
152+
Description: 'Allow communication to Prometheus'
153+
IpProtocol: -1
154+
FromPort: -1
155+
ToPort: -1
156+
GroupId: !Ref PrometheusAgentSecurityGroup
157+
SourceSecurityGroupId: !Ref PrometheusAgentSecurityGroup
158+
159+
################
160+
# Prometheus Agent Instance
161+
PrometheusAgentInstance:
162+
Type: 'AWS::EC2::Instance'
163+
Metadata:
164+
AWS::CloudFormation::Init:
165+
configSets:
166+
full_install:
167+
- install_and_enable_cfn_hup
168+
install_and_enable_cfn_hup:
169+
files:
170+
/etc/cfn/cfn-hup.conf:
171+
content: !Sub |
172+
[main]
173+
stack=${AWS::StackId}
174+
region=${AWS::Region}
175+
mode: "000400"
176+
owner: root
177+
group: root
178+
/etc/cfn/hooks.d/cfn-auto-reloader.conf:
179+
content: !Sub |
180+
[cfn-auto-reloader-hook]
181+
triggers=post.update
182+
path=Resources.EC2Instance.Metadata.AWS::CloudFormation::Init
183+
action=/opt/aws/bin/cfn-init -v --stack ${AWS::StackName} --resource PrometheusAgentInstance --configsets InstallAndRun --region ${AWS::Region}
184+
runas=root
185+
mode: "000400"
186+
owner: root
187+
group: root
188+
/lib/systemd/system/cfn-hup.service:
189+
content: |
190+
[Unit]
191+
Description=cfn-hup daemon
192+
[Service]
193+
Type=simple
194+
ExecStart=/usr/local/bin/cfn-hup
195+
Restart=always
196+
[Install]
197+
WantedBy=multi-user.target
198+
commands:
199+
01enable_cfn_hup:
200+
command: systemctl enable cfn-hup.service
201+
02start_cfn_hup:
202+
command: systemctl start cfn-hup.service
203+
Properties:
204+
BlockDeviceMappings:
205+
- DeviceName: '/dev/sda1'
206+
Ebs:
207+
DeleteOnTermination: false
208+
Encrypted: true
209+
Iops: 3000
210+
VolumeSize: !Ref EBSBootSize
211+
VolumeType: 'gp3'
212+
IamInstanceProfile: !Ref PrometheusAgentInstanceProfile
213+
ImageId: !Ref LatestUbuntuAmiId
214+
InstanceType: !Ref InstanceType
215+
SecurityGroupIds:
216+
- !Ref InternetSecurityGroup
217+
- !Ref PrometheusAgentSecurityGroup
218+
SubnetId: !Ref SubnetId
219+
Tags:
220+
- Key: Name
221+
Value: 'Prometheus Agent'
222+
UserData:
223+
Fn::Base64: !Sub
224+
- |
225+
MIME-Version: 1.0
226+
Content-Type: multipart/mixed; boundary="==MYBOUNDARY=="
227+
228+
--==MYBOUNDARY==
229+
Content-Type: text/x-shellscript; charset="us-ascii"
230+
231+
#!/bin/bash
232+
233+
export DEBIAN_FRONTEND='non-interactive'
234+
235+
236+
echo "Using pinned Prometheus version"
237+
LATEST_VERSION=3.2.1
238+
239+
echo "Prometheus version: $LATEST_VERSION"
240+
241+
# Construct the download URL with the correct version format
242+
DOWNLOAD_URL="https://github.com/prometheus/prometheus/releases/download/v$LATEST_VERSION/prometheus-$LATEST_VERSION.linux-amd64.tar.gz"
243+
244+
# Download the latest Prometheus release tarball
245+
echo "Downloading Prometheus version $LATEST_VERSION from $DOWNLOAD_URL ..."
246+
wget --progress=dot:giga "$DOWNLOAD_URL"
247+
248+
# Extract Prometheus
249+
echo "Extracting Prometheus"
250+
tar xvfz prometheus-$LATEST_VERSION.linux-amd64.tar.gz
251+
252+
# Move to Prometheus directory
253+
cd prometheus-$LATEST_VERSION.linux-amd64
254+
255+
# Move binaries to /usr/bin/
256+
echo "Moving Prometheus binaries to /usr/bin/"
257+
sudo mv prometheus /usr/bin/
258+
sudo mv promtool /usr/bin/
259+
260+
# Create Prometheus config directory
261+
echo "Creating Prometheus config directory"
262+
sudo mkdir -p /etc/prometheus
263+
264+
# Move prometheus.yml to config directory
265+
echo "Moving prometheus.yml to /etc/prometheus/"
266+
sudo mv prometheus.yml /etc/prometheus/prometheus.yml
267+
268+
269+
# Replace placeholders in the configuration template
270+
echo "Replacing placeholders in the Prometheus configuration template"
271+
sudo tee /etc/prometheus/prometheus.yml > /dev/null <<EOF
272+
global:
273+
scrape_interval: 15s
274+
evaluation_interval: 15s
275+
scrape_timeout: 15s
276+
277+
scrape_configs:
278+
- job_name: 'slurm_exporter'
279+
scrape_interval: 5s
280+
ec2_sd_configs:
281+
- port: 8080
282+
region: ${AWS_REGION}
283+
refresh_interval: 10s
284+
filters:
285+
- name: instance-state-name
286+
values:
287+
- running
288+
- name: tag:Name
289+
values:
290+
- HeadNode
291+
- name: tag:parallelcluster:cluster-name
292+
values:
293+
- ${PARALLELCLUSTER_NAME}
294+
295+
296+
- job_name: 'dcgm_exporter'
297+
scrape_interval: 5s
298+
ec2_sd_configs:
299+
- port: 9400
300+
region: ${AWS_REGION}
301+
refresh_interval: 10s
302+
filters:
303+
- name: instance-state-name
304+
values:
305+
- running
306+
- name: tag:Name
307+
values:
308+
- Compute
309+
- name: tag:parallelcluster:cluster-name
310+
values:
311+
- ${PARALLELCLUSTER_NAME}
312+
- name: instance-type
313+
values:
314+
- p4d.24xlarge
315+
- p4de.24xlarge
316+
- p5.48xlarge
317+
- p5e.48xlarge
318+
- p5en.48xlarge
319+
320+
relabel_configs:
321+
- source_labels: [__meta_ec2_tag_Name]
322+
target_label: instance_name
323+
- source_labels: [__meta_ec2_tag_Application]
324+
target_label: instance_grafana
325+
- source_labels: [__meta_ec2_instance_id]
326+
target_label: instance_id
327+
- source_labels: [__meta_ec2_availability_zone]
328+
target_label: instance_az
329+
- source_labels: [__meta_ec2_instance_state]
330+
target_label: instance_state
331+
- source_labels: [__meta_ec2_instance_type]
332+
target_label: instance_type
333+
- source_labels: [__meta_ec2_vpc_id]
334+
target_label: instance_vpc
335+
336+
remote_write:
337+
- url: ${ManagedPrometheusUrl}
338+
queue_config:
339+
max_samples_per_send: 1000
340+
max_shards: 200
341+
capacity: 2500
342+
sigv4:
343+
region: ${AWS::Region}
344+
EOF
345+
346+
# Create Prometheus systemd service file
347+
echo "Creating Prometheus systemd service file"
348+
sudo tee /etc/systemd/system/prometheus.service > /dev/null <<EOF
349+
[Unit]
350+
Description=Prometheus Exporter
351+
352+
[Service]
353+
Environment=PATH=/opt/slurm/bin:\$PATH
354+
ExecStart=/usr/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --agent --storage.agent.path="/opt/prometheus/data-agent"
355+
Restart=on-failure
356+
RestartSec=15
357+
Type=simple
358+
359+
[Install]
360+
WantedBy=multi-user.target
361+
EOF
362+
363+
# Reload systemd and enable Prometheus service
364+
echo "Reloading systemd and enabling Prometheus service"
365+
sudo systemctl daemon-reload
366+
sudo systemctl enable --now prometheus
367+
368+
echo "Prometheus setup completed successfully"
369+
370+
371+
# Notify CloudFormation once completed
372+
/usr/local/bin/cfn-init -v --stack ${AWS::StackName} \
373+
--resource PrometheusAgentInstance \
374+
--configsets full_install \
375+
--region ${AWS::Region}
376+
377+
/usr/local/bin/cfn-signal \
378+
-e $? \
379+
--stack ${AWS::StackName} \
380+
--region ${AWS::Region} \
381+
--resource PrometheusAgentInstance
382+
383+
--==MYBOUNDARY==--
384+
- {
385+
AWS_REGION: !Ref "AWS::Region",
386+
PARALLELCLUSTER_NAME: !Ref PCClusterNAME
387+
}
388+
389+
# CreationPolicy:
390+
# ResourceSignal:
391+
# Timeout: PT10M
392+
393+
#############
394+
## Outputs ##
395+
#############
396+
Outputs:
397+
PrometheusAgentInstanceId:
398+
Value: !GetAtt PrometheusAgentInstance.InstanceId

0 commit comments

Comments
 (0)