aws-samples · mhuguesaws · Mar 24, 2025
diff --git a/...n_and_observability/4.prometheus-grafana/1click-dashboards-deployment/README.md b/...n_and_observability/4.prometheus-grafana/1click-dashboards-deployment/README.md
@@ -0,0 +1,49 @@
+# Machine Learning Infrastructure Monitoring for Slurm based cluster <!-- omit from toc -->
+
+This solution provides a "1 click" deployment observability stack to monitor your slurm based machine learning infrastructure. It automatically creates:
+- Amazon Managed Grafana
+- Amazon Managed Prometheus
+- Setup a Prometheus Agent Collector
+- Create data source and dashboard in Grafana
+
+
+## Prerequisites
+
+Install AWS Serverless Application Model Command Line Interface (AWS SAM CLI) version **>=1.135.0** by following the [instructions](<https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/install-sam-cli.html>)
+
+
+## Architecture
+TBD
+
+## Deploy
+You will begin by installing the necessary Python package needed for the lambda function.
+In your shell:
+
+```bash
+cd dashboards
+pip install -r requirements.txt -t .
+cd ..
+```
+
+You are now ready to deploy the serverless application, run the following in your shell:
+
+```
+OBS_DASHBOARD_NAME="ml-obs-dashboard"
+sam build -t managed-cluster-observability-pc.yaml
+sam deploy -t managed-cluster-observability-pc.yaml\
+    --stack-name ${OBS_DASHBOARD_NAME} \
+    --guided \
+    --capabilities CAPABILITY_IAM CAPABILITY_AUTO_EXPAND \
+    --parameter-overrides \
+    ParameterKey=PCClusterName,ParameterValue=<CLUSTER_NAME> \
+    ParameterKey=SubnetId,ParameterValue=<SUBNET_ID> \
+    ParameterKey=VpcId,ParameterValue=<VPC_ID>
+```
+
+
+## Clean up
+To delete the SAM application deployment, you can use the terminal and enter:
+
+```bash
+sam delete
+```
diff --git a/...bservability/4.prometheus-grafana/1click-dashboards-deployment/cluster-observability.yaml b/...bservability/4.prometheus-grafana/1click-dashboards-deployment/cluster-observability.yaml
@@ -0,0 +1,73 @@
+AWSTemplateFormatVersion: '2010-09-09'
+Description: "Setup to monitor sagemaker hyperpod clusters on AWS. Amazon Managed Prometheus and Amazon Manged Grafana workspaces with associated IAM roles are deployed in the AWS Account. Prometheus and exporter services are set up on Cluster Nodes. Author: Matt Nightingale - nghtm@"
+
+
+Resources:
+  AmazonGrafanaWorkspaceIAMRole:
+    Type: 'AWS::IAM::Role'
+    Properties:
+      AssumeRolePolicyDocument:
+        Version: 2012-10-17
+        Statement:
+          - Effect: Allow
+            Principal:
+              Service:
+                - grafana.amazonaws.com
+            Action:
+              - 'sts:AssumeRole'
+      ManagedPolicyArns:
+        - arn:aws:iam::aws:policy/service-role/AmazonGrafanaCloudWatchAccess
+      RoleName: !Sub ${AWS::StackName}-Grafana-Role
+
+  AmazonGrafanaPrometheusPolicy:
+    Type: AWS::IAM::Policy
+    Properties:
+      PolicyName: AmazonGrafana_Prometheus_policy
+      PolicyDocument:
+        Version: '2012-10-17'
+        Statement:
+        - Effect: Allow
+          Action:
+          - aps:ListWorkspaces
+          - aps:DescribeWorkspace
+          - aps:QueryMetrics
+          - aps:GetLabels
+          - aps:GetSeries
+          - aps:GetMetricMetadata
+          Resource: "*"
+      Roles: [!Ref AmazonGrafanaWorkspaceIAMRole]   
+
+  AmazonGrafanaWorkspace:
+    Type: 'AWS::Grafana::Workspace'
+    Properties:
+      AccountAccessType: CURRENT_ACCOUNT
+      Name: !Sub ${AWS::StackName}-Dashboard
+      Description: Amazon Grafana Workspace to monitor SageMaker Cluster
+      AuthenticationProviders:
+        - AWS_SSO
+      PermissionType: SERVICE_MANAGED
+      RoleArn: !GetAtt 
+        - AmazonGrafanaWorkspaceIAMRole
+        - Arn
+      DataSources: ["CLOUDWATCH","PROMETHEUS"]
+      OrganizationRoleName: "ADMIN"
+
+  APSWorkspace:
+    Type: AWS::APS::Workspace
+    Properties:
+      Alias: !Sub ${AWS::StackName}-Hyperpod-WorkSpace
+      Tags:
+      - Key: Name
+        Value: SageMaker Hyperpod PrometheusMetrics
+
+Outputs:
+  Region:
+    Value: !Ref "AWS::Region"
+  AMPRemoteWriteURL:
+    Value: !Join ["" , [ !GetAtt APSWorkspace.PrometheusEndpoint , "api/v1/remote_write" ]]
+  AMPEndPointUrl:
+    Value: !GetAtt APSWorkspace.PrometheusEndpoint
+  GrafanWorkspaceURL:
+    Value: !Join ["" , [ "https://", !GetAtt AmazonGrafanaWorkspace.Endpoint ]]
+  GrafanWorkspaceId:
+    Value: !GetAtt AmazonGrafanaWorkspace.Id
diff --git a/...lity/4.prometheus-grafana/1click-dashboards-deployment/dashboards/create_ml_dashboards.py b/...lity/4.prometheus-grafana/1click-dashboards-deployment/dashboards/create_ml_dashboards.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+
+import boto3
+from grafana_client import GrafanaApi, HeaderAuth, TokenAuth
+from grafanalib._gen import DashboardEncoder
+from grafanalib.core import Dashboard
+import json
+from typing import Dict
+import urllib.request
+
+from grafana_client.knowledge import datasource_factory
+from grafana_client.model import DatasourceModel
+
+import os
+import cfnresponse
+
+PROM_DASHBOARDS_URL = [
+    'https://grafana.com/api/dashboards/12239/revisions/latest/download',
+    'https://grafana.com/api/dashboards/1860/revisions/latest/download',
+    'https://grafana.com/api/dashboards/20579/revisions/latest/download'
+]
+
+
+def create_prometheus_datasource(grafana, url, aws_region):
+    jsonData = {
+        'sigV4Auth': True,
+        'sigV4AuthType': 'ec2_iam_role',
+        'sigV4Region': aws_region,
+        'httpMethod': 'GET'
+    }
+
+    datasource = DatasourceModel(name="Prometheus",
+                                 type="prometheus",
+                                 url=url,
+                                 access="proxy",
+                                 jsonData=jsonData)
+    datasource = datasource_factory(datasource)
+    datasource = datasource.asdict()
+    datasource = grafana.datasource.create_datasource(datasource)["datasource"]
+    r = grafana.datasource.health(datasource['uid'])
+    return datasource
+
+
+def encode_dashboard(entity) -> Dict:
+    """
+    Encode grafanalib `Dashboard` entity to dictionary.
+
+    TODO: Optimize without going through JSON marshalling.
+    """
+    return json.loads(json.dumps(entity, sort_keys=True, cls=DashboardEncoder))
+
+
+def mk_dash(datasource_uid, url):
+    url = urllib.request.urlopen(url)
+    dashboard = json.load(url)
+    for i in dashboard['panels']:
+        i["datasource"] = {"type": "prometheus", "uid": datasource_uid}
+
+    for i in dashboard['templating']['list']:
+        i["datasource"] = {"type": "prometheus", "uid": datasource_uid}
+
+    return {"dashboard": dashboard, "overwrite": True}
+
+
+def lambda_handler(event, context):
+
+    if event['RequestType'] != 'Create':
+        responseData = {}
+        responseData['Data'] = 0
+        cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData,
+                         "CustomResourcePhysicalID")
+        return {'statusCode': 200, 'body': json.dumps('Update or Delete')}
+
+    aws_region = os.environ['REGION']
+    grafana_key_name = "CreateDashboards"
+    grafana_url = os.environ['GRAFANA_WORKSPACE_URL']
+    workspace_id = os.environ['GRAFANA_WORKSPACE_ID']
+    prometheus_url = os.environ['PROMETHEUS_URL']
+
+    client = boto3.client('grafana')
+    try:
+        response = client.create_workspace_api_key(keyName=grafana_key_name,
+                                                   keyRole='ADMIN',
+                                                   secondsToLive=60,
+                                                   workspaceId=workspace_id)
+    except Exception as e:
+        responseData = {}
+        responseData['Data'] = 123
+        cfnresponse.send(event, context, cfnresponse.FAILED, responseData,
+                         "CustomResourcePhysicalID")
+        print(e)
+
+    try:
+        grafana = GrafanaApi.from_url(
+            url=grafana_url,
+            credential=TokenAuth(token=response['key']),
+        )
+
+        prometheus_datasource = create_prometheus_datasource(
+            grafana, prometheus_url, aws_region)
+
+        for i in PROM_DASHBOARDS_URL:
+            dashboard_payload = mk_dash(prometheus_datasource['uid'], i)
+            response = grafana.dashboard.update_dashboard(dashboard_payload)
+
+        responseData = {}
+        responseData['Data'] = 123
+        cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData,
+                         "CustomResourcePhysicalID")
+
+    except Exception as e:
+        responseData = {}
+        responseData['Data'] = 123
+        cfnresponse.send(event, context, cfnresponse.FAILED, responseData,
+                         "CustomResourcePhysicalID")
+        print(e)
+
+    response = client.delete_workspace_api_key(keyName=grafana_key_name,
+                                               workspaceId=workspace_id)
+
+    return {'statusCode': 200, 'body': json.dumps('Dashboards created')}
diff --git a/...servability/4.prometheus-grafana/1click-dashboards-deployment/dashboards/requirements.txt b/...servability/4.prometheus-grafana/1click-dashboards-deployment/dashboards/requirements.txt
@@ -0,0 +1,3 @@
+certifi
+grafana-client==4.3.2
+grafanalib==0.7.1
diff --git a/...y/4.prometheus-grafana/1click-dashboards-deployment/managed-cluster-observability-pc.yaml b/...y/4.prometheus-grafana/1click-dashboards-deployment/managed-cluster-observability-pc.yaml
@@ -0,0 +1,79 @@
+AWSTemplateFormatVersion: '2010-09-09'
+Transform: AWS::Serverless-2016-10-31
+Description: >
+ Grafana Dashboards deployment
+ Author: [email protected]
+
+Parameters:
+  PCClusterName:
+    Type: String
+  SubnetId:
+    Type: AWS::EC2::Subnet::Id
+  VpcId:
+    Type: AWS::EC2::VPC::Id
+
+Resources:
+  GrafanaPrometheus:
+    Type: AWS::Serverless::Application
+    DeletionPolicy: Delete
+    UpdateReplacePolicy: Delete
+    Properties:
+      Location: cluster-observability.yaml
+
+  GrafanaLambdaRole:
+    Type: AWS::IAM::Role
+    Properties:
+      AssumeRolePolicyDocument:
+        Version: "2012-10-17"
+        Statement:
+          - Effect: Allow
+            Principal:
+              Service: lambda.amazonaws.com
+            Action: "sts:AssumeRole"
+      Policies:
+        - PolicyName: GrafanaLambda
+          PolicyDocument:
+            Version: "2012-10-17"
+            Statement:
+              - Effect: Allow
+                Action:
+                  - 'grafana:CreateWorkspaceApiKey'
+                  - 'grafana:DeleteWorkspaceApiKey'
+                Resource: "*"
+
+  DashboardCreationLambda:
+    Type: AWS::Serverless::Function
+    Properties:
+      CodeUri: dashboards
+      Environment:
+        Variables:
+          REGION: !Ref AWS::Region
+          PROMETHEUS_URL: !GetAtt GrafanaPrometheus.Outputs.AMPEndPointUrl
+          GRAFANA_WORKSPACE_ID: !GetAtt GrafanaPrometheus.Outputs.GrafanWorkspaceId
+          GRAFANA_WORKSPACE_URL: !GetAtt GrafanaPrometheus.Outputs.GrafanWorkspaceURL
+      Handler: create_ml_dashboards.lambda_handler
+      Runtime: python3.13
+      Role: !GetAtt GrafanaLambdaRole.Arn
+      Timeout: 10
+      MemorySize: 128
+      Tags:
+        Application: MLDashboards
+
+  PrometheusCollector:
+    Type: AWS::Serverless::Application
+    DeletionPolicy: Delete
+    UpdateReplacePolicy: Delete
+    Properties:
+      Location: prometheus-agent-collector.yaml
+      Parameters:
+       PCClusterNAME: !Ref PCClusterName
+       ManagedPrometheusUrl: !GetAtt GrafanaPrometheus.Outputs.AMPRemoteWriteURL
+       SubnetId: !Ref SubnetId
+       VpcId: !Ref VpcId
+
+  LambdaTrigger: 
+    Type: "Custom::LambdaTrigger"
+    Properties: 
+      ServiceToken:
+        !GetAtt DashboardCreationLambda.Arn
+      ServiceTimeout: 300