stable-diffusion-aws-extension/middleware_api/endpoints/cloudwatch_event.py

import datetime
import json
import logging
import os

import boto3
from aws_lambda_powertools import Tracer

from common.ddb_service.client import DynamoDbUtilsService
from delete_endpoints import get_endpoint_in_sagemaker
from libs.data_types import Endpoint
from libs.utils import get_endpoint_by_name

aws_region = os.environ.get('AWS_REGION')
esd_version = os.environ.get("ESD_VERSION")
sagemaker_endpoint_table = os.environ.get('ENDPOINT_TABLE_NAME')
logger = logging.getLogger(__name__)
logger.setLevel(os.environ.get('LOG_LEVEL') or logging.INFO)

tracer = Tracer()

logger = logging.getLogger(__name__)
logger.setLevel(os.environ.get('LOG_LEVEL') or logging.ERROR)
cloudwatch = boto3.client('cloudwatch')
sagemaker = boto3.client('sagemaker')
ddb_service = DynamoDbUtilsService(logger=logger)
period = 300


@tracer.capture_lambda_handler
def handler(event, context):
    logger.info(json.dumps(event))

    gen_workflow_ds()

    if 'detail' in event and 'EndpointStatus' in event['detail']:
        endpoint_name = event['detail']['EndpointName']
        endpoint_status = event['detail']['EndpointStatus']
        if endpoint_status == 'InService':
            ep = get_endpoint_by_name(endpoint_name)
            create_ds(ep)
            clean_ds()
            return {}

    eps = ddb_service.scan(sagemaker_endpoint_table)
    logger.info(f"Endpoints: {eps}")

    for ep in eps:
        ep_name = ep['endpoint_name']['S']
        ep = get_endpoint_by_name(ep_name)

        if ep.endpoint_status == 'Creating':
            continue

        endpoint = get_endpoint_in_sagemaker(ep_name)
        if endpoint is None:
            continue

        create_ds(ep)

    clean_ds()
    return {}


def gen_workflow_ds():
    dimensions = [{'Name': 'Workflow'}]
    metrics = cloudwatch.list_metrics(Namespace='ESD', MetricName='InferenceTotal', Dimensions=dimensions)['Metrics']

    workflow_name = []
    for m in metrics:
        workflow = m['Dimensions'][0]['Value']
        workflow_name.append(workflow)

    workflow_name.sort()

    logger.info(f"Workflow Names: {workflow_name}")

    y = 0
    widgets = []
    for workflow in workflow_name:
        widgets.append({
            "height": 4,
            "width": 24,
            "y": 0,
            "x": 0,
            "type": "metric",
            "properties": {
                "metrics": [
                    [
                        "ESD",
                        "InferenceTotal",
                        "Workflow",
                        workflow,
                        {
                            "region": aws_region
                        }
                    ],
                    [
                        ".",
                        "InferenceSucceed",
                        ".",
                        ".",
                        {
                            "region": aws_region
                        }
                    ],
                    [
                        ".",
                        "InferenceLatency",
                        ".",
                        ".",
                        {
                            "stat": "Minimum",
                            "region": aws_region
                        }
                    ],
                    [
                        "...",
                        {
                            "stat": "Average",
                            "region": aws_region
                        }
                    ],
                    [
                        "...",
                        {
                            "stat": "p99",
                            "region": aws_region
                        }
                    ],
                    [
                        "...",
                        {
                            "region": aws_region
                        }
                    ]
                ],
                "sparkline": True,
                "view": "singleValue",
                "region": aws_region,
                "stat": "Maximum",
                "period": 300,
                "title": f"{workflow}"
            }
        })
        y = y + 1

    if len(widgets) > 0:
        cloudwatch.put_dashboard(DashboardName='ESD-Workflow', DashboardBody=json.dumps({"widgets": widgets}))


def clean_ds():
    ds = cloudwatch.list_dashboards()
    if 'DashboardEntries' in ds:
        prefix = ('comfy-async-', 'comfy-real-time-', 'sd-async-', 'sd-real-time-')
        filtered_dashboards = [dashboard['DashboardName'] for dashboard in ds['DashboardEntries'] if
                               dashboard['DashboardName'].startswith(prefix)]
        for ep_name in filtered_dashboards:
            try:
                get_endpoint_by_name(ep_name)
            except Exception as e:
                print(f"Error: {e}")
                print(f"Deleting {ep_name}")
                cloudwatch.delete_dashboards(DashboardNames=[ep_name])


def ds_body(ep: Endpoint, custom_metrics):
    ep_name = ep.endpoint_name
    last_build_time = datetime.datetime.now().isoformat()
    dashboard_body = {
        "widgets": [
            {
                "type": "text",
                "x": 0,
                "y": 0,
                "width": 24,
                "height": 2,
                "properties": {
                    "markdown": f"## ESD - {ep_name} \n Last Build Time: {last_build_time}"
                }
            },
            {
                "type": "metric",
                "x": 0,
                "y": 0,
                "width": 24,
                "height": 4,
                "properties": {
                    "metrics": [
                        [
                            "ESD",
                            "QueueLatency",
                            "Endpoint",
                            ep_name,
                            {
                                "stat": "Minimum",
                                "region": aws_region
                            }
                        ],
                        [
                            "...",
                            {
                                "stat": "Average",
                                "region": aws_region
                            }
                        ],
                        [
                            "...",
                            {
                                "stat": "p99",
                                "region": aws_region
                            }
                        ],
                        [
                            "...",
                            {
                                "region": aws_region
                            }
                        ]
                    ],
                    "view": "singleValue",
                    "region": aws_region,
                    "period": period,
                    "stat": "Maximum",
                    "title": "QueueLatency"
                }
            },
            {
                "height": 4,
                "width": 6,
                "y": 1,
                "x": 0,
                "type": "metric",
                "properties": {
                    "metrics": [
                        [
                            "ESD",
                            "InferenceEndpointReceived",
                            "Endpoint",
                            ep_name,
                            {
                                "region": aws_region
                            }
                        ],
                        [
                            ".",
                            "InferenceSucceed",
                            ".",
                            ".",
                            {
                                "region": aws_region
                            }
                        ]
                    ],
                    "sparkline": True,
                    "view": "singleValue",
                    "region": aws_region,
                    "title": "Inference Results",
                    "period": period,
                    "stat": "Sum"
                }
            },
            {
                "height": 4,
                "width": 18,
                "y": 1,
                "x": 8,
                "type": "metric",
                "properties": {
                    "metrics": [
                        [
                            "ESD",
                            "InferenceLatency",
                            "Endpoint",
                            ep_name,
                            {
                                "region": aws_region,
                                "stat": "Minimum"
                            }
                        ],
                        [
                            "...",
                            {
                                "region": aws_region
                            }
                        ],
                        [
                            "...",
                            {
                                "stat": "p99",
                                "region": aws_region
                            }
                        ],
                        [
                            "...",
                            {
                                "stat": "Maximum",
                                "region": aws_region
                            }
                        ]
                    ],
                    "sparkline": True,
                    "view": "singleValue",
                    "region": aws_region,
                    "stat": "Average",
                    "period": period,
                    "title": "InferenceLatency"
                }
            },
            {
                "height": 5,
                "width": 6,
                "y": 0,
                "x": 0,
                "type": "metric",
                "properties": {
                    "metrics": [
                        [
                            "ESD",
                            "InferenceTotal",
                            "Endpoint",
                            ep_name,
                            {
                                "region": aws_region
                            }
                        ]
                    ],
                    "sparkline": True,
                    "view": "singleValue",
                    "region": aws_region,
                    "title": "Endpoint Inference",
                    "stat": "Sum",
                    "period": period
                }
            },
            {
                "height": 5,
                "width": 7,
                "y": 0,
                "x": 13,
                "type": "metric",
                "properties": {
                    "metrics": [
                        [
                            "/aws/sagemaker/Endpoints",
                            "MemoryUtilization",
                            "EndpointName",
                            ep_name,
                            "VariantName",
                            "prod",
                            {
                                "region": aws_region,
                                "stat": "Minimum"
                            }
                        ],
                        [
                            "...",
                            {
                                "region": aws_region
                            }
                        ]
                    ],
                    "sparkline": True,
                    "view": "gauge",
                    "region": aws_region,
                    "title": "MemoryUtilization",
                    "period": period,
                    "yAxis": {
                        "left": {
                            "min": 1,
                            "max": 100
                        }
                    },
                    "stat": "Maximum"
                }
            },
            {
                "height": 5,
                "width": 7,
                "y": 0,
                "x": 6,
                "type": "metric",
                "properties": {
                    "metrics": [
                        [
                            "/aws/sagemaker/Endpoints",
                            "GPUMemoryUtilization",
                            "EndpointName",
                            ep_name,
                            "VariantName",
                            "prod",
                            {
                                "region": aws_region,
                                "stat": "Minimum"
                            }
                        ],
                        [
                            "...",
                            {
                                "region": aws_region
                            }
                        ]
                    ],
                    "view": "gauge",
                    "stacked": True,
                    "region": aws_region,
                    "title": "GPUMemoryUtilization",
                    "period": period,
                    "yAxis": {
                        "left": {
                            "min": 1,
                            "max": resolve_gpu_nums(ep) * 100
                        }
                    },
                    "stat": "Maximum"
                }
            },
            {
                "height": 5,
                "width": 12,
                "y": 5,
                "x": 0,
                "type": "metric",
                "properties": {
                    "metrics": [
                        [
                            "/aws/sagemaker/Endpoints",
                            "GPUUtilization",
                            "EndpointName",
                            ep_name,
                            "VariantName",
                            "prod",
                            {
                                "region": aws_region,
                                "stat": "Minimum"
                            }
                        ],
                        [
                            "...",
                            {
                                "region": aws_region
                            }
                        ],
                        [
                            "...",
                            {
                                "region": aws_region,
                                "stat": "Maximum"
                            }
                        ]
                    ],
                    "sparkline": True,
                    "view": "gauge",
                    "yAxis": {
                        "left": {
                            "min": 0,
                            "max": resolve_gpu_nums(ep) * 100
                        }
                    },
                    "region": aws_region,
                    "title": "GPUUtilization",
                    "period": period,
                    "stacked": False,
                    "stat": "Average"
                }
            },
            {
                "height": 5,
                "width": 12,
                "y": 5,
                "x": 12,
                "type": "metric",
                "properties": {
                    "metrics": [
                        [
                            "/aws/sagemaker/Endpoints",
                            "CPUUtilization",
                            "EndpointName",
                            ep_name,
                            "VariantName",
                            "prod",
                            {
                                "region": aws_region,
                                "stat": "Minimum"
                            }
                        ],
                        [
                            "...",
                            {
                                "region": aws_region
                            }
                        ],
                        [
                            "...",
                            {
                                "region": aws_region,
                                "stat": "Maximum"
                            }
                        ]
                    ],
                    "sparkline": True,
                    "view": "singleValue",
                    "yAxis": {
                        "left": {
                            "min": 0,
                            "max": 100
                        }
                    },
                    "region": aws_region,
                    "title": "CPUUtilization",
                    "period": period,
                    "stat": "Average"
                }
            },
            {
                "height": 5,
                "width": 4,
                "y": 0,
                "x": 20,
                "type": "metric",
                "properties": {
                    "metrics": [
                        [
                            "/aws/sagemaker/Endpoints",
                            "DiskUtilization",
                            "EndpointName",
                            ep_name,
                            "VariantName",
                            "prod",
                            {
                                "region": aws_region
                            }
                        ]
                    ],
                    "view": "singleValue",
                    "stacked": True,
                    "region": aws_region,
                    "title": "DiskUtilization",
                    "period": period,
                    "stat": "Maximum"
                }
            },
            {
                "type": "log",
                "x": 0,
                "y": 20,
                "width": 24,
                "height": 8,
                "properties": {
                    "query": f"SOURCE '/aws/sagemaker/Endpoints/{ep_name}' "
                             f"| fields @timestamp, @logStream, @message\r\n"
                             f"| filter @message like /error/\r\n"
                             f"| sort @timestamp desc\r\n| limit 500",
                    "region": aws_region,
                    "stacked": False,
                    "view": "table",
                    "title": "Endpoint Error Log"
                }
            }
        ]
    }

    gpus_ds = resolve_gpu_ds(ep, custom_metrics)
    for gpu_ds in gpus_ds:
        dashboard_body['widgets'].append(gpu_ds)

    return json.dumps(dashboard_body)


def resolve_gpu_nums(ep: Endpoint):
    maps = {
        "ml.p4d.24xlarge": 8,
        "ml.g4dn.12xlarge": 4,
        "ml.g5.12xlarge": 4,
        "ml.g5.24xlarge": 4,
        "ml.g5.48xlarge": 8,
    }

    return maps.get(ep.instance_type, 1)


def resolve_gpu_ds(ep: Endpoint, custom_metrics):
    ep_name = ep.endpoint_name

    list = []
    ids = []

    cur_instance_id = None
    for metric in custom_metrics:
        if metric['MetricName'] == 'GPUUtilization':
            if len(metric['Dimensions']) == 3:
                for dm in metric['Dimensions']:
                    if dm['Name'] == 'Endpoint' and dm['Value'] == ep_name:
                        instance_id = metric['Dimensions'][1]['Value']
                        gpu_id = metric['Dimensions'][2]['Value']

                        ids.append({
                            "instance_id": instance_id,
                            "gpu_id": gpu_id,
                            "view": "singleValue",
                            "stat": "Sum",
                            "metric": "InferenceTotal"})

                        ids.append({
                            "instance_id": instance_id,
                            "gpu_id": gpu_id,
                            "view": "singleValue",
                            "stat": "Average",
                            "metric": "GPUUtilization"})

                        ids.append({
                            "instance_id": instance_id,
                            "gpu_id": gpu_id,
                            "view": "singleValue",
                            "stat": "Maximum",
                            "metric": "GPUUtilization"})

                        ids.append({
                            "instance_id": instance_id,
                            "gpu_id": gpu_id,
                            "view": "singleValue",
                            "stat": "Maximum",
                            "metric": "GPUMemoryUtilization"})

    def custom_sort(obj):
        return (-int(obj['instance_id']), obj['gpu_id'])

    ids = sorted(ids, key=custom_sort)

    x = 0
    y = 30

    colors = ["#9467bd", "#ff7f0e", "#2ca02c", "#8c564b", "#e377c2", "#7f7f7f", "#1f77b4"]
    color_index = 0

    for item in ids:
        if cur_instance_id != item['instance_id']:
            cur_instance_id = item['instance_id']
            list.append({
                "type": "text",
                "x": 0,
                "y": y,
                "width": 24,
                "height": 1,
                "properties": {
                    "background": "transparent",
                    "markdown": f""
                }
            })
            list.append({
                "type": "text",
                "x": 0,
                "y": y + 1,
                "width": 24,
                "height": 3,
                "properties": {
                    "markdown": f"# Endpoint Instance - {item['instance_id']} \n"
                                f"- InferenceTotal: Inference Job Count (Comfy Only)\n"
                                f"- GPUUtilization: The percentage of GPU units that are used on a GPU.\n"
                                f"- GPUMemoryUtilization: The percentage of GPU memory that are used on a GPU."

                }
            })
            list.append({
                "type": "metric",
                "x": 0,
                "y": y + 2,
                "width": 24,
                "height": 4,
                "properties": {
                    "metrics": [
                        [
                            "ESD",
                            "DiskTotal",
                            "Endpoint",
                            ep_name,
                            "Instance",
                            item['instance_id'],
                        ],
                        [
                            ".",
                            "DiskUsed",
                            ".",
                            ".",
                            ".",
                            "."
                        ],
                        [
                            ".",
                            "DiskFree",
                            ".",
                            ".",
                            ".",
                            "."
                        ],
                        [
                            ".",
                            "DiskPercentage",
                            ".",
                            ".",
                            ".",
                            "."
                        ]
                    ],
                    "sparkline": True,
                    "view": "singleValue",
                    "region": aws_region,
                    "stat": "Maximum",
                    "period": period,
                    "title": "Disk"
                }
            })
            y = y + 3

        list.append({
            "height": 4,
            "width": 6,
            "y": y,
            "x": x,
            "type": "metric",
            "properties": {
                "metrics": [
                    [
                        "ESD",
                        item['metric'],
                        "Endpoint",
                        ep_name,
                        "Instance",
                        item['instance_id'],
                        "InstanceGPU",
                        item['gpu_id'],
                        {
                            "region": aws_region,
                            "label": f"{item['metric']} - {item['stat']}",
                            "color": colors[color_index]
                        }
                    ]
                ],
                "sparkline": True,
                "view": item['view'],
                "yAxis": {
                    "left": {
                        "min": 1,
                        "max": 100
                    }
                },
                "stacked": True,
                "region": aws_region,
                "stat": item['stat'],
                "period": period,
                "title": f"{item['gpu_id']}",
            }
        })

        x = x + 6
        if x >= 24:

            color_index = color_index + 1
            if color_index >= len(colors):
                color_index = 0

            x = 0
            y = y + 1

    logger.info(f"Metrics List:")
    logger.info(json.dumps(list))

    return list


def get_dashboard(dashboard_name):
    try:
        response = cloudwatch.get_dashboard(DashboardName=dashboard_name)
        return response['DashboardBody']
    except cloudwatch.exceptions.ResourceNotFound:
        return None


def create_ds(ep: Endpoint):
    ep_name = ep.endpoint_name
    dimensions = [
        {
            'Name': 'Endpoint',
            'Value': ep_name
        }
    ]

    metrics1 = cloudwatch.list_metrics(Namespace='ESD', MetricName='GPUMemoryUtilization', Dimensions=dimensions)[
        'Metrics']
    metrics2 = cloudwatch.list_metrics(Namespace='ESD', MetricName='GPUUtilization', Dimensions=dimensions)['Metrics']
    metrics3 = cloudwatch.list_metrics(Namespace='ESD', MetricName='InferenceTotal', Dimensions=dimensions)['Metrics']

    custom_metrics = metrics1 + metrics2 + metrics3

    logger.info(f"Custom Metrics: ")
    logger.info(json.dumps(custom_metrics))

    existing_dashboard = get_dashboard(ep_name)

    cloudwatch.put_dashboard(DashboardName=ep_name, DashboardBody=ds_body(ep, custom_metrics))

    if existing_dashboard:
        logger.info(f"Dashboard '{ep_name}' updated successfully.")
    else:
        logger.info(f"Dashboard '{ep_name}' created successfully.")