import datetime import json import logging import os import boto3 from aws_lambda_powertools import Tracer from common.ddb_service.client import DynamoDbUtilsService from delete_endpoints import get_endpoint_in_sagemaker from libs.data_types import Endpoint from libs.utils import get_endpoint_by_name aws_region = os.environ.get('AWS_REGION') esd_version = os.environ.get("ESD_VERSION") sagemaker_endpoint_table = os.environ.get('ENDPOINT_TABLE_NAME') logger = logging.getLogger(__name__) logger.setLevel(os.environ.get('LOG_LEVEL') or logging.INFO) tracer = Tracer() logger = logging.getLogger(__name__) logger.setLevel(os.environ.get('LOG_LEVEL') or logging.ERROR) cloudwatch = boto3.client('cloudwatch') sagemaker = boto3.client('sagemaker') ddb_service = DynamoDbUtilsService(logger=logger) period = 300 @tracer.capture_lambda_handler def handler(event, context): logger.info(json.dumps(event)) gen_workflow_ds() if 'detail' in event and 'EndpointStatus' in event['detail']: endpoint_name = event['detail']['EndpointName'] endpoint_status = event['detail']['EndpointStatus'] if endpoint_status == 'InService': ep = get_endpoint_by_name(endpoint_name) create_ds(ep) clean_ds() return {} eps = ddb_service.scan(sagemaker_endpoint_table) logger.info(f"Endpoints: {eps}") for ep in eps: ep_name = ep['endpoint_name']['S'] ep = get_endpoint_by_name(ep_name) if ep.endpoint_status == 'Creating': continue endpoint = get_endpoint_in_sagemaker(ep_name) if endpoint is None: continue create_ds(ep) clean_ds() return {} def gen_workflow_ds(): dimensions = [{'Name': 'Workflow'}] metrics = cloudwatch.list_metrics(Namespace='ESD', MetricName='InferenceTotal', Dimensions=dimensions)['Metrics'] workflow_name = [] for m in metrics: workflow = m['Dimensions'][0]['Value'] workflow_name.append(workflow) workflow_name.sort() logger.info(f"Workflow Names: {workflow_name}") y = 0 widgets = [] for workflow in workflow_name: widgets.append({ "height": 4, "width": 24, "y": 0, "x": 0, "type": "metric", "properties": { "metrics": [ [ "ESD", "InferenceTotal", "Workflow", workflow, { "region": aws_region } ], [ ".", "InferenceSucceed", ".", ".", { "region": aws_region } ], [ ".", "InferenceLatency", ".", ".", { "stat": "Minimum", "region": aws_region } ], [ "...", { "stat": "Average", "region": aws_region } ], [ "...", { "stat": "p99", "region": aws_region } ], [ "...", { "region": aws_region } ] ], "sparkline": True, "view": "singleValue", "region": aws_region, "stat": "Maximum", "period": 300, "title": f"{workflow}" } }) y = y + 1 if len(widgets) > 0: cloudwatch.put_dashboard(DashboardName='ESD-Workflow', DashboardBody=json.dumps({"widgets": widgets})) def clean_ds(): ds = cloudwatch.list_dashboards() if 'DashboardEntries' in ds: prefix = ('comfy-async-', 'comfy-real-time-', 'sd-async-', 'sd-real-time-') filtered_dashboards = [dashboard['DashboardName'] for dashboard in ds['DashboardEntries'] if dashboard['DashboardName'].startswith(prefix)] for ep_name in filtered_dashboards: try: get_endpoint_by_name(ep_name) except Exception as e: print(f"Error: {e}") print(f"Deleting {ep_name}") cloudwatch.delete_dashboards(DashboardNames=[ep_name]) def ds_body(ep: Endpoint, custom_metrics): ep_name = ep.endpoint_name last_build_time = datetime.datetime.now().isoformat() dashboard_body = { "widgets": [ { "type": "text", "x": 0, "y": 0, "width": 24, "height": 2, "properties": { "markdown": f"## ESD - {ep_name} \n Last Build Time: {last_build_time}" } }, { "type": "metric", "x": 0, "y": 0, "width": 24, "height": 4, "properties": { "metrics": [ [ "ESD", "QueueLatency", "Endpoint", ep_name, { "stat": "Minimum", "region": aws_region } ], [ "...", { "stat": "Average", "region": aws_region } ], [ "...", { "stat": "p99", "region": aws_region } ], [ "...", { "region": aws_region } ] ], "view": "singleValue", "region": aws_region, "period": period, "stat": "Maximum", "title": "QueueLatency" } }, { "height": 4, "width": 6, "y": 1, "x": 0, "type": "metric", "properties": { "metrics": [ [ "ESD", "InferenceEndpointReceived", "Endpoint", ep_name, { "region": aws_region } ], [ ".", "InferenceSucceed", ".", ".", { "region": aws_region } ] ], "sparkline": True, "view": "singleValue", "region": aws_region, "title": "Inference Results", "period": period, "stat": "Sum" } }, { "height": 4, "width": 18, "y": 1, "x": 8, "type": "metric", "properties": { "metrics": [ [ "ESD", "InferenceLatency", "Endpoint", ep_name, { "region": aws_region, "stat": "Minimum" } ], [ "...", { "region": aws_region } ], [ "...", { "stat": "p99", "region": aws_region } ], [ "...", { "stat": "Maximum", "region": aws_region } ] ], "sparkline": True, "view": "singleValue", "region": aws_region, "stat": "Average", "period": period, "title": "InferenceLatency" } }, { "height": 5, "width": 6, "y": 0, "x": 0, "type": "metric", "properties": { "metrics": [ [ "ESD", "InferenceTotal", "Endpoint", ep_name, { "region": aws_region } ] ], "sparkline": True, "view": "singleValue", "region": aws_region, "title": "Endpoint Inference", "stat": "Sum", "period": period } }, { "height": 5, "width": 7, "y": 0, "x": 13, "type": "metric", "properties": { "metrics": [ [ "/aws/sagemaker/Endpoints", "MemoryUtilization", "EndpointName", ep_name, "VariantName", "prod", { "region": aws_region, "stat": "Minimum" } ], [ "...", { "region": aws_region } ] ], "sparkline": True, "view": "gauge", "region": aws_region, "title": "MemoryUtilization", "period": period, "yAxis": { "left": { "min": 1, "max": 100 } }, "stat": "Maximum" } }, { "height": 5, "width": 7, "y": 0, "x": 6, "type": "metric", "properties": { "metrics": [ [ "/aws/sagemaker/Endpoints", "GPUMemoryUtilization", "EndpointName", ep_name, "VariantName", "prod", { "region": aws_region, "stat": "Minimum" } ], [ "...", { "region": aws_region } ] ], "view": "gauge", "stacked": True, "region": aws_region, "title": "GPUMemoryUtilization", "period": period, "yAxis": { "left": { "min": 1, "max": resolve_gpu_nums(ep) * 100 } }, "stat": "Maximum" } }, { "height": 5, "width": 12, "y": 5, "x": 0, "type": "metric", "properties": { "metrics": [ [ "/aws/sagemaker/Endpoints", "GPUUtilization", "EndpointName", ep_name, "VariantName", "prod", { "region": aws_region, "stat": "Minimum" } ], [ "...", { "region": aws_region } ], [ "...", { "region": aws_region, "stat": "Maximum" } ] ], "sparkline": True, "view": "gauge", "yAxis": { "left": { "min": 0, "max": resolve_gpu_nums(ep) * 100 } }, "region": aws_region, "title": "GPUUtilization", "period": period, "stacked": False, "stat": "Average" } }, { "height": 5, "width": 12, "y": 5, "x": 12, "type": "metric", "properties": { "metrics": [ [ "/aws/sagemaker/Endpoints", "CPUUtilization", "EndpointName", ep_name, "VariantName", "prod", { "region": aws_region, "stat": "Minimum" } ], [ "...", { "region": aws_region } ], [ "...", { "region": aws_region, "stat": "Maximum" } ] ], "sparkline": True, "view": "singleValue", "yAxis": { "left": { "min": 0, "max": 100 } }, "region": aws_region, "title": "CPUUtilization", "period": period, "stat": "Average" } }, { "height": 5, "width": 4, "y": 0, "x": 20, "type": "metric", "properties": { "metrics": [ [ "/aws/sagemaker/Endpoints", "DiskUtilization", "EndpointName", ep_name, "VariantName", "prod", { "region": aws_region } ] ], "view": "singleValue", "stacked": True, "region": aws_region, "title": "DiskUtilization", "period": period, "stat": "Maximum" } }, { "type": "log", "x": 0, "y": 20, "width": 24, "height": 8, "properties": { "query": f"SOURCE '/aws/sagemaker/Endpoints/{ep_name}' " f"| fields @timestamp, @logStream, @message\r\n" f"| filter @message like /error/\r\n" f"| sort @timestamp desc\r\n| limit 500", "region": aws_region, "stacked": False, "view": "table", "title": "Endpoint Error Log" } } ] } gpus_ds = resolve_gpu_ds(ep, custom_metrics) for gpu_ds in gpus_ds: dashboard_body['widgets'].append(gpu_ds) return json.dumps(dashboard_body) def resolve_gpu_nums(ep: Endpoint): maps = { "ml.p4d.24xlarge": 8, "ml.g4dn.12xlarge": 4, "ml.g5.12xlarge": 4, "ml.g5.24xlarge": 4, "ml.g5.48xlarge": 8, } return maps.get(ep.instance_type, 1) def resolve_gpu_ds(ep: Endpoint, custom_metrics): ep_name = ep.endpoint_name list = [] ids = [] cur_instance_id = None for metric in custom_metrics: if metric['MetricName'] == 'GPUUtilization': if len(metric['Dimensions']) == 3: for dm in metric['Dimensions']: if dm['Name'] == 'Endpoint' and dm['Value'] == ep_name: instance_id = metric['Dimensions'][1]['Value'] gpu_id = metric['Dimensions'][2]['Value'] ids.append({ "instance_id": instance_id, "gpu_id": gpu_id, "view": "singleValue", "stat": "Sum", "metric": "InferenceTotal"}) ids.append({ "instance_id": instance_id, "gpu_id": gpu_id, "view": "singleValue", "stat": "Average", "metric": "GPUUtilization"}) ids.append({ "instance_id": instance_id, "gpu_id": gpu_id, "view": "singleValue", "stat": "Maximum", "metric": "GPUUtilization"}) ids.append({ "instance_id": instance_id, "gpu_id": gpu_id, "view": "singleValue", "stat": "Maximum", "metric": "GPUMemoryUtilization"}) def custom_sort(obj): return (-int(obj['instance_id']), obj['gpu_id']) ids = sorted(ids, key=custom_sort) x = 0 y = 30 colors = ["#9467bd", "#ff7f0e", "#2ca02c", "#8c564b", "#e377c2", "#7f7f7f", "#1f77b4"] color_index = 0 for item in ids: if cur_instance_id != item['instance_id']: cur_instance_id = item['instance_id'] list.append({ "type": "text", "x": 0, "y": y, "width": 24, "height": 1, "properties": { "background": "transparent", "markdown": f"" } }) list.append({ "type": "text", "x": 0, "y": y + 1, "width": 24, "height": 3, "properties": { "markdown": f"# Endpoint Instance - {item['instance_id']} \n" f"- InferenceTotal: Inference Job Count (Comfy Only)\n" f"- GPUUtilization: The percentage of GPU units that are used on a GPU.\n" f"- GPUMemoryUtilization: The percentage of GPU memory that are used on a GPU." } }) list.append({ "type": "metric", "x": 0, "y": y + 2, "width": 24, "height": 4, "properties": { "metrics": [ [ "ESD", "DiskTotal", "Endpoint", ep_name, "Instance", item['instance_id'], ], [ ".", "DiskUsed", ".", ".", ".", "." ], [ ".", "DiskFree", ".", ".", ".", "." ], [ ".", "DiskPercentage", ".", ".", ".", "." ] ], "sparkline": True, "view": "singleValue", "region": aws_region, "stat": "Maximum", "period": period, "title": "Disk" } }) y = y + 3 list.append({ "height": 4, "width": 6, "y": y, "x": x, "type": "metric", "properties": { "metrics": [ [ "ESD", item['metric'], "Endpoint", ep_name, "Instance", item['instance_id'], "InstanceGPU", item['gpu_id'], { "region": aws_region, "label": f"{item['metric']} - {item['stat']}", "color": colors[color_index] } ] ], "sparkline": True, "view": item['view'], "yAxis": { "left": { "min": 1, "max": 100 } }, "stacked": True, "region": aws_region, "stat": item['stat'], "period": period, "title": f"{item['gpu_id']}", } }) x = x + 6 if x >= 24: color_index = color_index + 1 if color_index >= len(colors): color_index = 0 x = 0 y = y + 1 logger.info(f"Metrics List:") logger.info(json.dumps(list)) return list def get_dashboard(dashboard_name): try: response = cloudwatch.get_dashboard(DashboardName=dashboard_name) return response['DashboardBody'] except cloudwatch.exceptions.ResourceNotFound: return None def create_ds(ep: Endpoint): ep_name = ep.endpoint_name dimensions = [ { 'Name': 'Endpoint', 'Value': ep_name } ] metrics1 = cloudwatch.list_metrics(Namespace='ESD', MetricName='GPUMemoryUtilization', Dimensions=dimensions)[ 'Metrics'] metrics2 = cloudwatch.list_metrics(Namespace='ESD', MetricName='GPUUtilization', Dimensions=dimensions)['Metrics'] metrics3 = cloudwatch.list_metrics(Namespace='ESD', MetricName='InferenceTotal', Dimensions=dimensions)['Metrics'] custom_metrics = metrics1 + metrics2 + metrics3 logger.info(f"Custom Metrics: ") logger.info(json.dumps(custom_metrics)) existing_dashboard = get_dashboard(ep_name) cloudwatch.put_dashboard(DashboardName=ep_name, DashboardBody=ds_body(ep, custom_metrics)) if existing_dashboard: logger.info(f"Dashboard '{ep_name}' updated successfully.") else: logger.info(f"Dashboard '{ep_name}' created successfully.")