806 lines
26 KiB
Python
806 lines
26 KiB
Python
import datetime
|
|
import json
|
|
import logging
|
|
import os
|
|
|
|
import boto3
|
|
from aws_lambda_powertools import Tracer
|
|
|
|
from common.ddb_service.client import DynamoDbUtilsService
|
|
from delete_endpoints import get_endpoint_in_sagemaker
|
|
from libs.data_types import Endpoint
|
|
from libs.utils import get_endpoint_by_name
|
|
|
|
aws_region = os.environ.get('AWS_REGION')
|
|
esd_version = os.environ.get("ESD_VERSION")
|
|
sagemaker_endpoint_table = os.environ.get('ENDPOINT_TABLE_NAME')
|
|
logger = logging.getLogger(__name__)
|
|
logger.setLevel(os.environ.get('LOG_LEVEL') or logging.INFO)
|
|
|
|
tracer = Tracer()
|
|
|
|
logger = logging.getLogger(__name__)
|
|
logger.setLevel(os.environ.get('LOG_LEVEL') or logging.ERROR)
|
|
cloudwatch = boto3.client('cloudwatch')
|
|
sagemaker = boto3.client('sagemaker')
|
|
ddb_service = DynamoDbUtilsService(logger=logger)
|
|
period = 300
|
|
|
|
|
|
@tracer.capture_lambda_handler
|
|
def handler(event, context):
|
|
logger.info(json.dumps(event))
|
|
|
|
gen_workflow_ds()
|
|
|
|
if 'detail' in event and 'EndpointStatus' in event['detail']:
|
|
endpoint_name = event['detail']['EndpointName']
|
|
endpoint_status = event['detail']['EndpointStatus']
|
|
if endpoint_status == 'InService':
|
|
ep = get_endpoint_by_name(endpoint_name)
|
|
create_ds(ep)
|
|
clean_ds()
|
|
return {}
|
|
|
|
eps = ddb_service.scan(sagemaker_endpoint_table)
|
|
logger.info(f"Endpoints: {eps}")
|
|
|
|
for ep in eps:
|
|
ep_name = ep['endpoint_name']['S']
|
|
ep = get_endpoint_by_name(ep_name)
|
|
|
|
if ep.endpoint_status == 'Creating':
|
|
continue
|
|
|
|
endpoint = get_endpoint_in_sagemaker(ep_name)
|
|
if endpoint is None:
|
|
continue
|
|
|
|
create_ds(ep)
|
|
|
|
clean_ds()
|
|
return {}
|
|
|
|
|
|
def gen_workflow_ds():
|
|
dimensions = [{'Name': 'Workflow'}]
|
|
metrics = cloudwatch.list_metrics(Namespace='ESD', MetricName='InferenceTotal', Dimensions=dimensions)['Metrics']
|
|
|
|
workflow_name = []
|
|
for m in metrics:
|
|
workflow = m['Dimensions'][0]['Value']
|
|
workflow_name.append(workflow)
|
|
|
|
workflow_name.sort()
|
|
|
|
logger.info(f"Workflow Names: {workflow_name}")
|
|
|
|
y = 0
|
|
widgets = []
|
|
for workflow in workflow_name:
|
|
widgets.append({
|
|
"height": 4,
|
|
"width": 24,
|
|
"y": 0,
|
|
"x": 0,
|
|
"type": "metric",
|
|
"properties": {
|
|
"metrics": [
|
|
[
|
|
"ESD",
|
|
"InferenceTotal",
|
|
"Workflow",
|
|
workflow,
|
|
{
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
".",
|
|
"InferenceSucceed",
|
|
".",
|
|
".",
|
|
{
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
".",
|
|
"InferenceLatency",
|
|
".",
|
|
".",
|
|
{
|
|
"stat": "Minimum",
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"stat": "Average",
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"stat": "p99",
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"region": aws_region
|
|
}
|
|
]
|
|
],
|
|
"sparkline": True,
|
|
"view": "singleValue",
|
|
"region": aws_region,
|
|
"stat": "Maximum",
|
|
"period": 300,
|
|
"title": f"{workflow}"
|
|
}
|
|
})
|
|
y = y + 1
|
|
|
|
if len(widgets) > 0:
|
|
cloudwatch.put_dashboard(DashboardName='ESD-Workflow', DashboardBody=json.dumps({"widgets": widgets}))
|
|
|
|
|
|
def clean_ds():
|
|
ds = cloudwatch.list_dashboards()
|
|
if 'DashboardEntries' in ds:
|
|
prefix = ('comfy-async-', 'comfy-real-time-', 'sd-async-', 'sd-real-time-')
|
|
filtered_dashboards = [dashboard['DashboardName'] for dashboard in ds['DashboardEntries'] if
|
|
dashboard['DashboardName'].startswith(prefix)]
|
|
for ep_name in filtered_dashboards:
|
|
try:
|
|
get_endpoint_by_name(ep_name)
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
print(f"Deleting {ep_name}")
|
|
cloudwatch.delete_dashboards(DashboardNames=[ep_name])
|
|
|
|
|
|
def ds_body(ep: Endpoint, custom_metrics):
|
|
ep_name = ep.endpoint_name
|
|
last_build_time = datetime.datetime.now().isoformat()
|
|
dashboard_body = {
|
|
"widgets": [
|
|
{
|
|
"type": "text",
|
|
"x": 0,
|
|
"y": 0,
|
|
"width": 24,
|
|
"height": 2,
|
|
"properties": {
|
|
"markdown": f"## ESD - {ep_name} \n Last Build Time: {last_build_time}"
|
|
}
|
|
},
|
|
{
|
|
"type": "metric",
|
|
"x": 0,
|
|
"y": 0,
|
|
"width": 24,
|
|
"height": 4,
|
|
"properties": {
|
|
"metrics": [
|
|
[
|
|
"ESD",
|
|
"QueueLatency",
|
|
"Endpoint",
|
|
ep_name,
|
|
{
|
|
"stat": "Minimum",
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"stat": "Average",
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"stat": "p99",
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"region": aws_region
|
|
}
|
|
]
|
|
],
|
|
"view": "singleValue",
|
|
"region": aws_region,
|
|
"period": period,
|
|
"stat": "Maximum",
|
|
"title": "QueueLatency"
|
|
}
|
|
},
|
|
{
|
|
"height": 4,
|
|
"width": 6,
|
|
"y": 1,
|
|
"x": 0,
|
|
"type": "metric",
|
|
"properties": {
|
|
"metrics": [
|
|
[
|
|
"ESD",
|
|
"InferenceEndpointReceived",
|
|
"Endpoint",
|
|
ep_name,
|
|
{
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
".",
|
|
"InferenceSucceed",
|
|
".",
|
|
".",
|
|
{
|
|
"region": aws_region
|
|
}
|
|
]
|
|
],
|
|
"sparkline": True,
|
|
"view": "singleValue",
|
|
"region": aws_region,
|
|
"title": "Inference Results",
|
|
"period": period,
|
|
"stat": "Sum"
|
|
}
|
|
},
|
|
{
|
|
"height": 4,
|
|
"width": 18,
|
|
"y": 1,
|
|
"x": 8,
|
|
"type": "metric",
|
|
"properties": {
|
|
"metrics": [
|
|
[
|
|
"ESD",
|
|
"InferenceLatency",
|
|
"Endpoint",
|
|
ep_name,
|
|
{
|
|
"region": aws_region,
|
|
"stat": "Minimum"
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"stat": "p99",
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"stat": "Maximum",
|
|
"region": aws_region
|
|
}
|
|
]
|
|
],
|
|
"sparkline": True,
|
|
"view": "singleValue",
|
|
"region": aws_region,
|
|
"stat": "Average",
|
|
"period": period,
|
|
"title": "InferenceLatency"
|
|
}
|
|
},
|
|
{
|
|
"height": 5,
|
|
"width": 6,
|
|
"y": 0,
|
|
"x": 0,
|
|
"type": "metric",
|
|
"properties": {
|
|
"metrics": [
|
|
[
|
|
"ESD",
|
|
"InferenceTotal",
|
|
"Endpoint",
|
|
ep_name,
|
|
{
|
|
"region": aws_region
|
|
}
|
|
]
|
|
],
|
|
"sparkline": True,
|
|
"view": "singleValue",
|
|
"region": aws_region,
|
|
"title": "Endpoint Inference",
|
|
"stat": "Sum",
|
|
"period": period
|
|
}
|
|
},
|
|
{
|
|
"height": 5,
|
|
"width": 7,
|
|
"y": 0,
|
|
"x": 13,
|
|
"type": "metric",
|
|
"properties": {
|
|
"metrics": [
|
|
[
|
|
"/aws/sagemaker/Endpoints",
|
|
"MemoryUtilization",
|
|
"EndpointName",
|
|
ep_name,
|
|
"VariantName",
|
|
"prod",
|
|
{
|
|
"region": aws_region,
|
|
"stat": "Minimum"
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"region": aws_region
|
|
}
|
|
]
|
|
],
|
|
"sparkline": True,
|
|
"view": "gauge",
|
|
"region": aws_region,
|
|
"title": "MemoryUtilization",
|
|
"period": period,
|
|
"yAxis": {
|
|
"left": {
|
|
"min": 1,
|
|
"max": 100
|
|
}
|
|
},
|
|
"stat": "Maximum"
|
|
}
|
|
},
|
|
{
|
|
"height": 5,
|
|
"width": 7,
|
|
"y": 0,
|
|
"x": 6,
|
|
"type": "metric",
|
|
"properties": {
|
|
"metrics": [
|
|
[
|
|
"/aws/sagemaker/Endpoints",
|
|
"GPUMemoryUtilization",
|
|
"EndpointName",
|
|
ep_name,
|
|
"VariantName",
|
|
"prod",
|
|
{
|
|
"region": aws_region,
|
|
"stat": "Minimum"
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"region": aws_region
|
|
}
|
|
]
|
|
],
|
|
"view": "gauge",
|
|
"stacked": True,
|
|
"region": aws_region,
|
|
"title": "GPUMemoryUtilization",
|
|
"period": period,
|
|
"yAxis": {
|
|
"left": {
|
|
"min": 1,
|
|
"max": resolve_gpu_nums(ep) * 100
|
|
}
|
|
},
|
|
"stat": "Maximum"
|
|
}
|
|
},
|
|
{
|
|
"height": 5,
|
|
"width": 12,
|
|
"y": 5,
|
|
"x": 0,
|
|
"type": "metric",
|
|
"properties": {
|
|
"metrics": [
|
|
[
|
|
"/aws/sagemaker/Endpoints",
|
|
"GPUUtilization",
|
|
"EndpointName",
|
|
ep_name,
|
|
"VariantName",
|
|
"prod",
|
|
{
|
|
"region": aws_region,
|
|
"stat": "Minimum"
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"region": aws_region,
|
|
"stat": "Maximum"
|
|
}
|
|
]
|
|
],
|
|
"sparkline": True,
|
|
"view": "gauge",
|
|
"yAxis": {
|
|
"left": {
|
|
"min": 0,
|
|
"max": resolve_gpu_nums(ep) * 100
|
|
}
|
|
},
|
|
"region": aws_region,
|
|
"title": "GPUUtilization",
|
|
"period": period,
|
|
"stacked": False,
|
|
"stat": "Average"
|
|
}
|
|
},
|
|
{
|
|
"height": 5,
|
|
"width": 12,
|
|
"y": 5,
|
|
"x": 12,
|
|
"type": "metric",
|
|
"properties": {
|
|
"metrics": [
|
|
[
|
|
"/aws/sagemaker/Endpoints",
|
|
"CPUUtilization",
|
|
"EndpointName",
|
|
ep_name,
|
|
"VariantName",
|
|
"prod",
|
|
{
|
|
"region": aws_region,
|
|
"stat": "Minimum"
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"region": aws_region
|
|
}
|
|
],
|
|
[
|
|
"...",
|
|
{
|
|
"region": aws_region,
|
|
"stat": "Maximum"
|
|
}
|
|
]
|
|
],
|
|
"sparkline": True,
|
|
"view": "singleValue",
|
|
"yAxis": {
|
|
"left": {
|
|
"min": 0,
|
|
"max": 100
|
|
}
|
|
},
|
|
"region": aws_region,
|
|
"title": "CPUUtilization",
|
|
"period": period,
|
|
"stat": "Average"
|
|
}
|
|
},
|
|
{
|
|
"height": 5,
|
|
"width": 4,
|
|
"y": 0,
|
|
"x": 20,
|
|
"type": "metric",
|
|
"properties": {
|
|
"metrics": [
|
|
[
|
|
"/aws/sagemaker/Endpoints",
|
|
"DiskUtilization",
|
|
"EndpointName",
|
|
ep_name,
|
|
"VariantName",
|
|
"prod",
|
|
{
|
|
"region": aws_region
|
|
}
|
|
]
|
|
],
|
|
"view": "singleValue",
|
|
"stacked": True,
|
|
"region": aws_region,
|
|
"title": "DiskUtilization",
|
|
"period": period,
|
|
"stat": "Maximum"
|
|
}
|
|
},
|
|
{
|
|
"type": "log",
|
|
"x": 0,
|
|
"y": 20,
|
|
"width": 24,
|
|
"height": 8,
|
|
"properties": {
|
|
"query": f"SOURCE '/aws/sagemaker/Endpoints/{ep_name}' "
|
|
f"| fields @timestamp, @logStream, @message\r\n"
|
|
f"| filter @message like /error/\r\n"
|
|
f"| sort @timestamp desc\r\n| limit 500",
|
|
"region": aws_region,
|
|
"stacked": False,
|
|
"view": "table",
|
|
"title": "Endpoint Error Log"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
|
|
gpus_ds = resolve_gpu_ds(ep, custom_metrics)
|
|
for gpu_ds in gpus_ds:
|
|
dashboard_body['widgets'].append(gpu_ds)
|
|
|
|
return json.dumps(dashboard_body)
|
|
|
|
|
|
def resolve_gpu_nums(ep: Endpoint):
|
|
maps = {
|
|
"ml.p4d.24xlarge": 8,
|
|
"ml.g4dn.12xlarge": 4,
|
|
"ml.g5.12xlarge": 4,
|
|
"ml.g5.24xlarge": 4,
|
|
"ml.g5.48xlarge": 8,
|
|
}
|
|
|
|
return maps.get(ep.instance_type, 1)
|
|
|
|
|
|
def resolve_gpu_ds(ep: Endpoint, custom_metrics):
|
|
ep_name = ep.endpoint_name
|
|
|
|
list = []
|
|
ids = []
|
|
|
|
cur_instance_id = None
|
|
for metric in custom_metrics:
|
|
if metric['MetricName'] == 'GPUUtilization':
|
|
if len(metric['Dimensions']) == 3:
|
|
for dm in metric['Dimensions']:
|
|
if dm['Name'] == 'Endpoint' and dm['Value'] == ep_name:
|
|
instance_id = metric['Dimensions'][1]['Value']
|
|
gpu_id = metric['Dimensions'][2]['Value']
|
|
|
|
ids.append({
|
|
"instance_id": instance_id,
|
|
"gpu_id": gpu_id,
|
|
"view": "singleValue",
|
|
"stat": "Sum",
|
|
"metric": "InferenceTotal"})
|
|
|
|
ids.append({
|
|
"instance_id": instance_id,
|
|
"gpu_id": gpu_id,
|
|
"view": "singleValue",
|
|
"stat": "Average",
|
|
"metric": "GPUUtilization"})
|
|
|
|
ids.append({
|
|
"instance_id": instance_id,
|
|
"gpu_id": gpu_id,
|
|
"view": "singleValue",
|
|
"stat": "Maximum",
|
|
"metric": "GPUUtilization"})
|
|
|
|
ids.append({
|
|
"instance_id": instance_id,
|
|
"gpu_id": gpu_id,
|
|
"view": "singleValue",
|
|
"stat": "Maximum",
|
|
"metric": "GPUMemoryUtilization"})
|
|
|
|
def custom_sort(obj):
|
|
return (-int(obj['instance_id']), obj['gpu_id'])
|
|
|
|
ids = sorted(ids, key=custom_sort)
|
|
|
|
x = 0
|
|
y = 30
|
|
|
|
colors = ["#9467bd", "#ff7f0e", "#2ca02c", "#8c564b", "#e377c2", "#7f7f7f", "#1f77b4"]
|
|
color_index = 0
|
|
|
|
for item in ids:
|
|
if cur_instance_id != item['instance_id']:
|
|
cur_instance_id = item['instance_id']
|
|
list.append({
|
|
"type": "text",
|
|
"x": 0,
|
|
"y": y,
|
|
"width": 24,
|
|
"height": 1,
|
|
"properties": {
|
|
"background": "transparent",
|
|
"markdown": f""
|
|
}
|
|
})
|
|
list.append({
|
|
"type": "text",
|
|
"x": 0,
|
|
"y": y + 1,
|
|
"width": 24,
|
|
"height": 3,
|
|
"properties": {
|
|
"markdown": f"# Endpoint Instance - {item['instance_id']} \n"
|
|
f"- InferenceTotal: Inference Job Count (Comfy Only)\n"
|
|
f"- GPUUtilization: The percentage of GPU units that are used on a GPU.\n"
|
|
f"- GPUMemoryUtilization: The percentage of GPU memory that are used on a GPU."
|
|
|
|
}
|
|
})
|
|
list.append({
|
|
"type": "metric",
|
|
"x": 0,
|
|
"y": y + 2,
|
|
"width": 24,
|
|
"height": 4,
|
|
"properties": {
|
|
"metrics": [
|
|
[
|
|
"ESD",
|
|
"DiskTotal",
|
|
"Endpoint",
|
|
ep_name,
|
|
"Instance",
|
|
item['instance_id'],
|
|
],
|
|
[
|
|
".",
|
|
"DiskUsed",
|
|
".",
|
|
".",
|
|
".",
|
|
"."
|
|
],
|
|
[
|
|
".",
|
|
"DiskFree",
|
|
".",
|
|
".",
|
|
".",
|
|
"."
|
|
],
|
|
[
|
|
".",
|
|
"DiskPercentage",
|
|
".",
|
|
".",
|
|
".",
|
|
"."
|
|
]
|
|
],
|
|
"sparkline": True,
|
|
"view": "singleValue",
|
|
"region": aws_region,
|
|
"stat": "Maximum",
|
|
"period": period,
|
|
"title": "Disk"
|
|
}
|
|
})
|
|
y = y + 3
|
|
|
|
list.append({
|
|
"height": 4,
|
|
"width": 6,
|
|
"y": y,
|
|
"x": x,
|
|
"type": "metric",
|
|
"properties": {
|
|
"metrics": [
|
|
[
|
|
"ESD",
|
|
item['metric'],
|
|
"Endpoint",
|
|
ep_name,
|
|
"Instance",
|
|
item['instance_id'],
|
|
"InstanceGPU",
|
|
item['gpu_id'],
|
|
{
|
|
"region": aws_region,
|
|
"label": f"{item['metric']} - {item['stat']}",
|
|
"color": colors[color_index]
|
|
}
|
|
]
|
|
],
|
|
"sparkline": True,
|
|
"view": item['view'],
|
|
"yAxis": {
|
|
"left": {
|
|
"min": 1,
|
|
"max": 100
|
|
}
|
|
},
|
|
"stacked": True,
|
|
"region": aws_region,
|
|
"stat": item['stat'],
|
|
"period": period,
|
|
"title": f"{item['gpu_id']}",
|
|
}
|
|
})
|
|
|
|
x = x + 6
|
|
if x >= 24:
|
|
|
|
color_index = color_index + 1
|
|
if color_index >= len(colors):
|
|
color_index = 0
|
|
|
|
x = 0
|
|
y = y + 1
|
|
|
|
logger.info(f"Metrics List:")
|
|
logger.info(json.dumps(list))
|
|
|
|
return list
|
|
|
|
|
|
def get_dashboard(dashboard_name):
|
|
try:
|
|
response = cloudwatch.get_dashboard(DashboardName=dashboard_name)
|
|
return response['DashboardBody']
|
|
except cloudwatch.exceptions.ResourceNotFound:
|
|
return None
|
|
|
|
|
|
def create_ds(ep: Endpoint):
|
|
ep_name = ep.endpoint_name
|
|
dimensions = [
|
|
{
|
|
'Name': 'Endpoint',
|
|
'Value': ep_name
|
|
}
|
|
]
|
|
|
|
metrics1 = cloudwatch.list_metrics(Namespace='ESD', MetricName='GPUMemoryUtilization', Dimensions=dimensions)[
|
|
'Metrics']
|
|
metrics2 = cloudwatch.list_metrics(Namespace='ESD', MetricName='GPUUtilization', Dimensions=dimensions)['Metrics']
|
|
metrics3 = cloudwatch.list_metrics(Namespace='ESD', MetricName='InferenceTotal', Dimensions=dimensions)['Metrics']
|
|
|
|
custom_metrics = metrics1 + metrics2 + metrics3
|
|
|
|
logger.info(f"Custom Metrics: ")
|
|
logger.info(json.dumps(custom_metrics))
|
|
|
|
existing_dashboard = get_dashboard(ep_name)
|
|
|
|
cloudwatch.put_dashboard(DashboardName=ep_name, DashboardBody=ds_body(ep, custom_metrics))
|
|
|
|
if existing_dashboard:
|
|
logger.info(f"Dashboard '{ep_name}' updated successfully.")
|
|
else:
|
|
logger.info(f"Dashboard '{ep_name}' created successfully.")
|