diff --git a/.gitignore b/.gitignore index aefea1f1..7ada5763 100644 --- a/.gitignore +++ b/.gitignore @@ -50,3 +50,4 @@ test/**/.env .DS_Store /container/ /.env +supervisord.* diff --git a/build_scripts/comfy/comfy_proxy.py b/build_scripts/comfy/comfy_proxy.py index 0c7b4e49..37cf49c5 100755 --- a/build_scripts/comfy/comfy_proxy.py +++ b/build_scripts/comfy/comfy_proxy.py @@ -851,9 +851,14 @@ if is_on_ec2: try: json_data = await request.json() if 'name' not in json_data or not json_data['name']: - raise ValueError("name is required") + return web.Response(status=200, content_type='application/json', + body=json.dumps({"result": False, "message": f"name is required"})) workflow_name = json_data['name'] + if workflow_name == 'default': + return web.Response(status=200, content_type='application/json', + body=json.dumps({"result": False, "message": f"{workflow_name} is not allowed"})) + payload_json = '' if 'payload_json' in json_data: @@ -865,20 +870,23 @@ if is_on_ec2: start_time = time.time() - s5cmd_syn_model_command = (f's5cmd sync ' + s5cmd_sync_command = (f's5cmd sync ' f'--delete=true ' f'--exclude="*comfy.tar" ' f'--exclude="*.log" ' f'--exclude="*__pycache__*" ' - f'--exclude="*.cache*" ' f'--exclude="*/ComfyUI/input/*" ' f'--exclude="*/ComfyUI/output/*" ' f'"/home/ubuntu/*" ' f'"s3://{bucket_name}/comfy/workflows/{workflow_name}/"') - logger.info(f"sync workflows files start {s5cmd_syn_model_command}") - os.system(s5cmd_syn_model_command) - os.system(f'echo "lock" > lock && s5cmd sync lock s3://{bucket_name}/comfy/workflows/{workflow_name}/lock') - action_unlock('release') + + s5cmd_lock_command = (f'echo "lock" > lock && ' + f's5cmd sync lock s3://{bucket_name}/comfy/workflows/{workflow_name}/lock') + + logger.info(f"sync workflows files start {s5cmd_sync_command}") + + subprocess.check_output(s5cmd_sync_command, shell=True) + subprocess.check_output(s5cmd_lock_command, shell=True) end_time = time.time() cost_time = end_time - start_time @@ -891,12 +899,54 @@ if is_on_ec2: response = get_response.json() logger.info(f"release workflow response is {response}") + action_unlock('release') + return web.Response(status=200, content_type='application/json', body=json.dumps({"result": True, "message": "success", "cost_time": cost_time})) except Exception as e: action_unlock('release') + logger.info(e) return web.Response(status=500, content_type='application/json', - body=json.dumps({"result": False, "message": e})) + body=json.dumps({"result": False, "message": 'Release workflow failed'})) + + + @server.PromptServer.instance.routes.put("/workflows") + async def switch_workflow(request): + if is_action_lock('release'): + return web.Response(status=200, content_type='application/json', + body=json.dumps( + {"result": False, "message": "release is not allowed during release workflow"})) + + try: + json_data = await request.json() + if 'name' not in json_data or not json_data['name']: + raise ValueError("name is required") + + workflow_name = json_data['name'] + + if workflow_name == os.getenv('WORKFLOW_NAME'): + return web.Response(status=200, content_type='application/json', + body=json.dumps({"result": False, "message": "workflow is already in use"})) + + if workflow_name == 'default' and not is_master_process: + return web.Response(status=200, content_type='application/json', + body=json.dumps({"result": False, "message": "slave can not use default workflow"})) + + if workflow_name != 'default': + if not check_file_exists(f"comfy/workflows/{workflow_name}/lock"): + return web.Response(status=200, content_type='application/json', + body=json.dumps({"result": False, "message": f"{workflow_name} not exists"})) + + name_file = os.getenv('WORKFLOW_NAME_FILE') + subprocess.check_output(f"echo {workflow_name} > {name_file}", shell=True) + subprocess.run(["pkill", "-f", "python3"]) + + return web.Response(status=200, content_type='application/json', + body=json.dumps({"result": True, "message": "Please wait to restart"})) + except Exception as e: + logger.info(e) + return web.Response(status=500, content_type='application/json', + body=json.dumps({"result": False, "message": 'Switch workflow failed'})) def check_file_exists(key): diff --git a/build_scripts/inference/serve b/build_scripts/inference/serve index 7b5033ba..a2d41950 100755 Binary files a/build_scripts/inference/serve and b/build_scripts/inference/serve differ diff --git a/build_scripts/inference/start.sh b/build_scripts/inference/start.sh index c4c55a78..5a7fa684 100644 --- a/build_scripts/inference/start.sh +++ b/build_scripts/inference/start.sh @@ -320,65 +320,66 @@ comfy_launch_from_public_s3(){ comfy_launch } -# -------------------- startup -------------------- - -ec2_start_process(){ +if [ -n "$ON_EC2" ]; then set -euxo pipefail - echo "---------------------------------------------------------------------------------" - export LD_LIBRARY_PATH=$LD_PRELOAD - set_conda - pip install supervisor - chown -R root:root "/home/ubuntu/ComfyUI" - chmod -R +x venv + WORKFLOW_NAME=$(cat "$WORKFLOW_NAME_FILE") + export WORKFLOW_DIR="/container/workflows/$WORKFLOW_NAME" - SUPERVISOR_CONF="[supervisord] -nodaemon=true -directory=/home/ubuntu/ComfyUI -autostart=true -autorestart=true + if [ ! -d "$WORKFLOW_DIR/ComfyUI/venv" ]; then + mkdir -p "$WORKFLOW_DIR" -[inet_http_server] -port = 127.0.0.1:9001 - -[rpcinterface:supervisor] -supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface - -[supervisorctl] -logfile=/dev/stdout - -" - - echo "$SUPERVISOR_CONF" > /etc/supervisord.conf - - init_port=8187 - for i in $(seq 1 "$PROCESS_NUMBER"); do - init_port=$((init_port + 1)) - - MASTER_PROCESS=false - if [ "$init_port" -eq "8188" ]; then - MASTER_PROCESS=true + if [ "$WORKFLOW_NAME" = "default" ]; then + if [ ! -f "/container/$WORKFLOW_NAME.tar" ]; then + start_at=$(date +%s) + s5cmd cp "s3://aws-gcr-solutions-$AWS_REGION/stable-diffusion-aws-extension-github-mainline/$ESD_VERSION/$SERVICE_TYPE.tar" "/container/$WORKFLOW_NAME.tar" + end_at=$(date +%s) + export DOWNLOAD_FILE_SECONDS=$((end_at-start_at)) fi - PROGRAM_NAME="comfy_$init_port" + start_at=$(date +%s) + tar --overwrite -xf "/container/$WORKFLOW_NAME.tar" -C "$WORKFLOW_DIR" + end_at=$(date +%s) + export DECOMPRESS_SECONDS=$((end_at-start_at)) - # shellcheck disable=SC2129 - echo "[program:$PROGRAM_NAME]" >> /etc/supervisord.conf - echo "command=/home/ubuntu/ComfyUI/venv/bin/python3 main.py --listen 0.0.0.0 --port $init_port --cuda-malloc --output-directory /home/ubuntu/ComfyUI/output/$init_port --temp-directory /home/ubuntu/ComfyUI/temp/$init_port" >> /etc/supervisord.conf - echo "startretries=3" >> /etc/supervisord.conf - echo "stdout_logfile=/home/ubuntu/ComfyUI/$PROGRAM_NAME.log" >> /etc/supervisord.conf - echo "stderr_logfile=/home/ubuntu/ComfyUI/$PROGRAM_NAME.log" >> /etc/supervisord.conf - echo "environment=MASTER_PROCESS=$MASTER_PROCESS,PROGRAM_NAME=$PROGRAM_NAME" >> /etc/supervisord.conf - echo "" >> /etc/supervisord.conf - done + cd "$WORKFLOW_DIR/ComfyUI" + mkdir -p models/vae/ + wget --quiet -O models/vae/vae-ft-mse-840000-ema-pruned.safetensors "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/resolve/main/vae-ft-mse-840000-ema-pruned.safetensors" + mkdir -p models/checkpoints/ + wget --quiet -O models/checkpoints/majicmixRealistic_v7.safetensors "https://huggingface.co/GreenGrape/231209/resolve/045ebfc504c47ba8ccc424f1869c65a223d1f5cc/majicmixRealistic_v7.safetensors" + mkdir -p models/animatediff_models/ + wget --quiet -O models/animatediff_models/mm_sd_v15_v2.ckpt "https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt" + wget --quiet -O models/checkpoints/v1-5-pruned-emaonly.ckpt "https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.ckpt" + else + start_at=$(date +%s) + s5cmd --log=error sync "s3://$COMFY_BUCKET_NAME/comfy/workflows/$WORKFLOW_NAME/*" "$WORKFLOW_DIR/" + end_at=$(date +%s) + export DOWNLOAD_FILE_SECONDS=$((end_at-start_at)) + echo "download file: $DOWNLOAD_FILE_SECONDS seconds" + cd "$WORKFLOW_DIR/ComfyUI" + fi + fi - echo "---------------------------------------------------------------------------------" - cat /etc/supervisord.conf - echo "---------------------------------------------------------------------------------" + rm -rf /home/ubuntu/ComfyUI - supervisord -c /etc/supervisord.conf | grep -v 'uncaptured python exception' + ln -s "$WORKFLOW_DIR/ComfyUI" /home/ubuntu/ComfyUI + + cd /home/ubuntu/ComfyUI || exit 1 + + # if /comfy_proxy.py exists + if [ -f "/comfy_proxy.py" ]; then + cp /comfy_proxy.py /home/ubuntu/ComfyUI/custom_nodes/ + fi + + rm -rf web/extensions/ComfyLiterals + chmod -R +x venv + source venv/bin/activate + + chmod -R 777 /home/ubuntu/ComfyUI + + venv/bin/python3 main.py --listen 0.0.0.0 --port 8188 --cuda-malloc exit 1 -} +fi if [ -n "$WORKFLOW_NAME" ]; then cd /home/ubuntu || exit 1 @@ -410,114 +411,12 @@ if [ -n "$WORKFLOW_NAME" ]; then chmod -R +x venv source venv/bin/activate - # on EC2 - if [ -n "$ON_EC2" ]; then - ec2_start_process - exit 1 - fi - # on SageMaker python /metrics.py & python3 serve.py exit 1 fi -if [ -n "$ON_EC2" ]; then - set -euxo pipefail - - if [ "$SERVICE_TYPE" == "sd" ]; then - cd /home/ubuntu || exit 1 - - if [ -d "/home/ubuntu/stable-diffusion-webui/venv" ]; then - cd /home/ubuntu/stable-diffusion-webui || exit 1 - chmod -R +x venv - source venv/bin/activate - chmod -R 777 /home/ubuntu - python3 launch.py --enable-insecure-extension-access --skip-torch-cuda-test --no-half --listen --no-download-sd-model - exit 1 - fi - - echo "downloading comfy file $CACHE_PUBLIC_SD ..." - start_at=$(date +%s) - s5cmd cp "s3://$CACHE_PUBLIC_SD" /home/ubuntu/ - end_at=$(date +%s) - export DOWNLOAD_FILE_SECONDS=$((end_at-start_at)) - echo "download file: $DOWNLOAD_FILE_SECONDS seconds" - - echo "decompressing sd file..." - start_at=$(date +%s) - tar --overwrite -xf "$SERVICE_TYPE.tar" -C /home/ubuntu/ - end_at=$(date +%s) - export DECOMPRESS_SECONDS=$((end_at-start_at)) - echo "decompress file: $DECOMPRESS_SECONDS seconds" - - cd /home/ubuntu/stable-diffusion-webui/extensions || exit 1 - git clone https://github.com/zixaphir/Stable-Diffusion-Webui-Civitai-Helper.git - cd ../ - - export AWS_REGION=us-east-1 - wget https://raw.githubusercontent.com/awslabs/stable-diffusion-aws-extension/dev/workshop/sd_models.txt - s5cmd run sd_models.txt - - chmod -R +x venv - source venv/bin/activate - - chmod -R 777 /home/ubuntu/stable-diffusion-webui - python3 launch.py --enable-insecure-extension-access --skip-torch-cuda-test --no-half --listen --no-download-sd-model - else - cd /home/ubuntu || exit 1 - - if [ -d "/home/ubuntu/ComfyUI/venv" ]; then - cd /home/ubuntu/ComfyUI || exit 1 - rm -rf web/extensions/ComfyLiterals - chmod -R +x venv - source venv/bin/activate - ec2_start_process - exit 1 - fi - - echo "downloading comfy file $CACHE_PUBLIC_COMFY ..." - start_at=$(date +%s) - s5cmd cp "s3://$CACHE_PUBLIC_COMFY" /home/ubuntu/ - end_at=$(date +%s) - export DOWNLOAD_FILE_SECONDS=$((end_at-start_at)) - echo "download file: $DOWNLOAD_FILE_SECONDS seconds" - - echo "decompressing comfy file..." - start_at=$(date +%s) - tar --overwrite -xf "$SERVICE_TYPE.tar" -C /home/ubuntu/ - end_at=$(date +%s) - export DECOMPRESS_SECONDS=$((end_at-start_at)) - echo "decompress file: $DECOMPRESS_SECONDS seconds" - - cd /home/ubuntu/ComfyUI || exit 1 - rm -rf web/extensions/ComfyLiterals - chmod -R +x venv - source venv/bin/activate - - pip install dynamicprompts - pip install ultralytics - - mkdir -p models/vae/ - wget --quiet -O models/vae/vae-ft-mse-840000-ema-pruned.safetensors "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/resolve/main/vae-ft-mse-840000-ema-pruned.safetensors" - - mkdir -p models/checkpoints/ - wget --quiet -O models/checkpoints/majicmixRealistic_v7.safetensors "https://huggingface.co/GreenGrape/231209/resolve/045ebfc504c47ba8ccc424f1869c65a223d1f5cc/majicmixRealistic_v7.safetensors" - - mkdir -p models/animatediff_models/ - wget --quiet -O models/animatediff_models/mm_sd_v15_v2.ckpt "https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt" - - wget --quiet -O models/checkpoints/v1-5-pruned-emaonly.ckpt "https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.ckpt" - - chmod -R 777 /home/ubuntu/ComfyUI - - ec2_start_process - fi - - exit 1 -fi - - if [ -f "/initiated_lock" ]; then echo "already initiated, start service directly..." if [ "$SERVICE_TYPE" == "sd" ]; then diff --git a/docker_start.sh b/docker_start.sh index fe858c7b..45df7174 100755 --- a/docker_start.sh +++ b/docker_start.sh @@ -6,13 +6,12 @@ if [ -f "/etc/environment" ]; then source /etc/environment fi -SERVICE_TYPE="comfy" - +export SERVICE_TYPE="comfy" export CONTAINER_NAME="esd_container" export ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text) export AWS_REGION=$(aws configure get region) -image="$ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$CONTAINER_NAME:latest" +export image="$ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$CONTAINER_NAME:latest" docker stop "$CONTAINER_NAME" || true docker rm "$CONTAINER_NAME" || true @@ -36,7 +35,7 @@ docker build -f Dockerfile \ image_hash=$(docker inspect "$image" | jq -r ".[0].Id") image_hash=${image_hash:7} -release_image="$ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$CONTAINER_NAME:$image_hash" +export release_image="$ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$CONTAINER_NAME:$image_hash" docker tag "$image" "$release_image" aws ecr get-login-password --region "$AWS_REGION" | docker login --username AWS --password-stdin "$ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com" @@ -58,29 +57,112 @@ fi total_memory=$(cat /proc/meminfo | grep 'MemTotal' | awk '{print $2}') total_memory_mb=$((total_memory / 1024)) echo "total_memory_mb: $total_memory_mb" -limit_memory_mb=$((total_memory_mb - 2048)) +export limit_memory_mb=$((total_memory_mb - 2048)) echo "limit_memory_mb: $limit_memory_mb" -# -v ./build_scripts/comfy/comfy_proxy.py:/home/ubuntu/ComfyUI/custom_nodes/comfy_proxy.py \ -docker run -v ~/.aws:/root/.aws \ - -v "$local_volume":/home/ubuntu \ - -v ./build_scripts/inference/start.sh:/start.sh \ - -v ./build_scripts/comfy/comfy_proxy.py:/home/ubuntu/ComfyUI/custom_nodes/comfy_proxy.py \ - --gpus all \ - -e "IMAGE_HASH=$release_image" \ - -e "ESD_VERSION=$ESD_VERSION" \ - -e "SERVICE_TYPE=$SERVICE_TYPE" \ - -e "ON_EC2=true" \ - -e "S3_BUCKET_NAME=$COMFY_BUCKET_NAME" \ - -e "AWS_REGION=$AWS_REGION" \ - -e "AWS_DEFAULT_REGION=$AWS_REGION" \ - -e "COMFY_API_URL=$COMFY_API_URL" \ - -e "COMFY_API_TOKEN=$COMFY_API_TOKEN" \ - -e "COMFY_ENDPOINT=$COMFY_ENDPOINT" \ - -e "COMFY_BUCKET_NAME=$COMFY_BUCKET_NAME" \ - -e "PROCESS_NUMBER=$PROCESS_NUMBER" \ - -e "WORKFLOW_NAME=$WORKFLOW_NAME" \ - --name "$CONTAINER_NAME" \ - -p 8188-8288:8188-8288 \ - --memory "${limit_memory_mb}mb" \ - "$image" +generate_process(){ + init_port=$1 + export PROGRAM_NAME="comfy_$init_port" + comfy_workflow_file="./container/$PROGRAM_NAME" + + WORKFLOW_NAME_TMP="" + + if [ -f "$comfy_workflow_file" ]; then + WORKFLOW_NAME_TMP=$(cat "$comfy_workflow_file") + fi + + if [ -z "$WORKFLOW_NAME_TMP" ]; then + WORKFLOW_NAME_TMP="$WORKFLOW_NAME" + fi + + if [ -z "$WORKFLOW_NAME_TMP" ]; then + WORKFLOW_NAME_TMP="default" + fi + + echo "$WORKFLOW_NAME_TMP" > "$comfy_workflow_file" + + export MASTER_PROCESS=false + if [ "$init_port" -eq "8188" ]; then + export MASTER_PROCESS=true + fi + + CONTAINER_PATH=$(realpath ./container) + START_SH=$(realpath ./build_scripts/inference/start.sh) + COMFY_PROXY=$(realpath ./build_scripts/comfy/comfy_proxy.py) + AWS_PATH=$(realpath ~/.aws) + START_HANDLER="#!/bin/bash +set -euxo pipefail +docker stop $PROGRAM_NAME || true +docker rm $PROGRAM_NAME || true +docker run -v $AWS_PATH:/root/.aws \\ + -v $CONTAINER_PATH:/container \\ + -v $START_SH:/start.sh \\ + -v $COMFY_PROXY:/comfy_proxy.py \\ + --gpus all \\ + -e IMAGE_HASH=$release_image \\ + -e SERVICE_TYPE=$SERVICE_TYPE \\ + -e ON_EC2=true \\ + -e S3_BUCKET_NAME=$COMFY_BUCKET_NAME \\ + -e AWS_REGION=$AWS_REGION \\ + -e AWS_DEFAULT_REGION=$AWS_REGION \\ + -e COMFY_API_URL=$COMFY_API_URL \\ + -e COMFY_API_TOKEN=$COMFY_API_TOKEN \\ + -e ESD_VERSION=$ESD_VERSION \\ + -e COMFY_ENDPOINT=$COMFY_ENDPOINT \\ + -e COMFY_BUCKET_NAME=$COMFY_BUCKET_NAME \\ + -e MASTER_PROCESS=$MASTER_PROCESS \\ + -e PROGRAM_NAME=$PROGRAM_NAME \\ + -e WORKFLOW_NAME_FILE=/container/$PROGRAM_NAME \\ + --name $PROGRAM_NAME \\ + -p $init_port:8188 \\ + --memory ${limit_memory_mb}mb \\ + $image +" + + echo "$START_HANDLER" > "./container/$PROGRAM_NAME.sh" + chmod +x "./container/$PROGRAM_NAME.sh" + + # shellcheck disable=SC2129 + echo "[program:$PROGRAM_NAME]" >> /tmp/supervisord.conf + echo "command=./container/$PROGRAM_NAME.sh" >> /tmp/supervisord.conf + echo "startretries=1" >> /tmp/supervisord.conf + echo "stdout_logfile=/dev/stdout" >> /tmp/supervisord.conf + echo "stderr_logfile=/dev/stderr" >> /tmp/supervisord.conf + echo "" >> /tmp/supervisord.conf +} + + +echo "---------------------------------------------------------------------------------" + +SUPERVISOR_CONF="[supervisord] +nodaemon=true +autostart=true +autorestart=true + +[inet_http_server] +port = 127.0.0.1:9001 + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface + +[supervisorctl] +logfile=/dev/stdout +" + +echo "$SUPERVISOR_CONF" > /tmp/supervisord.conf + +init_port=8187 + +for i in $(seq 1 "$PROCESS_NUMBER"); do + init_port=$((init_port + 1)) + generate_process $init_port +done + +echo "---------------------------------------------------------------------------------" +cat /tmp/supervisord.conf +echo "---------------------------------------------------------------------------------" + +supervisorctl -c /tmp/supervisord.conf shutdown || true +sudo systemctl restart supervisor.service +supervisord -c /tmp/supervisord.conf | grep -v 'uncaptured python exception' +exit 1