improved workflow lock

2024-05-24 17:40:29 +08:00 · 2024-05-24 17:40:29 +08:00 · 0629d964c3
parent bd40efaf19
commit 0629d964c3
5 changed files with 219 additions and 187 deletions
--- a/.gitignore
+++ b/.gitignore
@ -50,3 +50,4 @@ test/**/.env
 .DS_Store
 /container/
 /.env
+supervisord.*
--- a/build_scripts/comfy/comfy_proxy.py
+++ b/build_scripts/comfy/comfy_proxy.py
@ -851,9 +851,14 @@ if is_on_ec2:
        try:
            json_data = await request.json()
            if 'name' not in json_data or not json_data['name']:
-                raise ValueError("name is required")
+                return web.Response(status=200, content_type='application/json',
+                                    body=json.dumps({"result": False, "message": f"name is required"}))

            workflow_name = json_data['name']
+            if workflow_name == 'default':
+                return web.Response(status=200, content_type='application/json',
+                                    body=json.dumps({"result": False, "message": f"{workflow_name} is not allowed"}))
+
            payload_json = ''

            if 'payload_json' in json_data:
@ -865,20 +870,23 @@ if is_on_ec2:

            start_time = time.time()

-            s5cmd_syn_model_command = (f's5cmd sync '
+            s5cmd_sync_command = (f's5cmd sync '
                                       f'--delete=true '
                                       f'--exclude="*comfy.tar" '
                                       f'--exclude="*.log" '
                                       f'--exclude="*__pycache__*" '
-                                       f'--exclude="*.cache*" '
                                       f'--exclude="*/ComfyUI/input/*" '
                                       f'--exclude="*/ComfyUI/output/*" '
                                       f'"/home/ubuntu/*" '
                                       f'"s3://{bucket_name}/comfy/workflows/{workflow_name}/"')
-            logger.info(f"sync workflows files start {s5cmd_syn_model_command}")
-            os.system(s5cmd_syn_model_command)
-            os.system(f'echo "lock" > lock && s5cmd sync lock s3://{bucket_name}/comfy/workflows/{workflow_name}/lock')
-            action_unlock('release')
+
+            s5cmd_lock_command = (f'echo "lock" > lock && '
+                                  f's5cmd sync lock s3://{bucket_name}/comfy/workflows/{workflow_name}/lock')
+
+            logger.info(f"sync workflows files start {s5cmd_sync_command}")
+
+            subprocess.check_output(s5cmd_sync_command, shell=True)
+            subprocess.check_output(s5cmd_lock_command, shell=True)

            end_time = time.time()
            cost_time = end_time - start_time
@ -891,12 +899,54 @@ if is_on_ec2:
            response = get_response.json()
            logger.info(f"release workflow response is {response}")

+            action_unlock('release')
+
            return web.Response(status=200, content_type='application/json',
                                body=json.dumps({"result": True, "message": "success", "cost_time": cost_time}))
        except Exception as e:
            action_unlock('release')
+            logger.info(e)
            return web.Response(status=500, content_type='application/json',
-                                body=json.dumps({"result": False, "message": e}))
+                                body=json.dumps({"result": False, "message": 'Release workflow failed'}))
+
+
+    @server.PromptServer.instance.routes.put("/workflows")
+    async def switch_workflow(request):
+        if is_action_lock('release'):
+            return web.Response(status=200, content_type='application/json',
+                                body=json.dumps(
+                                    {"result": False, "message": "release is not allowed during release workflow"}))
+
+        try:
+            json_data = await request.json()
+            if 'name' not in json_data or not json_data['name']:
+                raise ValueError("name is required")
+
+            workflow_name = json_data['name']
+
+            if workflow_name == os.getenv('WORKFLOW_NAME'):
+                return web.Response(status=200, content_type='application/json',
+                                    body=json.dumps({"result": False, "message": "workflow is already in use"}))
+
+            if workflow_name == 'default' and not is_master_process:
+                return web.Response(status=200, content_type='application/json',
+                                    body=json.dumps({"result": False, "message": "slave can not use default workflow"}))
+
+            if workflow_name != 'default':
+                if not check_file_exists(f"comfy/workflows/{workflow_name}/lock"):
+                    return web.Response(status=200, content_type='application/json',
+                                        body=json.dumps({"result": False, "message": f"{workflow_name} not exists"}))
+
+            name_file = os.getenv('WORKFLOW_NAME_FILE')
+            subprocess.check_output(f"echo {workflow_name} > {name_file}", shell=True)
+            subprocess.run(["pkill", "-f", "python3"])
+
+            return web.Response(status=200, content_type='application/json',
+                                body=json.dumps({"result": True, "message": "Please wait to restart"}))
+        except Exception as e:
+            logger.info(e)
+            return web.Response(status=500, content_type='application/json',
+                                body=json.dumps({"result": False, "message": 'Switch workflow failed'}))


    def check_file_exists(key):
--- a/build_scripts/inference/serve
+++ b/build_scripts/inference/serve
--- a/build_scripts/inference/start.sh
+++ b/build_scripts/inference/start.sh
@ -320,65 +320,66 @@ comfy_launch_from_public_s3(){
    comfy_launch
 }

-# -------------------- startup --------------------
-
-ec2_start_process(){
+if [ -n "$ON_EC2" ]; then
  set -euxo pipefail
-  echo "---------------------------------------------------------------------------------"
-  export LD_LIBRARY_PATH=$LD_PRELOAD
-  set_conda

-  pip install supervisor
-  chown -R root:root "/home/ubuntu/ComfyUI"
-  chmod -R +x venv
+  WORKFLOW_NAME=$(cat "$WORKFLOW_NAME_FILE")
+  export WORKFLOW_DIR="/container/workflows/$WORKFLOW_NAME"

-  SUPERVISOR_CONF="[supervisord]
-nodaemon=true
-directory=/home/ubuntu/ComfyUI
-autostart=true
-autorestart=true
+  if [ ! -d "$WORKFLOW_DIR/ComfyUI/venv" ]; then
+    mkdir -p "$WORKFLOW_DIR"

-[inet_http_server]
-port = 127.0.0.1:9001
-
-[rpcinterface:supervisor]
-supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
-
-[supervisorctl]
-logfile=/dev/stdout
-
-"
-
-  echo "$SUPERVISOR_CONF" > /etc/supervisord.conf
-
-  init_port=8187
-  for i in $(seq 1 "$PROCESS_NUMBER"); do
-      init_port=$((init_port + 1))
-
-      MASTER_PROCESS=false
-      if [ "$init_port" -eq "8188" ]; then
-          MASTER_PROCESS=true
+    if [ "$WORKFLOW_NAME" = "default" ]; then
+      if [ ! -f "/container/$WORKFLOW_NAME.tar" ]; then
+        start_at=$(date +%s)
+        s5cmd cp "s3://aws-gcr-solutions-$AWS_REGION/stable-diffusion-aws-extension-github-mainline/$ESD_VERSION/$SERVICE_TYPE.tar" "/container/$WORKFLOW_NAME.tar"
+        end_at=$(date +%s)
+        export DOWNLOAD_FILE_SECONDS=$((end_at-start_at))
      fi

-      PROGRAM_NAME="comfy_$init_port"
+      start_at=$(date +%s)
+      tar --overwrite -xf "/container/$WORKFLOW_NAME.tar" -C "$WORKFLOW_DIR"
+      end_at=$(date +%s)
+      export DECOMPRESS_SECONDS=$((end_at-start_at))

-      # shellcheck disable=SC2129
-      echo "[program:$PROGRAM_NAME]" >> /etc/supervisord.conf
-      echo "command=/home/ubuntu/ComfyUI/venv/bin/python3 main.py --listen 0.0.0.0 --port $init_port --cuda-malloc --output-directory /home/ubuntu/ComfyUI/output/$init_port --temp-directory /home/ubuntu/ComfyUI/temp/$init_port" >> /etc/supervisord.conf
-      echo "startretries=3" >> /etc/supervisord.conf
-      echo "stdout_logfile=/home/ubuntu/ComfyUI/$PROGRAM_NAME.log" >> /etc/supervisord.conf
-      echo "stderr_logfile=/home/ubuntu/ComfyUI/$PROGRAM_NAME.log" >> /etc/supervisord.conf
-      echo "environment=MASTER_PROCESS=$MASTER_PROCESS,PROGRAM_NAME=$PROGRAM_NAME" >> /etc/supervisord.conf
-      echo "" >> /etc/supervisord.conf
-  done
+      cd "$WORKFLOW_DIR/ComfyUI"
+      mkdir -p models/vae/
+      wget --quiet -O models/vae/vae-ft-mse-840000-ema-pruned.safetensors "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/resolve/main/vae-ft-mse-840000-ema-pruned.safetensors"
+      mkdir -p models/checkpoints/
+      wget --quiet -O models/checkpoints/majicmixRealistic_v7.safetensors "https://huggingface.co/GreenGrape/231209/resolve/045ebfc504c47ba8ccc424f1869c65a223d1f5cc/majicmixRealistic_v7.safetensors"
+      mkdir -p models/animatediff_models/
+      wget --quiet -O models/animatediff_models/mm_sd_v15_v2.ckpt "https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt"
+      wget --quiet -O models/checkpoints/v1-5-pruned-emaonly.ckpt "https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.ckpt"
+    else
+      start_at=$(date +%s)
+      s5cmd --log=error sync "s3://$COMFY_BUCKET_NAME/comfy/workflows/$WORKFLOW_NAME/*" "$WORKFLOW_DIR/"
+      end_at=$(date +%s)
+      export DOWNLOAD_FILE_SECONDS=$((end_at-start_at))
+      echo "download file: $DOWNLOAD_FILE_SECONDS seconds"
+      cd "$WORKFLOW_DIR/ComfyUI"
+    fi
+  fi

-  echo "---------------------------------------------------------------------------------"
-  cat /etc/supervisord.conf
-  echo "---------------------------------------------------------------------------------"
+  rm -rf /home/ubuntu/ComfyUI

-  supervisord -c /etc/supervisord.conf | grep -v 'uncaptured python exception'
+  ln -s "$WORKFLOW_DIR/ComfyUI" /home/ubuntu/ComfyUI
+
+  cd /home/ubuntu/ComfyUI || exit 1
+
+  # if /comfy_proxy.py exists
+  if [ -f "/comfy_proxy.py" ]; then
+    cp /comfy_proxy.py /home/ubuntu/ComfyUI/custom_nodes/
+  fi
+
+  rm -rf web/extensions/ComfyLiterals
+  chmod -R +x venv
+  source venv/bin/activate
+
+  chmod -R 777 /home/ubuntu/ComfyUI
+
+  venv/bin/python3 main.py --listen 0.0.0.0 --port 8188 --cuda-malloc
  exit 1
-}
+fi

 if [ -n "$WORKFLOW_NAME" ]; then
  cd /home/ubuntu || exit 1
@ -410,114 +411,12 @@ if [ -n "$WORKFLOW_NAME" ]; then
  chmod -R +x venv
  source venv/bin/activate

-  # on EC2
-  if [ -n "$ON_EC2" ]; then
-    ec2_start_process
-    exit 1
-  fi
-
  # on SageMaker
  python /metrics.py &
  python3 serve.py
  exit 1
 fi

-if [ -n "$ON_EC2" ]; then
-  set -euxo pipefail
-
-  if [ "$SERVICE_TYPE" == "sd" ]; then
-    cd /home/ubuntu || exit 1
-
-    if [ -d "/home/ubuntu/stable-diffusion-webui/venv" ]; then
-        cd /home/ubuntu/stable-diffusion-webui || exit 1
-        chmod -R +x venv
-        source venv/bin/activate
-        chmod -R 777 /home/ubuntu
-        python3 launch.py --enable-insecure-extension-access --skip-torch-cuda-test --no-half --listen --no-download-sd-model
-        exit 1
-    fi
-
-    echo "downloading comfy file $CACHE_PUBLIC_SD ..."
-    start_at=$(date +%s)
-    s5cmd cp "s3://$CACHE_PUBLIC_SD" /home/ubuntu/
-    end_at=$(date +%s)
-    export DOWNLOAD_FILE_SECONDS=$((end_at-start_at))
-    echo "download file: $DOWNLOAD_FILE_SECONDS seconds"
-
-    echo "decompressing sd file..."
-    start_at=$(date +%s)
-    tar --overwrite -xf "$SERVICE_TYPE.tar" -C /home/ubuntu/
-    end_at=$(date +%s)
-    export DECOMPRESS_SECONDS=$((end_at-start_at))
-    echo "decompress file: $DECOMPRESS_SECONDS seconds"
-
-    cd /home/ubuntu/stable-diffusion-webui/extensions || exit 1
-    git clone https://github.com/zixaphir/Stable-Diffusion-Webui-Civitai-Helper.git
-    cd ../
-
-    export AWS_REGION=us-east-1
-    wget https://raw.githubusercontent.com/awslabs/stable-diffusion-aws-extension/dev/workshop/sd_models.txt
-    s5cmd run sd_models.txt
-
-    chmod -R +x venv
-    source venv/bin/activate
-
-    chmod -R 777 /home/ubuntu/stable-diffusion-webui
-    python3 launch.py --enable-insecure-extension-access --skip-torch-cuda-test --no-half --listen --no-download-sd-model
-  else
-    cd /home/ubuntu || exit 1
-
-    if [ -d "/home/ubuntu/ComfyUI/venv" ]; then
-        cd /home/ubuntu/ComfyUI || exit 1
-        rm -rf web/extensions/ComfyLiterals
-        chmod -R +x venv
-        source venv/bin/activate
-        ec2_start_process
-        exit 1
-    fi
-
-    echo "downloading comfy file $CACHE_PUBLIC_COMFY ..."
-    start_at=$(date +%s)
-    s5cmd cp "s3://$CACHE_PUBLIC_COMFY" /home/ubuntu/
-    end_at=$(date +%s)
-    export DOWNLOAD_FILE_SECONDS=$((end_at-start_at))
-    echo "download file: $DOWNLOAD_FILE_SECONDS seconds"
-
-    echo "decompressing comfy file..."
-    start_at=$(date +%s)
-    tar --overwrite -xf "$SERVICE_TYPE.tar" -C /home/ubuntu/
-    end_at=$(date +%s)
-    export DECOMPRESS_SECONDS=$((end_at-start_at))
-    echo "decompress file: $DECOMPRESS_SECONDS seconds"
-
-    cd /home/ubuntu/ComfyUI || exit 1
-    rm -rf web/extensions/ComfyLiterals
-    chmod -R +x venv
-    source venv/bin/activate
-
-    pip install dynamicprompts
-    pip install ultralytics
-
-    mkdir -p models/vae/
-    wget --quiet -O models/vae/vae-ft-mse-840000-ema-pruned.safetensors "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/resolve/main/vae-ft-mse-840000-ema-pruned.safetensors"
-
-    mkdir -p models/checkpoints/
-    wget --quiet -O models/checkpoints/majicmixRealistic_v7.safetensors "https://huggingface.co/GreenGrape/231209/resolve/045ebfc504c47ba8ccc424f1869c65a223d1f5cc/majicmixRealistic_v7.safetensors"
-
-    mkdir -p models/animatediff_models/
-    wget --quiet -O models/animatediff_models/mm_sd_v15_v2.ckpt "https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt"
-
-    wget --quiet -O models/checkpoints/v1-5-pruned-emaonly.ckpt "https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.ckpt"
-
-    chmod -R 777 /home/ubuntu/ComfyUI
-
-    ec2_start_process
-  fi
-
-  exit 1
-fi
-
-
 if [ -f "/initiated_lock" ]; then
    echo "already initiated, start service directly..."
    if [ "$SERVICE_TYPE" == "sd" ]; then
--- a/docker_start.sh
+++ b/docker_start.sh
@ -6,13 +6,12 @@ if [ -f "/etc/environment" ]; then
    source /etc/environment
 fi

-SERVICE_TYPE="comfy"
-
+export SERVICE_TYPE="comfy"
 export CONTAINER_NAME="esd_container"
 export ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text)
 export AWS_REGION=$(aws configure get region)

-image="$ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$CONTAINER_NAME:latest"
+export image="$ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$CONTAINER_NAME:latest"

 docker stop "$CONTAINER_NAME" || true
 docker rm "$CONTAINER_NAME" || true
@ -36,7 +35,7 @@ docker build -f Dockerfile \
 image_hash=$(docker inspect "$image"  | jq -r ".[0].Id")
 image_hash=${image_hash:7}

-release_image="$ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$CONTAINER_NAME:$image_hash"
+export release_image="$ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$CONTAINER_NAME:$image_hash"
 docker tag "$image" "$release_image"

 aws ecr get-login-password --region "$AWS_REGION" | docker login --username AWS --password-stdin "$ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com"
@ -58,29 +57,112 @@ fi
 total_memory=$(cat /proc/meminfo | grep 'MemTotal' | awk '{print $2}')
 total_memory_mb=$((total_memory / 1024))
 echo "total_memory_mb: $total_memory_mb"
-limit_memory_mb=$((total_memory_mb - 2048))
+export limit_memory_mb=$((total_memory_mb - 2048))
 echo "limit_memory_mb: $limit_memory_mb"

-#  -v ./build_scripts/comfy/comfy_proxy.py:/home/ubuntu/ComfyUI/custom_nodes/comfy_proxy.py \
-docker run -v ~/.aws:/root/.aws \
-           -v "$local_volume":/home/ubuntu \
-           -v ./build_scripts/inference/start.sh:/start.sh \
-           -v ./build_scripts/comfy/comfy_proxy.py:/home/ubuntu/ComfyUI/custom_nodes/comfy_proxy.py \
-           --gpus all \
-           -e "IMAGE_HASH=$release_image" \
-           -e "ESD_VERSION=$ESD_VERSION" \
-           -e "SERVICE_TYPE=$SERVICE_TYPE" \
-           -e "ON_EC2=true" \
-           -e "S3_BUCKET_NAME=$COMFY_BUCKET_NAME" \
-           -e "AWS_REGION=$AWS_REGION" \
-           -e "AWS_DEFAULT_REGION=$AWS_REGION" \
-           -e "COMFY_API_URL=$COMFY_API_URL" \
-           -e "COMFY_API_TOKEN=$COMFY_API_TOKEN" \
-           -e "COMFY_ENDPOINT=$COMFY_ENDPOINT" \
-           -e "COMFY_BUCKET_NAME=$COMFY_BUCKET_NAME" \
-           -e "PROCESS_NUMBER=$PROCESS_NUMBER" \
-           -e "WORKFLOW_NAME=$WORKFLOW_NAME" \
-           --name "$CONTAINER_NAME" \
-           -p 8188-8288:8188-8288 \
-           --memory "${limit_memory_mb}mb" \
-           "$image"
+generate_process(){
+  init_port=$1
+  export PROGRAM_NAME="comfy_$init_port"
+  comfy_workflow_file="./container/$PROGRAM_NAME"
+
+  WORKFLOW_NAME_TMP=""
+
+  if [ -f "$comfy_workflow_file" ]; then
+    WORKFLOW_NAME_TMP=$(cat "$comfy_workflow_file")
+  fi
+
+  if [ -z "$WORKFLOW_NAME_TMP" ]; then
+    WORKFLOW_NAME_TMP="$WORKFLOW_NAME"
+  fi
+
+  if [ -z "$WORKFLOW_NAME_TMP" ]; then
+    WORKFLOW_NAME_TMP="default"
+  fi
+
+  echo "$WORKFLOW_NAME_TMP" > "$comfy_workflow_file"
+
+  export MASTER_PROCESS=false
+  if [ "$init_port" -eq "8188" ]; then
+      export MASTER_PROCESS=true
+  fi
+
+  CONTAINER_PATH=$(realpath ./container)
+  START_SH=$(realpath ./build_scripts/inference/start.sh)
+  COMFY_PROXY=$(realpath ./build_scripts/comfy/comfy_proxy.py)
+  AWS_PATH=$(realpath ~/.aws)
+  START_HANDLER="#!/bin/bash
+set -euxo pipefail
+docker stop $PROGRAM_NAME || true
+docker rm $PROGRAM_NAME || true
+docker run -v $AWS_PATH:/root/.aws \\
+           -v $CONTAINER_PATH:/container \\
+           -v $START_SH:/start.sh \\
+           -v $COMFY_PROXY:/comfy_proxy.py \\
+           --gpus all \\
+           -e IMAGE_HASH=$release_image \\
+           -e SERVICE_TYPE=$SERVICE_TYPE \\
+           -e ON_EC2=true \\
+           -e S3_BUCKET_NAME=$COMFY_BUCKET_NAME \\
+           -e AWS_REGION=$AWS_REGION \\
+           -e AWS_DEFAULT_REGION=$AWS_REGION \\
+           -e COMFY_API_URL=$COMFY_API_URL \\
+           -e COMFY_API_TOKEN=$COMFY_API_TOKEN \\
+           -e ESD_VERSION=$ESD_VERSION \\
+           -e COMFY_ENDPOINT=$COMFY_ENDPOINT \\
+           -e COMFY_BUCKET_NAME=$COMFY_BUCKET_NAME \\
+           -e MASTER_PROCESS=$MASTER_PROCESS \\
+           -e PROGRAM_NAME=$PROGRAM_NAME \\
+           -e WORKFLOW_NAME_FILE=/container/$PROGRAM_NAME \\
+           --name $PROGRAM_NAME \\
+           -p $init_port:8188 \\
+           --memory ${limit_memory_mb}mb \\
+           $image
+"
+
+  echo "$START_HANDLER" > "./container/$PROGRAM_NAME.sh"
+  chmod +x "./container/$PROGRAM_NAME.sh"
+
+  # shellcheck disable=SC2129
+  echo "[program:$PROGRAM_NAME]" >> /tmp/supervisord.conf
+  echo "command=./container/$PROGRAM_NAME.sh" >> /tmp/supervisord.conf
+  echo "startretries=1" >> /tmp/supervisord.conf
+  echo "stdout_logfile=/dev/stdout" >> /tmp/supervisord.conf
+  echo "stderr_logfile=/dev/stderr" >> /tmp/supervisord.conf
+  echo "" >> /tmp/supervisord.conf
+}
+
+
+echo "---------------------------------------------------------------------------------"
+
+SUPERVISOR_CONF="[supervisord]
+nodaemon=true
+autostart=true
+autorestart=true
+
+[inet_http_server]
+port = 127.0.0.1:9001
+
+[rpcinterface:supervisor]
+supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
+
+[supervisorctl]
+logfile=/dev/stdout
+"
+
+echo "$SUPERVISOR_CONF" > /tmp/supervisord.conf
+
+init_port=8187
+
+for i in $(seq 1 "$PROCESS_NUMBER"); do
+    init_port=$((init_port + 1))
+    generate_process $init_port
+done
+
+echo "---------------------------------------------------------------------------------"
+cat /tmp/supervisord.conf
+echo "---------------------------------------------------------------------------------"
+
+supervisorctl -c /tmp/supervisord.conf shutdown || true
+sudo systemctl restart supervisor.service
+supervisord -c /tmp/supervisord.conf | grep -v 'uncaptured python exception'
+exit 1