improved wf delete

2024-05-23 13:31:49 +08:00 · 2024-05-23 13:31:49 +08:00 · 3946cd0411
parent 5eb8e4ba58
commit 3946cd0411
5 changed files with 83 additions and 21 deletions
--- a/4
+++ b/4
@ -5,7 +5,9 @@ FROM 366590864501.dkr.ecr.$AWS_REGION.amazonaws.com/esd-inference:$ESD_VERSION
 # TODO BYOC
 #RUN apt-get update -y && \
 #    apt-get install ffmpeg -y && \
-#    rm -rf /var/lib/apt/lists/* \
+#    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /home/ubuntu/ComfyUI

 COPY build_scripts/inference/start.sh /
 RUN chmod +x /start.sh
--- a/build_scripts/comfy/comfy_proxy.py
+++ b/build_scripts/comfy/comfy_proxy.py
@ -87,6 +87,7 @@ if is_on_ec2:
    max_wait_time = os.environ.get('MAX_WAIT_TIME', 86400)
    msg_max_wait_time = os.environ.get('MSG_MAX_WAIT_TIME', 86400)
    is_master_process = os.getenv('MASTER_PROCESS') == 'true'
+    program_name = os.getenv('PROGRAM_NAME')
    no_need_sync_files = ['.autosave', '.cache', '.autosave1', '~', '.swp']

    need_resend_msg_result = []
@ -714,7 +715,11 @@ if is_on_ec2:
    async def restart(self):
        logger.info(f"start to reboot {self}")
        try:
-            subprocess.run(["sudo", "reboot"])
+            from xmlrpc.client import ServerProxy
+            server = ServerProxy('http://localhost:9001/RPC2')
+            server.supervisor.restart()
+            # server.supervisor.shutdown()
+            return web.Response(status=200, content_type='application/json', body=json.dumps({"result": True}))
        except Exception as e:
            logger.info(f"error reboot  {e}")
            pass
--- a/build_scripts/inference/start.sh
+++ b/build_scripts/inference/start.sh
@ -328,31 +328,56 @@ ec2_start_process(){
  export LD_LIBRARY_PATH=$LD_PRELOAD
  set_conda

+  pip install supervisor
+  chown -R root:root "/home/ubuntu/ComfyUI"
+  chmod -R +x venv
+
+  SUPERVISOR_CONF="[supervisord]
+nodaemon=true
+directory=/home/ubuntu/ComfyUI
+autostart=true
+autorestart=true
+
+[inet_http_server]
+port = 127.0.0.1:9001
+
+[rpcinterface:supervisor]
+supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
+
+[supervisorctl]
+logfile=/dev/stdout
+
+"
+
+  echo "$SUPERVISOR_CONF" > /etc/supervisord.conf
+
  init_port=8187
  for i in $(seq 1 "$PROCESS_NUMBER"); do
      init_port=$((init_port + 1))

+      MASTER_PROCESS=false
      if [ "$init_port" -eq "8188" ]; then
          MASTER_PROCESS=true
-      else
-          MASTER_PROCESS=false
      fi

-      if [ "$i" -eq "$PROCESS_NUMBER" ]; then
-          export MASTER_PROCESS=$MASTER_PROCESS && python3 main.py --listen 0.0.0.0 \
-                                                        --port "$init_port" \
-                                                        --cuda-malloc \
-                                                        --output-directory "/home/ubuntu/ComfyUI/output/$init_port" \
-                                                        --temp-directory "/home/ubuntu/ComfyUI/temp/$init_port"
-          exit 1
-      fi
+      PROGRAM_NAME="comfy_$init_port"

-      export MASTER_PROCESS=$MASTER_PROCESS && nohup python3 main.py --listen 0.0.0.0 \
-                                            --port "$init_port" \
-                                            --cuda-malloc \
-                                            --output-directory "/home/ubuntu/ComfyUI/output/$init_port" \
-                                            --temp-directory "/home/ubuntu/ComfyUI/temp/$init_port" &
+      # shellcheck disable=SC2129
+      echo "[program:$PROGRAM_NAME]" >> /etc/supervisord.conf
+      echo "command=/home/ubuntu/ComfyUI/venv/bin/python3 main.py --listen 0.0.0.0 --port $init_port --cuda-malloc --output-directory /home/ubuntu/ComfyUI/output/$init_port --temp-directory /home/ubuntu/ComfyUI/temp/$init_port" >> /etc/supervisord.conf
+      echo "startretries=3" >> /etc/supervisord.conf
+      echo "stdout_logfile=/dev/stdout" >> /etc/supervisord.conf
+      echo "stderr_logfile=/dev/stderr" >> /etc/supervisord.conf
+      echo "environment=MASTER_PROCESS=$MASTER_PROCESS,PROGRAM_NAME=$PROGRAM_NAME" >> /etc/supervisord.conf
+      echo "" >> /etc/supervisord.conf
  done
+
+  echo "---------------------------------------------------------------------------------"
+  cat /etc/supervisord.conf
+  echo "---------------------------------------------------------------------------------"
+
+  supervisord -c /etc/supervisord.conf | grep -v 'uncaptured python exception'
+  exit 1
 }

 if [ -n "$WORKFLOW_NAME" ]; then
@ -407,7 +432,7 @@ if [ -n "$ON_EC2" ]; then
        cd /home/ubuntu/stable-diffusion-webui || exit 1
        chmod -R +x venv
        source venv/bin/activate
-        chmod -R 777 /home/ubuntu/stable-diffusion-webui
+        chmod -R 777 /home/ubuntu
        python3 launch.py --enable-insecure-extension-access --skip-torch-cuda-test --no-half --listen --no-download-sd-model
        exit 1
    fi
--- a/docker_start.sh
+++ b/docker_start.sh
@ -55,6 +55,12 @@ else
   export WORKFLOW_NAME=""
 fi

+total_memory=$(cat /proc/meminfo | grep 'MemTotal' | awk '{print $2}')
+total_memory_mb=$((total_memory / 1024))
+echo "total_memory_mb: $total_memory_mb"
+limit_memory_mb=$((total_memory_mb - 2048))
+echo "limit_memory_mb: $limit_memory_mb"
+
 #  -v ./build_scripts/comfy/comfy_proxy.py:/home/ubuntu/ComfyUI/custom_nodes/comfy_proxy.py \
 docker run -v ~/.aws:/root/.aws \
           -v "$local_volume":/home/ubuntu \
@ -76,4 +82,5 @@ docker run -v ~/.aws:/root/.aws \
           -e "WORKFLOW_NAME=$WORKFLOW_NAME" \
           --name "$CONTAINER_NAME" \
           -p 8188-8288:8188-8288 \
+           --memory "${limit_memory_mb}mb" \
           "$image"
--- a/test/test_10_local_only/test_09_comfy_snapshot_endpoint_create.py
+++ b/test/test_10_local_only/test_09_comfy_snapshot_endpoint_create.py
@ -60,7 +60,7 @@ class TestComfySnapshotEpCreateE2E:
            "endpoint_name": f'snapshot-{config.endpoint_name}',
            "service_type": "comfy",
            "endpoint_type": "Async",
-            "instance_type": 'ml.g5.8xlarge',
+            "instance_type": 'ml.g5.4xlarge',
            "workflow_name": 'workflow1',
            "initial_instance_count": 1,
            "autoscaling_enabled": False,
@ -73,9 +73,32 @@ class TestComfySnapshotEpCreateE2E:

        resp = self.api.create_endpoint(headers=headers, data=data)
        assert 'data' in resp.json(), resp.dumps()
-        assert resp.json()["data"]["endpoint_status"] == "Creating", resp.dumps()

-    def test_3_list_endpoints_status(self):
+    def test_3_create_comfy_snapshot_endpoint_rt(self):
+        headers = {
+            "x-api-key": config.api_key,
+            "username": config.username
+        }
+
+        data = {
+            "endpoint_name": f'snapshot-{config.endpoint_name}',
+            "service_type": "comfy",
+            "endpoint_type": "Real-time",
+            "instance_type": 'ml.g5.8xlarge',
+            "workflow_name": 'workflow1',
+            "initial_instance_count": 1,
+            "autoscaling_enabled": False,
+            "assign_to_roles": [config.role_comfy_real_time],
+            "creator": config.username
+        }
+
+        if config.custom_docker_image_uri:
+            data["custom_docker_image_uri"] = config.custom_docker_image_uri
+
+        resp = self.api.create_endpoint(headers=headers, data=data)
+        assert 'data' in resp.json(), resp.dumps()
+
+    def test_4_list_endpoints_status(self):
        headers = {
            "x-api-key": config.api_key,
            "username": config.username