diff --git a/Dockerfile b/Dockerfile index 797e7be1..93b1b11f 100755 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,9 @@ FROM 366590864501.dkr.ecr.$AWS_REGION.amazonaws.com/esd-inference:$ESD_VERSION # TODO BYOC #RUN apt-get update -y && \ # apt-get install ffmpeg -y && \ -# rm -rf /var/lib/apt/lists/* \ +# rm -rf /var/lib/apt/lists/* + +WORKDIR /home/ubuntu/ComfyUI COPY build_scripts/inference/start.sh / RUN chmod +x /start.sh diff --git a/build_scripts/comfy/comfy_proxy.py b/build_scripts/comfy/comfy_proxy.py index cc62f92e..cd5a1041 100755 --- a/build_scripts/comfy/comfy_proxy.py +++ b/build_scripts/comfy/comfy_proxy.py @@ -87,6 +87,7 @@ if is_on_ec2: max_wait_time = os.environ.get('MAX_WAIT_TIME', 86400) msg_max_wait_time = os.environ.get('MSG_MAX_WAIT_TIME', 86400) is_master_process = os.getenv('MASTER_PROCESS') == 'true' + program_name = os.getenv('PROGRAM_NAME') no_need_sync_files = ['.autosave', '.cache', '.autosave1', '~', '.swp'] need_resend_msg_result = [] @@ -714,7 +715,11 @@ if is_on_ec2: async def restart(self): logger.info(f"start to reboot {self}") try: - subprocess.run(["sudo", "reboot"]) + from xmlrpc.client import ServerProxy + server = ServerProxy('http://localhost:9001/RPC2') + server.supervisor.restart() + # server.supervisor.shutdown() + return web.Response(status=200, content_type='application/json', body=json.dumps({"result": True})) except Exception as e: logger.info(f"error reboot {e}") pass diff --git a/build_scripts/inference/start.sh b/build_scripts/inference/start.sh index e7154bd2..0d3fbda1 100644 --- a/build_scripts/inference/start.sh +++ b/build_scripts/inference/start.sh @@ -328,31 +328,56 @@ ec2_start_process(){ export LD_LIBRARY_PATH=$LD_PRELOAD set_conda + pip install supervisor + chown -R root:root "/home/ubuntu/ComfyUI" + chmod -R +x venv + + SUPERVISOR_CONF="[supervisord] +nodaemon=true +directory=/home/ubuntu/ComfyUI +autostart=true +autorestart=true + +[inet_http_server] +port = 127.0.0.1:9001 + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface + +[supervisorctl] +logfile=/dev/stdout + +" + + echo "$SUPERVISOR_CONF" > /etc/supervisord.conf + init_port=8187 for i in $(seq 1 "$PROCESS_NUMBER"); do init_port=$((init_port + 1)) + MASTER_PROCESS=false if [ "$init_port" -eq "8188" ]; then MASTER_PROCESS=true - else - MASTER_PROCESS=false fi - if [ "$i" -eq "$PROCESS_NUMBER" ]; then - export MASTER_PROCESS=$MASTER_PROCESS && python3 main.py --listen 0.0.0.0 \ - --port "$init_port" \ - --cuda-malloc \ - --output-directory "/home/ubuntu/ComfyUI/output/$init_port" \ - --temp-directory "/home/ubuntu/ComfyUI/temp/$init_port" - exit 1 - fi + PROGRAM_NAME="comfy_$init_port" - export MASTER_PROCESS=$MASTER_PROCESS && nohup python3 main.py --listen 0.0.0.0 \ - --port "$init_port" \ - --cuda-malloc \ - --output-directory "/home/ubuntu/ComfyUI/output/$init_port" \ - --temp-directory "/home/ubuntu/ComfyUI/temp/$init_port" & + # shellcheck disable=SC2129 + echo "[program:$PROGRAM_NAME]" >> /etc/supervisord.conf + echo "command=/home/ubuntu/ComfyUI/venv/bin/python3 main.py --listen 0.0.0.0 --port $init_port --cuda-malloc --output-directory /home/ubuntu/ComfyUI/output/$init_port --temp-directory /home/ubuntu/ComfyUI/temp/$init_port" >> /etc/supervisord.conf + echo "startretries=3" >> /etc/supervisord.conf + echo "stdout_logfile=/dev/stdout" >> /etc/supervisord.conf + echo "stderr_logfile=/dev/stderr" >> /etc/supervisord.conf + echo "environment=MASTER_PROCESS=$MASTER_PROCESS,PROGRAM_NAME=$PROGRAM_NAME" >> /etc/supervisord.conf + echo "" >> /etc/supervisord.conf done + + echo "---------------------------------------------------------------------------------" + cat /etc/supervisord.conf + echo "---------------------------------------------------------------------------------" + + supervisord -c /etc/supervisord.conf | grep -v 'uncaptured python exception' + exit 1 } if [ -n "$WORKFLOW_NAME" ]; then @@ -407,7 +432,7 @@ if [ -n "$ON_EC2" ]; then cd /home/ubuntu/stable-diffusion-webui || exit 1 chmod -R +x venv source venv/bin/activate - chmod -R 777 /home/ubuntu/stable-diffusion-webui + chmod -R 777 /home/ubuntu python3 launch.py --enable-insecure-extension-access --skip-torch-cuda-test --no-half --listen --no-download-sd-model exit 1 fi diff --git a/docker_start.sh b/docker_start.sh index 41d64e96..fe858c7b 100755 --- a/docker_start.sh +++ b/docker_start.sh @@ -55,6 +55,12 @@ else export WORKFLOW_NAME="" fi +total_memory=$(cat /proc/meminfo | grep 'MemTotal' | awk '{print $2}') +total_memory_mb=$((total_memory / 1024)) +echo "total_memory_mb: $total_memory_mb" +limit_memory_mb=$((total_memory_mb - 2048)) +echo "limit_memory_mb: $limit_memory_mb" + # -v ./build_scripts/comfy/comfy_proxy.py:/home/ubuntu/ComfyUI/custom_nodes/comfy_proxy.py \ docker run -v ~/.aws:/root/.aws \ -v "$local_volume":/home/ubuntu \ @@ -76,4 +82,5 @@ docker run -v ~/.aws:/root/.aws \ -e "WORKFLOW_NAME=$WORKFLOW_NAME" \ --name "$CONTAINER_NAME" \ -p 8188-8288:8188-8288 \ + --memory "${limit_memory_mb}mb" \ "$image" diff --git a/test/test_10_local_only/test_09_comfy_snapshot_endpoint_create.py b/test/test_10_local_only/test_09_comfy_snapshot_endpoint_create.py index 4c050244..6b43d91a 100644 --- a/test/test_10_local_only/test_09_comfy_snapshot_endpoint_create.py +++ b/test/test_10_local_only/test_09_comfy_snapshot_endpoint_create.py @@ -60,7 +60,7 @@ class TestComfySnapshotEpCreateE2E: "endpoint_name": f'snapshot-{config.endpoint_name}', "service_type": "comfy", "endpoint_type": "Async", - "instance_type": 'ml.g5.8xlarge', + "instance_type": 'ml.g5.4xlarge', "workflow_name": 'workflow1', "initial_instance_count": 1, "autoscaling_enabled": False, @@ -73,9 +73,32 @@ class TestComfySnapshotEpCreateE2E: resp = self.api.create_endpoint(headers=headers, data=data) assert 'data' in resp.json(), resp.dumps() - assert resp.json()["data"]["endpoint_status"] == "Creating", resp.dumps() - def test_3_list_endpoints_status(self): + def test_3_create_comfy_snapshot_endpoint_rt(self): + headers = { + "x-api-key": config.api_key, + "username": config.username + } + + data = { + "endpoint_name": f'snapshot-{config.endpoint_name}', + "service_type": "comfy", + "endpoint_type": "Real-time", + "instance_type": 'ml.g5.8xlarge', + "workflow_name": 'workflow1', + "initial_instance_count": 1, + "autoscaling_enabled": False, + "assign_to_roles": [config.role_comfy_real_time], + "creator": config.username + } + + if config.custom_docker_image_uri: + data["custom_docker_image_uri"] = config.custom_docker_image_uri + + resp = self.api.create_endpoint(headers=headers, data=data) + assert 'data' in resp.json(), resp.dumps() + + def test_4_list_endpoints_status(self): headers = { "x-api-key": config.api_key, "username": config.username