From a7dc6e1148035caf8a00a0eb413241d5ecfd4f6d Mon Sep 17 00:00:00 2001 From: Jingyi Date: Sat, 23 Mar 2024 18:32:28 +0800 Subject: [PATCH] improved docker files sync --- aws_extension/sagemaker_ui_tab.py | 6 +- .../Dockerfile.inference.from_scratch | 14 +- build_scripts/build_and_push_local.sh | 15 +- build_scripts/build_inference_local.sh | 3 +- build_scripts/inference/serve.sh | 245 +++++++++++------- .../lambda/endpoints/create_endpoint.py | 16 +- 6 files changed, 178 insertions(+), 121 deletions(-) diff --git a/aws_extension/sagemaker_ui_tab.py b/aws_extension/sagemaker_ui_tab.py index 6f2a0001..07c03d28 100644 --- a/aws_extension/sagemaker_ui_tab.py +++ b/aws_extension/sagemaker_ui_tab.py @@ -896,10 +896,10 @@ def ep_create_tab(): custom_extensions = gr.Textbox( value="", lines=5, - placeholder="https://github.com/awslabs/stable-diffusion-aws-extension.git", + placeholder="https://github.com/awslabs/stable-diffusion-aws-extension.git#main#a096556799b7b0686e19ec94c0dbf2ca74d8ffbc", label=f"Custom Extension URLs (Optional) - Please separate with line breaks", visible=False, - info="If you fill in this field, the endpoint will be deployed with the environment check, it's slow." + info="The endpoint will set an environment variable named EXTENSIONS, default image will be install automatically." ) custom_docker_image_uri = gr.Textbox( @@ -908,7 +908,7 @@ def ep_create_tab(): placeholder="123456789.dkr.ecr.us-east-1.amazonaws.com/repo/image:latest", label=f"Custom Docker Image URI (Optional)", visible=False, - info="If you fill in this field, the endpoint will be deployed with the value of this field." + info="The endpoint will be deployed with your custom docker image." ) ep_deploy_btn = gr.Button(value="Deploy Endpoint", variant='primary', diff --git a/build_scripts/Dockerfile.inference.from_scratch b/build_scripts/Dockerfile.inference.from_scratch index 51baa816..d58810f3 100644 --- a/build_scripts/Dockerfile.inference.from_scratch +++ b/build_scripts/Dockerfile.inference.from_scratch @@ -7,22 +7,20 @@ RUN tar xzvf s5cmd_2.2.2_Linux-64bit.tar.gz FROM public.ecr.aws/ubuntu/ubuntu:22.04_stable -COPY --from=builder /s5cmd /usr/local/bin/s5cmd +COPY --from=builder /s5cmd /usr/local/bin/ SHELL ["/bin/bash", "-c"] RUN adduser --disabled-password --gecos '' ubuntu RUN apt-get update -y && \ - apt-get install --no-install-recommends -y git libgl1 libtcmalloc-minimal4 libglib2.0-0 python3.10 python3.10-venv net-tools bc nginx && \ + apt-get install --no-install-recommends -y git libgl1 libtcmalloc-minimal4 libglib2.0-0 python3.10 python3.10-venv net-tools bc && \ # may not needed in endpoint apt-get install --no-install-recommends -y pkg-config file curl protobuf-compiler mesa-utils && \ # for env install - apt-get install --no-install-recommends -y lsof tar python3-dev libcairo2-dev libprotobuf-dev build-essential cmake wget && \ - rm -rf /var/lib/apt/lists/* && \ - rm -rf /var/www/html/* - -COPY nginx_proxy.conf /etc/nginx/sites-available/default + apt-get install --no-install-recommends -y tar python3-dev libcairo2-dev libprotobuf-dev build-essential cmake wget && \ + # clean up + rm -rf /var/lib/apt/lists/* WORKDIR /home/ubuntu/ @@ -30,8 +28,6 @@ ENV ON_DOCKER true ENV LD_PRELOAD /usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 -RUN chown -R ubuntu:ubuntu /home/ubuntu - COPY inference/serve.sh / RUN chmod +x /serve.sh diff --git a/build_scripts/build_and_push_local.sh b/build_scripts/build_and_push_local.sh index ea9b3518..6b47acde 100755 --- a/build_scripts/build_and_push_local.sh +++ b/build_scripts/build_and_push_local.sh @@ -56,8 +56,6 @@ then fi fi -#aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com -#aws ecr get-login-password --region us-west-2 | docker login -u AWS --password-stdin 292282985366.dkr.ecr.us-west-2.amazonaws.com aws ecr get-login-password --region ${region} | docker login -u AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com cp ${dockerfile} . @@ -65,8 +63,15 @@ cp ${dockerfile} . # Build the docker image locally with the image name and then push it to ECR # with the full name. -docker build -t ${image_name}:${tag} -f ${dockerfile} . -docker tag ${image_name}:${tag} ${fullname} +docker build -t ${fullname} -f ${dockerfile} . + +# if docker build faild, exit +if [ $? -ne 0 ] +then + echo "docker build failed" + exit 255 +fi docker push ${fullname} -echo $fullname + +docker images $fullname \ No newline at end of file diff --git a/build_scripts/build_inference_local.sh b/build_scripts/build_inference_local.sh index e0eef9ac..e7b1a5dd 100755 --- a/build_scripts/build_inference_local.sh +++ b/build_scripts/build_inference_local.sh @@ -1,3 +1,4 @@ #!/usr/bin/env bash -./build_and_push_local.sh Dockerfile.inference.from_scratch esd-inference dev "1.5.0-dev" +tag="1.5.0-dev" +./build_and_push_local.sh Dockerfile.inference.from_scratch esd-inference dev $tag \ No newline at end of file diff --git a/build_scripts/inference/serve.sh b/build_scripts/inference/serve.sh index 292dd605..66af5d43 100755 --- a/build_scripts/inference/serve.sh +++ b/build_scripts/inference/serve.sh @@ -1,43 +1,36 @@ #!/bin/bash -nginx & - export ESD_CODE_BRANCH=main -export INSTALL_SCRIPT=https://raw.githubusercontent.com/awslabs/stable-diffusion-aws-extension/main/install.sh +export ESD_VERSION='1.5.0' +export WEBUI_PORT=8080 if [[ $ECR_IMAGE_TAG == *"dev"* ]]; then export ESD_CODE_BRANCH=dev - export INSTALL_SCRIPT=https://raw.githubusercontent.com/awslabs/stable-diffusion-aws-extension/dev/install.sh trap 'echo "error_lock" > /error_lock; exit 1' ERR if [ -f "/error_lock" ]; then + # nginx -c /etc/nginx/nginx_error.conf & echo "start failed, please check the log" sleep 30 exit 1 fi fi -echo "*********************************************************************************" -cat $0 +cores=$(lscpu | grep "^Core(s) per socket:" | awk '{print $4}') +sockets=$(lscpu | grep "^Socket(s):" | awk '{print $2}') +cup_core_nums=$((cores * sockets)) echo "---------------------------------------------------------------------------------" +echo "whoami: $(whoami)" +echo "cup_core_nums: $cup_core_nums" echo "Current shell: $SHELL" echo "Running in $(bash --version)" -echo "---------------------------------------------------------------------------------" -nvidia-smi +export INSTALL_SCRIPT=https://raw.githubusercontent.com/awslabs/stable-diffusion-aws-extension/$ESD_CODE_BRANCH/install.sh echo "---------------------------------------------------------------------------------" printenv -export ESD_VERSION='1.5.0' - echo "---------------------------------------------------------------------------------" -echo "INSTANCE_TYPE: $INSTANCE_TYPE" -echo "ECR_IMAGE_TAG: $ECR_IMAGE_TAG" -echo "IMAGE_URL: $IMAGE_URL" -echo "ENDPOINT_NAME: $ENDPOINT_NAME" -echo "ENDPOINT_ID: $ENDPOINT_ID" -echo "ESD_VERSION: $ESD_VERSION" echo "CREATED_AT: $CREATED_AT" created_time_seconds=$(date -d "$CREATED_AT" +%s) current_time=$(date "+%Y-%m-%dT%H:%M:%S.%6N") @@ -47,15 +40,15 @@ echo "NOW_AT: $current_time" echo "Init from Create: $init_seconds seconds" echo "---------------------------------------------------------------------------------" -export S3_LOCATION="esd-$ESD_VERSION-$INSTANCE_TYPE" +export S3_LOCATION="esd-$INSTANCE_TYPE-$ESD_VERSION" if [ -n "$EXTENSIONS" ]; then - export S3_LOCATION="$ENDPOINT_NAME" + export S3_LOCATION="$ENDPOINT_NAME-$ESD_VERSION" fi tar_file="webui.tar" -echo "Check s3://$BUCKET_NAME/$S3_LOCATION files..." +echo "Checking s3://$BUCKET_NAME/$S3_LOCATION files..." output=$(s5cmd ls "s3://$BUCKET_NAME/") if echo "$output" | grep -q "$S3_LOCATION"; then @@ -90,32 +83,63 @@ if echo "$output" | grep -q "$S3_LOCATION"; then echo "---------------------------------------------------------------------------------" echo "accelerate launch..." - accelerate launch --num_cpu_threads_per_process=6 launch.py --api --listen --port 7860 --xformers --no-half-vae --no-download-sd-model --no-hashing --nowebui --skip-torch-cuda-test --skip-load-model-at-start --disable-safe-unpickle --skip-prepare-environment --skip-python-version-check --skip-install --skip-version-check + accelerate launch --num_cpu_threads_per_process=$cup_core_nums launch.py --enable-insecure-extension-access --api --api-log --log-startup --listen --port $WEBUI_PORT --xformers --no-half-vae --no-download-sd-model --no-hashing --nowebui --skip-torch-cuda-test --skip-load-model-at-start --disable-safe-unpickle --skip-prepare-environment --skip-python-version-check --skip-install --skip-version-check fi +echo "Not found files in S3, just install the environment..." + cd /home/ubuntu curl -sSL "$INSTALL_SCRIPT" | bash; -echo "---------------------------------------------------------------------------------" -echo "Set conda" -export AWS_REGION="us-west-2" -s5cmd --log=error cp "s3://aws-gcr-solutions-us-west-2/extension-for-stable-diffusion-on-aws/1.5.0-g5/conda/libcufft.so.10" /home/ubuntu/conda/lib/ -s5cmd --log=error cp "s3://aws-gcr-solutions-us-west-2/extension-for-stable-diffusion-on-aws/1.5.0-g5/conda/libcurand.so.10" /home/ubuntu/conda/lib/ -export LD_LIBRARY_PATH=/home/ubuntu/conda/lib:$LD_LIBRARY_PATH -export AWS_REGION=$AWS_DEFAULT_REGION -echo "---------------------------------------------------------------------------------" +# if $EXTENSIONS is not empty, it will be executed +if [ -n "$EXTENSIONS" ]; then + echo "---------------------------------------------------------------------------------" + echo "install extensions..." + cd /home/ubuntu/stable-diffusion-webui/extensions/ || exit 1 -cd stable-diffusion-webui + read -ra array <<< "$(echo "$EXTENSIONS" | tr "," " ")" + + for git_repo in "${array[@]}"; do + IFS='#' read -r -a repo <<< "$git_repo" + + git_repo=${repo[0]} + repo_name=$(basename -s .git "$git_repo") + repo_branch=${repo[1]} + commit_sha=${repo[2]} + + echo "rm -rf $repo_name for install $git_repo" + rm -rf $repo_name + + start_at=$(date +%s) + + echo "git clone $git_repo" + git clone "$git_repo" + + cd $repo_name || exit 1 + + echo "git checkout $repo_branch" + git checkout "$repo_branch" + + echo "git reset --hard $commit_sha" + git reset --hard "$commit_sha" + cd .. + + end_at=$(date +%s) + cost=$((end_at-start_at)) + echo "git clone $git_repo: $cost seconds" + done +fi + +echo "---------------------------------------------------------------------------------" +echo "creating venv and install packages..." + +cd /home/ubuntu/stable-diffusion-webui python3 -m venv venv -# chmod +x /home/ubuntu/stable-diffusion-webui/venv/bin/* - source venv/bin/activate python -m pip install --upgrade pip python -m pip install accelerate -python -m pip install markdown - python -m pip install onnxruntime-gpu python -m pip install insightface==0.7.3 @@ -123,81 +147,106 @@ export TORCH_INDEX_URL="https://download.pytorch.org/whl/cu118" export TORCH_COMMAND="pip install torch==2.0.1 torchvision==0.15.2 --extra-index-url $TORCH_INDEX_URL" export XFORMERS_PACKAGE="xformers==0.0.20" -# if $EXTENSIONS is not empty, it will be executed -if [ -n "$EXTENSIONS" ]; then - echo "---------------------------------------------------------------------------------" - cd /home/ubuntu/stable-diffusion-webui/extensions/ || exit 1 +remove_unused(){ + echo "rm $1" + rm -rf "$1" +} - read -ra array <<< "$(echo "$EXTENSIONS" | tr "," " ")" +remove_unused_list(){ + echo "---------------------------------------------------------------------------------" + echo "deleteing big unused files..." + remove_unused /home/ubuntu/stable-diffusion-webui/extensions/stable-diffusion-aws-extension/docs + remove_unused /home/ubuntu/stable-diffusion-webui/extensions/stable-diffusion-aws-extension/infrastructure + remove_unused /home/ubuntu/stable-diffusion-webui/extensions/stable-diffusion-aws-extension/middleware_api + remove_unused /home/ubuntu/stable-diffusion-webui/extensions/stable-diffusion-aws-extension/test + remove_unused /home/ubuntu/stable-diffusion-webui/repositories/BLIP/BLIP.gif + remove_unused /home/ubuntu/stable-diffusion-webui/repositories/generative-models/assets/ + remove_unused /home/ubuntu/stable-diffusion-webui/repositories/stable-diffusion-stability-ai/assets/ - for git_repo in "${array[@]}"; do - start_at=$(date +%s) - echo "git clone $git_repo" - git clone "$git_repo" - end_at=$(date +%s) - cost=$((end_at-start_at)) - echo "git clone $git_repo: $cost seconds" - done -fi + echo "deleteing git dir..." + find /home/ubuntu/stable-diffusion-webui -type d \( -name '.git' -o -name '.github' \) | while read dir; do + remove_unused "$dir"; + done -rm -rf /home/ubuntu/stable-diffusion-webui/extensions/stable-diffusion-aws-extension/docs -rm -rf /home/ubuntu/stable-diffusion-webui/extensions/stable-diffusion-aws-extension/infrastructure -rm -rf /home/ubuntu/stable-diffusion-webui/extensions/stable-diffusion-aws-extension/middleware_api -rm -rf /home/ubuntu/stable-diffusion-webui/extensions/stable-diffusion-aws-extension/test -rm -rf /home/ubuntu/stable-diffusion-webui/repositories/BLIP/BLIP.gif -rm -rf /home/ubuntu/stable-diffusion-webui/repositories/generative-models/assets/ -rm -rf /home/ubuntu/stable-diffusion-webui/repositories/stable-diffusion-stability-ai/assets/ + echo "deleteing unused files..." + find /home/ubuntu/stable-diffusion-webui -type f \( -name '.gitignore' -o -name 'README.md' -o -name 'CHANGELOG.md' \) | while read file; do + remove_unused "$file"; + done -echo "delete git..." -find "/home/ubuntu/stable-diffusion-webui" -type d -name '.git' -exec rm -rf {} + -find "/home/ubuntu/stable-diffusion-webui" -type d -name '.github' -exec rm -rf {} + -find "/home/ubuntu/stable-diffusion-webui" -type f -name '.gitignore' -exec rm -rf {} + -find "/home/ubuntu/stable-diffusion-webui" -type f -name 'README.md' -exec rm -rf {} + -find "/home/ubuntu/stable-diffusion-webui" -type f -name 'CHANGELOG.md' -exec rm -rf {} + -find "/home/ubuntu/stable-diffusion-webui" -type f -name 'CODE_OF_CONDUCT.md' -exec rm -rf {} + -find "/home/ubuntu/stable-diffusion-webui" -type f -name 'LICENSE.md' -exec rm -rf {} + -find "/home/ubuntu/stable-diffusion-webui" -type f -name 'NOTICE.md' -exec rm -rf {} + + find /home/ubuntu/stable-diffusion-webui -type f \( -name 'CODE_OF_CONDUCT.md' -o -name 'LICENSE.md' -o -name 'NOTICE.md' \) | while read file; do + remove_unused "$file"; + done -check_ready() { - while true; do - PID=$(lsof -i :8080 | awk 'NR!=1 {print $2}' | head -1) + find /home/ubuntu/stable-diffusion-webui -type f \( -name 'CODEOWNERS' -o -name 'LICENSE.txt' -o -name 'LICENSE' \) | while read file; do + remove_unused "$file"; + done - if [ -n "$PID" ]; then - echo "Port 8080 is in use by PID: $PID. tar files and upload to S3" - - echo "collection big files..." - upload_files=$(mktemp) - big_files=$(find "/home/ubuntu/stable-diffusion-webui" -type f -size +2520k) - for file in $big_files; do - key=$(echo "$file" | cut -d'/' -f4-) - echo "sync $file s3://$BUCKET_NAME/$S3_LOCATION/$key" >> "$upload_files" - done - - echo "tar files..." - filelist=$(mktemp) - # shellcheck disable=SC2164 - cd /home/ubuntu/stable-diffusion-webui - find "./" \( -type f -o -type l \) -size -2530k > "$filelist" - tar -cf $tar_file -T "$filelist" - - echo "sync $tar_file s3://$BUCKET_NAME/$S3_LOCATION/" >> "$upload_files" - echo "sync /home/ubuntu/conda/* s3://$BUCKET_NAME/$S3_LOCATION/conda/" >> "$upload_files" - echo "sync /home/ubuntu/stable-diffusion-webui/models/insightface/* s3://$BUCKET_NAME/$S3_LOCATION/insightface/" >> "$upload_files" - - echo "upload files..." - s5cmd run "$upload_files" - - break - fi - - echo "Port 8080 is not in use, waiting for 10 seconds..." - sleep 1 + find /home/ubuntu/stable-diffusion-webui -type f \( -name '*.gif' -o -name '*.png' -o -name '*.jpg' \) | while read file; do + remove_unused "$file"; done } -check_ready & +check_ready() { + while true; do + RESPONSE_CODE=$(curl -o /dev/null -s -w "%{http_code}\n" localhost:8080/ping) + if [ "$RESPONSE_CODE" -eq 200 ]; then + echo "Server is ready!" + + start_at=$(date +%s) + + echo "collection big files..." + upload_files=$(mktemp) + big_files=$(find "/home/ubuntu/stable-diffusion-webui" -type f -size +2520k) + for file in $big_files; do + key=$(echo "$file" | cut -d'/' -f4-) + echo "sync $file s3://$BUCKET_NAME/$S3_LOCATION/$key" >> "$upload_files" + done + + echo "tar files..." + filelist=$(mktemp) + # shellcheck disable=SC2164 + cd /home/ubuntu/stable-diffusion-webui + find "./" \( -type f -o -type l \) -size -2530k > "$filelist" + tar -cf $tar_file -T "$filelist" + + echo "sync $tar_file s3://$BUCKET_NAME/$S3_LOCATION/" >> "$upload_files" + echo "sync /home/ubuntu/conda/* s3://$BUCKET_NAME/$S3_LOCATION/conda/" >> "$upload_files" + + # for ReActor + echo "sync /home/ubuntu/stable-diffusion-webui/models/insightface/* s3://$BUCKET_NAME/$S3_LOCATION/insightface/" >> "$upload_files" + + echo "upload files..." + s5cmd run "$upload_files" + end_at=$(date +%s) + cost=$((end_at-start_at)) + echo "sync endpoint files: $cost seconds" + break + fi + + sleep 2 + done +} + +echo "---------------------------------------------------------------------------------" +echo "set conda environment..." +export AWS_REGION="us-west-2" +s5cmd --log=error cp "s3://aws-gcr-solutions-us-west-2/extension-for-stable-diffusion-on-aws/1.5.0-g5/conda/libcufft.so.10" /home/ubuntu/conda/lib/ +s5cmd --log=error cp "s3://aws-gcr-solutions-us-west-2/extension-for-stable-diffusion-on-aws/1.5.0-g5/conda/libcurand.so.10" /home/ubuntu/conda/lib/ +export LD_LIBRARY_PATH=/home/ubuntu/conda/lib:$LD_LIBRARY_PATH +export AWS_REGION=$AWS_DEFAULT_REGION + +echo "---------------------------------------------------------------------------------" +nvidia-smi cd /home/ubuntu/stable-diffusion-webui +echo "---------------------------------------------------------------------------------" +echo "install webui..." +accelerate launch --num_cpu_threads_per_process=$cup_core_nums launch.py --enable-insecure-extension-access --api --api-log --log-startup --listen --port $WEBUI_PORT --xformers --no-half-vae --no-download-sd-model --no-hashing --nowebui --skip-torch-cuda-test --skip-load-model-at-start --disable-safe-unpickle --exit + +remove_unused_list + +check_ready & + echo "---------------------------------------------------------------------------------" echo "accelerate launch..." -accelerate launch --num_cpu_threads_per_process=6 launch.py --api --listen --port 7860 --xformers --no-half-vae --no-download-sd-model --no-hashing --nowebui --skip-torch-cuda-test --skip-load-model-at-start --disable-safe-unpickle +accelerate launch --num_cpu_threads_per_process=$cup_core_nums launch.py --enable-insecure-extension-access --api --api-log --log-startup --listen --port $WEBUI_PORT --xformers --no-half-vae --no-download-sd-model --no-hashing --nowebui --skip-torch-cuda-test --skip-load-model-at-start --disable-safe-unpickle --skip-prepare-environment --skip-python-version-check --skip-install --skip-version-check \ No newline at end of file diff --git a/middleware_api/lambda/endpoints/create_endpoint.py b/middleware_api/lambda/endpoints/create_endpoint.py index b8456690..227c1a6b 100644 --- a/middleware_api/lambda/endpoints/create_endpoint.py +++ b/middleware_api/lambda/endpoints/create_endpoint.py @@ -53,6 +53,15 @@ def check_custom_extensions(event: CreateEndpointEvent): extensions_array = re.split('[ ,\n]+', event.custom_extensions) extensions_array = list(set(extensions_array)) extensions_array = list(filter(None, extensions_array)) + + for extension in extensions_array: + pattern = r'^https://github\.com/[^#/]+/[^#/]+\.git#[^#]+#[a-fA-F0-9]{40}$' + if not re.match(pattern, extension): + raise BadRequestException( + message=f"extension format is invalid: {extension}, valid format is like " + f"https://github.com/awslabs/stable-diffusion-aws-extension.git#main#" + f"a096556799b7b0686e19ec94c0dbf2ca74d8ffbc") + # make extensions_array to string again event.custom_extensions = ','.join(extensions_array) @@ -78,9 +87,6 @@ def handler(raw_event, ctx): logger.info(json.dumps(raw_event)) event = CreateEndpointEvent(**json.loads(raw_event['body'])) - if event.custom_extensions and event.custom_docker_image_uri: - raise BadRequestException(message="custom_extensions and custom_docker_image_uri cannot be used together") - permissions_check(raw_event, [PERMISSION_ENDPOINT_ALL, PERMISSION_ENDPOINT_CREATE]) if event.endpoint_type not in EndpointType.List.value: @@ -218,8 +224,8 @@ def get_production_variants(model_name, instance_type, initial_instance_count): 'ModelName': model_name, 'InitialInstanceCount': initial_instance_count, 'InstanceType': instance_type, - "ModelDataDownloadTimeoutInSeconds": 1800, # Specify the model download timeout in seconds. - "ContainerStartupHealthCheckTimeoutInSeconds": 600, # Specify the health checkup timeout in seconds + "ModelDataDownloadTimeoutInSeconds": 60 * 30, # Specify the model download timeout in seconds. + "ContainerStartupHealthCheckTimeoutInSeconds": 60 * 20, # Specify the health checkup timeout in seconds } ]