stable-diffusion-aws-extension/middleware_api/lambda/datasets/dataset_utils.py

128 lines
3.7 KiB
Python

import argparse
import time
import boto3
parser = argparse.ArgumentParser(description="Generate metadata according to the content in S3 bucket")
parser.add_argument("--s3_path", required=True, help="S3 path of the folder, eg. s3://s3-bucket-name/dataset/your-folder/")
parser.add_argument("--region", required=False, default="us-east-1", help="AWS region")
parser.add_argument("--desc", required=False, default="Default description created by utils", help="Dataset description")
args = parser.parse_args()
arg_folder_path = args.s3_path
arg_aws_region = args.region
arg_desc = args.desc
# Initialize the AWS clients for S3 and DynamoDB
s3_client = boto3.client("s3", region_name=arg_aws_region)
dynamodb_client = boto3.client("dynamodb", region_name=arg_aws_region)
info_table_name = "DatasetInfoTable"
item_table_name = "DatasetItemTable"
parts = arg_folder_path.split("/")
s3_bucket = parts[2]
s3_prefix = "/".join(parts[3:])
create_time = str(time.time())
if arg_folder_path.endswith("/"):
# Ignore the last forward slash
s3_dataset = parts[-2]
else:
s3_dataset = parts[-1]
def insert_item(dataset_name: str, file_name: str, create_time: str):
"""Insert data into DynamoDB table DatasetItemTable
Args:
dataset_name (str): dataset name
file_name (str): S3 file name
create_time (str): create time in timestamp format
"""
dataset_item = {
"dataset_name": {
"S": dataset_name
},
"sort_key": {
"S": create_time + "_" + file_name
},
"data_status": {
"S": "Enabled"
},
"name": {
"S": file_name
},
"params": {
"M": {
"original_file_name": {
"S": file_name
}
}
},
"type": {
"S": "image"
}
}
try:
response = dynamodb_client.put_item(
TableName=item_table_name,
Item=dataset_item
)
print(f"Inserted {dataset_item} into DynamoDB with response: {response}")
except Exception as e:
print(f"Error inserting {dataset_item} into DynamoDB: {e}")
def insert_info(s3_dataset: str, desc: str, create_time: str):
"""Insert data into DynamoDB table DatasetInfoTable
Args:
s3_dataset (str): dataset name
desc (str): dataset description
create_time (str): create time in timestamp format
"""
dataset_info = {
"dataset_name": {
"S": s3_dataset
},
"dataset_status": {
"S": "Enabled"
},
"params": {
"M": {
"description": {
"S": desc
}
}
},
"timestamp": {
"N": create_time
}
}
try:
response = dynamodb_client.put_item(
TableName=info_table_name,
Item=dataset_info
)
print(f"Inserted {dataset_info} into DynamoDB with response: {response}")
except Exception as e:
print(f"Error inserting {dataset_info} into DynamoDB: {e}")
if __name__ == "__main__":
# List S3 objects in the specified folder
s3_objects = s3_client.list_objects(Bucket=s3_bucket, Prefix=s3_prefix)
if "Contents" in s3_objects:
for obj in s3_objects["Contents"]:
print(obj)
file_name = obj["Key"].split("/")[-1]
print(file_name)
if len(file_name.strip()) != 0:
# Insert file information into DynamoDB
insert_item(s3_dataset, file_name, create_time)
else:
print(f"No objects found in {s3_prefix}")
insert_info(s3_dataset, arg_desc, create_time)