128 lines
3.7 KiB
Python
128 lines
3.7 KiB
Python
import argparse
|
|
import time
|
|
|
|
import boto3
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="Generate metadata according to the content in S3 bucket")
|
|
parser.add_argument("--s3_path", required=True, help="S3 path of the folder, eg. s3://s3-bucket-name/dataset/your-folder/")
|
|
parser.add_argument("--region", required=False, default="us-east-1", help="AWS region")
|
|
parser.add_argument("--desc", required=False, default="Default description created by utils", help="Dataset description")
|
|
|
|
args = parser.parse_args()
|
|
arg_folder_path = args.s3_path
|
|
arg_aws_region = args.region
|
|
arg_desc = args.desc
|
|
|
|
# Initialize the AWS clients for S3 and DynamoDB
|
|
s3_client = boto3.client("s3", region_name=arg_aws_region)
|
|
dynamodb_client = boto3.client("dynamodb", region_name=arg_aws_region)
|
|
|
|
info_table_name = "DatasetInfoTable"
|
|
item_table_name = "DatasetItemTable"
|
|
|
|
parts = arg_folder_path.split("/")
|
|
s3_bucket = parts[2]
|
|
s3_prefix = "/".join(parts[3:])
|
|
create_time = str(time.time())
|
|
if arg_folder_path.endswith("/"):
|
|
# Ignore the last forward slash
|
|
s3_dataset = parts[-2]
|
|
else:
|
|
s3_dataset = parts[-1]
|
|
|
|
|
|
def insert_item(dataset_name: str, file_name: str, create_time: str):
|
|
"""Insert data into DynamoDB table DatasetItemTable
|
|
|
|
Args:
|
|
dataset_name (str): dataset name
|
|
file_name (str): S3 file name
|
|
create_time (str): create time in timestamp format
|
|
"""
|
|
dataset_item = {
|
|
"dataset_name": {
|
|
"S": dataset_name
|
|
},
|
|
"sort_key": {
|
|
"S": create_time + "_" + file_name
|
|
},
|
|
"data_status": {
|
|
"S": "Enabled"
|
|
},
|
|
"name": {
|
|
"S": file_name
|
|
},
|
|
"params": {
|
|
"M": {
|
|
"original_file_name": {
|
|
"S": file_name
|
|
}
|
|
}
|
|
},
|
|
"type": {
|
|
"S": "image"
|
|
}
|
|
}
|
|
try:
|
|
response = dynamodb_client.put_item(
|
|
TableName=item_table_name,
|
|
Item=dataset_item
|
|
)
|
|
print(f"Inserted {dataset_item} into DynamoDB with response: {response}")
|
|
except Exception as e:
|
|
print(f"Error inserting {dataset_item} into DynamoDB: {e}")
|
|
|
|
|
|
def insert_info(s3_dataset: str, desc: str, create_time: str):
|
|
"""Insert data into DynamoDB table DatasetInfoTable
|
|
|
|
Args:
|
|
s3_dataset (str): dataset name
|
|
desc (str): dataset description
|
|
create_time (str): create time in timestamp format
|
|
"""
|
|
dataset_info = {
|
|
"dataset_name": {
|
|
"S": s3_dataset
|
|
},
|
|
"dataset_status": {
|
|
"S": "Enabled"
|
|
},
|
|
"params": {
|
|
"M": {
|
|
"description": {
|
|
"S": desc
|
|
}
|
|
}
|
|
},
|
|
"timestamp": {
|
|
"N": create_time
|
|
}
|
|
}
|
|
try:
|
|
response = dynamodb_client.put_item(
|
|
TableName=info_table_name,
|
|
Item=dataset_info
|
|
)
|
|
print(f"Inserted {dataset_info} into DynamoDB with response: {response}")
|
|
except Exception as e:
|
|
print(f"Error inserting {dataset_info} into DynamoDB: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# List S3 objects in the specified folder
|
|
s3_objects = s3_client.list_objects(Bucket=s3_bucket, Prefix=s3_prefix)
|
|
if "Contents" in s3_objects:
|
|
for obj in s3_objects["Contents"]:
|
|
print(obj)
|
|
file_name = obj["Key"].split("/")[-1]
|
|
print(file_name)
|
|
if len(file_name.strip()) != 0:
|
|
# Insert file information into DynamoDB
|
|
insert_item(s3_dataset, file_name, create_time)
|
|
else:
|
|
print(f"No objects found in {s3_prefix}")
|
|
|
|
insert_info(s3_dataset, arg_desc, create_time)
|