data-vault/scripts/collection/s3_tools.py
2025-02-05 10:21:50 -05:00

118 lines
4.3 KiB
Python

import boto3
import click
from tqdm import tqdm
import logging
from itertools import islice
logger = logging.getLogger(__name__)
def get_delete_markers(s3_client, bucket: str, prefix: str):
"""Get all delete markers for objects with the given prefix."""
paginator = s3_client.get_paginator('list_object_versions')
for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="pages"):
if 'DeleteMarkers' in page:
yield [
{
'Key': marker['Key'],
'VersionId': marker['VersionId']
}
for marker in page['DeleteMarkers']
if marker['IsLatest']
]
def remove_delete_markers(s3_client, bucket: str, prefix: str, dry_run: bool = False):
"""Remove all delete markers for objects with the given prefix."""
for marker_batch in get_delete_markers(s3_client, bucket, prefix):
response = s3_client.delete_objects(
Bucket=bucket,
Delete={
'Objects': marker_batch,
'Quiet': True
}
)
# Log any errors
if 'Errors' in response:
for error in response['Errors']:
logger.error(f"Failed to remove marker for {error['Key']}: {error['Message']}")
def get_empty_files(s3_client, bucket: str, prefix: str):
"""Get all objects with size zero under the given prefix."""
paginator = s3_client.get_paginator('list_objects_v2')
for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="pages"):
if 'Contents' in page:
yield [
{'Key': obj['Key']}
for obj in page['Contents']
if obj['Size'] == 0
]
def delete_empty_files(s3_client, bucket: str, prefix: str, dry_run: bool = False):
"""Delete all zero-size objects under the given prefix."""
pbar = tqdm(desc="deleted")
for empty_batch in get_empty_files(s3_client, bucket, prefix):
if not empty_batch:
continue
if dry_run:
for obj in empty_batch:
logger.info(f"Would delete empty file: {obj['Key']}")
continue
pbar.update(len(empty_batch))
response = s3_client.delete_objects(
Bucket=bucket,
Delete={
'Objects': empty_batch,
'Quiet': True
}
)
# Log any errors
if 'Errors' in response:
for error in response['Errors']:
logger.error(f"Failed to delete {error['Key']}: {error['Message']}")
pbar.close()
@click.group()
def cli():
"""S3 object management commands."""
pass
@cli.command()
@click.argument('s3_path')
@click.option('--profile', help='AWS profile name', default='sc-direct')
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO', help='Set logging level')
def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
"""Remove delete markers from versioned S3 objects, effectively undeleting them."""
logging.basicConfig(level=log_level)
bucket, prefix = s3_path.split('/', 1)
session = boto3.Session(profile_name=profile)
s3_client = session.client('s3')
remove_delete_markers(s3_client, bucket, prefix, dry_run)
@cli.command()
@click.argument('s3_path')
@click.option('--profile', help='AWS profile name', default='sc-direct')
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO', help='Set logging level')
def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
"""Delete all zero-size objects under the given prefix."""
logging.basicConfig(level=log_level)
bucket, prefix = s3_path.split('/', 1)
session = boto3.Session(profile_name=profile)
s3_client = session.client('s3')
delete_empty_files(s3_client, bucket, prefix, dry_run)
if __name__ == '__main__':
cli()