data-vault/scripts/helpers/bag.py
2025-02-26 14:49:24 -05:00

156 lines
No EOL
5.9 KiB
Python

import os
import json
import zipfile
import struct
import boto3
import logging
from pathlib import Path
from datetime import datetime
import tempfile
import shutil
from nabit.lib.archive import package
logger = logging.getLogger(__name__)
# File extensions that are already compressed or wouldn't benefit from additional compression
UNCOMPRESSED_EXTENSIONS = {
# Already compressed archives
'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
# Compressed images
'jpg', 'jpeg', 'png', 'gif', 'webp',
# Compressed video/audio
'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
# Other compressed/binary formats
'pdf', 'docx', 'xlsx', 'pptx',
}
def zip_archive(bag_dir, archive_path):
"""Zip up a nabit archive and create metadata."""
# Create zip archive
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
for file_path in bag_dir.rglob('*'):
if file_path.is_file():
arc_path = file_path.relative_to(bag_dir)
compression = (zipfile.ZIP_STORED
if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS
else zipfile.ZIP_DEFLATED)
zf.write(file_path, arc_path, compress_type=compression)
# Create metadata file
zip_info = []
with zipfile.ZipFile(archive_path, 'r') as zf:
for info in zf.filelist:
header_offset = info.header_offset
# Read header to calculate data offset
zf.fp.seek(header_offset)
header = zf.fp.read(zipfile.sizeFileHeader)
fheader = struct.unpack(zipfile.structFileHeader, header)
fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
zip_info.append({
'filename': info.filename,
'file_size': info.file_size,
'compress_size': info.compress_size,
'compress_type': info.compress_type,
'header_offset': header_offset,
'data_offset': data_offset,
})
# Read the bag-info.txt and signed-metadata.json
bag_info = (bag_dir / 'bag-info.txt').read_text()
signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
return {
'bag_info': bag_info,
'signed_metadata': signed_metadata,
'zip_entries': zip_info
}
def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
"""Upload archive and metadata to S3."""
s3 = boto3.Session(**session_args).client('s3')
bucket_name, s3_key_prefix = s3_path.split('/', 1)
# Upload zip file
s3_collection_key = os.path.join(s3_key_prefix, str(collection_path.relative_to(output_path)))
s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
# Upload metadata file
s3_metadata_key = os.path.join(s3_key_prefix, str(metadata_path.relative_to(output_path)))
s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
def cleanup_files(collection_path, no_delete=False, s3_path=None):
"""Clean up local files after upload if needed."""
if not no_delete and s3_path:
logger.info("- Deleting local zip archive...")
if os.path.exists(collection_path):
os.remove(collection_path)
if collection_path.parent.exists() and not os.listdir(collection_path.parent):
os.rmdir(collection_path.parent)
def fetch_and_upload(
output_path,
collection_path,
metadata_path,
create_archive_callback,
signatures=None,
session_args=None,
s3_path=None,
no_delete=False,
):
"""
Common pipeline for creating and processing archives.
Args:
output_path: Base output directory
collection_path: Path where the final zip will be stored
metadata_path: Path where the metadata will be stored
create_archive_callback: Function that will create the archive
signatures: Signature configuration for nabit
session_args: AWS session arguments
s3_path: S3 path for uploads
no_delete: Whether to preserve local files
"""
with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
logger.info("- Creating archive...")
# set up paths
temp_dir = Path(temp_dir)
bag_dir = temp_dir / 'bag'
archive_path = temp_dir / 'archive.zip'
source_files_dir = temp_dir / 'source_files'
source_files_dir.mkdir(parents=True, exist_ok=True)
# Call the callback to create the archive
package_kwargs = create_archive_callback(source_files_dir)
# create bag
package(
output_path=bag_dir,
collect_errors='ignore',
signatures=signatures,
**package_kwargs,
)
logger.info("- Zipping archive...")
# zip up data and create metadata
output_metadata = zip_archive(bag_dir, archive_path)
logger.info("- Moving files to final location...")
# Move files to final location
collection_path.parent.mkdir(parents=True, exist_ok=True)
metadata_path.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(archive_path), collection_path)
with open(metadata_path, 'w') as f:
json.dump(output_metadata, f)
f.write('\n')
if s3_path:
logger.info("Uploading to S3...")
upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
cleanup_files(collection_path, no_delete, s3_path)