mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-07-04 13:46:56 -04:00
Refactoring, github pipeline, s3 creation
This commit is contained in:
parent
a7c99e264d
commit
b245fd44eb
21 changed files with 718 additions and 281 deletions
156
scripts/helpers/bag.py
Normal file
156
scripts/helpers/bag.py
Normal file
|
@ -0,0 +1,156 @@
|
|||
import os
|
||||
import json
|
||||
import zipfile
|
||||
import struct
|
||||
import boto3
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import tempfile
|
||||
import shutil
|
||||
from nabit.lib.archive import package
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# File extensions that are already compressed or wouldn't benefit from additional compression
|
||||
UNCOMPRESSED_EXTENSIONS = {
|
||||
# Already compressed archives
|
||||
'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
|
||||
# Compressed images
|
||||
'jpg', 'jpeg', 'png', 'gif', 'webp',
|
||||
# Compressed video/audio
|
||||
'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
|
||||
# Other compressed/binary formats
|
||||
'pdf', 'docx', 'xlsx', 'pptx',
|
||||
}
|
||||
|
||||
def zip_archive(bag_dir, archive_path):
|
||||
"""Zip up a nabit archive and create metadata."""
|
||||
# Create zip archive
|
||||
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
||||
for file_path in bag_dir.rglob('*'):
|
||||
if file_path.is_file():
|
||||
arc_path = file_path.relative_to(bag_dir)
|
||||
compression = (zipfile.ZIP_STORED
|
||||
if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS
|
||||
else zipfile.ZIP_DEFLATED)
|
||||
zf.write(file_path, arc_path, compress_type=compression)
|
||||
|
||||
# Create metadata file
|
||||
zip_info = []
|
||||
with zipfile.ZipFile(archive_path, 'r') as zf:
|
||||
for info in zf.filelist:
|
||||
header_offset = info.header_offset
|
||||
|
||||
# Read header to calculate data offset
|
||||
zf.fp.seek(header_offset)
|
||||
header = zf.fp.read(zipfile.sizeFileHeader)
|
||||
fheader = struct.unpack(zipfile.structFileHeader, header)
|
||||
fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
|
||||
extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
|
||||
data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
|
||||
|
||||
zip_info.append({
|
||||
'filename': info.filename,
|
||||
'file_size': info.file_size,
|
||||
'compress_size': info.compress_size,
|
||||
'compress_type': info.compress_type,
|
||||
'header_offset': header_offset,
|
||||
'data_offset': data_offset,
|
||||
})
|
||||
|
||||
# Read the bag-info.txt and signed-metadata.json
|
||||
bag_info = (bag_dir / 'bag-info.txt').read_text()
|
||||
signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
|
||||
|
||||
return {
|
||||
'bag_info': bag_info,
|
||||
'signed_metadata': signed_metadata,
|
||||
'zip_entries': zip_info
|
||||
}
|
||||
|
||||
def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
|
||||
"""Upload archive and metadata to S3."""
|
||||
s3 = boto3.Session(**session_args).client('s3')
|
||||
bucket_name, s3_key_prefix = s3_path.split('/', 1)
|
||||
|
||||
# Upload zip file
|
||||
s3_collection_key = os.path.join(s3_key_prefix, str(collection_path.relative_to(output_path)))
|
||||
s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
|
||||
logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
|
||||
|
||||
# Upload metadata file
|
||||
s3_metadata_key = os.path.join(s3_key_prefix, str(metadata_path.relative_to(output_path)))
|
||||
s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
|
||||
logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
|
||||
|
||||
def cleanup_files(collection_path, no_delete=False, s3_path=None):
|
||||
"""Clean up local files after upload if needed."""
|
||||
if not no_delete and s3_path:
|
||||
logger.info("- Deleting local zip archive...")
|
||||
if os.path.exists(collection_path):
|
||||
os.remove(collection_path)
|
||||
if collection_path.parent.exists() and not os.listdir(collection_path.parent):
|
||||
os.rmdir(collection_path.parent)
|
||||
|
||||
def fetch_and_upload(
|
||||
output_path,
|
||||
collection_path,
|
||||
metadata_path,
|
||||
create_archive_callback,
|
||||
signatures=None,
|
||||
session_args=None,
|
||||
s3_path=None,
|
||||
no_delete=False,
|
||||
):
|
||||
"""
|
||||
Common pipeline for creating and processing archives.
|
||||
|
||||
Args:
|
||||
output_path: Base output directory
|
||||
collection_path: Path where the final zip will be stored
|
||||
metadata_path: Path where the metadata will be stored
|
||||
create_archive_callback: Function that will create the archive
|
||||
signatures: Signature configuration for nabit
|
||||
session_args: AWS session arguments
|
||||
s3_path: S3 path for uploads
|
||||
no_delete: Whether to preserve local files
|
||||
"""
|
||||
with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
|
||||
logger.info("- Creating archive...")
|
||||
# set up paths
|
||||
temp_dir = Path(temp_dir)
|
||||
bag_dir = temp_dir / 'bag'
|
||||
archive_path = temp_dir / 'archive.zip'
|
||||
source_files_dir = temp_dir / 'source_files'
|
||||
source_files_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Call the callback to create the archive
|
||||
package_kwargs = create_archive_callback(source_files_dir)
|
||||
|
||||
# create bag
|
||||
package(
|
||||
output_path=bag_dir,
|
||||
collect_errors='ignore',
|
||||
signatures=signatures,
|
||||
**package_kwargs,
|
||||
)
|
||||
|
||||
logger.info("- Zipping archive...")
|
||||
# zip up data and create metadata
|
||||
output_metadata = zip_archive(bag_dir, archive_path)
|
||||
|
||||
logger.info("- Moving files to final location...")
|
||||
# Move files to final location
|
||||
collection_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.move(str(archive_path), collection_path)
|
||||
with open(metadata_path, 'w') as f:
|
||||
json.dump(output_metadata, f)
|
||||
f.write('\n')
|
||||
|
||||
if s3_path:
|
||||
logger.info("Uploading to S3...")
|
||||
upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
|
||||
|
||||
cleanup_files(collection_path, no_delete, s3_path)
|
85
scripts/helpers/onepassword.py
Normal file
85
scripts/helpers/onepassword.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
from onepassword import Client, ItemCreateParams, ItemField, ItemFieldType, ItemCategory, ItemSection, Item
|
||||
from onepassword import ItemShareParams, ItemShareDuration, ValidRecipient
|
||||
from .misc import load_config
|
||||
import logging
|
||||
import asyncio
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def get_client():
|
||||
op_config = load_config().get("1password", {})
|
||||
if not op_config.get('token'):
|
||||
raise Exception("1Password token not found in config")
|
||||
|
||||
return await Client.authenticate(auth=op_config['token'], integration_name="data-vault", integration_version="v1.0.0")
|
||||
|
||||
def save_item(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
|
||||
return asyncio.run(save_item_async(vault_name, title, fields, notes, category))
|
||||
|
||||
async def save_item_async(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
|
||||
client = await get_client()
|
||||
vault_id = None
|
||||
vaults = await client.vaults.list_all()
|
||||
async for vault in vaults:
|
||||
if vault.title == vault_name:
|
||||
vault_id = vault.id
|
||||
break
|
||||
else:
|
||||
raise Exception(f"Vault {vault} not found")
|
||||
|
||||
field_objs = []
|
||||
sections = []
|
||||
for field in fields:
|
||||
if field.get('concealed', False):
|
||||
field['field_type'] = ItemFieldType.CONCEALED
|
||||
else:
|
||||
field['field_type'] = ItemFieldType.TEXT
|
||||
field['id'] = field['title'].lower().replace(' ', '_')
|
||||
field_objs.append(ItemField(**field))
|
||||
if section_id := field.get('section_id'):
|
||||
if section_id not in sections:
|
||||
sections.append(section_id)
|
||||
sections = [ItemSection(id=section, title=section) for section in sections]
|
||||
|
||||
# Create item parameters with sections
|
||||
create_params = ItemCreateParams(
|
||||
title=title,
|
||||
category=category,
|
||||
vault_id=vault_id,
|
||||
fields=field_objs,
|
||||
sections=sections,
|
||||
)
|
||||
|
||||
if notes:
|
||||
create_params.notes = notes
|
||||
|
||||
item = await client.items.create(create_params)
|
||||
logger.info(f"Stored credentials in 1Password vault '{vault}' with title '{title}'")
|
||||
return item
|
||||
|
||||
def share_item(
|
||||
item: Item,
|
||||
recipients: list[str] | None = None,
|
||||
expire_after: ItemShareDuration | None = ItemShareDuration.SEVENDAYS,
|
||||
one_time_only: bool = False
|
||||
):
|
||||
return asyncio.run(share_item_async(item, recipients, expire_after, one_time_only))
|
||||
|
||||
async def share_item_async(
|
||||
item: Item,
|
||||
recipients: list[str] | None,
|
||||
expire_after: ItemShareDuration | None,
|
||||
one_time_only: bool,
|
||||
):
|
||||
client = await get_client()
|
||||
policy = await client.items.shares.get_account_policy(item.vault_id, item.id)
|
||||
valid_recipients = await client.items.shares.validate_recipients(policy, recipients)
|
||||
share_params = ItemShareParams(
|
||||
recipients=valid_recipients,
|
||||
expire_after=expire_after,
|
||||
one_time_only=one_time_only
|
||||
)
|
||||
share_link = await client.items.shares.create(item, policy, share_params)
|
||||
logger.info(f"Created share link for '{item.title}'")
|
||||
return share_link
|
||||
|
|
@ -25,17 +25,9 @@ def worker(task_queue, task, catch_errors: bool = True):
|
|||
raise e
|
||||
|
||||
|
||||
def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, log_level: str | None = None, task_count: int | None = None):
|
||||
def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, task_count: int | None = None):
|
||||
workers = workers or os.cpu_count() or 4
|
||||
|
||||
# Configure logging based on whether we're running in parallel or not
|
||||
if log_level is None:
|
||||
log_level = 'INFO' if workers == 1 else 'WARNING'
|
||||
logging.basicConfig(
|
||||
level=log_level,
|
||||
format='[%(process)d] %(message)s'
|
||||
)
|
||||
|
||||
logger.debug(f"Starting processing with {workers} workers")
|
||||
|
||||
if workers > 1:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue