Refactoring, github pipeline, s3 creation

This commit is contained in:
Jack Cushman 2025-02-26 14:49:24 -05:00
parent a7c99e264d
commit b245fd44eb
21 changed files with 718 additions and 281 deletions

156
scripts/helpers/bag.py Normal file
View file

@ -0,0 +1,156 @@
import os
import json
import zipfile
import struct
import boto3
import logging
from pathlib import Path
from datetime import datetime
import tempfile
import shutil
from nabit.lib.archive import package
logger = logging.getLogger(__name__)
# File extensions that are already compressed or wouldn't benefit from additional compression
UNCOMPRESSED_EXTENSIONS = {
# Already compressed archives
'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
# Compressed images
'jpg', 'jpeg', 'png', 'gif', 'webp',
# Compressed video/audio
'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
# Other compressed/binary formats
'pdf', 'docx', 'xlsx', 'pptx',
}
def zip_archive(bag_dir, archive_path):
"""Zip up a nabit archive and create metadata."""
# Create zip archive
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
for file_path in bag_dir.rglob('*'):
if file_path.is_file():
arc_path = file_path.relative_to(bag_dir)
compression = (zipfile.ZIP_STORED
if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS
else zipfile.ZIP_DEFLATED)
zf.write(file_path, arc_path, compress_type=compression)
# Create metadata file
zip_info = []
with zipfile.ZipFile(archive_path, 'r') as zf:
for info in zf.filelist:
header_offset = info.header_offset
# Read header to calculate data offset
zf.fp.seek(header_offset)
header = zf.fp.read(zipfile.sizeFileHeader)
fheader = struct.unpack(zipfile.structFileHeader, header)
fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
zip_info.append({
'filename': info.filename,
'file_size': info.file_size,
'compress_size': info.compress_size,
'compress_type': info.compress_type,
'header_offset': header_offset,
'data_offset': data_offset,
})
# Read the bag-info.txt and signed-metadata.json
bag_info = (bag_dir / 'bag-info.txt').read_text()
signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
return {
'bag_info': bag_info,
'signed_metadata': signed_metadata,
'zip_entries': zip_info
}
def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
"""Upload archive and metadata to S3."""
s3 = boto3.Session(**session_args).client('s3')
bucket_name, s3_key_prefix = s3_path.split('/', 1)
# Upload zip file
s3_collection_key = os.path.join(s3_key_prefix, str(collection_path.relative_to(output_path)))
s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
# Upload metadata file
s3_metadata_key = os.path.join(s3_key_prefix, str(metadata_path.relative_to(output_path)))
s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
def cleanup_files(collection_path, no_delete=False, s3_path=None):
"""Clean up local files after upload if needed."""
if not no_delete and s3_path:
logger.info("- Deleting local zip archive...")
if os.path.exists(collection_path):
os.remove(collection_path)
if collection_path.parent.exists() and not os.listdir(collection_path.parent):
os.rmdir(collection_path.parent)
def fetch_and_upload(
output_path,
collection_path,
metadata_path,
create_archive_callback,
signatures=None,
session_args=None,
s3_path=None,
no_delete=False,
):
"""
Common pipeline for creating and processing archives.
Args:
output_path: Base output directory
collection_path: Path where the final zip will be stored
metadata_path: Path where the metadata will be stored
create_archive_callback: Function that will create the archive
signatures: Signature configuration for nabit
session_args: AWS session arguments
s3_path: S3 path for uploads
no_delete: Whether to preserve local files
"""
with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
logger.info("- Creating archive...")
# set up paths
temp_dir = Path(temp_dir)
bag_dir = temp_dir / 'bag'
archive_path = temp_dir / 'archive.zip'
source_files_dir = temp_dir / 'source_files'
source_files_dir.mkdir(parents=True, exist_ok=True)
# Call the callback to create the archive
package_kwargs = create_archive_callback(source_files_dir)
# create bag
package(
output_path=bag_dir,
collect_errors='ignore',
signatures=signatures,
**package_kwargs,
)
logger.info("- Zipping archive...")
# zip up data and create metadata
output_metadata = zip_archive(bag_dir, archive_path)
logger.info("- Moving files to final location...")
# Move files to final location
collection_path.parent.mkdir(parents=True, exist_ok=True)
metadata_path.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(archive_path), collection_path)
with open(metadata_path, 'w') as f:
json.dump(output_metadata, f)
f.write('\n')
if s3_path:
logger.info("Uploading to S3...")
upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
cleanup_files(collection_path, no_delete, s3_path)

View file

@ -0,0 +1,85 @@
from onepassword import Client, ItemCreateParams, ItemField, ItemFieldType, ItemCategory, ItemSection, Item
from onepassword import ItemShareParams, ItemShareDuration, ValidRecipient
from .misc import load_config
import logging
import asyncio
logger = logging.getLogger(__name__)
async def get_client():
op_config = load_config().get("1password", {})
if not op_config.get('token'):
raise Exception("1Password token not found in config")
return await Client.authenticate(auth=op_config['token'], integration_name="data-vault", integration_version="v1.0.0")
def save_item(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
return asyncio.run(save_item_async(vault_name, title, fields, notes, category))
async def save_item_async(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
client = await get_client()
vault_id = None
vaults = await client.vaults.list_all()
async for vault in vaults:
if vault.title == vault_name:
vault_id = vault.id
break
else:
raise Exception(f"Vault {vault} not found")
field_objs = []
sections = []
for field in fields:
if field.get('concealed', False):
field['field_type'] = ItemFieldType.CONCEALED
else:
field['field_type'] = ItemFieldType.TEXT
field['id'] = field['title'].lower().replace(' ', '_')
field_objs.append(ItemField(**field))
if section_id := field.get('section_id'):
if section_id not in sections:
sections.append(section_id)
sections = [ItemSection(id=section, title=section) for section in sections]
# Create item parameters with sections
create_params = ItemCreateParams(
title=title,
category=category,
vault_id=vault_id,
fields=field_objs,
sections=sections,
)
if notes:
create_params.notes = notes
item = await client.items.create(create_params)
logger.info(f"Stored credentials in 1Password vault '{vault}' with title '{title}'")
return item
def share_item(
item: Item,
recipients: list[str] | None = None,
expire_after: ItemShareDuration | None = ItemShareDuration.SEVENDAYS,
one_time_only: bool = False
):
return asyncio.run(share_item_async(item, recipients, expire_after, one_time_only))
async def share_item_async(
item: Item,
recipients: list[str] | None,
expire_after: ItemShareDuration | None,
one_time_only: bool,
):
client = await get_client()
policy = await client.items.shares.get_account_policy(item.vault_id, item.id)
valid_recipients = await client.items.shares.validate_recipients(policy, recipients)
share_params = ItemShareParams(
recipients=valid_recipients,
expire_after=expire_after,
one_time_only=one_time_only
)
share_link = await client.items.shares.create(item, policy, share_params)
logger.info(f"Created share link for '{item.title}'")
return share_link

View file

@ -25,17 +25,9 @@ def worker(task_queue, task, catch_errors: bool = True):
raise e
def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, log_level: str | None = None, task_count: int | None = None):
def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, task_count: int | None = None):
workers = workers or os.cpu_count() or 4
# Configure logging based on whether we're running in parallel or not
if log_level is None:
log_level = 'INFO' if workers == 1 else 'WARNING'
logging.basicConfig(
level=log_level,
format='[%(process)d] %(message)s'
)
logger.debug(f"Starting processing with {workers} workers")
if workers > 1: