Refactoring, github pipeline, s3 creation

2025-07-04 13:46:56 -04:00 · 2025-02-26 14:49:24 -05:00 · 2025-02-26 14:49:24 -05:00 · b245fd44eb
commit b245fd44eb
parent a7c99e264d
21 changed files with 718 additions and 281 deletions
--- a/scripts/helpers/bag.py
+++ b/scripts/helpers/bag.py
@ -0,0 +1,156 @@
+import os
+import json
+import zipfile
+import struct
+import boto3
+import logging
+from pathlib import Path
+from datetime import datetime
+import tempfile
+import shutil
+from nabit.lib.archive import package
+
+logger = logging.getLogger(__name__)
+
+# File extensions that are already compressed or wouldn't benefit from additional compression
+UNCOMPRESSED_EXTENSIONS = {
+    # Already compressed archives
+    'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
+    # Compressed images
+    'jpg', 'jpeg', 'png', 'gif', 'webp',
+    # Compressed video/audio
+    'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
+    # Other compressed/binary formats
+    'pdf', 'docx', 'xlsx', 'pptx',
+}
+
+def zip_archive(bag_dir, archive_path):
+    """Zip up a nabit archive and create metadata."""
+    # Create zip archive
+    with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for file_path in bag_dir.rglob('*'):
+            if file_path.is_file():
+                arc_path = file_path.relative_to(bag_dir)
+                compression = (zipfile.ZIP_STORED 
+                            if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS 
+                            else zipfile.ZIP_DEFLATED)
+                zf.write(file_path, arc_path, compress_type=compression)
+
+    # Create metadata file
+    zip_info = []
+    with zipfile.ZipFile(archive_path, 'r') as zf:
+        for info in zf.filelist:
+            header_offset = info.header_offset
+            
+            # Read header to calculate data offset
+            zf.fp.seek(header_offset)
+            header = zf.fp.read(zipfile.sizeFileHeader)
+            fheader = struct.unpack(zipfile.structFileHeader, header)
+            fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
+            extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
+            data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
+            
+            zip_info.append({
+                'filename': info.filename,
+                'file_size': info.file_size,
+                'compress_size': info.compress_size,
+                'compress_type': info.compress_type,
+                'header_offset': header_offset,
+                'data_offset': data_offset,
+            })
+        
+    # Read the bag-info.txt and signed-metadata.json
+    bag_info = (bag_dir / 'bag-info.txt').read_text()
+    signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
+
+    return {
+        'bag_info': bag_info,
+        'signed_metadata': signed_metadata,
+        'zip_entries': zip_info
+    }
+
+def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
+    """Upload archive and metadata to S3."""
+    s3 = boto3.Session(**session_args).client('s3')
+    bucket_name, s3_key_prefix = s3_path.split('/', 1)
+
+    # Upload zip file
+    s3_collection_key = os.path.join(s3_key_prefix, str(collection_path.relative_to(output_path)))
+    s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
+    logger.info(f"  - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
+
+    # Upload metadata file
+    s3_metadata_key = os.path.join(s3_key_prefix, str(metadata_path.relative_to(output_path)))
+    s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
+    logger.info(f"  - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
+
+def cleanup_files(collection_path, no_delete=False, s3_path=None):
+    """Clean up local files after upload if needed."""
+    if not no_delete and s3_path:
+        logger.info("- Deleting local zip archive...")
+        if os.path.exists(collection_path):
+            os.remove(collection_path)
+            if collection_path.parent.exists() and not os.listdir(collection_path.parent):
+                os.rmdir(collection_path.parent)
+
+def fetch_and_upload(
+        output_path,
+        collection_path,
+        metadata_path,
+        create_archive_callback,
+        signatures=None,
+        session_args=None,
+        s3_path=None,
+        no_delete=False,
+    ):
+    """
+    Common pipeline for creating and processing archives.
+    
+    Args:
+        output_path: Base output directory
+        collection_path: Path where the final zip will be stored
+        metadata_path: Path where the metadata will be stored
+        create_archive_callback: Function that will create the archive
+        signatures: Signature configuration for nabit
+        session_args: AWS session arguments
+        s3_path: S3 path for uploads
+        no_delete: Whether to preserve local files
+    """
+    with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
+        logger.info("- Creating archive...")
+        # set up paths
+        temp_dir = Path(temp_dir)
+        bag_dir = temp_dir / 'bag'
+        archive_path = temp_dir / 'archive.zip'
+        source_files_dir = temp_dir / 'source_files'
+        source_files_dir.mkdir(parents=True, exist_ok=True)
+
+        # Call the callback to create the archive
+        package_kwargs = create_archive_callback(source_files_dir)
+
+        # create bag
+        package(
+            output_path=bag_dir,
+            collect_errors='ignore',
+            signatures=signatures,
+            **package_kwargs,
+        )
+
+        logger.info("- Zipping archive...")
+        # zip up data and create metadata
+        output_metadata = zip_archive(bag_dir, archive_path)
+
+        logger.info("- Moving files to final location...")
+        # Move files to final location
+        collection_path.parent.mkdir(parents=True, exist_ok=True)
+        metadata_path.parent.mkdir(parents=True, exist_ok=True)
+        shutil.move(str(archive_path), collection_path)
+        with open(metadata_path, 'w') as f:
+            json.dump(output_metadata, f)
+            f.write('\n')
+
+    if s3_path:
+        logger.info("Uploading to S3...")
+        upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
+
+    cleanup_files(collection_path, no_delete, s3_path) 
--- a/scripts/helpers/onepassword.py
+++ b/scripts/helpers/onepassword.py
@ -0,0 +1,85 @@
+from onepassword import Client, ItemCreateParams, ItemField, ItemFieldType, ItemCategory, ItemSection, Item
+from onepassword import ItemShareParams, ItemShareDuration, ValidRecipient
+from .misc import load_config
+import logging
+import asyncio
+
+logger = logging.getLogger(__name__)
+
+async def get_client():
+    op_config = load_config().get("1password", {})
+    if not op_config.get('token'):
+        raise Exception("1Password token not found in config")
+
+    return await Client.authenticate(auth=op_config['token'], integration_name="data-vault", integration_version="v1.0.0")
+
+def save_item(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
+    return asyncio.run(save_item_async(vault_name, title, fields, notes, category))
+
+async def save_item_async(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
+    client = await get_client()
+    vault_id = None
+    vaults = await client.vaults.list_all()
+    async for vault in vaults:
+        if vault.title == vault_name:
+            vault_id = vault.id
+            break
+    else:
+        raise Exception(f"Vault {vault} not found")
+    
+    field_objs = []
+    sections = []
+    for field in fields:
+        if field.get('concealed', False):
+            field['field_type'] = ItemFieldType.CONCEALED
+        else:
+            field['field_type'] = ItemFieldType.TEXT
+        field['id'] = field['title'].lower().replace(' ', '_')
+        field_objs.append(ItemField(**field))
+        if section_id := field.get('section_id'):
+            if section_id not in sections:
+                sections.append(section_id)
+    sections = [ItemSection(id=section, title=section) for section in sections]
+    
+    # Create item parameters with sections
+    create_params = ItemCreateParams(
+        title=title,
+        category=category,
+        vault_id=vault_id,
+        fields=field_objs,
+        sections=sections,
+    )
+    
+    if notes:
+        create_params.notes = notes
+    
+    item = await client.items.create(create_params)
+    logger.info(f"Stored credentials in 1Password vault '{vault}' with title '{title}'")
+    return item
+
+def share_item(
+        item: Item, 
+        recipients: list[str] | None = None, 
+        expire_after: ItemShareDuration | None = ItemShareDuration.SEVENDAYS, 
+        one_time_only: bool = False
+    ):
+    return asyncio.run(share_item_async(item, recipients, expire_after, one_time_only))
+
+async def share_item_async(
+        item: Item,
+        recipients: list[str] | None, 
+        expire_after: ItemShareDuration | None,
+        one_time_only: bool,
+    ):
+    client = await get_client()
+    policy = await client.items.shares.get_account_policy(item.vault_id, item.id)
+    valid_recipients = await client.items.shares.validate_recipients(policy, recipients)
+    share_params = ItemShareParams(
+        recipients=valid_recipients,
+        expire_after=expire_after,
+        one_time_only=one_time_only
+    )
+    share_link = await client.items.shares.create(item, policy, share_params)
+    logger.info(f"Created share link for '{item.title}'")
+    return share_link
+
--- a/scripts/helpers/parallel.py
+++ b/scripts/helpers/parallel.py
@ -25,17 +25,9 @@ def worker(task_queue, task, catch_errors: bool = True):
                raise e


-def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, log_level: str | None = None, task_count: int | None = None):
+def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, task_count: int | None = None):
    workers = workers or os.cpu_count() or 4
    
-    # Configure logging based on whether we're running in parallel or not
-    if log_level is None:
-        log_level = 'INFO' if workers == 1 else 'WARNING'
-    logging.basicConfig(
-        level=log_level,
-        format='[%(process)d] %(message)s'
-    )
-    
    logger.debug(f"Starting processing with {workers} workers")
    
    if workers > 1: