Refactoring, github pipeline, s3 creation

2025-09-02 02:26:03 -04:00 · 2025-02-26 14:49:24 -05:00 · 2025-02-26 14:49:24 -05:00 · b245fd44eb
commit b245fd44eb
parent a7c99e264d
21 changed files with 718 additions and 281 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -18,6 +18,7 @@ dependencies = [
    "cloudflare>=4.0.0",
    "deepdiff>=8.2.0",
    "orjson>=3.10.15",
+    "onepassword-sdk>=0.1.7",
 ]

 [build-system]
@ -35,3 +36,6 @@ gitspoke = { git = "https://github.com/harvard-lil/gitspoke" }

 [tool.hatch.build.targets.wheel]
 packages = ["scripts"]
+
+[project.scripts]
+vault = "scripts.cli:cli"
--- a/scripts/init.py
+++ b/scripts/init.py
@ -1,2 +0,0 @@
-def hello() -> str:
-    return "Hello from data-mirror!"
--- a/scripts/cli.py
+++ b/scripts/cli.py
@ -0,0 +1,118 @@
+import click
+import importlib
+from pathlib import Path
+import logging
+import os
+import sys
+
+"""
+Top level CLI that registers commands with this logic:
+
+- Find all subdirectories WITH __init__.py
+- Fetch a 'cli' click.group() from __init__.py if it exists, or create one
+- For all python files in the subdirectory:
+    - If the file defines a 'cli' click.group(), add it to the subdirectory's group
+    - If the file defines a 'main' click.command(), add it to the subdirectory's group
+- If any commands are added to the subdirectory's group, add the group to the main CLI
+"""
+
+def register_commands(cli: click.Group) -> None:
+    """Find all command groups in the scripts directory."""
+    scripts_dir = Path(__file__).parent
+    
+    # for each subdirectory, try to import subdir.__init__
+    for subdir in scripts_dir.glob('**'):
+        if not subdir.is_dir():
+            continue
+
+        # get group from __init__.py or create a new one
+        subdir_import = f"scripts.{subdir.name}"
+        try:
+            init_module = importlib.import_module(subdir_import+'.__init__')
+        except ImportError:
+            continue
+        if hasattr(init_module, 'cli'):
+            group = init_module.cli
+        else:
+            @click.group()
+            def group():
+                pass
+
+        # add commands from the subdirectory
+        for item in subdir.glob('*.py'):
+            if item.name == '__init__.py':
+                continue
+            file_import = f"{subdir_import}.{item.stem}"
+            module = importlib.import_module(file_import)
+            if type(getattr(module, 'cli', None)) == click.Group:
+                group.add_command(module.cli, name=item.stem)
+            elif type(getattr(module, 'main', None)) == click.Command:
+                group.add_command(module.main, name=item.stem)
+
+        # add the group to the main cli
+        if group.commands:
+            cli.add_command(group, name=subdir.name)
+
+@click.group()
+@click.option('--log-level', '-l',
+              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
+              default='WARNING',
+              help='Logging level.')
+def cli(log_level):
+    """Main CLI entry point for the vault tool."""
+    logging.basicConfig(level=log_level)
+
+@cli.command()
+@click.option('--shell', '-s', type=click.Choice(['bash', 'zsh', 'fish']), 
+              help='Shell to generate completion script for. Defaults to auto-detect.')
+def completion(shell):
+    """Install tab completion for the CLI.
+    
+    Examples:
+        # Auto-detect shell and print instructions
+        $ vault completion
+        
+        # Generate completion script for bash
+        $ vault completion --shell=bash > ~/.vault-complete.bash
+        
+        # Generate completion script for zsh
+        $ vault completion --shell=zsh > ~/.vault-complete.zsh
+        
+        # Generate completion script for fish
+        $ vault completion --shell=fish > ~/.config/fish/completions/vault.fish
+    """
+    # Auto-detect shell if not specified
+    if not shell:
+        shell = os.environ.get('SHELL', '')
+        if 'bash' in shell:
+            shell = 'bash'
+        elif 'zsh' in shell:
+            shell = 'zsh'
+        elif 'fish' in shell:
+            shell = 'fish'
+        else:
+            click.echo("Could not auto-detect shell. Please specify with --shell option.")
+            return
+
+    # Get the script name (executable name)
+    script_name = os.path.basename(sys.argv[0])
+    env_var = f"_{script_name.replace('-', '_').upper()}_COMPLETE"
+    
+    if shell == 'bash':
+        click.echo(f"# Save the completion script:")
+        click.echo(f"{env_var}=bash_source {script_name} > ~/.{script_name}-complete.bash")
+        click.echo("\n# Then add this line to your ~/.bashrc:")
+        click.echo(f". ~/.{script_name}-complete.bash")
+    elif shell == 'zsh':
+        click.echo(f"# Save the completion script:")
+        click.echo(f"{env_var}=zsh_source {script_name} > ~/.{script_name}-complete.zsh")
+        click.echo("\n# Then add this line to your ~/.zshrc:")
+        click.echo(f". ~/.{script_name}-complete.zsh")
+    elif shell == 'fish':
+        click.echo(f"# Save the completion script to the fish completions directory:")
+        click.echo(f"{env_var}=fish_source {script_name} > ~/.config/fish/completions/{script_name}.fish")
+
+register_commands(cli)
+
+if __name__ == "__main__":
+    cli()
--- a/scripts/collection/init.py
+++ b/scripts/collection/init.py
@ -1,2 +0,0 @@
-def hello() -> str:
-    return "Hello from data-mirror!"
--- a/scripts/collection/cloudflare_tools.py
+++ b/scripts/collection/cloudflare_tools.py
@ -54,15 +54,9 @@ def cli():
              help='Key prefixes to restrict access to. Can be specified multiple times.')
@click.option('--objects', '-o', multiple=True,
              help='Specific object keys to restrict access to. Can be specified multiple times.')
-@click.option('--log-level', '-l', 
-              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
-              default='INFO',
-              help='Logging level.')
 def generate_key(bucket: str, permission: str, ttl: int, prefixes: tuple[str, ...], 
                objects: tuple[str, ...], log_level: str):
    """Generate temporary Cloudflare R2 access credentials."""
-    # Setup logging
-    logging.basicConfig(level=log_level)
    
    # Load config
    config = load_config().get("temp_tokens", {})
--- a/scripts/collection/render.py
+++ b/scripts/collection/render.py
@ -74,18 +74,10 @@ def render_html(datasets_query, output_path: Path) -> None:
@click.command()
@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
@click.argument('output_path', type=click.Path(path_type=Path), default='data/processed/web')
-@click.option('--log-level', '-l', 
-              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
-              default='INFO',
-              help='Logging level.')
@click.option('--limit', '-n', type=int, default=None,
              help='Maximum number of rows to display. Default: all rows.')
-def main(db_path: Path, output_path: Path, log_level: str, limit: int | None):
+def main(db_path: Path, output_path: Path, limit: int | None):
    """Render the Dataset table to an HTML file."""
-    logging.basicConfig(
-        level=getattr(logging, log_level),
-        format='%(asctime)s - %(levelname)s - %(message)s'
-    )
    
    logger.info(f"Connecting to database at {db_path}")
    db.init(db_path)
--- a/scripts/collection/s3_tools.py
+++ b/scripts/collection/s3_tools.py
@ -2,14 +2,12 @@ import boto3
 import click
 from tqdm import tqdm
 import logging
-from itertools import islice
 import json
-import gzip
-from io import BytesIO
 import tempfile
 import os
 from scripts.helpers.misc import json_default
 import zipfile
+from scripts.helpers.onepassword import save_item, share_item

 logger = logging.getLogger(__name__)

@ -122,11 +120,8 @@ def cli():
@click.argument('s3_path')
@click.option('--profile', help='AWS profile name', default='sc-direct')
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
-@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
-              default='INFO', help='Set logging level')
-def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
+def undelete(s3_path: str, profile: str = None, dry_run: bool = False):
    """Remove delete markers from versioned S3 objects, effectively undeleting them."""
-    logging.basicConfig(level=log_level)
    bucket, prefix = s3_path.split('/', 1)
    
    session = boto3.Session(profile_name=profile)
@ -138,11 +133,8 @@ def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level
@click.argument('s3_path')
@click.option('--profile', help='AWS profile name', default='sc-direct')
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
-@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
-              default='INFO', help='Set logging level')
-def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
+def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False):
    """Delete all zero-size objects under the given prefix."""
-    logging.basicConfig(level=log_level)
    bucket, prefix = s3_path.split('/', 1)
    
    session = boto3.Session(profile_name=profile)
@ -154,11 +146,8 @@ def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_l
@click.argument('s3_path')
@click.option('--profile', help='AWS profile name', default='sc-direct')
@click.option('--output', '-o', help='Output path for index file', default=None)
-@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
-              default='INFO', help='Set logging level')
-def write_index(s3_path: str, profile: str = None, output: str | None = None, log_level: str = 'INFO'):
+def write_index(s3_path: str, profile: str = None, output: str | None = None):
    """Write a JSONL index of all files under the given prefix."""
-    logging.basicConfig(level=log_level)
    bucket, prefix = s3_path.split('/', 1)
    
    if output is None:
@ -169,6 +158,117 @@ def write_index(s3_path: str, profile: str = None, output: str | None = None, lo
    
    write_file_listing(s3_client, bucket, prefix, output)

+@cli.command()
+@click.argument('bucket_name')
+@click.option('--profile', '-p', help='AWS profile name')
+@click.option('--region', '-r', help='AWS region', default='us-east-1')
+@click.option('--tag', '-t', help='Tag the bucket with a name', default=None)
+def create_bucket(bucket_name: str, profile: str = None, region: str = 'us-east-1', tag: str | None = None):
+    """Create a new S3 bucket with versioning enabled by default."""
+    session = boto3.Session(profile_name=profile)
+    s3_client = session.client('s3')
+    
+    # Ensure bucket exists
+    try:
+        if region == 'us-east-1':
+            s3_client.create_bucket(Bucket=bucket_name)
+        else:
+            s3_client.create_bucket(
+                Bucket=bucket_name,
+                CreateBucketConfiguration={'LocationConstraint': region},
+            )
+    except s3_client.exceptions.BucketAlreadyExists:
+        logger.warning(f"Bucket {bucket_name} already exists. Updating settings.")
+
+    # Configure bucket
+    s3_client.put_bucket_versioning(
+        Bucket=bucket_name,
+        VersioningConfiguration={'Status': 'Enabled'}
+    )
+    
+    logger.info(f"Ensured bucket {bucket_name} exists with versioning enabled")
+
+@cli.command()
+@click.argument('bucket_name')
+@click.argument('username')
+@click.option('--profile', '-p', help='AWS profile name')
+@click.option('--permissions-boundary', '-b', help='ARN of the permissions boundary policy')
+@click.option('--op-vault', help='1Password vault to store credentials in', default='Private')
+@click.option('--op-share', help='Share the credentials with the given email', default=None)
+def create_user(bucket_name: str, username: str, profile: str, permissions_boundary: str, op_vault: str, op_share: str | None):
+    """Generate temporary S3 credentials with read/write/list access for a specific bucket."""
+    session = boto3.Session(profile_name=profile)
+    iam_client = session.client('iam')
+    
+    # Define inline policy for bucket access
+    bucket_policy = {
+        "Version": "2012-10-17",
+        "Statement": [{
+            "Effect": "Allow",
+            "Action": [
+                "s3:GetObject",
+                "s3:PutObject",
+                "s3:DeleteObject",
+                "s3:ListBucket"
+            ],
+            "Resource": [
+                f"arn:aws:s3:::{bucket_name}",
+                f"arn:aws:s3:::{bucket_name}/*"
+            ]
+        }]
+    }
+    
+    # Create the IAM user with permissions boundary
+    try:
+        iam_client.create_user(
+            UserName=username,
+            PermissionsBoundary=permissions_boundary
+        )
+        logger.info(f"Created IAM user: {username}")
+    except iam_client.exceptions.EntityAlreadyExistsException:
+        logger.warning(f"User {username} already exists")
+        
+    # Attach inline policy directly to user
+    iam_client.put_user_policy(
+        UserName=username,
+        PolicyName=f"{bucket_name}-access",
+        PolicyDocument=json.dumps(bucket_policy)
+    )
+    logger.info(f"Attached bucket access policy to user {username}")
+    
+    # Create access key for the user
+    response = iam_client.create_access_key(UserName=username)
+    credentials = response['AccessKey']
+    
+    # Output the credentials
+    click.echo(f"AWS_ACCESS_KEY_ID={credentials['AccessKeyId']}")
+    click.echo(f"AWS_SECRET_ACCESS_KEY={credentials['SecretAccessKey']}")
+    
+    # Save credentials to 1Password if requested
+    if op_vault:
+        item = save_item(op_vault, f"{username} S3 Credentials for {bucket_name}", [
+            {
+                'title': 'Access Key ID',
+                'value': credentials['AccessKeyId'],
+                'section_id': 's3_details'
+            },
+            {
+                'title': 'Secret Access Key',
+                'value': credentials['SecretAccessKey'],
+                'section_id': 's3_details'
+            },
+            {
+                'title': 'S3 Bucket',
+                'value': bucket_name,
+                'section_id': 's3_details'
+            },
+        ])
+        if op_share:
+            share_link = share_item(item, [op_share])
+            click.echo(f"To share credentials with {op_share}, use the following link: {share_link}")
+
+        
 if __name__ == '__main__':
    cli()

+
--- a/scripts/collection/verify_upload.py
+++ b/scripts/collection/verify_upload.py
@ -73,18 +73,8 @@ def verify_dataset(json_url: str, zip_url: str, output_dir: Path | None = None):
@click.argument('zip_url', type=str)
@click.option('--output', '-o', type=click.Path(path_type=Path), 
              help='Directory to write uncompressed files')
-@click.option('--log-level', '-l', 
-              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
-              default='INFO',
-              help='Logging level.')
-def main(json_url: str, zip_url: str, output: Path = None, log_level: str = 'INFO'):
+def main(json_url: str, zip_url: str, output: Path = None):
    """Verify dataset from JSON and ZIP URLs"""
-    # Set up logging
-    logging.basicConfig(
-        level=getattr(logging, log_level),
-        format='%(asctime)s - %(levelname)s - %(message)s'
-    )
-    
    verify_dataset(json_url, zip_url, output)

 if __name__ == '__main__':
--- a/scripts/data_gov/init.py
+++ b/scripts/data_gov/init.py
--- a/scripts/data_gov/diff/diff.py
+++ b/scripts/data_gov/diff/diff.py
@ -61,17 +61,8 @@ def find_differences(csv_data: Dict[str, dict],
@click.option('--compare-by', '-c',
              default='id',
              help='Field to compare by.')
-@click.option('--log-level', '-l',
-              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
-              default='INFO',
-              help='Logging level.')
 def main(old_path: Path, new_path: Path, compare_by: str, log_level: str):
    """Compare records between CSV and JSONL files."""
-    logging.basicConfig(
-        level=getattr(logging, log_level),
-        format='%(asctime)s - %(levelname)s - %(message)s'
-    )
-    
    old_data = load_jsonl_data(old_path, compare_by=compare_by)
    new_data = load_jsonl_data(new_path, compare_by=compare_by)
    
--- a/scripts/data_gov/diff/diff_analyze.py
+++ b/scripts/data_gov/diff/diff_analyze.py
@ -3,36 +3,38 @@ from collections import Counter, defaultdict
 from pathlib import Path


-# Read the JSONL file and count crawler_identified_date values
-downloaded_counts = Counter()
-identified_counts = Counter()
-titles_by_org = defaultdict(list)
-with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
-    for line in f:
-          data = json.loads(line)
-          org = json.loads(data.get('organization', '{}'))
-          identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
-          titles_by_org[org['title']].append(data["title"])
+if __name__ == "__main__":

-# Print the counts sorted by date
-for date, count in sorted(identified_counts.items()):
-    print(f"{date}: {count}")
-
-# sort each list of titles by org
-for org, titles in titles_by_org.items():
-    titles_by_org[org].sort()
-Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
-
-
-# print urls
-for path in Path('data/').glob('glass*'):
-     print(path)
-     with open(path, 'r') as f:
+     # Read the JSONL file and count crawler_identified_date values
+     downloaded_counts = Counter()
+     identified_counts = Counter()
+     titles_by_org = defaultdict(list)
+     with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
          for line in f:
               data = json.loads(line)
-               print("* " + data['name'])
-               resources = data.get('resources', [])
-               if type(resources) == str:
-                    resources = json.loads(resources)
-               for resource in resources:
-                    print(' * ' + resource['url'])
+               org = json.loads(data.get('organization', '{}'))
+               identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
+               titles_by_org[org['title']].append(data["title"])
+
+     # Print the counts sorted by date
+     for date, count in sorted(identified_counts.items()):
+          print(f"{date}: {count}")
+
+     # sort each list of titles by org
+     for org, titles in titles_by_org.items():
+          titles_by_org[org].sort()
+     Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
+
+
+     # print urls
+     for path in Path('data/').glob('glass*'):
+          print(path)
+          with open(path, 'r') as f:
+               for line in f:
+                    data = json.loads(line)
+                    print("* " + data['name'])
+                    resources = data.get('resources', [])
+                    if type(resources) == str:
+                         resources = json.loads(resources)
+                    for resource in resources:
+                         print(' * ' + resource['url'])
--- a/scripts/data_gov/fetch_data.py
+++ b/scripts/data_gov/fetch_data.py
@ -10,31 +10,17 @@ import os
 from urllib.parse import urlparse
 import re
 from scripts.helpers.parallel import run_parallel
-import zipfile
-import struct
-import boto3
 import logging
 from scripts.data_gov.models import db, Dataset
 from playhouse.shortcuts import model_to_dict
 from tqdm import tqdm
 from datetime import datetime
+from scripts.helpers.bag import zip_archive, upload_archive, cleanup_files, fetch_and_upload

 logger = logging.getLogger(__name__)

 ## download data.gov datasets, create nabit archives, and upload to S3

-# File extensions that are already compressed or wouldn't benefit from additional compression
-UNCOMPRESSED_EXTENSIONS = {
-    # Already compressed archives
-    'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
-    # Compressed images
-    'jpg', 'jpeg', 'png', 'gif', 'webp',
-    # Compressed video/audio
-    'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
-    # Other compressed/binary formats
-    'pdf', 'docx', 'xlsx', 'pptx',
-}
-
 stats_counter = {}

 def is_valid_url(url):
@ -55,95 +41,6 @@ def extract_urls(data, urls = None):
            extract_urls(item, urls)
    return urls

-def create_archive(bag_dir, dataset: Dataset, signatures):
-    data_dict = model_to_dict(dataset)
-    for key, value in data_dict.items():
-        if isinstance(value, datetime):
-            data_dict[key] = value.isoformat()
-    data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}'
-    collect = [
-        *[UrlCollectionTask(url=url) for url in extract_urls(data_dict)],
-    ]
-    logger.info(f"  - Downloading {len(collect)} files")
-
-    # sort fields from dataset
-    data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')}
-    crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')}
-
-    # Create the archive
-    package(
-        output_path=bag_dir,
-        collect=collect,
-        collect_errors='ignore',
-        signed_metadata={
-            'id': str(uuid.uuid4()),
-            'url': data_gov_url,
-            'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.',
-            'data_gov_metadata': data_gov_metadata,
-            'crawler_metadata': crawler_metadata,
-        },
-        signatures=signatures,
-    )
-
-def zip_archive(bag_dir, archive_path):
-    # Create zip archive
-    with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
-        for file_path in bag_dir.rglob('*'):
-            if file_path.is_file():
-                arc_path = file_path.relative_to(bag_dir)
-                compression = (zipfile.ZIP_STORED 
-                            if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS 
-                            else zipfile.ZIP_DEFLATED)
-                zf.write(file_path, arc_path, compress_type=compression)
-
-    # Create metadata file
-    zip_info = []
-    with zipfile.ZipFile(archive_path, 'r') as zf:
-        for info in zf.filelist:
-            header_offset = info.header_offset
-            
-            # Read header to calculate data offset
-            zf.fp.seek(header_offset)
-            header = zf.fp.read(zipfile.sizeFileHeader)
-            fheader = struct.unpack(zipfile.structFileHeader, header)
-            fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
-            extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
-            data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
-            
-            zip_info.append({
-                'filename': info.filename,
-                'file_size': info.file_size,
-                'compress_size': info.compress_size,
-                'compress_type': info.compress_type,
-                'header_offset': header_offset,
-                'data_offset': data_offset,
-            })
-        
-    # Read the bag-info.txt and signed-metadata.json
-    bag_info = (bag_dir / 'bag-info.txt').read_text()
-    signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
-
-    return {
-        'bag_info': bag_info,
-        'signed_metadata': signed_metadata,
-        'zip_entries': zip_info
-    }
-
-def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
-    s3 = boto3.Session(**session_args).client('s3')
-    bucket_name, s3_path = s3_path.split('/', 1)
-
-    # Upload zip file
-    s3_collection_key = os.path.join(s3_path, str(collection_path.relative_to(output_path)))
-    s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
-    logger.info(f"  - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
-
-    # Upload metadata file
-    s3_metadata_key = os.path.join(s3_path, str(metadata_path.relative_to(output_path)))
-    s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
-    logger.info(f"  - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
-
-
 def run_pipeline(
        dataset: Dataset, 
        output_path: Path, 
@ -162,36 +59,43 @@ def run_pipeline(
    # set this here so it makes it into the metadata
    dataset.crawler_downloaded_date = datetime.now()

-    with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
-        logger.info("- Creating archive...")
-        # set up paths
-        temp_dir = Path(temp_dir)
-        bag_dir = temp_dir / 'bag'
-        archive_path = temp_dir / 'archive.zip'
+    def create_archive(temp_dir):
+        data_dict = model_to_dict(dataset)
+        for key, value in data_dict.items():
+            if isinstance(value, datetime):
+                data_dict[key] = value.isoformat()
+        data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}'
+        collect = [
+            *[UrlCollectionTask(url=url) for url in extract_urls(data_dict)],
+        ]
+        logger.info(f"  - Downloading {len(collect)} files")

-        # download data with nabit
-        create_archive(bag_dir, dataset, signatures)
+        # sort fields from dataset
+        data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')}
+        crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')}

-        logger.info("- Zipping archive...")
-        # zip up data and create metadata
-        output_metadata = zip_archive(bag_dir, archive_path)
+        return {
+            'collect': collect,
+            'signed_metadata': {
+                'id': str(uuid.uuid4()),
+                'url': data_gov_url,
+                'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.',
+                'data_gov_metadata': data_gov_metadata,
+                'crawler_metadata': crawler_metadata,
+            },
+        }

-        logger.info("- Moving files to final location...")
-        # Move files to final location
-        collection_path.parent.mkdir(parents=True, exist_ok=True)
-        metadata_path.parent.mkdir(parents=True, exist_ok=True)
-        os.rename(str(archive_path), collection_path)
-        metadata_path.write_text(json.dumps(output_metadata) + '\n')
-
-    if s3_path:
-        logger.info("Uploading to S3...")
-        upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
-
-    if not no_delete:
-        logger.info("- Deleting zip archive...")
-        os.remove(collection_path)
-        if collection_path.parent.exists() and not os.listdir(collection_path.parent):
-            os.rmdir(collection_path.parent)
+    # Use common pipeline
+    fetch_and_upload(
+        output_path=output_path,
+        collection_path=collection_path,
+        metadata_path=metadata_path,
+        create_archive_callback=create_archive,
+        signatures=signatures,
+        session_args=session_args,
+        s3_path=s3_path,
+        no_delete=no_delete
+    )

    logger.info("- Setting crawler_downloaded_date...")
    db.connect()
@ -244,12 +148,10 @@ def get_unprocessed_datasets(output_path: Path, collection: str, min_size: int =
@click.option('--signatures', help='JSON string of signature configuration.')
@click.option('--profile', '-p', help='AWS profile name')
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
-@click.option('--log-level', '-l', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), default=None,
-              help='Logging level.')
@click.option('--stop-after', help='Stop after processing this many collections', type=int)
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
 def main(db_path: Path, output_path: Path, collection: str, workers=None, min_size=0, dataset_name=None,
-         if_exists='skip', signatures=None, profile=None, s3_path=None, log_level=None, stop_after=None, no_delete=False):
+         if_exists='skip', signatures=None, profile=None, s3_path=None, stop_after=None, no_delete=False):
    
    if dataset_name:
        workers = 1
--- a/scripts/data_gov/fetch_index.py
+++ b/scripts/data_gov/fetch_index.py
@ -151,17 +151,8 @@ def cli():
              help='Number of results to fetch per page.')
@click.option('--start-date', '-s', type=str, default=None,
              help='Date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)')
-@click.option('--log-level', '-l', 
-              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
-              default='WARNING',
-              help='Logging level.')
-def fetch(output_path: Path, rows_per_page: int, start_date: str, log_level: str):
+def fetch(output_path: Path, rows_per_page: int, start_date: str):
    """Fetch package data from data.gov API and save to database."""
-    logging.basicConfig(
-        level=getattr(logging, log_level),
-        format='%(asctime)s - %(levelname)s - %(message)s'
-    )
-    
    save_packages_to_database(output_path, rows_per_page, start_date)

@cli.command()
--- a/scripts/data_gov/fetch_jsonl.py
+++ b/scripts/data_gov/fetch_jsonl.py
@ -22,19 +22,10 @@ def cli():
@click.argument('output_path', type=click.Path(path_type=Path))
@click.option('--rows-per-page', '-r', type=int, default=1000,
              help='Number of results to fetch per page.')
-@click.option('--log-level', '-l', 
-              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
-              default='INFO',
-              help='Logging level.')
@click.option('--start-date', '-s', type=str, default=None,
              help='Start date for fetching packages in YYYY-MM-DD format.')
 def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
    """Fetch all package data from data.gov API and save to gzipped JSONL file."""
-    logging.basicConfig(
-        level=getattr(logging, log_level),
-        format='%(asctime)s - %(levelname)s - %(message)s'
-    )
-    
    if output_path.is_dir():
        current_date = datetime.now().strftime('%Y%m%d')
        output_path = output_path / f'data_{current_date}.jsonl.gz'
@ -49,17 +40,8 @@ def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str
@cli.command()
@click.argument('file1', type=click.Path(exists=True, path_type=Path))
@click.argument('file2', type=click.Path(exists=True, path_type=Path))
-@click.option('--log-level', '-l', 
-              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
-              default='INFO',
-              help='Logging level.')
 def compare(file1: Path, file2: Path, log_level: str):
    """Compare two gzipped JSONL files by indexing on the 'name' key."""
-    logging.basicConfig(
-        level=getattr(logging, log_level),
-        format='%(asctime)s - %(levelname)s - %(message)s'
-    )
-
    def load_jsonl_index(file_path: Path) -> Dict[str, Any]:
        # Check for pickle file
        pickle_path = file_path.with_suffix('.pickle')
--- a/scripts/data_gov/models.py
+++ b/scripts/data_gov/models.py
@ -60,7 +60,7 @@ class Dataset(BaseModel):
    # fields starting with crawler_ are added by our crawler
    crawler_identified_date = DateTimeField(null=True, default=datetime.now)
    crawler_downloaded_date = DateTimeField(null=True)
-    crawler_last_crawl_id = ForeignKeyField('Crawl', backref='datasets', null=True)
+    crawler_last_crawl_id = ForeignKeyField(Crawl, backref='datasets', null=True)


 class DatasetHistory(Dataset):
--- a/scripts/github/init.py
+++ b/scripts/github/init.py
--- a/scripts/github/download_git.py
+++ b/scripts/github/download_git.py
@ -8,8 +8,14 @@ from gitspoke import Downloader, GitHubAPI
 from gitspoke.cli import valid_include_items, get_token
 import os
 import json
+import uuid
 import requests
+from datetime import datetime
 from scripts.helpers.misc import load_config
+from nabit.lib.archive import package
+from nabit.lib.sign import KNOWN_TSAS, is_encrypted_key
+from nabit.lib.backends.path import PathCollectionTask
+from scripts.helpers.bag import fetch_and_upload

 logger = logging.getLogger(__name__)
 stats_counter = {}
@ -34,17 +40,90 @@ def check_repo_exists(org_name, repo_name, token, output_path=None):
                output_file.write(f"{repo_link}\n")
    return exists

-def run_pipeline(org_name, repo_name, collection_path, include, token, check_exists=False, output_path=None):
+def run_pipeline(
+    org_name, 
+    repo_name, 
+    collection_path, 
+    include, 
+    token,
+    metadata_path=None,
+    output_path=None,
+    signatures=None,
+    session_args=None,
+    s3_path=None,
+    no_delete=False,
+    save_raw=False,
+    raw_dir=None,
+    check_exists=False,
+    output_deleted=None,
+):
    """Process a single repository."""
+    # existing checking mode
    if check_exists:
-        return check_repo_exists(org_name, repo_name, token, output_path)
-        
-    logger.info(f"Processing repository: {org_name}/{repo_name}")
-    Downloader(org_name, repo_name, token, max_retries=20).download_repo(collection_path, include=include)
+        return check_repo_exists(org_name, repo_name, token, output_deleted)
+    
+    # raw saving mode
+    if save_raw:
+        raw_path = raw_dir / org_name / repo_name
+        Downloader(org_name, repo_name, token, max_retries=20).download_repo(raw_path, include=include)
+        logger.info("Processing complete")
+        return
+    
+    def create_archive_callback(temp_dir):
+        if raw_dir:
+            raw_path = raw_dir / org_name / repo_name
+            if raw_path.exists():
+                out_dir = raw_path
+        else:
+            Downloader(org_name, repo_name, token, max_retries=20).download_repo(temp_dir, include=include)
+            out_dir = temp_dir
+        return {
+            'collect': [
+                PathCollectionTask(path=out_dir)
+            ],
+            'signed_metadata': {
+                'id': str(uuid.uuid4()),
+                'url': f'https://github.com/{org_name}/{repo_name}',
+                'description': f'Archive of GitHub repository {org_name}/{repo_name}',
+                'github_metadata': {
+                    'org': org_name,
+                    'repo': repo_name,
+                    'archived_date': datetime.now().isoformat()
+                },
+            },
+        }
+
+    # Archive mode - use common pipeline
+    fetch_and_upload(
+        output_path=output_path,
+        collection_path=collection_path,
+        metadata_path=metadata_path,
+        create_archive_callback=create_archive_callback,
+        signatures=signatures,
+        session_args=session_args,
+        s3_path=s3_path,
+        no_delete=no_delete
+    )
+    
    logger.info("Processing complete")

-def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int = 0, skip_existing: bool = False, stop_after: int = None, include: str = None,
-              check_exists: bool = False):
+def get_tasks(
+    csv_path: Path,
+    output_path: Path,
+    skip_rows: int = 0,
+    skip_existing: bool = False, 
+    stop_after: int = None,
+    include: str = None,
+    archive_mode: bool = False,
+    signatures: list = None,
+    session_args: dict = None,
+    s3_path: str = None,
+    no_delete: bool = False,
+    save_raw: bool = False,
+    raw_dir: Path = None,
+    check_exists: bool = False,
+    output_deleted: Path = None,
+):
    """Get repositories from CSV that haven't been processed yet."""
    # Initialize progress bars
    if not check_exists:
@ -85,19 +164,37 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
                continue
                
            org_name, repo_name = row['html_url'].split('/')[-2:]
-            collection_path = output_path / 'collections' / collection / org_name / repo_name
            
-            if skip_existing:
-                if collection_path.exists():
-                    stats_counter['skipped'].update(1)
-                    continue
-                else:
-                    stats_counter['yielded'].update(1)
+            collection_path = output_path / 'data' / org_name / repo_name / 'v1.zip'
+            metadata_path = output_path / 'metadata' / org_name / repo_name / 'v1.json'
+                
+            if skip_existing and collection_path.exists():
+                stats_counter['skipped'].update(1)
+                continue
+            else:
+                stats_counter['yielded'].update(1)

            # use tokens round robin
            token = tokens[processed % len(tokens)]

-            yield org_name, repo_name, collection_path, include, token, check_exists, output_path
+            yield (
+                org_name,
+                repo_name,
+                collection_path,
+                include,
+                token,
+                output_deleted,
+                archive_mode,
+                metadata_path,
+                output_path,
+                signatures,
+                session_args,
+                s3_path,
+                save_raw,
+                raw_dir,
+                check_exists,
+                no_delete,
+            )

            processed += 1
            if stop_after and processed >= stop_after:
@ -107,11 +204,10 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
    for counter in stats_counter.values():
        counter.close()

+
@click.command()
@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
              help='Output path.')
-@click.option('--collection', '-c', type=str, default='github_raw',
-              help='Collection name.')
@click.option('--workers', '-w', type=int, default=None, 
              help='Number of worker processes. Defaults to CPU count.')
@click.option('--skip-rows', type=int, default=0,
@ -120,21 +216,44 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
              help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items))
@click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv',
              help='Path to the CSV file.')
-@click.option('--log-level', '-l', 
-              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
-              default=None,
-              help='Logging level.')
@click.option('--stop-after', help='Stop after processing this many repositories', type=int)
@click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories')
+@click.option('--signatures', help='JSON string of signature configuration.')
+# upload settings
+@click.option('--profile', '-p', help='AWS profile name')
+@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
+@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
+# raw saving
+# useful if doing multiple runs with the same csv and different --include values
+@click.option('--save-raw', is_flag=True, help='Save raw repositories to disk rather than bagging and uploading')
+@click.option('--raw-dir', type=click.Path(path_type=Path), help='Directory to save raw repositories to')
+# deletion checking
@click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub')
-def main(csv_path: Path, output_path: Path, collection: str, workers=None, skip_rows=0, include=None,
-         log_level=None, stop_after=None, skip_existing=False, check_exists=False):
+@click.option('--output-deleted', type=click.Path(path_type=Path), help='File to output deleted repositories to')
+def main(profile, workers, **kwargs):
+    
+    session_args = {}
+    if profile:
+        session_args['profile_name'] = profile
+
+    if signatures := kwargs.get('signatures'):
+        signatures = json.loads(signatures)
+        for signature in signatures:
+            if signature['action'] == 'sign':
+                if is_encrypted_key(signature['params']['key']):
+                    signature['params']['password'] = click.prompt(
+                        f"Enter password for {signature['params']['key']}: ", 
+                        hide_input=True
+                    )
+            elif signature['action'] == 'timestamp':
+                if known_tsa := signature.pop('known_tsa', None):
+                    signature['params'] = KNOWN_TSAS[known_tsa]
+        kwargs['signatures'] = signatures
    
    run_parallel(
        run_pipeline,
-        get_tasks(csv_path, output_path, collection, skip_rows, skip_existing, stop_after, include, check_exists),
+        get_tasks(**kwargs),
        workers,
-        log_level=log_level
    )

 if __name__ == "__main__":
--- a/scripts/helpers/bag.py
+++ b/scripts/helpers/bag.py
@ -0,0 +1,156 @@
+import os
+import json
+import zipfile
+import struct
+import boto3
+import logging
+from pathlib import Path
+from datetime import datetime
+import tempfile
+import shutil
+from nabit.lib.archive import package
+
+logger = logging.getLogger(__name__)
+
+# File extensions that are already compressed or wouldn't benefit from additional compression
+UNCOMPRESSED_EXTENSIONS = {
+    # Already compressed archives
+    'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
+    # Compressed images
+    'jpg', 'jpeg', 'png', 'gif', 'webp',
+    # Compressed video/audio
+    'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
+    # Other compressed/binary formats
+    'pdf', 'docx', 'xlsx', 'pptx',
+}
+
+def zip_archive(bag_dir, archive_path):
+    """Zip up a nabit archive and create metadata."""
+    # Create zip archive
+    with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for file_path in bag_dir.rglob('*'):
+            if file_path.is_file():
+                arc_path = file_path.relative_to(bag_dir)
+                compression = (zipfile.ZIP_STORED 
+                            if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS 
+                            else zipfile.ZIP_DEFLATED)
+                zf.write(file_path, arc_path, compress_type=compression)
+
+    # Create metadata file
+    zip_info = []
+    with zipfile.ZipFile(archive_path, 'r') as zf:
+        for info in zf.filelist:
+            header_offset = info.header_offset
+            
+            # Read header to calculate data offset
+            zf.fp.seek(header_offset)
+            header = zf.fp.read(zipfile.sizeFileHeader)
+            fheader = struct.unpack(zipfile.structFileHeader, header)
+            fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
+            extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
+            data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
+            
+            zip_info.append({
+                'filename': info.filename,
+                'file_size': info.file_size,
+                'compress_size': info.compress_size,
+                'compress_type': info.compress_type,
+                'header_offset': header_offset,
+                'data_offset': data_offset,
+            })
+        
+    # Read the bag-info.txt and signed-metadata.json
+    bag_info = (bag_dir / 'bag-info.txt').read_text()
+    signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
+
+    return {
+        'bag_info': bag_info,
+        'signed_metadata': signed_metadata,
+        'zip_entries': zip_info
+    }
+
+def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
+    """Upload archive and metadata to S3."""
+    s3 = boto3.Session(**session_args).client('s3')
+    bucket_name, s3_key_prefix = s3_path.split('/', 1)
+
+    # Upload zip file
+    s3_collection_key = os.path.join(s3_key_prefix, str(collection_path.relative_to(output_path)))
+    s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
+    logger.info(f"  - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
+
+    # Upload metadata file
+    s3_metadata_key = os.path.join(s3_key_prefix, str(metadata_path.relative_to(output_path)))
+    s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
+    logger.info(f"  - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
+
+def cleanup_files(collection_path, no_delete=False, s3_path=None):
+    """Clean up local files after upload if needed."""
+    if not no_delete and s3_path:
+        logger.info("- Deleting local zip archive...")
+        if os.path.exists(collection_path):
+            os.remove(collection_path)
+            if collection_path.parent.exists() and not os.listdir(collection_path.parent):
+                os.rmdir(collection_path.parent)
+
+def fetch_and_upload(
+        output_path,
+        collection_path,
+        metadata_path,
+        create_archive_callback,
+        signatures=None,
+        session_args=None,
+        s3_path=None,
+        no_delete=False,
+    ):
+    """
+    Common pipeline for creating and processing archives.
+    
+    Args:
+        output_path: Base output directory
+        collection_path: Path where the final zip will be stored
+        metadata_path: Path where the metadata will be stored
+        create_archive_callback: Function that will create the archive
+        signatures: Signature configuration for nabit
+        session_args: AWS session arguments
+        s3_path: S3 path for uploads
+        no_delete: Whether to preserve local files
+    """
+    with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
+        logger.info("- Creating archive...")
+        # set up paths
+        temp_dir = Path(temp_dir)
+        bag_dir = temp_dir / 'bag'
+        archive_path = temp_dir / 'archive.zip'
+        source_files_dir = temp_dir / 'source_files'
+        source_files_dir.mkdir(parents=True, exist_ok=True)
+
+        # Call the callback to create the archive
+        package_kwargs = create_archive_callback(source_files_dir)
+
+        # create bag
+        package(
+            output_path=bag_dir,
+            collect_errors='ignore',
+            signatures=signatures,
+            **package_kwargs,
+        )
+
+        logger.info("- Zipping archive...")
+        # zip up data and create metadata
+        output_metadata = zip_archive(bag_dir, archive_path)
+
+        logger.info("- Moving files to final location...")
+        # Move files to final location
+        collection_path.parent.mkdir(parents=True, exist_ok=True)
+        metadata_path.parent.mkdir(parents=True, exist_ok=True)
+        shutil.move(str(archive_path), collection_path)
+        with open(metadata_path, 'w') as f:
+            json.dump(output_metadata, f)
+            f.write('\n')
+
+    if s3_path:
+        logger.info("Uploading to S3...")
+        upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
+
+    cleanup_files(collection_path, no_delete, s3_path) 
--- a/scripts/helpers/onepassword.py
+++ b/scripts/helpers/onepassword.py
@ -0,0 +1,85 @@
+from onepassword import Client, ItemCreateParams, ItemField, ItemFieldType, ItemCategory, ItemSection, Item
+from onepassword import ItemShareParams, ItemShareDuration, ValidRecipient
+from .misc import load_config
+import logging
+import asyncio
+
+logger = logging.getLogger(__name__)
+
+async def get_client():
+    op_config = load_config().get("1password", {})
+    if not op_config.get('token'):
+        raise Exception("1Password token not found in config")
+
+    return await Client.authenticate(auth=op_config['token'], integration_name="data-vault", integration_version="v1.0.0")
+
+def save_item(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
+    return asyncio.run(save_item_async(vault_name, title, fields, notes, category))
+
+async def save_item_async(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
+    client = await get_client()
+    vault_id = None
+    vaults = await client.vaults.list_all()
+    async for vault in vaults:
+        if vault.title == vault_name:
+            vault_id = vault.id
+            break
+    else:
+        raise Exception(f"Vault {vault} not found")
+    
+    field_objs = []
+    sections = []
+    for field in fields:
+        if field.get('concealed', False):
+            field['field_type'] = ItemFieldType.CONCEALED
+        else:
+            field['field_type'] = ItemFieldType.TEXT
+        field['id'] = field['title'].lower().replace(' ', '_')
+        field_objs.append(ItemField(**field))
+        if section_id := field.get('section_id'):
+            if section_id not in sections:
+                sections.append(section_id)
+    sections = [ItemSection(id=section, title=section) for section in sections]
+    
+    # Create item parameters with sections
+    create_params = ItemCreateParams(
+        title=title,
+        category=category,
+        vault_id=vault_id,
+        fields=field_objs,
+        sections=sections,
+    )
+    
+    if notes:
+        create_params.notes = notes
+    
+    item = await client.items.create(create_params)
+    logger.info(f"Stored credentials in 1Password vault '{vault}' with title '{title}'")
+    return item
+
+def share_item(
+        item: Item, 
+        recipients: list[str] | None = None, 
+        expire_after: ItemShareDuration | None = ItemShareDuration.SEVENDAYS, 
+        one_time_only: bool = False
+    ):
+    return asyncio.run(share_item_async(item, recipients, expire_after, one_time_only))
+
+async def share_item_async(
+        item: Item,
+        recipients: list[str] | None, 
+        expire_after: ItemShareDuration | None,
+        one_time_only: bool,
+    ):
+    client = await get_client()
+    policy = await client.items.shares.get_account_policy(item.vault_id, item.id)
+    valid_recipients = await client.items.shares.validate_recipients(policy, recipients)
+    share_params = ItemShareParams(
+        recipients=valid_recipients,
+        expire_after=expire_after,
+        one_time_only=one_time_only
+    )
+    share_link = await client.items.shares.create(item, policy, share_params)
+    logger.info(f"Created share link for '{item.title}'")
+    return share_link
+
--- a/scripts/helpers/parallel.py
+++ b/scripts/helpers/parallel.py
@ -25,17 +25,9 @@ def worker(task_queue, task, catch_errors: bool = True):
                raise e


-def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, log_level: str | None = None, task_count: int | None = None):
+def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, task_count: int | None = None):
    workers = workers or os.cpu_count() or 4
    
-    # Configure logging based on whether we're running in parallel or not
-    if log_level is None:
-        log_level = 'INFO' if workers == 1 else 'WARNING'
-    logging.basicConfig(
-        level=log_level,
-        format='[%(process)d] %(message)s'
-    )
-    
    logger.debug(f"Starting processing with {workers} workers")
    
    if workers > 1:
--- a/uv.lock
+++ b/uv.lock
@ -169,6 +169,7 @@ dependencies = [
    { name = "httpx" },
    { name = "jsondiff" },
    { name = "nabit" },
+    { name = "onepassword-sdk" },
    { name = "orjson" },
    { name = "peewee" },
    { name = "publicsuffixlist" },
@ -192,6 +193,7 @@ requires-dist = [
    { name = "httpx", specifier = ">=0.27.2" },
    { name = "jsondiff", specifier = ">=2.2.1" },
    { name = "nabit", git = "https://github.com/harvard-lil/bag-nabit" },
+    { name = "onepassword-sdk", specifier = ">=0.1.7" },
    { name = "orjson", specifier = ">=3.10.15" },
    { name = "peewee", specifier = ">=3.17.8" },
    { name = "publicsuffixlist", specifier = ">=1.0.2.20241121" },
@ -466,6 +468,27 @@ dependencies = [
    { name = "warcio" },
 ]

+[[package]]
+name = "onepassword-sdk"
+version = "0.1.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/aa/64/75462b6a21cbf98bff8ccae24848034ab280292d5b65346b77c720f46f93/onepassword_sdk-0.1.7.tar.gz", hash = "sha256:27979eb1b4fe0476a123336e3b432ce549feb4a6f87e975581497befcac985e9", size = 25312950 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/1a/016c165b87107cf771e3ac1f5a6b813003de534631ff78171b334980bef0/onepassword_sdk-0.1.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fd399e48833f5c916831c22be40f76cf7e85d0180fcb818dbd76cc5d45659cb8", size = 5261542 },
+    { url = "https://files.pythonhosted.org/packages/9a/d9/3960534ef2faced9cdecbe72bc2060f718de97b8d95c15210f50d1c514bb/onepassword_sdk-0.1.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:48a6dfb85c0138896ecb8d6aa814c5985acbb4262ed5e3a18e99725454bb41d5", size = 4827963 },
+    { url = "https://files.pythonhosted.org/packages/04/50/b0a34b7f1aa7e3b747bdca586896d3f2d452cdf87249f3dae56f649b1c83/onepassword_sdk-0.1.7-cp312-cp312-manylinux_2_32_aarch64.whl", hash = "sha256:7c6bbb202c83ad1566f27c3354ce88fa8dcceae0f90b0514b42d40aeb0b8ad10", size = 5262840 },
+    { url = "https://files.pythonhosted.org/packages/52/2d/73a11ee89fb69d6d6e388524560e53574eb5fdd20f65d57f25212087fc9f/onepassword_sdk-0.1.7-cp312-cp312-manylinux_2_32_x86_64.whl", hash = "sha256:299cd1d33f83e0acca01f0b93474397b471e5470b7f58f542f6866f8ea4de134", size = 5559126 },
+    { url = "https://files.pythonhosted.org/packages/02/fe/7aa733bfdadc930e4a02115189652524edce425ae13859d10a79f3cd24fa/onepassword_sdk-0.1.7-cp312-cp312-win_amd64.whl", hash = "sha256:c6d7ff7a15e56b2f6198b3681f6d18678fa56f6d9d3b0639bc70ae9b18b8acd2", size = 4627027 },
+    { url = "https://files.pythonhosted.org/packages/6a/97/ef38f90c2b8038529c059d5e6e1d5fb447f75300258ce2c5e5761bbe7283/onepassword_sdk-0.1.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0a6c0e2b8383e2428e020abe41a82376fe24c1a1d9634bfb31c1db4692ef8a25", size = 5261540 },
+    { url = "https://files.pythonhosted.org/packages/22/c2/6b9eafd9a1549bae1c7e04e344374e560842b6451279be268d1f45cada08/onepassword_sdk-0.1.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6d570ebe068ba6ce4116a71d16a180353b864e5975be4574ede623b1ea794209", size = 4827963 },
+    { url = "https://files.pythonhosted.org/packages/57/0e/d924f5b92176fb86ff74b432d2dd4558048c971f902f287557e89795edc0/onepassword_sdk-0.1.7-cp313-cp313-manylinux_2_32_aarch64.whl", hash = "sha256:50f332f0f0f77913abe461c4d0913f7714ba2e07d9ef39446e0d564b5b440879", size = 5262838 },
+    { url = "https://files.pythonhosted.org/packages/38/13/9a79eccdb8f65b6e2676e72f0f77a2c626e00210960e74b0ff72f7a419a8/onepassword_sdk-0.1.7-cp313-cp313-manylinux_2_32_x86_64.whl", hash = "sha256:8a773da4770e887b124ec671781678e1b1fd57f20dcc40f89a610728f55138ed", size = 5559126 },
+    { url = "https://files.pythonhosted.org/packages/24/2c/15549d06fa6874d520ab123963172d18c45de34d0ac26e02465a56488f7d/onepassword_sdk-0.1.7-cp313-cp313-win_amd64.whl", hash = "sha256:9541175ed6283736745bf673f8c74696e53a164192c34c44de4400e8fdf01e38", size = 4627026 },
+]
+
 [[package]]
 name = "orderly-set"
 version = "5.3.0"