initial commit

2025-07-04 13:46:56 -04:00 · 2025-02-05 10:21:50 -05:00 · 2025-02-05 10:21:50 -05:00 · 404c3627f7
commit 404c3627f7
26 changed files with 2534 additions and 0 deletions
--- a/scripts/collection/init.py
+++ b/scripts/collection/init.py
@ -0,0 +1,2 @@
+def hello() -> str:
+    return "Hello from data-mirror!"
--- a/scripts/collection/cloudflare_tools.py
+++ b/scripts/collection/cloudflare_tools.py
@ -0,0 +1,100 @@
+import logging
+from pathlib import Path
+import click
+from cloudflare import Cloudflare
+import os
+from scripts.helpers.config import load_config
+
+logger = logging.getLogger(__name__)
+
+def generate_temp_key(account_id: str, bucket: str, parent_access_key_id: str, token: str,
+                     permission: str = "object-read-write", ttl_seconds: int = 3600,
+                     prefixes: list[str] | None = None, objects: list[str] | None = None):
+    """Generate a temporary R2 access key using the Cloudflare API.
+    
+    Args:
+        account_id: Cloudflare account ID
+        bucket: R2 bucket name
+        parent_access_key_id: Parent access key ID
+        token: Cloudflare API token
+        permission: Permission level ('object-read-write' or 'object-read')
+        ttl_seconds: Time-to-live in seconds
+        prefixes: Optional list of key prefixes to restrict access to
+        objects: Optional list of specific object keys to restrict access to
+    """
+    params = {
+        "account_id": account_id,
+        "bucket": bucket,
+        "parent_access_key_id": parent_access_key_id,
+        "permission": permission,
+        "ttl_seconds": ttl_seconds,
+    }
+    
+    if prefixes:
+        params["prefixes"] = prefixes
+    if objects:
+        params["objects"] = objects
+        
+    return Cloudflare(api_token=token).r2.temporary_credentials.create(**params)
+
+@click.group()
+def cli():
+    """Cloudflare R2 utility commands."""
+    pass
+
+@cli.command()
+@click.option('--bucket', '-b', type=str, required=True,
+              help='R2 bucket name.')
+@click.option('--permission', '-p', type=click.Choice(['object-read-write', 'object-read']), 
+              default='object-read-write',
+              help='Permission level for the temporary key.')
+@click.option('--ttl', '-t', type=int, default=1,
+              help='Time-to-live in hours for the temporary key.')
+@click.option('--prefixes', '-x', multiple=True,
+              help='Key prefixes to restrict access to. Can be specified multiple times.')
+@click.option('--objects', '-o', multiple=True,
+              help='Specific object keys to restrict access to. Can be specified multiple times.')
+@click.option('--log-level', '-l', 
+              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO',
+              help='Logging level.')
+def generate_key(bucket: str, permission: str, ttl: int, prefixes: tuple[str, ...], 
+                objects: tuple[str, ...], log_level: str):
+    """Generate temporary Cloudflare R2 access credentials."""
+    # Setup logging
+    logging.basicConfig(level=log_level)
+    
+    # Load config
+    config = load_config().get("temp_tokens", {})
+
+    if not config or any(key not in config for key in ['parent_access_key_id', 'account_id', 'token']):
+        raise click.ClickException("Config file must have 'temp_tokens' dict with 'parent_access_key_id', 'account_id', and 'token' keys")
+    
+    # Generate temporary key
+    temp_cred = generate_temp_key(
+        account_id=config['account_id'],
+        bucket=bucket,
+        parent_access_key_id=config['parent_access_key_id'],
+        token=config['token'],
+        permission=permission,
+        ttl_seconds=ttl * 3600,
+        prefixes=list(prefixes) if prefixes else None,
+        objects=list(objects) if objects else None
+    )
+    
+    # Output AWS config format
+    click.echo("\n# Add this to ~/.aws/config:")
+    click.echo("[profile r2-temp]")
+    click.echo(f"aws_access_key_id = {temp_cred.access_key_id}")
+    click.echo(f"aws_secret_access_key = {temp_cred.secret_access_key}")
+    click.echo(f"aws_session_token = {temp_cred.session_token}")
+    click.echo("region = auto")
+    click.echo(f"endpoint_url = https://{config['account_id']}.r2.cloudflarestorage.com")
+
+    # Output sample command using first prefix if available
+    click.echo("\n# Sample upload command:")
+    sample_path = objects[0] if objects else f"{prefixes[0].strip('/')}/" if prefixes else ""
+    click.echo(f"aws s3 cp local-file.txt s3://{bucket}/{sample_path} --profile r2-temp")
+
+if __name__ == "__main__":
+    cli()
--- a/scripts/collection/render.py
+++ b/scripts/collection/render.py
@ -0,0 +1,109 @@
+import click
+from pathlib import Path
+from scripts.data_gov.models import db, Dataset
+import logging
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+
+# Header template with styles
+HEADER_TEMPLATE = '''<!DOCTYPE html>
+<html>
+<head>
+    <title>Data.gov Dataset Mirror</title>
+    <link rel="stylesheet" href="style.css">
+</head>
+<body>
+    <h1>Data.gov Dataset Mirror</h1>
+'''
+
+TABLE_START = '''    <table>
+        <thead>
+            <tr>
+                <th>Name</th>
+                <th>Organization</th>
+                <th>Description</th>
+            </tr>
+        </thead>
+        <tbody>
+'''
+
+ROW_TEMPLATE = '''            <tr>
+                <td>{name}</td>
+                <td>{org}</td>
+                <td>{title}</td>
+            </tr>
+'''
+
+TABLE_END = '''        </tbody>
+    </table>
+</body>
+</html>
+'''
+
+def render_html(datasets_query, output_path: Path) -> None:
+    """Render the datasets to an HTML file, streaming content."""
+    with open(output_path / 'index.html', 'w', encoding='utf-8') as f:
+        # Write header
+        f.write(HEADER_TEMPLATE)
+        
+        # Write table start
+        f.write(TABLE_START)
+        
+        # Stream each dataset row
+        rows = []
+        for dataset in tqdm(datasets_query.iterator(), desc="Rendering datasets"):
+            org_title = dataset.organization.get('title') if dataset.organization else 'N/A'
+            row = ROW_TEMPLATE.format(
+                name=dataset.name or '',
+                org=org_title,
+                title=dataset.title,
+            )
+            rows.append(row)
+            if len(rows) >= 1000:
+                f.write('\n'.join(rows))
+                rows = []
+
+        if rows:
+            f.write('\n'.join(rows))
+        
+        # Write table end
+        f.write(TABLE_END)
+
+@click.command()
+@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
+@click.argument('output_path', type=click.Path(path_type=Path), default='data/processed/web')
+@click.option('--log-level', '-l', 
+              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO',
+              help='Logging level.')
+@click.option('--limit', '-n', type=int, default=None,
+              help='Maximum number of rows to display. Default: all rows.')
+def main(db_path: Path, output_path: Path, log_level: str, limit: int | None):
+    """Render the Dataset table to an HTML file."""
+    logging.basicConfig(
+        level=getattr(logging, log_level),
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    
+    logger.info(f"Connecting to database at {db_path}")
+    db.init(db_path)
+    db.connect()
+    
+    try:
+        logger.info("Starting HTML generation...")
+        datasets_query = Dataset.select().order_by(Dataset.id)
+        if limit:
+            datasets_query = datasets_query.limit(limit)
+            logger.info(f"Limited to {limit} rows")
+        
+        logger.info(f"Rendering HTML to {output_path}")
+        render_html(datasets_query, output_path)
+        logger.info("Done!")
+        
+    finally:
+        db.close()
+
+if __name__ == "__main__":
+    main()
--- a/scripts/collection/s3_tools.py
+++ b/scripts/collection/s3_tools.py
@ -0,0 +1,118 @@
+import boto3
+import click
+from tqdm import tqdm
+import logging
+from itertools import islice
+
+logger = logging.getLogger(__name__)
+
+def get_delete_markers(s3_client, bucket: str, prefix: str):
+    """Get all delete markers for objects with the given prefix."""
+    paginator = s3_client.get_paginator('list_object_versions')
+    for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="pages"):
+        if 'DeleteMarkers' in page:
+            yield [
+                {
+                    'Key': marker['Key'],
+                    'VersionId': marker['VersionId']
+                }
+                for marker in page['DeleteMarkers']
+                if marker['IsLatest']
+            ]
+
+def remove_delete_markers(s3_client, bucket: str, prefix: str, dry_run: bool = False):
+    """Remove all delete markers for objects with the given prefix."""
+    for marker_batch in get_delete_markers(s3_client, bucket, prefix):
+        response = s3_client.delete_objects(
+            Bucket=bucket,
+            Delete={
+                'Objects': marker_batch,
+                'Quiet': True
+            }
+        )
+        
+        # Log any errors
+        if 'Errors' in response:
+            for error in response['Errors']:
+                logger.error(f"Failed to remove marker for {error['Key']}: {error['Message']}")
+
+def get_empty_files(s3_client, bucket: str, prefix: str):
+    """Get all objects with size zero under the given prefix."""
+    paginator = s3_client.get_paginator('list_objects_v2')
+    for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="pages"):
+        if 'Contents' in page:
+            yield [
+                {'Key': obj['Key']}
+                for obj in page['Contents']
+                if obj['Size'] == 0
+            ]
+
+def delete_empty_files(s3_client, bucket: str, prefix: str, dry_run: bool = False):
+    """Delete all zero-size objects under the given prefix."""
+    pbar = tqdm(desc="deleted")
+    for empty_batch in get_empty_files(s3_client, bucket, prefix):
+        if not empty_batch:
+            continue
+
+        if dry_run:
+            for obj in empty_batch:
+                logger.info(f"Would delete empty file: {obj['Key']}")
+            continue
+
+        pbar.update(len(empty_batch))
+
+        response = s3_client.delete_objects(
+            Bucket=bucket,
+            Delete={
+                'Objects': empty_batch,
+                'Quiet': True
+            }
+        )
+        
+        # Log any errors
+        if 'Errors' in response:
+            for error in response['Errors']:
+                logger.error(f"Failed to delete {error['Key']}: {error['Message']}")
+
+    pbar.close()
+
+@click.group()
+def cli():
+    """S3 object management commands."""
+    pass
+
+@cli.command()
+@click.argument('s3_path')
+@click.option('--profile', help='AWS profile name', default='sc-direct')
+@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
+@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO', help='Set logging level')
+def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
+    """Remove delete markers from versioned S3 objects, effectively undeleting them."""
+    logging.basicConfig(level=log_level)
+    bucket, prefix = s3_path.split('/', 1)
+    
+    session = boto3.Session(profile_name=profile)
+    s3_client = session.client('s3')
+    
+    remove_delete_markers(s3_client, bucket, prefix, dry_run)
+
+@cli.command()
+@click.argument('s3_path')
+@click.option('--profile', help='AWS profile name', default='sc-direct')
+@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
+@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO', help='Set logging level')
+def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
+    """Delete all zero-size objects under the given prefix."""
+    logging.basicConfig(level=log_level)
+    bucket, prefix = s3_path.split('/', 1)
+    
+    session = boto3.Session(profile_name=profile)
+    s3_client = session.client('s3')
+    
+    delete_empty_files(s3_client, bucket, prefix, dry_run)
+
+if __name__ == '__main__':
+    cli()
+
--- a/scripts/collection/sync.py
+++ b/scripts/collection/sync.py
@ -0,0 +1,31 @@
+import boto3
+import click
+import json
+from pathlib import Path
+import logging
+
+logger = logging.getLogger(__name__)
+
+@click.command()
+@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path),
+              default='collections/collections.json',
+              help='Path to collections configuration file.')
+def main(collections_file: Path):
+    # Load collections config
+    collections = json.loads(collections_file.read_text())
+    collections_dir = collections_file.parent
+
+    for collection in collections:
+        s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3')
+        collection_path = collections_dir / collection['directory']
+        bucket_name, s3_prefix = collection['s3_path'].split('/', 1)
+
+        for file_path in collection_path.rglob('*'):
+            if file_path.is_file():
+                relative_path = file_path.relative_to(collection_path)
+                s3_key = f"{s3_prefix}/{relative_path}"
+                print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}")
+                s3.upload_file(str(file_path), bucket_name, s3_key)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/collection/verify_upload.py
+++ b/scripts/collection/verify_upload.py
@ -0,0 +1,91 @@
+from pathlib import Path
+import json
+import zipfile
+import tempfile
+import requests
+import click
+import logging
+from nabit.bin.utils import cli_validate
+logger = logging.getLogger(__name__)
+
+def download_file(url: str, target_path: Path):
+    """Download a file from URL to target path"""
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    with target_path.open('wb') as f:
+        for chunk in response.iter_content(chunk_size=2**20):
+            f.write(chunk)
+
+def verify_dataset(json_url: str, zip_url: str, output_dir: Path | None = None):
+    """
+    Verify a dataset by downloading and checking its JSON metadata and ZIP contents.
+    If output_dir is provided, write the uncompressed contents there.
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+        
+        # Download files
+        logger.info(f"Downloading metadata from {json_url}...")
+        json_path = tmpdir / "metadata.json"
+        download_file(json_url, json_path)
+        
+        logger.info(f"Downloading archive from {zip_url}...")
+        zip_path = tmpdir / "data.zip"
+        download_file(zip_url, zip_path)
+        
+        # Load metadata
+        metadata = json.loads(json_path.read_text())
+        
+        # Create output directory
+        if not output_dir:
+            output_dir = tmpdir / "output"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Verify file contents
+        logger.info("Verifying file contents...")
+        with zip_path.open('rb') as f:
+            for entry in metadata['zip_entries']:
+                logger.info(f"Checking {entry['filename']}...")
+                f.seek(entry['data_offset'])
+                zip_data = f.read(entry['compress_size'])
+                
+                if entry['compress_type'] == zipfile.ZIP_STORED:
+                    uncompressed = zip_data
+                else:
+                    decompressor = zipfile._get_decompressor(entry['compress_type'])
+                    uncompressed = decompressor.decompress(zip_data)
+                
+                # write the file
+                output_file = output_dir / entry['filename']
+                output_file.parent.mkdir(parents=True, exist_ok=True)
+                output_file.write_bytes(uncompressed)
+
+        logger.info("All files extracted successfully")
+
+        # verify dataset with nabit
+        cli_validate(output_dir)
+        
+        # Return metadata for potential further use
+        return metadata
+
+@click.command()
+@click.argument('json_url', type=str)
+@click.argument('zip_url', type=str)
+@click.option('--output', '-o', type=click.Path(path_type=Path), 
+              help='Directory to write uncompressed files')
+@click.option('--log-level', '-l', 
+              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO',
+              help='Logging level.')
+def main(json_url: str, zip_url: str, output: Path = None, log_level: str = 'INFO'):
+    """Verify dataset from JSON and ZIP URLs"""
+    # Set up logging
+    logging.basicConfig(
+        level=getattr(logging, log_level),
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    
+    verify_dataset(json_url, zip_url, output)
+
+if __name__ == '__main__':
+    main()