mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-07-04 13:46:56 -04:00
initial commit
This commit is contained in:
commit
404c3627f7
26 changed files with 2534 additions and 0 deletions
2
scripts/collection/__init__.py
Normal file
2
scripts/collection/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
def hello() -> str:
|
||||
return "Hello from data-mirror!"
|
100
scripts/collection/cloudflare_tools.py
Normal file
100
scripts/collection/cloudflare_tools.py
Normal file
|
@ -0,0 +1,100 @@
|
|||
import logging
|
||||
from pathlib import Path
|
||||
import click
|
||||
from cloudflare import Cloudflare
|
||||
import os
|
||||
from scripts.helpers.config import load_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def generate_temp_key(account_id: str, bucket: str, parent_access_key_id: str, token: str,
|
||||
permission: str = "object-read-write", ttl_seconds: int = 3600,
|
||||
prefixes: list[str] | None = None, objects: list[str] | None = None):
|
||||
"""Generate a temporary R2 access key using the Cloudflare API.
|
||||
|
||||
Args:
|
||||
account_id: Cloudflare account ID
|
||||
bucket: R2 bucket name
|
||||
parent_access_key_id: Parent access key ID
|
||||
token: Cloudflare API token
|
||||
permission: Permission level ('object-read-write' or 'object-read')
|
||||
ttl_seconds: Time-to-live in seconds
|
||||
prefixes: Optional list of key prefixes to restrict access to
|
||||
objects: Optional list of specific object keys to restrict access to
|
||||
"""
|
||||
params = {
|
||||
"account_id": account_id,
|
||||
"bucket": bucket,
|
||||
"parent_access_key_id": parent_access_key_id,
|
||||
"permission": permission,
|
||||
"ttl_seconds": ttl_seconds,
|
||||
}
|
||||
|
||||
if prefixes:
|
||||
params["prefixes"] = prefixes
|
||||
if objects:
|
||||
params["objects"] = objects
|
||||
|
||||
return Cloudflare(api_token=token).r2.temporary_credentials.create(**params)
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
"""Cloudflare R2 utility commands."""
|
||||
pass
|
||||
|
||||
@cli.command()
|
||||
@click.option('--bucket', '-b', type=str, required=True,
|
||||
help='R2 bucket name.')
|
||||
@click.option('--permission', '-p', type=click.Choice(['object-read-write', 'object-read']),
|
||||
default='object-read-write',
|
||||
help='Permission level for the temporary key.')
|
||||
@click.option('--ttl', '-t', type=int, default=1,
|
||||
help='Time-to-live in hours for the temporary key.')
|
||||
@click.option('--prefixes', '-x', multiple=True,
|
||||
help='Key prefixes to restrict access to. Can be specified multiple times.')
|
||||
@click.option('--objects', '-o', multiple=True,
|
||||
help='Specific object keys to restrict access to. Can be specified multiple times.')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO',
|
||||
help='Logging level.')
|
||||
def generate_key(bucket: str, permission: str, ttl: int, prefixes: tuple[str, ...],
|
||||
objects: tuple[str, ...], log_level: str):
|
||||
"""Generate temporary Cloudflare R2 access credentials."""
|
||||
# Setup logging
|
||||
logging.basicConfig(level=log_level)
|
||||
|
||||
# Load config
|
||||
config = load_config().get("temp_tokens", {})
|
||||
|
||||
if not config or any(key not in config for key in ['parent_access_key_id', 'account_id', 'token']):
|
||||
raise click.ClickException("Config file must have 'temp_tokens' dict with 'parent_access_key_id', 'account_id', and 'token' keys")
|
||||
|
||||
# Generate temporary key
|
||||
temp_cred = generate_temp_key(
|
||||
account_id=config['account_id'],
|
||||
bucket=bucket,
|
||||
parent_access_key_id=config['parent_access_key_id'],
|
||||
token=config['token'],
|
||||
permission=permission,
|
||||
ttl_seconds=ttl * 3600,
|
||||
prefixes=list(prefixes) if prefixes else None,
|
||||
objects=list(objects) if objects else None
|
||||
)
|
||||
|
||||
# Output AWS config format
|
||||
click.echo("\n# Add this to ~/.aws/config:")
|
||||
click.echo("[profile r2-temp]")
|
||||
click.echo(f"aws_access_key_id = {temp_cred.access_key_id}")
|
||||
click.echo(f"aws_secret_access_key = {temp_cred.secret_access_key}")
|
||||
click.echo(f"aws_session_token = {temp_cred.session_token}")
|
||||
click.echo("region = auto")
|
||||
click.echo(f"endpoint_url = https://{config['account_id']}.r2.cloudflarestorage.com")
|
||||
|
||||
# Output sample command using first prefix if available
|
||||
click.echo("\n# Sample upload command:")
|
||||
sample_path = objects[0] if objects else f"{prefixes[0].strip('/')}/" if prefixes else ""
|
||||
click.echo(f"aws s3 cp local-file.txt s3://{bucket}/{sample_path} --profile r2-temp")
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
109
scripts/collection/render.py
Normal file
109
scripts/collection/render.py
Normal file
|
@ -0,0 +1,109 @@
|
|||
import click
|
||||
from pathlib import Path
|
||||
from scripts.data_gov.models import db, Dataset
|
||||
import logging
|
||||
from tqdm import tqdm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Header template with styles
|
||||
HEADER_TEMPLATE = '''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Data.gov Dataset Mirror</title>
|
||||
<link rel="stylesheet" href="style.css">
|
||||
</head>
|
||||
<body>
|
||||
<h1>Data.gov Dataset Mirror</h1>
|
||||
'''
|
||||
|
||||
TABLE_START = ''' <table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Name</th>
|
||||
<th>Organization</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
'''
|
||||
|
||||
ROW_TEMPLATE = ''' <tr>
|
||||
<td>{name}</td>
|
||||
<td>{org}</td>
|
||||
<td>{title}</td>
|
||||
</tr>
|
||||
'''
|
||||
|
||||
TABLE_END = ''' </tbody>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
def render_html(datasets_query, output_path: Path) -> None:
|
||||
"""Render the datasets to an HTML file, streaming content."""
|
||||
with open(output_path / 'index.html', 'w', encoding='utf-8') as f:
|
||||
# Write header
|
||||
f.write(HEADER_TEMPLATE)
|
||||
|
||||
# Write table start
|
||||
f.write(TABLE_START)
|
||||
|
||||
# Stream each dataset row
|
||||
rows = []
|
||||
for dataset in tqdm(datasets_query.iterator(), desc="Rendering datasets"):
|
||||
org_title = dataset.organization.get('title') if dataset.organization else 'N/A'
|
||||
row = ROW_TEMPLATE.format(
|
||||
name=dataset.name or '',
|
||||
org=org_title,
|
||||
title=dataset.title,
|
||||
)
|
||||
rows.append(row)
|
||||
if len(rows) >= 1000:
|
||||
f.write('\n'.join(rows))
|
||||
rows = []
|
||||
|
||||
if rows:
|
||||
f.write('\n'.join(rows))
|
||||
|
||||
# Write table end
|
||||
f.write(TABLE_END)
|
||||
|
||||
@click.command()
|
||||
@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
|
||||
@click.argument('output_path', type=click.Path(path_type=Path), default='data/processed/web')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO',
|
||||
help='Logging level.')
|
||||
@click.option('--limit', '-n', type=int, default=None,
|
||||
help='Maximum number of rows to display. Default: all rows.')
|
||||
def main(db_path: Path, output_path: Path, log_level: str, limit: int | None):
|
||||
"""Render the Dataset table to an HTML file."""
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
logger.info(f"Connecting to database at {db_path}")
|
||||
db.init(db_path)
|
||||
db.connect()
|
||||
|
||||
try:
|
||||
logger.info("Starting HTML generation...")
|
||||
datasets_query = Dataset.select().order_by(Dataset.id)
|
||||
if limit:
|
||||
datasets_query = datasets_query.limit(limit)
|
||||
logger.info(f"Limited to {limit} rows")
|
||||
|
||||
logger.info(f"Rendering HTML to {output_path}")
|
||||
render_html(datasets_query, output_path)
|
||||
logger.info("Done!")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
118
scripts/collection/s3_tools.py
Normal file
118
scripts/collection/s3_tools.py
Normal file
|
@ -0,0 +1,118 @@
|
|||
import boto3
|
||||
import click
|
||||
from tqdm import tqdm
|
||||
import logging
|
||||
from itertools import islice
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_delete_markers(s3_client, bucket: str, prefix: str):
|
||||
"""Get all delete markers for objects with the given prefix."""
|
||||
paginator = s3_client.get_paginator('list_object_versions')
|
||||
for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="pages"):
|
||||
if 'DeleteMarkers' in page:
|
||||
yield [
|
||||
{
|
||||
'Key': marker['Key'],
|
||||
'VersionId': marker['VersionId']
|
||||
}
|
||||
for marker in page['DeleteMarkers']
|
||||
if marker['IsLatest']
|
||||
]
|
||||
|
||||
def remove_delete_markers(s3_client, bucket: str, prefix: str, dry_run: bool = False):
|
||||
"""Remove all delete markers for objects with the given prefix."""
|
||||
for marker_batch in get_delete_markers(s3_client, bucket, prefix):
|
||||
response = s3_client.delete_objects(
|
||||
Bucket=bucket,
|
||||
Delete={
|
||||
'Objects': marker_batch,
|
||||
'Quiet': True
|
||||
}
|
||||
)
|
||||
|
||||
# Log any errors
|
||||
if 'Errors' in response:
|
||||
for error in response['Errors']:
|
||||
logger.error(f"Failed to remove marker for {error['Key']}: {error['Message']}")
|
||||
|
||||
def get_empty_files(s3_client, bucket: str, prefix: str):
|
||||
"""Get all objects with size zero under the given prefix."""
|
||||
paginator = s3_client.get_paginator('list_objects_v2')
|
||||
for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="pages"):
|
||||
if 'Contents' in page:
|
||||
yield [
|
||||
{'Key': obj['Key']}
|
||||
for obj in page['Contents']
|
||||
if obj['Size'] == 0
|
||||
]
|
||||
|
||||
def delete_empty_files(s3_client, bucket: str, prefix: str, dry_run: bool = False):
|
||||
"""Delete all zero-size objects under the given prefix."""
|
||||
pbar = tqdm(desc="deleted")
|
||||
for empty_batch in get_empty_files(s3_client, bucket, prefix):
|
||||
if not empty_batch:
|
||||
continue
|
||||
|
||||
if dry_run:
|
||||
for obj in empty_batch:
|
||||
logger.info(f"Would delete empty file: {obj['Key']}")
|
||||
continue
|
||||
|
||||
pbar.update(len(empty_batch))
|
||||
|
||||
response = s3_client.delete_objects(
|
||||
Bucket=bucket,
|
||||
Delete={
|
||||
'Objects': empty_batch,
|
||||
'Quiet': True
|
||||
}
|
||||
)
|
||||
|
||||
# Log any errors
|
||||
if 'Errors' in response:
|
||||
for error in response['Errors']:
|
||||
logger.error(f"Failed to delete {error['Key']}: {error['Message']}")
|
||||
|
||||
pbar.close()
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
"""S3 object management commands."""
|
||||
pass
|
||||
|
||||
@cli.command()
|
||||
@click.argument('s3_path')
|
||||
@click.option('--profile', help='AWS profile name', default='sc-direct')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
|
||||
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO', help='Set logging level')
|
||||
def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
|
||||
"""Remove delete markers from versioned S3 objects, effectively undeleting them."""
|
||||
logging.basicConfig(level=log_level)
|
||||
bucket, prefix = s3_path.split('/', 1)
|
||||
|
||||
session = boto3.Session(profile_name=profile)
|
||||
s3_client = session.client('s3')
|
||||
|
||||
remove_delete_markers(s3_client, bucket, prefix, dry_run)
|
||||
|
||||
@cli.command()
|
||||
@click.argument('s3_path')
|
||||
@click.option('--profile', help='AWS profile name', default='sc-direct')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
|
||||
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO', help='Set logging level')
|
||||
def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
|
||||
"""Delete all zero-size objects under the given prefix."""
|
||||
logging.basicConfig(level=log_level)
|
||||
bucket, prefix = s3_path.split('/', 1)
|
||||
|
||||
session = boto3.Session(profile_name=profile)
|
||||
s3_client = session.client('s3')
|
||||
|
||||
delete_empty_files(s3_client, bucket, prefix, dry_run)
|
||||
|
||||
if __name__ == '__main__':
|
||||
cli()
|
||||
|
31
scripts/collection/sync.py
Normal file
31
scripts/collection/sync.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
import boto3
|
||||
import click
|
||||
import json
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@click.command()
|
||||
@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path),
|
||||
default='collections/collections.json',
|
||||
help='Path to collections configuration file.')
|
||||
def main(collections_file: Path):
|
||||
# Load collections config
|
||||
collections = json.loads(collections_file.read_text())
|
||||
collections_dir = collections_file.parent
|
||||
|
||||
for collection in collections:
|
||||
s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3')
|
||||
collection_path = collections_dir / collection['directory']
|
||||
bucket_name, s3_prefix = collection['s3_path'].split('/', 1)
|
||||
|
||||
for file_path in collection_path.rglob('*'):
|
||||
if file_path.is_file():
|
||||
relative_path = file_path.relative_to(collection_path)
|
||||
s3_key = f"{s3_prefix}/{relative_path}"
|
||||
print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}")
|
||||
s3.upload_file(str(file_path), bucket_name, s3_key)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
91
scripts/collection/verify_upload.py
Normal file
91
scripts/collection/verify_upload.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
from pathlib import Path
|
||||
import json
|
||||
import zipfile
|
||||
import tempfile
|
||||
import requests
|
||||
import click
|
||||
import logging
|
||||
from nabit.bin.utils import cli_validate
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def download_file(url: str, target_path: Path):
|
||||
"""Download a file from URL to target path"""
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
with target_path.open('wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=2**20):
|
||||
f.write(chunk)
|
||||
|
||||
def verify_dataset(json_url: str, zip_url: str, output_dir: Path | None = None):
|
||||
"""
|
||||
Verify a dataset by downloading and checking its JSON metadata and ZIP contents.
|
||||
If output_dir is provided, write the uncompressed contents there.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Download files
|
||||
logger.info(f"Downloading metadata from {json_url}...")
|
||||
json_path = tmpdir / "metadata.json"
|
||||
download_file(json_url, json_path)
|
||||
|
||||
logger.info(f"Downloading archive from {zip_url}...")
|
||||
zip_path = tmpdir / "data.zip"
|
||||
download_file(zip_url, zip_path)
|
||||
|
||||
# Load metadata
|
||||
metadata = json.loads(json_path.read_text())
|
||||
|
||||
# Create output directory
|
||||
if not output_dir:
|
||||
output_dir = tmpdir / "output"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Verify file contents
|
||||
logger.info("Verifying file contents...")
|
||||
with zip_path.open('rb') as f:
|
||||
for entry in metadata['zip_entries']:
|
||||
logger.info(f"Checking {entry['filename']}...")
|
||||
f.seek(entry['data_offset'])
|
||||
zip_data = f.read(entry['compress_size'])
|
||||
|
||||
if entry['compress_type'] == zipfile.ZIP_STORED:
|
||||
uncompressed = zip_data
|
||||
else:
|
||||
decompressor = zipfile._get_decompressor(entry['compress_type'])
|
||||
uncompressed = decompressor.decompress(zip_data)
|
||||
|
||||
# write the file
|
||||
output_file = output_dir / entry['filename']
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_file.write_bytes(uncompressed)
|
||||
|
||||
logger.info("All files extracted successfully")
|
||||
|
||||
# verify dataset with nabit
|
||||
cli_validate(output_dir)
|
||||
|
||||
# Return metadata for potential further use
|
||||
return metadata
|
||||
|
||||
@click.command()
|
||||
@click.argument('json_url', type=str)
|
||||
@click.argument('zip_url', type=str)
|
||||
@click.option('--output', '-o', type=click.Path(path_type=Path),
|
||||
help='Directory to write uncompressed files')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO',
|
||||
help='Logging level.')
|
||||
def main(json_url: str, zip_url: str, output: Path = None, log_level: str = 'INFO'):
|
||||
"""Verify dataset from JSON and ZIP URLs"""
|
||||
# Set up logging
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
verify_dataset(json_url, zip_url, output)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue