Refactoring, github pipeline, s3 creation

This commit is contained in:
Jack Cushman 2025-02-26 14:49:24 -05:00
parent a7c99e264d
commit b245fd44eb
21 changed files with 718 additions and 281 deletions

View file

@ -18,6 +18,7 @@ dependencies = [
"cloudflare>=4.0.0",
"deepdiff>=8.2.0",
"orjson>=3.10.15",
"onepassword-sdk>=0.1.7",
]
[build-system]
@ -35,3 +36,6 @@ gitspoke = { git = "https://github.com/harvard-lil/gitspoke" }
[tool.hatch.build.targets.wheel]
packages = ["scripts"]
[project.scripts]
vault = "scripts.cli:cli"

View file

@ -1,2 +0,0 @@
def hello() -> str:
return "Hello from data-mirror!"

118
scripts/cli.py Normal file
View file

@ -0,0 +1,118 @@
import click
import importlib
from pathlib import Path
import logging
import os
import sys
"""
Top level CLI that registers commands with this logic:
- Find all subdirectories WITH __init__.py
- Fetch a 'cli' click.group() from __init__.py if it exists, or create one
- For all python files in the subdirectory:
- If the file defines a 'cli' click.group(), add it to the subdirectory's group
- If the file defines a 'main' click.command(), add it to the subdirectory's group
- If any commands are added to the subdirectory's group, add the group to the main CLI
"""
def register_commands(cli: click.Group) -> None:
"""Find all command groups in the scripts directory."""
scripts_dir = Path(__file__).parent
# for each subdirectory, try to import subdir.__init__
for subdir in scripts_dir.glob('**'):
if not subdir.is_dir():
continue
# get group from __init__.py or create a new one
subdir_import = f"scripts.{subdir.name}"
try:
init_module = importlib.import_module(subdir_import+'.__init__')
except ImportError:
continue
if hasattr(init_module, 'cli'):
group = init_module.cli
else:
@click.group()
def group():
pass
# add commands from the subdirectory
for item in subdir.glob('*.py'):
if item.name == '__init__.py':
continue
file_import = f"{subdir_import}.{item.stem}"
module = importlib.import_module(file_import)
if type(getattr(module, 'cli', None)) == click.Group:
group.add_command(module.cli, name=item.stem)
elif type(getattr(module, 'main', None)) == click.Command:
group.add_command(module.main, name=item.stem)
# add the group to the main cli
if group.commands:
cli.add_command(group, name=subdir.name)
@click.group()
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='WARNING',
help='Logging level.')
def cli(log_level):
"""Main CLI entry point for the vault tool."""
logging.basicConfig(level=log_level)
@cli.command()
@click.option('--shell', '-s', type=click.Choice(['bash', 'zsh', 'fish']),
help='Shell to generate completion script for. Defaults to auto-detect.')
def completion(shell):
"""Install tab completion for the CLI.
Examples:
# Auto-detect shell and print instructions
$ vault completion
# Generate completion script for bash
$ vault completion --shell=bash > ~/.vault-complete.bash
# Generate completion script for zsh
$ vault completion --shell=zsh > ~/.vault-complete.zsh
# Generate completion script for fish
$ vault completion --shell=fish > ~/.config/fish/completions/vault.fish
"""
# Auto-detect shell if not specified
if not shell:
shell = os.environ.get('SHELL', '')
if 'bash' in shell:
shell = 'bash'
elif 'zsh' in shell:
shell = 'zsh'
elif 'fish' in shell:
shell = 'fish'
else:
click.echo("Could not auto-detect shell. Please specify with --shell option.")
return
# Get the script name (executable name)
script_name = os.path.basename(sys.argv[0])
env_var = f"_{script_name.replace('-', '_').upper()}_COMPLETE"
if shell == 'bash':
click.echo(f"# Save the completion script:")
click.echo(f"{env_var}=bash_source {script_name} > ~/.{script_name}-complete.bash")
click.echo("\n# Then add this line to your ~/.bashrc:")
click.echo(f". ~/.{script_name}-complete.bash")
elif shell == 'zsh':
click.echo(f"# Save the completion script:")
click.echo(f"{env_var}=zsh_source {script_name} > ~/.{script_name}-complete.zsh")
click.echo("\n# Then add this line to your ~/.zshrc:")
click.echo(f". ~/.{script_name}-complete.zsh")
elif shell == 'fish':
click.echo(f"# Save the completion script to the fish completions directory:")
click.echo(f"{env_var}=fish_source {script_name} > ~/.config/fish/completions/{script_name}.fish")
register_commands(cli)
if __name__ == "__main__":
cli()

View file

@ -1,2 +0,0 @@
def hello() -> str:
return "Hello from data-mirror!"

View file

@ -54,15 +54,9 @@ def cli():
help='Key prefixes to restrict access to. Can be specified multiple times.')
@click.option('--objects', '-o', multiple=True,
help='Specific object keys to restrict access to. Can be specified multiple times.')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
def generate_key(bucket: str, permission: str, ttl: int, prefixes: tuple[str, ...],
objects: tuple[str, ...], log_level: str):
"""Generate temporary Cloudflare R2 access credentials."""
# Setup logging
logging.basicConfig(level=log_level)
# Load config
config = load_config().get("temp_tokens", {})

View file

@ -74,18 +74,10 @@ def render_html(datasets_query, output_path: Path) -> None:
@click.command()
@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
@click.argument('output_path', type=click.Path(path_type=Path), default='data/processed/web')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
@click.option('--limit', '-n', type=int, default=None,
help='Maximum number of rows to display. Default: all rows.')
def main(db_path: Path, output_path: Path, log_level: str, limit: int | None):
def main(db_path: Path, output_path: Path, limit: int | None):
"""Render the Dataset table to an HTML file."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger.info(f"Connecting to database at {db_path}")
db.init(db_path)

View file

@ -2,14 +2,12 @@ import boto3
import click
from tqdm import tqdm
import logging
from itertools import islice
import json
import gzip
from io import BytesIO
import tempfile
import os
from scripts.helpers.misc import json_default
import zipfile
from scripts.helpers.onepassword import save_item, share_item
logger = logging.getLogger(__name__)
@ -122,11 +120,8 @@ def cli():
@click.argument('s3_path')
@click.option('--profile', help='AWS profile name', default='sc-direct')
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO', help='Set logging level')
def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
def undelete(s3_path: str, profile: str = None, dry_run: bool = False):
"""Remove delete markers from versioned S3 objects, effectively undeleting them."""
logging.basicConfig(level=log_level)
bucket, prefix = s3_path.split('/', 1)
session = boto3.Session(profile_name=profile)
@ -138,11 +133,8 @@ def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level
@click.argument('s3_path')
@click.option('--profile', help='AWS profile name', default='sc-direct')
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO', help='Set logging level')
def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False):
"""Delete all zero-size objects under the given prefix."""
logging.basicConfig(level=log_level)
bucket, prefix = s3_path.split('/', 1)
session = boto3.Session(profile_name=profile)
@ -154,11 +146,8 @@ def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_l
@click.argument('s3_path')
@click.option('--profile', help='AWS profile name', default='sc-direct')
@click.option('--output', '-o', help='Output path for index file', default=None)
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO', help='Set logging level')
def write_index(s3_path: str, profile: str = None, output: str | None = None, log_level: str = 'INFO'):
def write_index(s3_path: str, profile: str = None, output: str | None = None):
"""Write a JSONL index of all files under the given prefix."""
logging.basicConfig(level=log_level)
bucket, prefix = s3_path.split('/', 1)
if output is None:
@ -169,6 +158,117 @@ def write_index(s3_path: str, profile: str = None, output: str | None = None, lo
write_file_listing(s3_client, bucket, prefix, output)
@cli.command()
@click.argument('bucket_name')
@click.option('--profile', '-p', help='AWS profile name')
@click.option('--region', '-r', help='AWS region', default='us-east-1')
@click.option('--tag', '-t', help='Tag the bucket with a name', default=None)
def create_bucket(bucket_name: str, profile: str = None, region: str = 'us-east-1', tag: str | None = None):
"""Create a new S3 bucket with versioning enabled by default."""
session = boto3.Session(profile_name=profile)
s3_client = session.client('s3')
# Ensure bucket exists
try:
if region == 'us-east-1':
s3_client.create_bucket(Bucket=bucket_name)
else:
s3_client.create_bucket(
Bucket=bucket_name,
CreateBucketConfiguration={'LocationConstraint': region},
)
except s3_client.exceptions.BucketAlreadyExists:
logger.warning(f"Bucket {bucket_name} already exists. Updating settings.")
# Configure bucket
s3_client.put_bucket_versioning(
Bucket=bucket_name,
VersioningConfiguration={'Status': 'Enabled'}
)
logger.info(f"Ensured bucket {bucket_name} exists with versioning enabled")
@cli.command()
@click.argument('bucket_name')
@click.argument('username')
@click.option('--profile', '-p', help='AWS profile name')
@click.option('--permissions-boundary', '-b', help='ARN of the permissions boundary policy')
@click.option('--op-vault', help='1Password vault to store credentials in', default='Private')
@click.option('--op-share', help='Share the credentials with the given email', default=None)
def create_user(bucket_name: str, username: str, profile: str, permissions_boundary: str, op_vault: str, op_share: str | None):
"""Generate temporary S3 credentials with read/write/list access for a specific bucket."""
session = boto3.Session(profile_name=profile)
iam_client = session.client('iam')
# Define inline policy for bucket access
bucket_policy = {
"Version": "2012-10-17",
"Statement": [{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject",
"s3:DeleteObject",
"s3:ListBucket"
],
"Resource": [
f"arn:aws:s3:::{bucket_name}",
f"arn:aws:s3:::{bucket_name}/*"
]
}]
}
# Create the IAM user with permissions boundary
try:
iam_client.create_user(
UserName=username,
PermissionsBoundary=permissions_boundary
)
logger.info(f"Created IAM user: {username}")
except iam_client.exceptions.EntityAlreadyExistsException:
logger.warning(f"User {username} already exists")
# Attach inline policy directly to user
iam_client.put_user_policy(
UserName=username,
PolicyName=f"{bucket_name}-access",
PolicyDocument=json.dumps(bucket_policy)
)
logger.info(f"Attached bucket access policy to user {username}")
# Create access key for the user
response = iam_client.create_access_key(UserName=username)
credentials = response['AccessKey']
# Output the credentials
click.echo(f"AWS_ACCESS_KEY_ID={credentials['AccessKeyId']}")
click.echo(f"AWS_SECRET_ACCESS_KEY={credentials['SecretAccessKey']}")
# Save credentials to 1Password if requested
if op_vault:
item = save_item(op_vault, f"{username} S3 Credentials for {bucket_name}", [
{
'title': 'Access Key ID',
'value': credentials['AccessKeyId'],
'section_id': 's3_details'
},
{
'title': 'Secret Access Key',
'value': credentials['SecretAccessKey'],
'section_id': 's3_details'
},
{
'title': 'S3 Bucket',
'value': bucket_name,
'section_id': 's3_details'
},
])
if op_share:
share_link = share_item(item, [op_share])
click.echo(f"To share credentials with {op_share}, use the following link: {share_link}")
if __name__ == '__main__':
cli()

View file

@ -73,18 +73,8 @@ def verify_dataset(json_url: str, zip_url: str, output_dir: Path | None = None):
@click.argument('zip_url', type=str)
@click.option('--output', '-o', type=click.Path(path_type=Path),
help='Directory to write uncompressed files')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
def main(json_url: str, zip_url: str, output: Path = None, log_level: str = 'INFO'):
def main(json_url: str, zip_url: str, output: Path = None):
"""Verify dataset from JSON and ZIP URLs"""
# Set up logging
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
verify_dataset(json_url, zip_url, output)
if __name__ == '__main__':

View file

View file

@ -61,17 +61,8 @@ def find_differences(csv_data: Dict[str, dict],
@click.option('--compare-by', '-c',
default='id',
help='Field to compare by.')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
def main(old_path: Path, new_path: Path, compare_by: str, log_level: str):
"""Compare records between CSV and JSONL files."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
old_data = load_jsonl_data(old_path, compare_by=compare_by)
new_data = load_jsonl_data(new_path, compare_by=compare_by)

View file

@ -3,36 +3,38 @@ from collections import Counter, defaultdict
from pathlib import Path
# Read the JSONL file and count crawler_identified_date values
downloaded_counts = Counter()
identified_counts = Counter()
titles_by_org = defaultdict(list)
with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
for line in f:
data = json.loads(line)
org = json.loads(data.get('organization', '{}'))
identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
titles_by_org[org['title']].append(data["title"])
if __name__ == "__main__":
# Print the counts sorted by date
for date, count in sorted(identified_counts.items()):
print(f"{date}: {count}")
# sort each list of titles by org
for org, titles in titles_by_org.items():
titles_by_org[org].sort()
Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
# print urls
for path in Path('data/').glob('glass*'):
print(path)
with open(path, 'r') as f:
# Read the JSONL file and count crawler_identified_date values
downloaded_counts = Counter()
identified_counts = Counter()
titles_by_org = defaultdict(list)
with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
for line in f:
data = json.loads(line)
print("* " + data['name'])
resources = data.get('resources', [])
if type(resources) == str:
resources = json.loads(resources)
for resource in resources:
print(' * ' + resource['url'])
org = json.loads(data.get('organization', '{}'))
identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
titles_by_org[org['title']].append(data["title"])
# Print the counts sorted by date
for date, count in sorted(identified_counts.items()):
print(f"{date}: {count}")
# sort each list of titles by org
for org, titles in titles_by_org.items():
titles_by_org[org].sort()
Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
# print urls
for path in Path('data/').glob('glass*'):
print(path)
with open(path, 'r') as f:
for line in f:
data = json.loads(line)
print("* " + data['name'])
resources = data.get('resources', [])
if type(resources) == str:
resources = json.loads(resources)
for resource in resources:
print(' * ' + resource['url'])

View file

@ -10,31 +10,17 @@ import os
from urllib.parse import urlparse
import re
from scripts.helpers.parallel import run_parallel
import zipfile
import struct
import boto3
import logging
from scripts.data_gov.models import db, Dataset
from playhouse.shortcuts import model_to_dict
from tqdm import tqdm
from datetime import datetime
from scripts.helpers.bag import zip_archive, upload_archive, cleanup_files, fetch_and_upload
logger = logging.getLogger(__name__)
## download data.gov datasets, create nabit archives, and upload to S3
# File extensions that are already compressed or wouldn't benefit from additional compression
UNCOMPRESSED_EXTENSIONS = {
# Already compressed archives
'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
# Compressed images
'jpg', 'jpeg', 'png', 'gif', 'webp',
# Compressed video/audio
'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
# Other compressed/binary formats
'pdf', 'docx', 'xlsx', 'pptx',
}
stats_counter = {}
def is_valid_url(url):
@ -55,95 +41,6 @@ def extract_urls(data, urls = None):
extract_urls(item, urls)
return urls
def create_archive(bag_dir, dataset: Dataset, signatures):
data_dict = model_to_dict(dataset)
for key, value in data_dict.items():
if isinstance(value, datetime):
data_dict[key] = value.isoformat()
data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}'
collect = [
*[UrlCollectionTask(url=url) for url in extract_urls(data_dict)],
]
logger.info(f" - Downloading {len(collect)} files")
# sort fields from dataset
data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')}
crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')}
# Create the archive
package(
output_path=bag_dir,
collect=collect,
collect_errors='ignore',
signed_metadata={
'id': str(uuid.uuid4()),
'url': data_gov_url,
'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.',
'data_gov_metadata': data_gov_metadata,
'crawler_metadata': crawler_metadata,
},
signatures=signatures,
)
def zip_archive(bag_dir, archive_path):
# Create zip archive
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
for file_path in bag_dir.rglob('*'):
if file_path.is_file():
arc_path = file_path.relative_to(bag_dir)
compression = (zipfile.ZIP_STORED
if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS
else zipfile.ZIP_DEFLATED)
zf.write(file_path, arc_path, compress_type=compression)
# Create metadata file
zip_info = []
with zipfile.ZipFile(archive_path, 'r') as zf:
for info in zf.filelist:
header_offset = info.header_offset
# Read header to calculate data offset
zf.fp.seek(header_offset)
header = zf.fp.read(zipfile.sizeFileHeader)
fheader = struct.unpack(zipfile.structFileHeader, header)
fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
zip_info.append({
'filename': info.filename,
'file_size': info.file_size,
'compress_size': info.compress_size,
'compress_type': info.compress_type,
'header_offset': header_offset,
'data_offset': data_offset,
})
# Read the bag-info.txt and signed-metadata.json
bag_info = (bag_dir / 'bag-info.txt').read_text()
signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
return {
'bag_info': bag_info,
'signed_metadata': signed_metadata,
'zip_entries': zip_info
}
def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
s3 = boto3.Session(**session_args).client('s3')
bucket_name, s3_path = s3_path.split('/', 1)
# Upload zip file
s3_collection_key = os.path.join(s3_path, str(collection_path.relative_to(output_path)))
s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
# Upload metadata file
s3_metadata_key = os.path.join(s3_path, str(metadata_path.relative_to(output_path)))
s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
def run_pipeline(
dataset: Dataset,
output_path: Path,
@ -162,36 +59,43 @@ def run_pipeline(
# set this here so it makes it into the metadata
dataset.crawler_downloaded_date = datetime.now()
with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
logger.info("- Creating archive...")
# set up paths
temp_dir = Path(temp_dir)
bag_dir = temp_dir / 'bag'
archive_path = temp_dir / 'archive.zip'
def create_archive(temp_dir):
data_dict = model_to_dict(dataset)
for key, value in data_dict.items():
if isinstance(value, datetime):
data_dict[key] = value.isoformat()
data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}'
collect = [
*[UrlCollectionTask(url=url) for url in extract_urls(data_dict)],
]
logger.info(f" - Downloading {len(collect)} files")
# download data with nabit
create_archive(bag_dir, dataset, signatures)
# sort fields from dataset
data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')}
crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')}
logger.info("- Zipping archive...")
# zip up data and create metadata
output_metadata = zip_archive(bag_dir, archive_path)
return {
'collect': collect,
'signed_metadata': {
'id': str(uuid.uuid4()),
'url': data_gov_url,
'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.',
'data_gov_metadata': data_gov_metadata,
'crawler_metadata': crawler_metadata,
},
}
logger.info("- Moving files to final location...")
# Move files to final location
collection_path.parent.mkdir(parents=True, exist_ok=True)
metadata_path.parent.mkdir(parents=True, exist_ok=True)
os.rename(str(archive_path), collection_path)
metadata_path.write_text(json.dumps(output_metadata) + '\n')
if s3_path:
logger.info("Uploading to S3...")
upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
if not no_delete:
logger.info("- Deleting zip archive...")
os.remove(collection_path)
if collection_path.parent.exists() and not os.listdir(collection_path.parent):
os.rmdir(collection_path.parent)
# Use common pipeline
fetch_and_upload(
output_path=output_path,
collection_path=collection_path,
metadata_path=metadata_path,
create_archive_callback=create_archive,
signatures=signatures,
session_args=session_args,
s3_path=s3_path,
no_delete=no_delete
)
logger.info("- Setting crawler_downloaded_date...")
db.connect()
@ -244,12 +148,10 @@ def get_unprocessed_datasets(output_path: Path, collection: str, min_size: int =
@click.option('--signatures', help='JSON string of signature configuration.')
@click.option('--profile', '-p', help='AWS profile name')
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
@click.option('--log-level', '-l', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), default=None,
help='Logging level.')
@click.option('--stop-after', help='Stop after processing this many collections', type=int)
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
def main(db_path: Path, output_path: Path, collection: str, workers=None, min_size=0, dataset_name=None,
if_exists='skip', signatures=None, profile=None, s3_path=None, log_level=None, stop_after=None, no_delete=False):
if_exists='skip', signatures=None, profile=None, s3_path=None, stop_after=None, no_delete=False):
if dataset_name:
workers = 1

View file

@ -151,17 +151,8 @@ def cli():
help='Number of results to fetch per page.')
@click.option('--start-date', '-s', type=str, default=None,
help='Date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='WARNING',
help='Logging level.')
def fetch(output_path: Path, rows_per_page: int, start_date: str, log_level: str):
def fetch(output_path: Path, rows_per_page: int, start_date: str):
"""Fetch package data from data.gov API and save to database."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
save_packages_to_database(output_path, rows_per_page, start_date)
@cli.command()

View file

@ -22,19 +22,10 @@ def cli():
@click.argument('output_path', type=click.Path(path_type=Path))
@click.option('--rows-per-page', '-r', type=int, default=1000,
help='Number of results to fetch per page.')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
@click.option('--start-date', '-s', type=str, default=None,
help='Start date for fetching packages in YYYY-MM-DD format.')
def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
"""Fetch all package data from data.gov API and save to gzipped JSONL file."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
if output_path.is_dir():
current_date = datetime.now().strftime('%Y%m%d')
output_path = output_path / f'data_{current_date}.jsonl.gz'
@ -49,17 +40,8 @@ def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str
@cli.command()
@click.argument('file1', type=click.Path(exists=True, path_type=Path))
@click.argument('file2', type=click.Path(exists=True, path_type=Path))
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
def compare(file1: Path, file2: Path, log_level: str):
"""Compare two gzipped JSONL files by indexing on the 'name' key."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
def load_jsonl_index(file_path: Path) -> Dict[str, Any]:
# Check for pickle file
pickle_path = file_path.with_suffix('.pickle')

View file

@ -60,7 +60,7 @@ class Dataset(BaseModel):
# fields starting with crawler_ are added by our crawler
crawler_identified_date = DateTimeField(null=True, default=datetime.now)
crawler_downloaded_date = DateTimeField(null=True)
crawler_last_crawl_id = ForeignKeyField('Crawl', backref='datasets', null=True)
crawler_last_crawl_id = ForeignKeyField(Crawl, backref='datasets', null=True)
class DatasetHistory(Dataset):

View file

View file

@ -8,8 +8,14 @@ from gitspoke import Downloader, GitHubAPI
from gitspoke.cli import valid_include_items, get_token
import os
import json
import uuid
import requests
from datetime import datetime
from scripts.helpers.misc import load_config
from nabit.lib.archive import package
from nabit.lib.sign import KNOWN_TSAS, is_encrypted_key
from nabit.lib.backends.path import PathCollectionTask
from scripts.helpers.bag import fetch_and_upload
logger = logging.getLogger(__name__)
stats_counter = {}
@ -34,17 +40,90 @@ def check_repo_exists(org_name, repo_name, token, output_path=None):
output_file.write(f"{repo_link}\n")
return exists
def run_pipeline(org_name, repo_name, collection_path, include, token, check_exists=False, output_path=None):
def run_pipeline(
org_name,
repo_name,
collection_path,
include,
token,
metadata_path=None,
output_path=None,
signatures=None,
session_args=None,
s3_path=None,
no_delete=False,
save_raw=False,
raw_dir=None,
check_exists=False,
output_deleted=None,
):
"""Process a single repository."""
# existing checking mode
if check_exists:
return check_repo_exists(org_name, repo_name, token, output_path)
logger.info(f"Processing repository: {org_name}/{repo_name}")
Downloader(org_name, repo_name, token, max_retries=20).download_repo(collection_path, include=include)
return check_repo_exists(org_name, repo_name, token, output_deleted)
# raw saving mode
if save_raw:
raw_path = raw_dir / org_name / repo_name
Downloader(org_name, repo_name, token, max_retries=20).download_repo(raw_path, include=include)
logger.info("Processing complete")
return
def create_archive_callback(temp_dir):
if raw_dir:
raw_path = raw_dir / org_name / repo_name
if raw_path.exists():
out_dir = raw_path
else:
Downloader(org_name, repo_name, token, max_retries=20).download_repo(temp_dir, include=include)
out_dir = temp_dir
return {
'collect': [
PathCollectionTask(path=out_dir)
],
'signed_metadata': {
'id': str(uuid.uuid4()),
'url': f'https://github.com/{org_name}/{repo_name}',
'description': f'Archive of GitHub repository {org_name}/{repo_name}',
'github_metadata': {
'org': org_name,
'repo': repo_name,
'archived_date': datetime.now().isoformat()
},
},
}
# Archive mode - use common pipeline
fetch_and_upload(
output_path=output_path,
collection_path=collection_path,
metadata_path=metadata_path,
create_archive_callback=create_archive_callback,
signatures=signatures,
session_args=session_args,
s3_path=s3_path,
no_delete=no_delete
)
logger.info("Processing complete")
def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int = 0, skip_existing: bool = False, stop_after: int = None, include: str = None,
check_exists: bool = False):
def get_tasks(
csv_path: Path,
output_path: Path,
skip_rows: int = 0,
skip_existing: bool = False,
stop_after: int = None,
include: str = None,
archive_mode: bool = False,
signatures: list = None,
session_args: dict = None,
s3_path: str = None,
no_delete: bool = False,
save_raw: bool = False,
raw_dir: Path = None,
check_exists: bool = False,
output_deleted: Path = None,
):
"""Get repositories from CSV that haven't been processed yet."""
# Initialize progress bars
if not check_exists:
@ -85,19 +164,37 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
continue
org_name, repo_name = row['html_url'].split('/')[-2:]
collection_path = output_path / 'collections' / collection / org_name / repo_name
if skip_existing:
if collection_path.exists():
stats_counter['skipped'].update(1)
continue
else:
stats_counter['yielded'].update(1)
collection_path = output_path / 'data' / org_name / repo_name / 'v1.zip'
metadata_path = output_path / 'metadata' / org_name / repo_name / 'v1.json'
if skip_existing and collection_path.exists():
stats_counter['skipped'].update(1)
continue
else:
stats_counter['yielded'].update(1)
# use tokens round robin
token = tokens[processed % len(tokens)]
yield org_name, repo_name, collection_path, include, token, check_exists, output_path
yield (
org_name,
repo_name,
collection_path,
include,
token,
output_deleted,
archive_mode,
metadata_path,
output_path,
signatures,
session_args,
s3_path,
save_raw,
raw_dir,
check_exists,
no_delete,
)
processed += 1
if stop_after and processed >= stop_after:
@ -107,11 +204,10 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
for counter in stats_counter.values():
counter.close()
@click.command()
@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
help='Output path.')
@click.option('--collection', '-c', type=str, default='github_raw',
help='Collection name.')
@click.option('--workers', '-w', type=int, default=None,
help='Number of worker processes. Defaults to CPU count.')
@click.option('--skip-rows', type=int, default=0,
@ -120,21 +216,44 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items))
@click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv',
help='Path to the CSV file.')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default=None,
help='Logging level.')
@click.option('--stop-after', help='Stop after processing this many repositories', type=int)
@click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories')
@click.option('--signatures', help='JSON string of signature configuration.')
# upload settings
@click.option('--profile', '-p', help='AWS profile name')
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
# raw saving
# useful if doing multiple runs with the same csv and different --include values
@click.option('--save-raw', is_flag=True, help='Save raw repositories to disk rather than bagging and uploading')
@click.option('--raw-dir', type=click.Path(path_type=Path), help='Directory to save raw repositories to')
# deletion checking
@click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub')
def main(csv_path: Path, output_path: Path, collection: str, workers=None, skip_rows=0, include=None,
log_level=None, stop_after=None, skip_existing=False, check_exists=False):
@click.option('--output-deleted', type=click.Path(path_type=Path), help='File to output deleted repositories to')
def main(profile, workers, **kwargs):
session_args = {}
if profile:
session_args['profile_name'] = profile
if signatures := kwargs.get('signatures'):
signatures = json.loads(signatures)
for signature in signatures:
if signature['action'] == 'sign':
if is_encrypted_key(signature['params']['key']):
signature['params']['password'] = click.prompt(
f"Enter password for {signature['params']['key']}: ",
hide_input=True
)
elif signature['action'] == 'timestamp':
if known_tsa := signature.pop('known_tsa', None):
signature['params'] = KNOWN_TSAS[known_tsa]
kwargs['signatures'] = signatures
run_parallel(
run_pipeline,
get_tasks(csv_path, output_path, collection, skip_rows, skip_existing, stop_after, include, check_exists),
get_tasks(**kwargs),
workers,
log_level=log_level
)
if __name__ == "__main__":

156
scripts/helpers/bag.py Normal file
View file

@ -0,0 +1,156 @@
import os
import json
import zipfile
import struct
import boto3
import logging
from pathlib import Path
from datetime import datetime
import tempfile
import shutil
from nabit.lib.archive import package
logger = logging.getLogger(__name__)
# File extensions that are already compressed or wouldn't benefit from additional compression
UNCOMPRESSED_EXTENSIONS = {
# Already compressed archives
'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
# Compressed images
'jpg', 'jpeg', 'png', 'gif', 'webp',
# Compressed video/audio
'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
# Other compressed/binary formats
'pdf', 'docx', 'xlsx', 'pptx',
}
def zip_archive(bag_dir, archive_path):
"""Zip up a nabit archive and create metadata."""
# Create zip archive
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
for file_path in bag_dir.rglob('*'):
if file_path.is_file():
arc_path = file_path.relative_to(bag_dir)
compression = (zipfile.ZIP_STORED
if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS
else zipfile.ZIP_DEFLATED)
zf.write(file_path, arc_path, compress_type=compression)
# Create metadata file
zip_info = []
with zipfile.ZipFile(archive_path, 'r') as zf:
for info in zf.filelist:
header_offset = info.header_offset
# Read header to calculate data offset
zf.fp.seek(header_offset)
header = zf.fp.read(zipfile.sizeFileHeader)
fheader = struct.unpack(zipfile.structFileHeader, header)
fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
zip_info.append({
'filename': info.filename,
'file_size': info.file_size,
'compress_size': info.compress_size,
'compress_type': info.compress_type,
'header_offset': header_offset,
'data_offset': data_offset,
})
# Read the bag-info.txt and signed-metadata.json
bag_info = (bag_dir / 'bag-info.txt').read_text()
signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
return {
'bag_info': bag_info,
'signed_metadata': signed_metadata,
'zip_entries': zip_info
}
def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
"""Upload archive and metadata to S3."""
s3 = boto3.Session(**session_args).client('s3')
bucket_name, s3_key_prefix = s3_path.split('/', 1)
# Upload zip file
s3_collection_key = os.path.join(s3_key_prefix, str(collection_path.relative_to(output_path)))
s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
# Upload metadata file
s3_metadata_key = os.path.join(s3_key_prefix, str(metadata_path.relative_to(output_path)))
s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
def cleanup_files(collection_path, no_delete=False, s3_path=None):
"""Clean up local files after upload if needed."""
if not no_delete and s3_path:
logger.info("- Deleting local zip archive...")
if os.path.exists(collection_path):
os.remove(collection_path)
if collection_path.parent.exists() and not os.listdir(collection_path.parent):
os.rmdir(collection_path.parent)
def fetch_and_upload(
output_path,
collection_path,
metadata_path,
create_archive_callback,
signatures=None,
session_args=None,
s3_path=None,
no_delete=False,
):
"""
Common pipeline for creating and processing archives.
Args:
output_path: Base output directory
collection_path: Path where the final zip will be stored
metadata_path: Path where the metadata will be stored
create_archive_callback: Function that will create the archive
signatures: Signature configuration for nabit
session_args: AWS session arguments
s3_path: S3 path for uploads
no_delete: Whether to preserve local files
"""
with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
logger.info("- Creating archive...")
# set up paths
temp_dir = Path(temp_dir)
bag_dir = temp_dir / 'bag'
archive_path = temp_dir / 'archive.zip'
source_files_dir = temp_dir / 'source_files'
source_files_dir.mkdir(parents=True, exist_ok=True)
# Call the callback to create the archive
package_kwargs = create_archive_callback(source_files_dir)
# create bag
package(
output_path=bag_dir,
collect_errors='ignore',
signatures=signatures,
**package_kwargs,
)
logger.info("- Zipping archive...")
# zip up data and create metadata
output_metadata = zip_archive(bag_dir, archive_path)
logger.info("- Moving files to final location...")
# Move files to final location
collection_path.parent.mkdir(parents=True, exist_ok=True)
metadata_path.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(archive_path), collection_path)
with open(metadata_path, 'w') as f:
json.dump(output_metadata, f)
f.write('\n')
if s3_path:
logger.info("Uploading to S3...")
upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
cleanup_files(collection_path, no_delete, s3_path)

View file

@ -0,0 +1,85 @@
from onepassword import Client, ItemCreateParams, ItemField, ItemFieldType, ItemCategory, ItemSection, Item
from onepassword import ItemShareParams, ItemShareDuration, ValidRecipient
from .misc import load_config
import logging
import asyncio
logger = logging.getLogger(__name__)
async def get_client():
op_config = load_config().get("1password", {})
if not op_config.get('token'):
raise Exception("1Password token not found in config")
return await Client.authenticate(auth=op_config['token'], integration_name="data-vault", integration_version="v1.0.0")
def save_item(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
return asyncio.run(save_item_async(vault_name, title, fields, notes, category))
async def save_item_async(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
client = await get_client()
vault_id = None
vaults = await client.vaults.list_all()
async for vault in vaults:
if vault.title == vault_name:
vault_id = vault.id
break
else:
raise Exception(f"Vault {vault} not found")
field_objs = []
sections = []
for field in fields:
if field.get('concealed', False):
field['field_type'] = ItemFieldType.CONCEALED
else:
field['field_type'] = ItemFieldType.TEXT
field['id'] = field['title'].lower().replace(' ', '_')
field_objs.append(ItemField(**field))
if section_id := field.get('section_id'):
if section_id not in sections:
sections.append(section_id)
sections = [ItemSection(id=section, title=section) for section in sections]
# Create item parameters with sections
create_params = ItemCreateParams(
title=title,
category=category,
vault_id=vault_id,
fields=field_objs,
sections=sections,
)
if notes:
create_params.notes = notes
item = await client.items.create(create_params)
logger.info(f"Stored credentials in 1Password vault '{vault}' with title '{title}'")
return item
def share_item(
item: Item,
recipients: list[str] | None = None,
expire_after: ItemShareDuration | None = ItemShareDuration.SEVENDAYS,
one_time_only: bool = False
):
return asyncio.run(share_item_async(item, recipients, expire_after, one_time_only))
async def share_item_async(
item: Item,
recipients: list[str] | None,
expire_after: ItemShareDuration | None,
one_time_only: bool,
):
client = await get_client()
policy = await client.items.shares.get_account_policy(item.vault_id, item.id)
valid_recipients = await client.items.shares.validate_recipients(policy, recipients)
share_params = ItemShareParams(
recipients=valid_recipients,
expire_after=expire_after,
one_time_only=one_time_only
)
share_link = await client.items.shares.create(item, policy, share_params)
logger.info(f"Created share link for '{item.title}'")
return share_link

View file

@ -25,17 +25,9 @@ def worker(task_queue, task, catch_errors: bool = True):
raise e
def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, log_level: str | None = None, task_count: int | None = None):
def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, task_count: int | None = None):
workers = workers or os.cpu_count() or 4
# Configure logging based on whether we're running in parallel or not
if log_level is None:
log_level = 'INFO' if workers == 1 else 'WARNING'
logging.basicConfig(
level=log_level,
format='[%(process)d] %(message)s'
)
logger.debug(f"Starting processing with {workers} workers")
if workers > 1:

23
uv.lock generated
View file

@ -169,6 +169,7 @@ dependencies = [
{ name = "httpx" },
{ name = "jsondiff" },
{ name = "nabit" },
{ name = "onepassword-sdk" },
{ name = "orjson" },
{ name = "peewee" },
{ name = "publicsuffixlist" },
@ -192,6 +193,7 @@ requires-dist = [
{ name = "httpx", specifier = ">=0.27.2" },
{ name = "jsondiff", specifier = ">=2.2.1" },
{ name = "nabit", git = "https://github.com/harvard-lil/bag-nabit" },
{ name = "onepassword-sdk", specifier = ">=0.1.7" },
{ name = "orjson", specifier = ">=3.10.15" },
{ name = "peewee", specifier = ">=3.17.8" },
{ name = "publicsuffixlist", specifier = ">=1.0.2.20241121" },
@ -466,6 +468,27 @@ dependencies = [
{ name = "warcio" },
]
[[package]]
name = "onepassword-sdk"
version = "0.1.7"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pydantic" },
]
sdist = { url = "https://files.pythonhosted.org/packages/aa/64/75462b6a21cbf98bff8ccae24848034ab280292d5b65346b77c720f46f93/onepassword_sdk-0.1.7.tar.gz", hash = "sha256:27979eb1b4fe0476a123336e3b432ce549feb4a6f87e975581497befcac985e9", size = 25312950 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d9/1a/016c165b87107cf771e3ac1f5a6b813003de534631ff78171b334980bef0/onepassword_sdk-0.1.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fd399e48833f5c916831c22be40f76cf7e85d0180fcb818dbd76cc5d45659cb8", size = 5261542 },
{ url = "https://files.pythonhosted.org/packages/9a/d9/3960534ef2faced9cdecbe72bc2060f718de97b8d95c15210f50d1c514bb/onepassword_sdk-0.1.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:48a6dfb85c0138896ecb8d6aa814c5985acbb4262ed5e3a18e99725454bb41d5", size = 4827963 },
{ url = "https://files.pythonhosted.org/packages/04/50/b0a34b7f1aa7e3b747bdca586896d3f2d452cdf87249f3dae56f649b1c83/onepassword_sdk-0.1.7-cp312-cp312-manylinux_2_32_aarch64.whl", hash = "sha256:7c6bbb202c83ad1566f27c3354ce88fa8dcceae0f90b0514b42d40aeb0b8ad10", size = 5262840 },
{ url = "https://files.pythonhosted.org/packages/52/2d/73a11ee89fb69d6d6e388524560e53574eb5fdd20f65d57f25212087fc9f/onepassword_sdk-0.1.7-cp312-cp312-manylinux_2_32_x86_64.whl", hash = "sha256:299cd1d33f83e0acca01f0b93474397b471e5470b7f58f542f6866f8ea4de134", size = 5559126 },
{ url = "https://files.pythonhosted.org/packages/02/fe/7aa733bfdadc930e4a02115189652524edce425ae13859d10a79f3cd24fa/onepassword_sdk-0.1.7-cp312-cp312-win_amd64.whl", hash = "sha256:c6d7ff7a15e56b2f6198b3681f6d18678fa56f6d9d3b0639bc70ae9b18b8acd2", size = 4627027 },
{ url = "https://files.pythonhosted.org/packages/6a/97/ef38f90c2b8038529c059d5e6e1d5fb447f75300258ce2c5e5761bbe7283/onepassword_sdk-0.1.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0a6c0e2b8383e2428e020abe41a82376fe24c1a1d9634bfb31c1db4692ef8a25", size = 5261540 },
{ url = "https://files.pythonhosted.org/packages/22/c2/6b9eafd9a1549bae1c7e04e344374e560842b6451279be268d1f45cada08/onepassword_sdk-0.1.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6d570ebe068ba6ce4116a71d16a180353b864e5975be4574ede623b1ea794209", size = 4827963 },
{ url = "https://files.pythonhosted.org/packages/57/0e/d924f5b92176fb86ff74b432d2dd4558048c971f902f287557e89795edc0/onepassword_sdk-0.1.7-cp313-cp313-manylinux_2_32_aarch64.whl", hash = "sha256:50f332f0f0f77913abe461c4d0913f7714ba2e07d9ef39446e0d564b5b440879", size = 5262838 },
{ url = "https://files.pythonhosted.org/packages/38/13/9a79eccdb8f65b6e2676e72f0f77a2c626e00210960e74b0ff72f7a419a8/onepassword_sdk-0.1.7-cp313-cp313-manylinux_2_32_x86_64.whl", hash = "sha256:8a773da4770e887b124ec671781678e1b1fd57f20dcc40f89a610728f55138ed", size = 5559126 },
{ url = "https://files.pythonhosted.org/packages/24/2c/15549d06fa6874d520ab123963172d18c45de34d0ac26e02465a56488f7d/onepassword_sdk-0.1.7-cp313-cp313-win_amd64.whl", hash = "sha256:9541175ed6283736745bf673f8c74696e53a164192c34c44de4400e8fdf01e38", size = 4627026 },
]
[[package]]
name = "orderly-set"
version = "5.3.0"