mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-03-15 07:31:21 +00:00
Refactoring, github pipeline, s3 creation
This commit is contained in:
parent
a7c99e264d
commit
b245fd44eb
21 changed files with 718 additions and 281 deletions
|
@ -18,6 +18,7 @@ dependencies = [
|
|||
"cloudflare>=4.0.0",
|
||||
"deepdiff>=8.2.0",
|
||||
"orjson>=3.10.15",
|
||||
"onepassword-sdk>=0.1.7",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
|
@ -35,3 +36,6 @@ gitspoke = { git = "https://github.com/harvard-lil/gitspoke" }
|
|||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["scripts"]
|
||||
|
||||
[project.scripts]
|
||||
vault = "scripts.cli:cli"
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
def hello() -> str:
|
||||
return "Hello from data-mirror!"
|
118
scripts/cli.py
Normal file
118
scripts/cli.py
Normal file
|
@ -0,0 +1,118 @@
|
|||
import click
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
"""
|
||||
Top level CLI that registers commands with this logic:
|
||||
|
||||
- Find all subdirectories WITH __init__.py
|
||||
- Fetch a 'cli' click.group() from __init__.py if it exists, or create one
|
||||
- For all python files in the subdirectory:
|
||||
- If the file defines a 'cli' click.group(), add it to the subdirectory's group
|
||||
- If the file defines a 'main' click.command(), add it to the subdirectory's group
|
||||
- If any commands are added to the subdirectory's group, add the group to the main CLI
|
||||
"""
|
||||
|
||||
def register_commands(cli: click.Group) -> None:
|
||||
"""Find all command groups in the scripts directory."""
|
||||
scripts_dir = Path(__file__).parent
|
||||
|
||||
# for each subdirectory, try to import subdir.__init__
|
||||
for subdir in scripts_dir.glob('**'):
|
||||
if not subdir.is_dir():
|
||||
continue
|
||||
|
||||
# get group from __init__.py or create a new one
|
||||
subdir_import = f"scripts.{subdir.name}"
|
||||
try:
|
||||
init_module = importlib.import_module(subdir_import+'.__init__')
|
||||
except ImportError:
|
||||
continue
|
||||
if hasattr(init_module, 'cli'):
|
||||
group = init_module.cli
|
||||
else:
|
||||
@click.group()
|
||||
def group():
|
||||
pass
|
||||
|
||||
# add commands from the subdirectory
|
||||
for item in subdir.glob('*.py'):
|
||||
if item.name == '__init__.py':
|
||||
continue
|
||||
file_import = f"{subdir_import}.{item.stem}"
|
||||
module = importlib.import_module(file_import)
|
||||
if type(getattr(module, 'cli', None)) == click.Group:
|
||||
group.add_command(module.cli, name=item.stem)
|
||||
elif type(getattr(module, 'main', None)) == click.Command:
|
||||
group.add_command(module.main, name=item.stem)
|
||||
|
||||
# add the group to the main cli
|
||||
if group.commands:
|
||||
cli.add_command(group, name=subdir.name)
|
||||
|
||||
@click.group()
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='WARNING',
|
||||
help='Logging level.')
|
||||
def cli(log_level):
|
||||
"""Main CLI entry point for the vault tool."""
|
||||
logging.basicConfig(level=log_level)
|
||||
|
||||
@cli.command()
|
||||
@click.option('--shell', '-s', type=click.Choice(['bash', 'zsh', 'fish']),
|
||||
help='Shell to generate completion script for. Defaults to auto-detect.')
|
||||
def completion(shell):
|
||||
"""Install tab completion for the CLI.
|
||||
|
||||
Examples:
|
||||
# Auto-detect shell and print instructions
|
||||
$ vault completion
|
||||
|
||||
# Generate completion script for bash
|
||||
$ vault completion --shell=bash > ~/.vault-complete.bash
|
||||
|
||||
# Generate completion script for zsh
|
||||
$ vault completion --shell=zsh > ~/.vault-complete.zsh
|
||||
|
||||
# Generate completion script for fish
|
||||
$ vault completion --shell=fish > ~/.config/fish/completions/vault.fish
|
||||
"""
|
||||
# Auto-detect shell if not specified
|
||||
if not shell:
|
||||
shell = os.environ.get('SHELL', '')
|
||||
if 'bash' in shell:
|
||||
shell = 'bash'
|
||||
elif 'zsh' in shell:
|
||||
shell = 'zsh'
|
||||
elif 'fish' in shell:
|
||||
shell = 'fish'
|
||||
else:
|
||||
click.echo("Could not auto-detect shell. Please specify with --shell option.")
|
||||
return
|
||||
|
||||
# Get the script name (executable name)
|
||||
script_name = os.path.basename(sys.argv[0])
|
||||
env_var = f"_{script_name.replace('-', '_').upper()}_COMPLETE"
|
||||
|
||||
if shell == 'bash':
|
||||
click.echo(f"# Save the completion script:")
|
||||
click.echo(f"{env_var}=bash_source {script_name} > ~/.{script_name}-complete.bash")
|
||||
click.echo("\n# Then add this line to your ~/.bashrc:")
|
||||
click.echo(f". ~/.{script_name}-complete.bash")
|
||||
elif shell == 'zsh':
|
||||
click.echo(f"# Save the completion script:")
|
||||
click.echo(f"{env_var}=zsh_source {script_name} > ~/.{script_name}-complete.zsh")
|
||||
click.echo("\n# Then add this line to your ~/.zshrc:")
|
||||
click.echo(f". ~/.{script_name}-complete.zsh")
|
||||
elif shell == 'fish':
|
||||
click.echo(f"# Save the completion script to the fish completions directory:")
|
||||
click.echo(f"{env_var}=fish_source {script_name} > ~/.config/fish/completions/{script_name}.fish")
|
||||
|
||||
register_commands(cli)
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
|
@ -1,2 +0,0 @@
|
|||
def hello() -> str:
|
||||
return "Hello from data-mirror!"
|
|
@ -54,15 +54,9 @@ def cli():
|
|||
help='Key prefixes to restrict access to. Can be specified multiple times.')
|
||||
@click.option('--objects', '-o', multiple=True,
|
||||
help='Specific object keys to restrict access to. Can be specified multiple times.')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO',
|
||||
help='Logging level.')
|
||||
def generate_key(bucket: str, permission: str, ttl: int, prefixes: tuple[str, ...],
|
||||
objects: tuple[str, ...], log_level: str):
|
||||
"""Generate temporary Cloudflare R2 access credentials."""
|
||||
# Setup logging
|
||||
logging.basicConfig(level=log_level)
|
||||
|
||||
# Load config
|
||||
config = load_config().get("temp_tokens", {})
|
||||
|
|
|
@ -74,18 +74,10 @@ def render_html(datasets_query, output_path: Path) -> None:
|
|||
@click.command()
|
||||
@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
|
||||
@click.argument('output_path', type=click.Path(path_type=Path), default='data/processed/web')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO',
|
||||
help='Logging level.')
|
||||
@click.option('--limit', '-n', type=int, default=None,
|
||||
help='Maximum number of rows to display. Default: all rows.')
|
||||
def main(db_path: Path, output_path: Path, log_level: str, limit: int | None):
|
||||
def main(db_path: Path, output_path: Path, limit: int | None):
|
||||
"""Render the Dataset table to an HTML file."""
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
logger.info(f"Connecting to database at {db_path}")
|
||||
db.init(db_path)
|
||||
|
|
|
@ -2,14 +2,12 @@ import boto3
|
|||
import click
|
||||
from tqdm import tqdm
|
||||
import logging
|
||||
from itertools import islice
|
||||
import json
|
||||
import gzip
|
||||
from io import BytesIO
|
||||
import tempfile
|
||||
import os
|
||||
from scripts.helpers.misc import json_default
|
||||
import zipfile
|
||||
from scripts.helpers.onepassword import save_item, share_item
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -122,11 +120,8 @@ def cli():
|
|||
@click.argument('s3_path')
|
||||
@click.option('--profile', help='AWS profile name', default='sc-direct')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
|
||||
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO', help='Set logging level')
|
||||
def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
|
||||
def undelete(s3_path: str, profile: str = None, dry_run: bool = False):
|
||||
"""Remove delete markers from versioned S3 objects, effectively undeleting them."""
|
||||
logging.basicConfig(level=log_level)
|
||||
bucket, prefix = s3_path.split('/', 1)
|
||||
|
||||
session = boto3.Session(profile_name=profile)
|
||||
|
@ -138,11 +133,8 @@ def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level
|
|||
@click.argument('s3_path')
|
||||
@click.option('--profile', help='AWS profile name', default='sc-direct')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
|
||||
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO', help='Set logging level')
|
||||
def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
|
||||
def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False):
|
||||
"""Delete all zero-size objects under the given prefix."""
|
||||
logging.basicConfig(level=log_level)
|
||||
bucket, prefix = s3_path.split('/', 1)
|
||||
|
||||
session = boto3.Session(profile_name=profile)
|
||||
|
@ -154,11 +146,8 @@ def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_l
|
|||
@click.argument('s3_path')
|
||||
@click.option('--profile', help='AWS profile name', default='sc-direct')
|
||||
@click.option('--output', '-o', help='Output path for index file', default=None)
|
||||
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO', help='Set logging level')
|
||||
def write_index(s3_path: str, profile: str = None, output: str | None = None, log_level: str = 'INFO'):
|
||||
def write_index(s3_path: str, profile: str = None, output: str | None = None):
|
||||
"""Write a JSONL index of all files under the given prefix."""
|
||||
logging.basicConfig(level=log_level)
|
||||
bucket, prefix = s3_path.split('/', 1)
|
||||
|
||||
if output is None:
|
||||
|
@ -169,6 +158,117 @@ def write_index(s3_path: str, profile: str = None, output: str | None = None, lo
|
|||
|
||||
write_file_listing(s3_client, bucket, prefix, output)
|
||||
|
||||
@cli.command()
|
||||
@click.argument('bucket_name')
|
||||
@click.option('--profile', '-p', help='AWS profile name')
|
||||
@click.option('--region', '-r', help='AWS region', default='us-east-1')
|
||||
@click.option('--tag', '-t', help='Tag the bucket with a name', default=None)
|
||||
def create_bucket(bucket_name: str, profile: str = None, region: str = 'us-east-1', tag: str | None = None):
|
||||
"""Create a new S3 bucket with versioning enabled by default."""
|
||||
session = boto3.Session(profile_name=profile)
|
||||
s3_client = session.client('s3')
|
||||
|
||||
# Ensure bucket exists
|
||||
try:
|
||||
if region == 'us-east-1':
|
||||
s3_client.create_bucket(Bucket=bucket_name)
|
||||
else:
|
||||
s3_client.create_bucket(
|
||||
Bucket=bucket_name,
|
||||
CreateBucketConfiguration={'LocationConstraint': region},
|
||||
)
|
||||
except s3_client.exceptions.BucketAlreadyExists:
|
||||
logger.warning(f"Bucket {bucket_name} already exists. Updating settings.")
|
||||
|
||||
# Configure bucket
|
||||
s3_client.put_bucket_versioning(
|
||||
Bucket=bucket_name,
|
||||
VersioningConfiguration={'Status': 'Enabled'}
|
||||
)
|
||||
|
||||
logger.info(f"Ensured bucket {bucket_name} exists with versioning enabled")
|
||||
|
||||
@cli.command()
|
||||
@click.argument('bucket_name')
|
||||
@click.argument('username')
|
||||
@click.option('--profile', '-p', help='AWS profile name')
|
||||
@click.option('--permissions-boundary', '-b', help='ARN of the permissions boundary policy')
|
||||
@click.option('--op-vault', help='1Password vault to store credentials in', default='Private')
|
||||
@click.option('--op-share', help='Share the credentials with the given email', default=None)
|
||||
def create_user(bucket_name: str, username: str, profile: str, permissions_boundary: str, op_vault: str, op_share: str | None):
|
||||
"""Generate temporary S3 credentials with read/write/list access for a specific bucket."""
|
||||
session = boto3.Session(profile_name=profile)
|
||||
iam_client = session.client('iam')
|
||||
|
||||
# Define inline policy for bucket access
|
||||
bucket_policy = {
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:GetObject",
|
||||
"s3:PutObject",
|
||||
"s3:DeleteObject",
|
||||
"s3:ListBucket"
|
||||
],
|
||||
"Resource": [
|
||||
f"arn:aws:s3:::{bucket_name}",
|
||||
f"arn:aws:s3:::{bucket_name}/*"
|
||||
]
|
||||
}]
|
||||
}
|
||||
|
||||
# Create the IAM user with permissions boundary
|
||||
try:
|
||||
iam_client.create_user(
|
||||
UserName=username,
|
||||
PermissionsBoundary=permissions_boundary
|
||||
)
|
||||
logger.info(f"Created IAM user: {username}")
|
||||
except iam_client.exceptions.EntityAlreadyExistsException:
|
||||
logger.warning(f"User {username} already exists")
|
||||
|
||||
# Attach inline policy directly to user
|
||||
iam_client.put_user_policy(
|
||||
UserName=username,
|
||||
PolicyName=f"{bucket_name}-access",
|
||||
PolicyDocument=json.dumps(bucket_policy)
|
||||
)
|
||||
logger.info(f"Attached bucket access policy to user {username}")
|
||||
|
||||
# Create access key for the user
|
||||
response = iam_client.create_access_key(UserName=username)
|
||||
credentials = response['AccessKey']
|
||||
|
||||
# Output the credentials
|
||||
click.echo(f"AWS_ACCESS_KEY_ID={credentials['AccessKeyId']}")
|
||||
click.echo(f"AWS_SECRET_ACCESS_KEY={credentials['SecretAccessKey']}")
|
||||
|
||||
# Save credentials to 1Password if requested
|
||||
if op_vault:
|
||||
item = save_item(op_vault, f"{username} S3 Credentials for {bucket_name}", [
|
||||
{
|
||||
'title': 'Access Key ID',
|
||||
'value': credentials['AccessKeyId'],
|
||||
'section_id': 's3_details'
|
||||
},
|
||||
{
|
||||
'title': 'Secret Access Key',
|
||||
'value': credentials['SecretAccessKey'],
|
||||
'section_id': 's3_details'
|
||||
},
|
||||
{
|
||||
'title': 'S3 Bucket',
|
||||
'value': bucket_name,
|
||||
'section_id': 's3_details'
|
||||
},
|
||||
])
|
||||
if op_share:
|
||||
share_link = share_item(item, [op_share])
|
||||
click.echo(f"To share credentials with {op_share}, use the following link: {share_link}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cli()
|
||||
|
||||
|
|
@ -73,18 +73,8 @@ def verify_dataset(json_url: str, zip_url: str, output_dir: Path | None = None):
|
|||
@click.argument('zip_url', type=str)
|
||||
@click.option('--output', '-o', type=click.Path(path_type=Path),
|
||||
help='Directory to write uncompressed files')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO',
|
||||
help='Logging level.')
|
||||
def main(json_url: str, zip_url: str, output: Path = None, log_level: str = 'INFO'):
|
||||
def main(json_url: str, zip_url: str, output: Path = None):
|
||||
"""Verify dataset from JSON and ZIP URLs"""
|
||||
# Set up logging
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
verify_dataset(json_url, zip_url, output)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
0
scripts/data_gov/__init__.py
Normal file
0
scripts/data_gov/__init__.py
Normal file
|
@ -61,17 +61,8 @@ def find_differences(csv_data: Dict[str, dict],
|
|||
@click.option('--compare-by', '-c',
|
||||
default='id',
|
||||
help='Field to compare by.')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO',
|
||||
help='Logging level.')
|
||||
def main(old_path: Path, new_path: Path, compare_by: str, log_level: str):
|
||||
"""Compare records between CSV and JSONL files."""
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
old_data = load_jsonl_data(old_path, compare_by=compare_by)
|
||||
new_data = load_jsonl_data(new_path, compare_by=compare_by)
|
||||
|
||||
|
|
|
@ -3,6 +3,8 @@ from collections import Counter, defaultdict
|
|||
from pathlib import Path
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# Read the JSONL file and count crawler_identified_date values
|
||||
downloaded_counts = Counter()
|
||||
identified_counts = Counter()
|
||||
|
|
|
@ -10,31 +10,17 @@ import os
|
|||
from urllib.parse import urlparse
|
||||
import re
|
||||
from scripts.helpers.parallel import run_parallel
|
||||
import zipfile
|
||||
import struct
|
||||
import boto3
|
||||
import logging
|
||||
from scripts.data_gov.models import db, Dataset
|
||||
from playhouse.shortcuts import model_to_dict
|
||||
from tqdm import tqdm
|
||||
from datetime import datetime
|
||||
from scripts.helpers.bag import zip_archive, upload_archive, cleanup_files, fetch_and_upload
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
## download data.gov datasets, create nabit archives, and upload to S3
|
||||
|
||||
# File extensions that are already compressed or wouldn't benefit from additional compression
|
||||
UNCOMPRESSED_EXTENSIONS = {
|
||||
# Already compressed archives
|
||||
'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
|
||||
# Compressed images
|
||||
'jpg', 'jpeg', 'png', 'gif', 'webp',
|
||||
# Compressed video/audio
|
||||
'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
|
||||
# Other compressed/binary formats
|
||||
'pdf', 'docx', 'xlsx', 'pptx',
|
||||
}
|
||||
|
||||
stats_counter = {}
|
||||
|
||||
def is_valid_url(url):
|
||||
|
@ -55,95 +41,6 @@ def extract_urls(data, urls = None):
|
|||
extract_urls(item, urls)
|
||||
return urls
|
||||
|
||||
def create_archive(bag_dir, dataset: Dataset, signatures):
|
||||
data_dict = model_to_dict(dataset)
|
||||
for key, value in data_dict.items():
|
||||
if isinstance(value, datetime):
|
||||
data_dict[key] = value.isoformat()
|
||||
data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}'
|
||||
collect = [
|
||||
*[UrlCollectionTask(url=url) for url in extract_urls(data_dict)],
|
||||
]
|
||||
logger.info(f" - Downloading {len(collect)} files")
|
||||
|
||||
# sort fields from dataset
|
||||
data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')}
|
||||
crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')}
|
||||
|
||||
# Create the archive
|
||||
package(
|
||||
output_path=bag_dir,
|
||||
collect=collect,
|
||||
collect_errors='ignore',
|
||||
signed_metadata={
|
||||
'id': str(uuid.uuid4()),
|
||||
'url': data_gov_url,
|
||||
'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.',
|
||||
'data_gov_metadata': data_gov_metadata,
|
||||
'crawler_metadata': crawler_metadata,
|
||||
},
|
||||
signatures=signatures,
|
||||
)
|
||||
|
||||
def zip_archive(bag_dir, archive_path):
|
||||
# Create zip archive
|
||||
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
||||
for file_path in bag_dir.rglob('*'):
|
||||
if file_path.is_file():
|
||||
arc_path = file_path.relative_to(bag_dir)
|
||||
compression = (zipfile.ZIP_STORED
|
||||
if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS
|
||||
else zipfile.ZIP_DEFLATED)
|
||||
zf.write(file_path, arc_path, compress_type=compression)
|
||||
|
||||
# Create metadata file
|
||||
zip_info = []
|
||||
with zipfile.ZipFile(archive_path, 'r') as zf:
|
||||
for info in zf.filelist:
|
||||
header_offset = info.header_offset
|
||||
|
||||
# Read header to calculate data offset
|
||||
zf.fp.seek(header_offset)
|
||||
header = zf.fp.read(zipfile.sizeFileHeader)
|
||||
fheader = struct.unpack(zipfile.structFileHeader, header)
|
||||
fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
|
||||
extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
|
||||
data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
|
||||
|
||||
zip_info.append({
|
||||
'filename': info.filename,
|
||||
'file_size': info.file_size,
|
||||
'compress_size': info.compress_size,
|
||||
'compress_type': info.compress_type,
|
||||
'header_offset': header_offset,
|
||||
'data_offset': data_offset,
|
||||
})
|
||||
|
||||
# Read the bag-info.txt and signed-metadata.json
|
||||
bag_info = (bag_dir / 'bag-info.txt').read_text()
|
||||
signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
|
||||
|
||||
return {
|
||||
'bag_info': bag_info,
|
||||
'signed_metadata': signed_metadata,
|
||||
'zip_entries': zip_info
|
||||
}
|
||||
|
||||
def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
|
||||
s3 = boto3.Session(**session_args).client('s3')
|
||||
bucket_name, s3_path = s3_path.split('/', 1)
|
||||
|
||||
# Upload zip file
|
||||
s3_collection_key = os.path.join(s3_path, str(collection_path.relative_to(output_path)))
|
||||
s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
|
||||
logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
|
||||
|
||||
# Upload metadata file
|
||||
s3_metadata_key = os.path.join(s3_path, str(metadata_path.relative_to(output_path)))
|
||||
s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
|
||||
logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
|
||||
|
||||
|
||||
def run_pipeline(
|
||||
dataset: Dataset,
|
||||
output_path: Path,
|
||||
|
@ -162,36 +59,43 @@ def run_pipeline(
|
|||
# set this here so it makes it into the metadata
|
||||
dataset.crawler_downloaded_date = datetime.now()
|
||||
|
||||
with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
|
||||
logger.info("- Creating archive...")
|
||||
# set up paths
|
||||
temp_dir = Path(temp_dir)
|
||||
bag_dir = temp_dir / 'bag'
|
||||
archive_path = temp_dir / 'archive.zip'
|
||||
def create_archive(temp_dir):
|
||||
data_dict = model_to_dict(dataset)
|
||||
for key, value in data_dict.items():
|
||||
if isinstance(value, datetime):
|
||||
data_dict[key] = value.isoformat()
|
||||
data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}'
|
||||
collect = [
|
||||
*[UrlCollectionTask(url=url) for url in extract_urls(data_dict)],
|
||||
]
|
||||
logger.info(f" - Downloading {len(collect)} files")
|
||||
|
||||
# download data with nabit
|
||||
create_archive(bag_dir, dataset, signatures)
|
||||
# sort fields from dataset
|
||||
data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')}
|
||||
crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')}
|
||||
|
||||
logger.info("- Zipping archive...")
|
||||
# zip up data and create metadata
|
||||
output_metadata = zip_archive(bag_dir, archive_path)
|
||||
return {
|
||||
'collect': collect,
|
||||
'signed_metadata': {
|
||||
'id': str(uuid.uuid4()),
|
||||
'url': data_gov_url,
|
||||
'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.',
|
||||
'data_gov_metadata': data_gov_metadata,
|
||||
'crawler_metadata': crawler_metadata,
|
||||
},
|
||||
}
|
||||
|
||||
logger.info("- Moving files to final location...")
|
||||
# Move files to final location
|
||||
collection_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
os.rename(str(archive_path), collection_path)
|
||||
metadata_path.write_text(json.dumps(output_metadata) + '\n')
|
||||
|
||||
if s3_path:
|
||||
logger.info("Uploading to S3...")
|
||||
upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
|
||||
|
||||
if not no_delete:
|
||||
logger.info("- Deleting zip archive...")
|
||||
os.remove(collection_path)
|
||||
if collection_path.parent.exists() and not os.listdir(collection_path.parent):
|
||||
os.rmdir(collection_path.parent)
|
||||
# Use common pipeline
|
||||
fetch_and_upload(
|
||||
output_path=output_path,
|
||||
collection_path=collection_path,
|
||||
metadata_path=metadata_path,
|
||||
create_archive_callback=create_archive,
|
||||
signatures=signatures,
|
||||
session_args=session_args,
|
||||
s3_path=s3_path,
|
||||
no_delete=no_delete
|
||||
)
|
||||
|
||||
logger.info("- Setting crawler_downloaded_date...")
|
||||
db.connect()
|
||||
|
@ -244,12 +148,10 @@ def get_unprocessed_datasets(output_path: Path, collection: str, min_size: int =
|
|||
@click.option('--signatures', help='JSON string of signature configuration.')
|
||||
@click.option('--profile', '-p', help='AWS profile name')
|
||||
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
|
||||
@click.option('--log-level', '-l', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), default=None,
|
||||
help='Logging level.')
|
||||
@click.option('--stop-after', help='Stop after processing this many collections', type=int)
|
||||
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
|
||||
def main(db_path: Path, output_path: Path, collection: str, workers=None, min_size=0, dataset_name=None,
|
||||
if_exists='skip', signatures=None, profile=None, s3_path=None, log_level=None, stop_after=None, no_delete=False):
|
||||
if_exists='skip', signatures=None, profile=None, s3_path=None, stop_after=None, no_delete=False):
|
||||
|
||||
if dataset_name:
|
||||
workers = 1
|
||||
|
|
|
@ -151,17 +151,8 @@ def cli():
|
|||
help='Number of results to fetch per page.')
|
||||
@click.option('--start-date', '-s', type=str, default=None,
|
||||
help='Date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='WARNING',
|
||||
help='Logging level.')
|
||||
def fetch(output_path: Path, rows_per_page: int, start_date: str, log_level: str):
|
||||
def fetch(output_path: Path, rows_per_page: int, start_date: str):
|
||||
"""Fetch package data from data.gov API and save to database."""
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
save_packages_to_database(output_path, rows_per_page, start_date)
|
||||
|
||||
@cli.command()
|
||||
|
|
|
@ -22,19 +22,10 @@ def cli():
|
|||
@click.argument('output_path', type=click.Path(path_type=Path))
|
||||
@click.option('--rows-per-page', '-r', type=int, default=1000,
|
||||
help='Number of results to fetch per page.')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO',
|
||||
help='Logging level.')
|
||||
@click.option('--start-date', '-s', type=str, default=None,
|
||||
help='Start date for fetching packages in YYYY-MM-DD format.')
|
||||
def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
|
||||
"""Fetch all package data from data.gov API and save to gzipped JSONL file."""
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
if output_path.is_dir():
|
||||
current_date = datetime.now().strftime('%Y%m%d')
|
||||
output_path = output_path / f'data_{current_date}.jsonl.gz'
|
||||
|
@ -49,17 +40,8 @@ def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str
|
|||
@cli.command()
|
||||
@click.argument('file1', type=click.Path(exists=True, path_type=Path))
|
||||
@click.argument('file2', type=click.Path(exists=True, path_type=Path))
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO',
|
||||
help='Logging level.')
|
||||
def compare(file1: Path, file2: Path, log_level: str):
|
||||
"""Compare two gzipped JSONL files by indexing on the 'name' key."""
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
def load_jsonl_index(file_path: Path) -> Dict[str, Any]:
|
||||
# Check for pickle file
|
||||
pickle_path = file_path.with_suffix('.pickle')
|
||||
|
|
|
@ -60,7 +60,7 @@ class Dataset(BaseModel):
|
|||
# fields starting with crawler_ are added by our crawler
|
||||
crawler_identified_date = DateTimeField(null=True, default=datetime.now)
|
||||
crawler_downloaded_date = DateTimeField(null=True)
|
||||
crawler_last_crawl_id = ForeignKeyField('Crawl', backref='datasets', null=True)
|
||||
crawler_last_crawl_id = ForeignKeyField(Crawl, backref='datasets', null=True)
|
||||
|
||||
|
||||
class DatasetHistory(Dataset):
|
||||
|
|
0
scripts/github/__init__.py
Normal file
0
scripts/github/__init__.py
Normal file
|
@ -8,8 +8,14 @@ from gitspoke import Downloader, GitHubAPI
|
|||
from gitspoke.cli import valid_include_items, get_token
|
||||
import os
|
||||
import json
|
||||
import uuid
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from scripts.helpers.misc import load_config
|
||||
from nabit.lib.archive import package
|
||||
from nabit.lib.sign import KNOWN_TSAS, is_encrypted_key
|
||||
from nabit.lib.backends.path import PathCollectionTask
|
||||
from scripts.helpers.bag import fetch_and_upload
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
stats_counter = {}
|
||||
|
@ -34,17 +40,90 @@ def check_repo_exists(org_name, repo_name, token, output_path=None):
|
|||
output_file.write(f"{repo_link}\n")
|
||||
return exists
|
||||
|
||||
def run_pipeline(org_name, repo_name, collection_path, include, token, check_exists=False, output_path=None):
|
||||
def run_pipeline(
|
||||
org_name,
|
||||
repo_name,
|
||||
collection_path,
|
||||
include,
|
||||
token,
|
||||
metadata_path=None,
|
||||
output_path=None,
|
||||
signatures=None,
|
||||
session_args=None,
|
||||
s3_path=None,
|
||||
no_delete=False,
|
||||
save_raw=False,
|
||||
raw_dir=None,
|
||||
check_exists=False,
|
||||
output_deleted=None,
|
||||
):
|
||||
"""Process a single repository."""
|
||||
# existing checking mode
|
||||
if check_exists:
|
||||
return check_repo_exists(org_name, repo_name, token, output_path)
|
||||
return check_repo_exists(org_name, repo_name, token, output_deleted)
|
||||
|
||||
# raw saving mode
|
||||
if save_raw:
|
||||
raw_path = raw_dir / org_name / repo_name
|
||||
Downloader(org_name, repo_name, token, max_retries=20).download_repo(raw_path, include=include)
|
||||
logger.info("Processing complete")
|
||||
return
|
||||
|
||||
def create_archive_callback(temp_dir):
|
||||
if raw_dir:
|
||||
raw_path = raw_dir / org_name / repo_name
|
||||
if raw_path.exists():
|
||||
out_dir = raw_path
|
||||
else:
|
||||
Downloader(org_name, repo_name, token, max_retries=20).download_repo(temp_dir, include=include)
|
||||
out_dir = temp_dir
|
||||
return {
|
||||
'collect': [
|
||||
PathCollectionTask(path=out_dir)
|
||||
],
|
||||
'signed_metadata': {
|
||||
'id': str(uuid.uuid4()),
|
||||
'url': f'https://github.com/{org_name}/{repo_name}',
|
||||
'description': f'Archive of GitHub repository {org_name}/{repo_name}',
|
||||
'github_metadata': {
|
||||
'org': org_name,
|
||||
'repo': repo_name,
|
||||
'archived_date': datetime.now().isoformat()
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Archive mode - use common pipeline
|
||||
fetch_and_upload(
|
||||
output_path=output_path,
|
||||
collection_path=collection_path,
|
||||
metadata_path=metadata_path,
|
||||
create_archive_callback=create_archive_callback,
|
||||
signatures=signatures,
|
||||
session_args=session_args,
|
||||
s3_path=s3_path,
|
||||
no_delete=no_delete
|
||||
)
|
||||
|
||||
logger.info(f"Processing repository: {org_name}/{repo_name}")
|
||||
Downloader(org_name, repo_name, token, max_retries=20).download_repo(collection_path, include=include)
|
||||
logger.info("Processing complete")
|
||||
|
||||
def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int = 0, skip_existing: bool = False, stop_after: int = None, include: str = None,
|
||||
check_exists: bool = False):
|
||||
def get_tasks(
|
||||
csv_path: Path,
|
||||
output_path: Path,
|
||||
skip_rows: int = 0,
|
||||
skip_existing: bool = False,
|
||||
stop_after: int = None,
|
||||
include: str = None,
|
||||
archive_mode: bool = False,
|
||||
signatures: list = None,
|
||||
session_args: dict = None,
|
||||
s3_path: str = None,
|
||||
no_delete: bool = False,
|
||||
save_raw: bool = False,
|
||||
raw_dir: Path = None,
|
||||
check_exists: bool = False,
|
||||
output_deleted: Path = None,
|
||||
):
|
||||
"""Get repositories from CSV that haven't been processed yet."""
|
||||
# Initialize progress bars
|
||||
if not check_exists:
|
||||
|
@ -85,10 +164,11 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
|
|||
continue
|
||||
|
||||
org_name, repo_name = row['html_url'].split('/')[-2:]
|
||||
collection_path = output_path / 'collections' / collection / org_name / repo_name
|
||||
|
||||
if skip_existing:
|
||||
if collection_path.exists():
|
||||
collection_path = output_path / 'data' / org_name / repo_name / 'v1.zip'
|
||||
metadata_path = output_path / 'metadata' / org_name / repo_name / 'v1.json'
|
||||
|
||||
if skip_existing and collection_path.exists():
|
||||
stats_counter['skipped'].update(1)
|
||||
continue
|
||||
else:
|
||||
|
@ -97,7 +177,24 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
|
|||
# use tokens round robin
|
||||
token = tokens[processed % len(tokens)]
|
||||
|
||||
yield org_name, repo_name, collection_path, include, token, check_exists, output_path
|
||||
yield (
|
||||
org_name,
|
||||
repo_name,
|
||||
collection_path,
|
||||
include,
|
||||
token,
|
||||
output_deleted,
|
||||
archive_mode,
|
||||
metadata_path,
|
||||
output_path,
|
||||
signatures,
|
||||
session_args,
|
||||
s3_path,
|
||||
save_raw,
|
||||
raw_dir,
|
||||
check_exists,
|
||||
no_delete,
|
||||
)
|
||||
|
||||
processed += 1
|
||||
if stop_after and processed >= stop_after:
|
||||
|
@ -107,11 +204,10 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
|
|||
for counter in stats_counter.values():
|
||||
counter.close()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
|
||||
help='Output path.')
|
||||
@click.option('--collection', '-c', type=str, default='github_raw',
|
||||
help='Collection name.')
|
||||
@click.option('--workers', '-w', type=int, default=None,
|
||||
help='Number of worker processes. Defaults to CPU count.')
|
||||
@click.option('--skip-rows', type=int, default=0,
|
||||
|
@ -120,21 +216,44 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
|
|||
help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items))
|
||||
@click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv',
|
||||
help='Path to the CSV file.')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default=None,
|
||||
help='Logging level.')
|
||||
@click.option('--stop-after', help='Stop after processing this many repositories', type=int)
|
||||
@click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories')
|
||||
@click.option('--signatures', help='JSON string of signature configuration.')
|
||||
# upload settings
|
||||
@click.option('--profile', '-p', help='AWS profile name')
|
||||
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
|
||||
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
|
||||
# raw saving
|
||||
# useful if doing multiple runs with the same csv and different --include values
|
||||
@click.option('--save-raw', is_flag=True, help='Save raw repositories to disk rather than bagging and uploading')
|
||||
@click.option('--raw-dir', type=click.Path(path_type=Path), help='Directory to save raw repositories to')
|
||||
# deletion checking
|
||||
@click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub')
|
||||
def main(csv_path: Path, output_path: Path, collection: str, workers=None, skip_rows=0, include=None,
|
||||
log_level=None, stop_after=None, skip_existing=False, check_exists=False):
|
||||
@click.option('--output-deleted', type=click.Path(path_type=Path), help='File to output deleted repositories to')
|
||||
def main(profile, workers, **kwargs):
|
||||
|
||||
session_args = {}
|
||||
if profile:
|
||||
session_args['profile_name'] = profile
|
||||
|
||||
if signatures := kwargs.get('signatures'):
|
||||
signatures = json.loads(signatures)
|
||||
for signature in signatures:
|
||||
if signature['action'] == 'sign':
|
||||
if is_encrypted_key(signature['params']['key']):
|
||||
signature['params']['password'] = click.prompt(
|
||||
f"Enter password for {signature['params']['key']}: ",
|
||||
hide_input=True
|
||||
)
|
||||
elif signature['action'] == 'timestamp':
|
||||
if known_tsa := signature.pop('known_tsa', None):
|
||||
signature['params'] = KNOWN_TSAS[known_tsa]
|
||||
kwargs['signatures'] = signatures
|
||||
|
||||
run_parallel(
|
||||
run_pipeline,
|
||||
get_tasks(csv_path, output_path, collection, skip_rows, skip_existing, stop_after, include, check_exists),
|
||||
get_tasks(**kwargs),
|
||||
workers,
|
||||
log_level=log_level
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
156
scripts/helpers/bag.py
Normal file
156
scripts/helpers/bag.py
Normal file
|
@ -0,0 +1,156 @@
|
|||
import os
|
||||
import json
|
||||
import zipfile
|
||||
import struct
|
||||
import boto3
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import tempfile
|
||||
import shutil
|
||||
from nabit.lib.archive import package
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# File extensions that are already compressed or wouldn't benefit from additional compression
|
||||
UNCOMPRESSED_EXTENSIONS = {
|
||||
# Already compressed archives
|
||||
'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
|
||||
# Compressed images
|
||||
'jpg', 'jpeg', 'png', 'gif', 'webp',
|
||||
# Compressed video/audio
|
||||
'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
|
||||
# Other compressed/binary formats
|
||||
'pdf', 'docx', 'xlsx', 'pptx',
|
||||
}
|
||||
|
||||
def zip_archive(bag_dir, archive_path):
|
||||
"""Zip up a nabit archive and create metadata."""
|
||||
# Create zip archive
|
||||
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
||||
for file_path in bag_dir.rglob('*'):
|
||||
if file_path.is_file():
|
||||
arc_path = file_path.relative_to(bag_dir)
|
||||
compression = (zipfile.ZIP_STORED
|
||||
if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS
|
||||
else zipfile.ZIP_DEFLATED)
|
||||
zf.write(file_path, arc_path, compress_type=compression)
|
||||
|
||||
# Create metadata file
|
||||
zip_info = []
|
||||
with zipfile.ZipFile(archive_path, 'r') as zf:
|
||||
for info in zf.filelist:
|
||||
header_offset = info.header_offset
|
||||
|
||||
# Read header to calculate data offset
|
||||
zf.fp.seek(header_offset)
|
||||
header = zf.fp.read(zipfile.sizeFileHeader)
|
||||
fheader = struct.unpack(zipfile.structFileHeader, header)
|
||||
fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
|
||||
extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
|
||||
data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
|
||||
|
||||
zip_info.append({
|
||||
'filename': info.filename,
|
||||
'file_size': info.file_size,
|
||||
'compress_size': info.compress_size,
|
||||
'compress_type': info.compress_type,
|
||||
'header_offset': header_offset,
|
||||
'data_offset': data_offset,
|
||||
})
|
||||
|
||||
# Read the bag-info.txt and signed-metadata.json
|
||||
bag_info = (bag_dir / 'bag-info.txt').read_text()
|
||||
signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
|
||||
|
||||
return {
|
||||
'bag_info': bag_info,
|
||||
'signed_metadata': signed_metadata,
|
||||
'zip_entries': zip_info
|
||||
}
|
||||
|
||||
def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
|
||||
"""Upload archive and metadata to S3."""
|
||||
s3 = boto3.Session(**session_args).client('s3')
|
||||
bucket_name, s3_key_prefix = s3_path.split('/', 1)
|
||||
|
||||
# Upload zip file
|
||||
s3_collection_key = os.path.join(s3_key_prefix, str(collection_path.relative_to(output_path)))
|
||||
s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
|
||||
logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
|
||||
|
||||
# Upload metadata file
|
||||
s3_metadata_key = os.path.join(s3_key_prefix, str(metadata_path.relative_to(output_path)))
|
||||
s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
|
||||
logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
|
||||
|
||||
def cleanup_files(collection_path, no_delete=False, s3_path=None):
|
||||
"""Clean up local files after upload if needed."""
|
||||
if not no_delete and s3_path:
|
||||
logger.info("- Deleting local zip archive...")
|
||||
if os.path.exists(collection_path):
|
||||
os.remove(collection_path)
|
||||
if collection_path.parent.exists() and not os.listdir(collection_path.parent):
|
||||
os.rmdir(collection_path.parent)
|
||||
|
||||
def fetch_and_upload(
|
||||
output_path,
|
||||
collection_path,
|
||||
metadata_path,
|
||||
create_archive_callback,
|
||||
signatures=None,
|
||||
session_args=None,
|
||||
s3_path=None,
|
||||
no_delete=False,
|
||||
):
|
||||
"""
|
||||
Common pipeline for creating and processing archives.
|
||||
|
||||
Args:
|
||||
output_path: Base output directory
|
||||
collection_path: Path where the final zip will be stored
|
||||
metadata_path: Path where the metadata will be stored
|
||||
create_archive_callback: Function that will create the archive
|
||||
signatures: Signature configuration for nabit
|
||||
session_args: AWS session arguments
|
||||
s3_path: S3 path for uploads
|
||||
no_delete: Whether to preserve local files
|
||||
"""
|
||||
with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
|
||||
logger.info("- Creating archive...")
|
||||
# set up paths
|
||||
temp_dir = Path(temp_dir)
|
||||
bag_dir = temp_dir / 'bag'
|
||||
archive_path = temp_dir / 'archive.zip'
|
||||
source_files_dir = temp_dir / 'source_files'
|
||||
source_files_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Call the callback to create the archive
|
||||
package_kwargs = create_archive_callback(source_files_dir)
|
||||
|
||||
# create bag
|
||||
package(
|
||||
output_path=bag_dir,
|
||||
collect_errors='ignore',
|
||||
signatures=signatures,
|
||||
**package_kwargs,
|
||||
)
|
||||
|
||||
logger.info("- Zipping archive...")
|
||||
# zip up data and create metadata
|
||||
output_metadata = zip_archive(bag_dir, archive_path)
|
||||
|
||||
logger.info("- Moving files to final location...")
|
||||
# Move files to final location
|
||||
collection_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.move(str(archive_path), collection_path)
|
||||
with open(metadata_path, 'w') as f:
|
||||
json.dump(output_metadata, f)
|
||||
f.write('\n')
|
||||
|
||||
if s3_path:
|
||||
logger.info("Uploading to S3...")
|
||||
upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
|
||||
|
||||
cleanup_files(collection_path, no_delete, s3_path)
|
85
scripts/helpers/onepassword.py
Normal file
85
scripts/helpers/onepassword.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
from onepassword import Client, ItemCreateParams, ItemField, ItemFieldType, ItemCategory, ItemSection, Item
|
||||
from onepassword import ItemShareParams, ItemShareDuration, ValidRecipient
|
||||
from .misc import load_config
|
||||
import logging
|
||||
import asyncio
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def get_client():
|
||||
op_config = load_config().get("1password", {})
|
||||
if not op_config.get('token'):
|
||||
raise Exception("1Password token not found in config")
|
||||
|
||||
return await Client.authenticate(auth=op_config['token'], integration_name="data-vault", integration_version="v1.0.0")
|
||||
|
||||
def save_item(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
|
||||
return asyncio.run(save_item_async(vault_name, title, fields, notes, category))
|
||||
|
||||
async def save_item_async(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
|
||||
client = await get_client()
|
||||
vault_id = None
|
||||
vaults = await client.vaults.list_all()
|
||||
async for vault in vaults:
|
||||
if vault.title == vault_name:
|
||||
vault_id = vault.id
|
||||
break
|
||||
else:
|
||||
raise Exception(f"Vault {vault} not found")
|
||||
|
||||
field_objs = []
|
||||
sections = []
|
||||
for field in fields:
|
||||
if field.get('concealed', False):
|
||||
field['field_type'] = ItemFieldType.CONCEALED
|
||||
else:
|
||||
field['field_type'] = ItemFieldType.TEXT
|
||||
field['id'] = field['title'].lower().replace(' ', '_')
|
||||
field_objs.append(ItemField(**field))
|
||||
if section_id := field.get('section_id'):
|
||||
if section_id not in sections:
|
||||
sections.append(section_id)
|
||||
sections = [ItemSection(id=section, title=section) for section in sections]
|
||||
|
||||
# Create item parameters with sections
|
||||
create_params = ItemCreateParams(
|
||||
title=title,
|
||||
category=category,
|
||||
vault_id=vault_id,
|
||||
fields=field_objs,
|
||||
sections=sections,
|
||||
)
|
||||
|
||||
if notes:
|
||||
create_params.notes = notes
|
||||
|
||||
item = await client.items.create(create_params)
|
||||
logger.info(f"Stored credentials in 1Password vault '{vault}' with title '{title}'")
|
||||
return item
|
||||
|
||||
def share_item(
|
||||
item: Item,
|
||||
recipients: list[str] | None = None,
|
||||
expire_after: ItemShareDuration | None = ItemShareDuration.SEVENDAYS,
|
||||
one_time_only: bool = False
|
||||
):
|
||||
return asyncio.run(share_item_async(item, recipients, expire_after, one_time_only))
|
||||
|
||||
async def share_item_async(
|
||||
item: Item,
|
||||
recipients: list[str] | None,
|
||||
expire_after: ItemShareDuration | None,
|
||||
one_time_only: bool,
|
||||
):
|
||||
client = await get_client()
|
||||
policy = await client.items.shares.get_account_policy(item.vault_id, item.id)
|
||||
valid_recipients = await client.items.shares.validate_recipients(policy, recipients)
|
||||
share_params = ItemShareParams(
|
||||
recipients=valid_recipients,
|
||||
expire_after=expire_after,
|
||||
one_time_only=one_time_only
|
||||
)
|
||||
share_link = await client.items.shares.create(item, policy, share_params)
|
||||
logger.info(f"Created share link for '{item.title}'")
|
||||
return share_link
|
||||
|
|
@ -25,17 +25,9 @@ def worker(task_queue, task, catch_errors: bool = True):
|
|||
raise e
|
||||
|
||||
|
||||
def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, log_level: str | None = None, task_count: int | None = None):
|
||||
def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, task_count: int | None = None):
|
||||
workers = workers or os.cpu_count() or 4
|
||||
|
||||
# Configure logging based on whether we're running in parallel or not
|
||||
if log_level is None:
|
||||
log_level = 'INFO' if workers == 1 else 'WARNING'
|
||||
logging.basicConfig(
|
||||
level=log_level,
|
||||
format='[%(process)d] %(message)s'
|
||||
)
|
||||
|
||||
logger.debug(f"Starting processing with {workers} workers")
|
||||
|
||||
if workers > 1:
|
||||
|
|
23
uv.lock
generated
23
uv.lock
generated
|
@ -169,6 +169,7 @@ dependencies = [
|
|||
{ name = "httpx" },
|
||||
{ name = "jsondiff" },
|
||||
{ name = "nabit" },
|
||||
{ name = "onepassword-sdk" },
|
||||
{ name = "orjson" },
|
||||
{ name = "peewee" },
|
||||
{ name = "publicsuffixlist" },
|
||||
|
@ -192,6 +193,7 @@ requires-dist = [
|
|||
{ name = "httpx", specifier = ">=0.27.2" },
|
||||
{ name = "jsondiff", specifier = ">=2.2.1" },
|
||||
{ name = "nabit", git = "https://github.com/harvard-lil/bag-nabit" },
|
||||
{ name = "onepassword-sdk", specifier = ">=0.1.7" },
|
||||
{ name = "orjson", specifier = ">=3.10.15" },
|
||||
{ name = "peewee", specifier = ">=3.17.8" },
|
||||
{ name = "publicsuffixlist", specifier = ">=1.0.2.20241121" },
|
||||
|
@ -466,6 +468,27 @@ dependencies = [
|
|||
{ name = "warcio" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "onepassword-sdk"
|
||||
version = "0.1.7"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pydantic" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/aa/64/75462b6a21cbf98bff8ccae24848034ab280292d5b65346b77c720f46f93/onepassword_sdk-0.1.7.tar.gz", hash = "sha256:27979eb1b4fe0476a123336e3b432ce549feb4a6f87e975581497befcac985e9", size = 25312950 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d9/1a/016c165b87107cf771e3ac1f5a6b813003de534631ff78171b334980bef0/onepassword_sdk-0.1.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fd399e48833f5c916831c22be40f76cf7e85d0180fcb818dbd76cc5d45659cb8", size = 5261542 },
|
||||
{ url = "https://files.pythonhosted.org/packages/9a/d9/3960534ef2faced9cdecbe72bc2060f718de97b8d95c15210f50d1c514bb/onepassword_sdk-0.1.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:48a6dfb85c0138896ecb8d6aa814c5985acbb4262ed5e3a18e99725454bb41d5", size = 4827963 },
|
||||
{ url = "https://files.pythonhosted.org/packages/04/50/b0a34b7f1aa7e3b747bdca586896d3f2d452cdf87249f3dae56f649b1c83/onepassword_sdk-0.1.7-cp312-cp312-manylinux_2_32_aarch64.whl", hash = "sha256:7c6bbb202c83ad1566f27c3354ce88fa8dcceae0f90b0514b42d40aeb0b8ad10", size = 5262840 },
|
||||
{ url = "https://files.pythonhosted.org/packages/52/2d/73a11ee89fb69d6d6e388524560e53574eb5fdd20f65d57f25212087fc9f/onepassword_sdk-0.1.7-cp312-cp312-manylinux_2_32_x86_64.whl", hash = "sha256:299cd1d33f83e0acca01f0b93474397b471e5470b7f58f542f6866f8ea4de134", size = 5559126 },
|
||||
{ url = "https://files.pythonhosted.org/packages/02/fe/7aa733bfdadc930e4a02115189652524edce425ae13859d10a79f3cd24fa/onepassword_sdk-0.1.7-cp312-cp312-win_amd64.whl", hash = "sha256:c6d7ff7a15e56b2f6198b3681f6d18678fa56f6d9d3b0639bc70ae9b18b8acd2", size = 4627027 },
|
||||
{ url = "https://files.pythonhosted.org/packages/6a/97/ef38f90c2b8038529c059d5e6e1d5fb447f75300258ce2c5e5761bbe7283/onepassword_sdk-0.1.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0a6c0e2b8383e2428e020abe41a82376fe24c1a1d9634bfb31c1db4692ef8a25", size = 5261540 },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/c2/6b9eafd9a1549bae1c7e04e344374e560842b6451279be268d1f45cada08/onepassword_sdk-0.1.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6d570ebe068ba6ce4116a71d16a180353b864e5975be4574ede623b1ea794209", size = 4827963 },
|
||||
{ url = "https://files.pythonhosted.org/packages/57/0e/d924f5b92176fb86ff74b432d2dd4558048c971f902f287557e89795edc0/onepassword_sdk-0.1.7-cp313-cp313-manylinux_2_32_aarch64.whl", hash = "sha256:50f332f0f0f77913abe461c4d0913f7714ba2e07d9ef39446e0d564b5b440879", size = 5262838 },
|
||||
{ url = "https://files.pythonhosted.org/packages/38/13/9a79eccdb8f65b6e2676e72f0f77a2c626e00210960e74b0ff72f7a419a8/onepassword_sdk-0.1.7-cp313-cp313-manylinux_2_32_x86_64.whl", hash = "sha256:8a773da4770e887b124ec671781678e1b1fd57f20dcc40f89a610728f55138ed", size = 5559126 },
|
||||
{ url = "https://files.pythonhosted.org/packages/24/2c/15549d06fa6874d520ab123963172d18c45de34d0ac26e02465a56488f7d/onepassword_sdk-0.1.7-cp313-cp313-win_amd64.whl", hash = "sha256:9541175ed6283736745bf673f8c74696e53a164192c34c44de4400e8fdf01e38", size = 4627026 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "orderly-set"
|
||||
version = "5.3.0"
|
||||
|
|
Loading…
Add table
Reference in a new issue