diff --git a/pyproject.toml b/pyproject.toml index 91bfc8a..820daf5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ dependencies = [ "cloudflare>=4.0.0", "deepdiff>=8.2.0", "orjson>=3.10.15", + "onepassword-sdk>=0.1.7", ] [build-system] @@ -35,3 +36,6 @@ gitspoke = { git = "https://github.com/harvard-lil/gitspoke" } [tool.hatch.build.targets.wheel] packages = ["scripts"] + +[project.scripts] +vault = "scripts.cli:cli" diff --git a/scripts/__init__.py b/scripts/__init__.py index 4251662..e69de29 100644 --- a/scripts/__init__.py +++ b/scripts/__init__.py @@ -1,2 +0,0 @@ -def hello() -> str: - return "Hello from data-mirror!" diff --git a/scripts/cli.py b/scripts/cli.py new file mode 100644 index 0000000..9751b99 --- /dev/null +++ b/scripts/cli.py @@ -0,0 +1,118 @@ +import click +import importlib +from pathlib import Path +import logging +import os +import sys + +""" +Top level CLI that registers commands with this logic: + +- Find all subdirectories WITH __init__.py +- Fetch a 'cli' click.group() from __init__.py if it exists, or create one +- For all python files in the subdirectory: + - If the file defines a 'cli' click.group(), add it to the subdirectory's group + - If the file defines a 'main' click.command(), add it to the subdirectory's group +- If any commands are added to the subdirectory's group, add the group to the main CLI +""" + +def register_commands(cli: click.Group) -> None: + """Find all command groups in the scripts directory.""" + scripts_dir = Path(__file__).parent + + # for each subdirectory, try to import subdir.__init__ + for subdir in scripts_dir.glob('**'): + if not subdir.is_dir(): + continue + + # get group from __init__.py or create a new one + subdir_import = f"scripts.{subdir.name}" + try: + init_module = importlib.import_module(subdir_import+'.__init__') + except ImportError: + continue + if hasattr(init_module, 'cli'): + group = init_module.cli + else: + @click.group() + def group(): + pass + + # add commands from the subdirectory + for item in subdir.glob('*.py'): + if item.name == '__init__.py': + continue + file_import = f"{subdir_import}.{item.stem}" + module = importlib.import_module(file_import) + if type(getattr(module, 'cli', None)) == click.Group: + group.add_command(module.cli, name=item.stem) + elif type(getattr(module, 'main', None)) == click.Command: + group.add_command(module.main, name=item.stem) + + # add the group to the main cli + if group.commands: + cli.add_command(group, name=subdir.name) + +@click.group() +@click.option('--log-level', '-l', + type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), + default='WARNING', + help='Logging level.') +def cli(log_level): + """Main CLI entry point for the vault tool.""" + logging.basicConfig(level=log_level) + +@cli.command() +@click.option('--shell', '-s', type=click.Choice(['bash', 'zsh', 'fish']), + help='Shell to generate completion script for. Defaults to auto-detect.') +def completion(shell): + """Install tab completion for the CLI. + + Examples: + # Auto-detect shell and print instructions + $ vault completion + + # Generate completion script for bash + $ vault completion --shell=bash > ~/.vault-complete.bash + + # Generate completion script for zsh + $ vault completion --shell=zsh > ~/.vault-complete.zsh + + # Generate completion script for fish + $ vault completion --shell=fish > ~/.config/fish/completions/vault.fish + """ + # Auto-detect shell if not specified + if not shell: + shell = os.environ.get('SHELL', '') + if 'bash' in shell: + shell = 'bash' + elif 'zsh' in shell: + shell = 'zsh' + elif 'fish' in shell: + shell = 'fish' + else: + click.echo("Could not auto-detect shell. Please specify with --shell option.") + return + + # Get the script name (executable name) + script_name = os.path.basename(sys.argv[0]) + env_var = f"_{script_name.replace('-', '_').upper()}_COMPLETE" + + if shell == 'bash': + click.echo(f"# Save the completion script:") + click.echo(f"{env_var}=bash_source {script_name} > ~/.{script_name}-complete.bash") + click.echo("\n# Then add this line to your ~/.bashrc:") + click.echo(f". ~/.{script_name}-complete.bash") + elif shell == 'zsh': + click.echo(f"# Save the completion script:") + click.echo(f"{env_var}=zsh_source {script_name} > ~/.{script_name}-complete.zsh") + click.echo("\n# Then add this line to your ~/.zshrc:") + click.echo(f". ~/.{script_name}-complete.zsh") + elif shell == 'fish': + click.echo(f"# Save the completion script to the fish completions directory:") + click.echo(f"{env_var}=fish_source {script_name} > ~/.config/fish/completions/{script_name}.fish") + +register_commands(cli) + +if __name__ == "__main__": + cli() \ No newline at end of file diff --git a/scripts/collection/__init__.py b/scripts/collection/__init__.py index 4251662..e69de29 100644 --- a/scripts/collection/__init__.py +++ b/scripts/collection/__init__.py @@ -1,2 +0,0 @@ -def hello() -> str: - return "Hello from data-mirror!" diff --git a/scripts/collection/cloudflare_tools.py b/scripts/collection/cloudflare_tools.py index 6050d6c..d0b7547 100644 --- a/scripts/collection/cloudflare_tools.py +++ b/scripts/collection/cloudflare_tools.py @@ -54,15 +54,9 @@ def cli(): help='Key prefixes to restrict access to. Can be specified multiple times.') @click.option('--objects', '-o', multiple=True, help='Specific object keys to restrict access to. Can be specified multiple times.') -@click.option('--log-level', '-l', - type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - default='INFO', - help='Logging level.') def generate_key(bucket: str, permission: str, ttl: int, prefixes: tuple[str, ...], objects: tuple[str, ...], log_level: str): """Generate temporary Cloudflare R2 access credentials.""" - # Setup logging - logging.basicConfig(level=log_level) # Load config config = load_config().get("temp_tokens", {}) diff --git a/scripts/collection/render.py b/scripts/collection/render.py index c06767b..9f0a32a 100644 --- a/scripts/collection/render.py +++ b/scripts/collection/render.py @@ -74,18 +74,10 @@ def render_html(datasets_query, output_path: Path) -> None: @click.command() @click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db') @click.argument('output_path', type=click.Path(path_type=Path), default='data/processed/web') -@click.option('--log-level', '-l', - type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - default='INFO', - help='Logging level.') @click.option('--limit', '-n', type=int, default=None, help='Maximum number of rows to display. Default: all rows.') -def main(db_path: Path, output_path: Path, log_level: str, limit: int | None): +def main(db_path: Path, output_path: Path, limit: int | None): """Render the Dataset table to an HTML file.""" - logging.basicConfig( - level=getattr(logging, log_level), - format='%(asctime)s - %(levelname)s - %(message)s' - ) logger.info(f"Connecting to database at {db_path}") db.init(db_path) diff --git a/scripts/collection/s3_tools.py b/scripts/collection/s3.py similarity index 58% rename from scripts/collection/s3_tools.py rename to scripts/collection/s3.py index 35e704f..45ae15a 100644 --- a/scripts/collection/s3_tools.py +++ b/scripts/collection/s3.py @@ -2,14 +2,12 @@ import boto3 import click from tqdm import tqdm import logging -from itertools import islice import json -import gzip -from io import BytesIO import tempfile import os from scripts.helpers.misc import json_default import zipfile +from scripts.helpers.onepassword import save_item, share_item logger = logging.getLogger(__name__) @@ -122,11 +120,8 @@ def cli(): @click.argument('s3_path') @click.option('--profile', help='AWS profile name', default='sc-direct') @click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it') -@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - default='INFO', help='Set logging level') -def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'): +def undelete(s3_path: str, profile: str = None, dry_run: bool = False): """Remove delete markers from versioned S3 objects, effectively undeleting them.""" - logging.basicConfig(level=log_level) bucket, prefix = s3_path.split('/', 1) session = boto3.Session(profile_name=profile) @@ -138,11 +133,8 @@ def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level @click.argument('s3_path') @click.option('--profile', help='AWS profile name', default='sc-direct') @click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it') -@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - default='INFO', help='Set logging level') -def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'): +def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False): """Delete all zero-size objects under the given prefix.""" - logging.basicConfig(level=log_level) bucket, prefix = s3_path.split('/', 1) session = boto3.Session(profile_name=profile) @@ -154,11 +146,8 @@ def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_l @click.argument('s3_path') @click.option('--profile', help='AWS profile name', default='sc-direct') @click.option('--output', '-o', help='Output path for index file', default=None) -@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - default='INFO', help='Set logging level') -def write_index(s3_path: str, profile: str = None, output: str | None = None, log_level: str = 'INFO'): +def write_index(s3_path: str, profile: str = None, output: str | None = None): """Write a JSONL index of all files under the given prefix.""" - logging.basicConfig(level=log_level) bucket, prefix = s3_path.split('/', 1) if output is None: @@ -169,6 +158,117 @@ def write_index(s3_path: str, profile: str = None, output: str | None = None, lo write_file_listing(s3_client, bucket, prefix, output) +@cli.command() +@click.argument('bucket_name') +@click.option('--profile', '-p', help='AWS profile name') +@click.option('--region', '-r', help='AWS region', default='us-east-1') +@click.option('--tag', '-t', help='Tag the bucket with a name', default=None) +def create_bucket(bucket_name: str, profile: str = None, region: str = 'us-east-1', tag: str | None = None): + """Create a new S3 bucket with versioning enabled by default.""" + session = boto3.Session(profile_name=profile) + s3_client = session.client('s3') + + # Ensure bucket exists + try: + if region == 'us-east-1': + s3_client.create_bucket(Bucket=bucket_name) + else: + s3_client.create_bucket( + Bucket=bucket_name, + CreateBucketConfiguration={'LocationConstraint': region}, + ) + except s3_client.exceptions.BucketAlreadyExists: + logger.warning(f"Bucket {bucket_name} already exists. Updating settings.") + + # Configure bucket + s3_client.put_bucket_versioning( + Bucket=bucket_name, + VersioningConfiguration={'Status': 'Enabled'} + ) + + logger.info(f"Ensured bucket {bucket_name} exists with versioning enabled") + +@cli.command() +@click.argument('bucket_name') +@click.argument('username') +@click.option('--profile', '-p', help='AWS profile name') +@click.option('--permissions-boundary', '-b', help='ARN of the permissions boundary policy') +@click.option('--op-vault', help='1Password vault to store credentials in', default='Private') +@click.option('--op-share', help='Share the credentials with the given email', default=None) +def create_user(bucket_name: str, username: str, profile: str, permissions_boundary: str, op_vault: str, op_share: str | None): + """Generate temporary S3 credentials with read/write/list access for a specific bucket.""" + session = boto3.Session(profile_name=profile) + iam_client = session.client('iam') + + # Define inline policy for bucket access + bucket_policy = { + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket" + ], + "Resource": [ + f"arn:aws:s3:::{bucket_name}", + f"arn:aws:s3:::{bucket_name}/*" + ] + }] + } + + # Create the IAM user with permissions boundary + try: + iam_client.create_user( + UserName=username, + PermissionsBoundary=permissions_boundary + ) + logger.info(f"Created IAM user: {username}") + except iam_client.exceptions.EntityAlreadyExistsException: + logger.warning(f"User {username} already exists") + + # Attach inline policy directly to user + iam_client.put_user_policy( + UserName=username, + PolicyName=f"{bucket_name}-access", + PolicyDocument=json.dumps(bucket_policy) + ) + logger.info(f"Attached bucket access policy to user {username}") + + # Create access key for the user + response = iam_client.create_access_key(UserName=username) + credentials = response['AccessKey'] + + # Output the credentials + click.echo(f"AWS_ACCESS_KEY_ID={credentials['AccessKeyId']}") + click.echo(f"AWS_SECRET_ACCESS_KEY={credentials['SecretAccessKey']}") + + # Save credentials to 1Password if requested + if op_vault: + item = save_item(op_vault, f"{username} S3 Credentials for {bucket_name}", [ + { + 'title': 'Access Key ID', + 'value': credentials['AccessKeyId'], + 'section_id': 's3_details' + }, + { + 'title': 'Secret Access Key', + 'value': credentials['SecretAccessKey'], + 'section_id': 's3_details' + }, + { + 'title': 'S3 Bucket', + 'value': bucket_name, + 'section_id': 's3_details' + }, + ]) + if op_share: + share_link = share_item(item, [op_share]) + click.echo(f"To share credentials with {op_share}, use the following link: {share_link}") + + if __name__ == '__main__': cli() + diff --git a/scripts/collection/verify_upload.py b/scripts/collection/verify_upload.py index 1d4ff2f..1243b68 100644 --- a/scripts/collection/verify_upload.py +++ b/scripts/collection/verify_upload.py @@ -73,18 +73,8 @@ def verify_dataset(json_url: str, zip_url: str, output_dir: Path | None = None): @click.argument('zip_url', type=str) @click.option('--output', '-o', type=click.Path(path_type=Path), help='Directory to write uncompressed files') -@click.option('--log-level', '-l', - type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - default='INFO', - help='Logging level.') -def main(json_url: str, zip_url: str, output: Path = None, log_level: str = 'INFO'): +def main(json_url: str, zip_url: str, output: Path = None): """Verify dataset from JSON and ZIP URLs""" - # Set up logging - logging.basicConfig( - level=getattr(logging, log_level), - format='%(asctime)s - %(levelname)s - %(message)s' - ) - verify_dataset(json_url, zip_url, output) if __name__ == '__main__': diff --git a/scripts/data_gov/__init__.py b/scripts/data_gov/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/data_gov/diff/diff.py b/scripts/data_gov/diff/diff.py index 887beea..f10332b 100644 --- a/scripts/data_gov/diff/diff.py +++ b/scripts/data_gov/diff/diff.py @@ -61,17 +61,8 @@ def find_differences(csv_data: Dict[str, dict], @click.option('--compare-by', '-c', default='id', help='Field to compare by.') -@click.option('--log-level', '-l', - type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - default='INFO', - help='Logging level.') def main(old_path: Path, new_path: Path, compare_by: str, log_level: str): """Compare records between CSV and JSONL files.""" - logging.basicConfig( - level=getattr(logging, log_level), - format='%(asctime)s - %(levelname)s - %(message)s' - ) - old_data = load_jsonl_data(old_path, compare_by=compare_by) new_data = load_jsonl_data(new_path, compare_by=compare_by) diff --git a/scripts/data_gov/diff/diff_analyze.py b/scripts/data_gov/diff/diff_analyze.py index 0c70d75..1d393bc 100644 --- a/scripts/data_gov/diff/diff_analyze.py +++ b/scripts/data_gov/diff/diff_analyze.py @@ -3,36 +3,38 @@ from collections import Counter, defaultdict from pathlib import Path -# Read the JSONL file and count crawler_identified_date values -downloaded_counts = Counter() -identified_counts = Counter() -titles_by_org = defaultdict(list) -with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f: - for line in f: - data = json.loads(line) - org = json.loads(data.get('organization', '{}')) - identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1 - titles_by_org[org['title']].append(data["title"]) +if __name__ == "__main__": -# Print the counts sorted by date -for date, count in sorted(identified_counts.items()): - print(f"{date}: {count}") - -# sort each list of titles by org -for org, titles in titles_by_org.items(): - titles_by_org[org].sort() -Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2)) - - -# print urls -for path in Path('data/').glob('glass*'): - print(path) - with open(path, 'r') as f: + # Read the JSONL file and count crawler_identified_date values + downloaded_counts = Counter() + identified_counts = Counter() + titles_by_org = defaultdict(list) + with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f: for line in f: data = json.loads(line) - print("* " + data['name']) - resources = data.get('resources', []) - if type(resources) == str: - resources = json.loads(resources) - for resource in resources: - print(' * ' + resource['url']) + org = json.loads(data.get('organization', '{}')) + identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1 + titles_by_org[org['title']].append(data["title"]) + + # Print the counts sorted by date + for date, count in sorted(identified_counts.items()): + print(f"{date}: {count}") + + # sort each list of titles by org + for org, titles in titles_by_org.items(): + titles_by_org[org].sort() + Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2)) + + + # print urls + for path in Path('data/').glob('glass*'): + print(path) + with open(path, 'r') as f: + for line in f: + data = json.loads(line) + print("* " + data['name']) + resources = data.get('resources', []) + if type(resources) == str: + resources = json.loads(resources) + for resource in resources: + print(' * ' + resource['url']) diff --git a/scripts/data_gov/fetch_data.py b/scripts/data_gov/fetch_data.py index 8bd1eda..122ff1d 100644 --- a/scripts/data_gov/fetch_data.py +++ b/scripts/data_gov/fetch_data.py @@ -10,31 +10,17 @@ import os from urllib.parse import urlparse import re from scripts.helpers.parallel import run_parallel -import zipfile -import struct -import boto3 import logging from scripts.data_gov.models import db, Dataset from playhouse.shortcuts import model_to_dict from tqdm import tqdm from datetime import datetime +from scripts.helpers.bag import zip_archive, upload_archive, cleanup_files, fetch_and_upload logger = logging.getLogger(__name__) ## download data.gov datasets, create nabit archives, and upload to S3 -# File extensions that are already compressed or wouldn't benefit from additional compression -UNCOMPRESSED_EXTENSIONS = { - # Already compressed archives - 'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz', - # Compressed images - 'jpg', 'jpeg', 'png', 'gif', 'webp', - # Compressed video/audio - 'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a', - # Other compressed/binary formats - 'pdf', 'docx', 'xlsx', 'pptx', -} - stats_counter = {} def is_valid_url(url): @@ -55,95 +41,6 @@ def extract_urls(data, urls = None): extract_urls(item, urls) return urls -def create_archive(bag_dir, dataset: Dataset, signatures): - data_dict = model_to_dict(dataset) - for key, value in data_dict.items(): - if isinstance(value, datetime): - data_dict[key] = value.isoformat() - data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}' - collect = [ - *[UrlCollectionTask(url=url) for url in extract_urls(data_dict)], - ] - logger.info(f" - Downloading {len(collect)} files") - - # sort fields from dataset - data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')} - crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')} - - # Create the archive - package( - output_path=bag_dir, - collect=collect, - collect_errors='ignore', - signed_metadata={ - 'id': str(uuid.uuid4()), - 'url': data_gov_url, - 'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.', - 'data_gov_metadata': data_gov_metadata, - 'crawler_metadata': crawler_metadata, - }, - signatures=signatures, - ) - -def zip_archive(bag_dir, archive_path): - # Create zip archive - with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf: - for file_path in bag_dir.rglob('*'): - if file_path.is_file(): - arc_path = file_path.relative_to(bag_dir) - compression = (zipfile.ZIP_STORED - if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS - else zipfile.ZIP_DEFLATED) - zf.write(file_path, arc_path, compress_type=compression) - - # Create metadata file - zip_info = [] - with zipfile.ZipFile(archive_path, 'r') as zf: - for info in zf.filelist: - header_offset = info.header_offset - - # Read header to calculate data offset - zf.fp.seek(header_offset) - header = zf.fp.read(zipfile.sizeFileHeader) - fheader = struct.unpack(zipfile.structFileHeader, header) - fname_length = fheader[zipfile._FH_FILENAME_LENGTH] - extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH] - data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length - - zip_info.append({ - 'filename': info.filename, - 'file_size': info.file_size, - 'compress_size': info.compress_size, - 'compress_type': info.compress_type, - 'header_offset': header_offset, - 'data_offset': data_offset, - }) - - # Read the bag-info.txt and signed-metadata.json - bag_info = (bag_dir / 'bag-info.txt').read_text() - signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text()) - - return { - 'bag_info': bag_info, - 'signed_metadata': signed_metadata, - 'zip_entries': zip_info - } - -def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args): - s3 = boto3.Session(**session_args).client('s3') - bucket_name, s3_path = s3_path.split('/', 1) - - # Upload zip file - s3_collection_key = os.path.join(s3_path, str(collection_path.relative_to(output_path))) - s3.upload_file(str(collection_path), bucket_name, s3_collection_key) - logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}") - - # Upload metadata file - s3_metadata_key = os.path.join(s3_path, str(metadata_path.relative_to(output_path))) - s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key) - logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}") - - def run_pipeline( dataset: Dataset, output_path: Path, @@ -162,36 +59,43 @@ def run_pipeline( # set this here so it makes it into the metadata dataset.crawler_downloaded_date = datetime.now() - with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir: - logger.info("- Creating archive...") - # set up paths - temp_dir = Path(temp_dir) - bag_dir = temp_dir / 'bag' - archive_path = temp_dir / 'archive.zip' + def create_archive(temp_dir): + data_dict = model_to_dict(dataset) + for key, value in data_dict.items(): + if isinstance(value, datetime): + data_dict[key] = value.isoformat() + data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}' + collect = [ + *[UrlCollectionTask(url=url) for url in extract_urls(data_dict)], + ] + logger.info(f" - Downloading {len(collect)} files") - # download data with nabit - create_archive(bag_dir, dataset, signatures) + # sort fields from dataset + data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')} + crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')} - logger.info("- Zipping archive...") - # zip up data and create metadata - output_metadata = zip_archive(bag_dir, archive_path) + return { + 'collect': collect, + 'signed_metadata': { + 'id': str(uuid.uuid4()), + 'url': data_gov_url, + 'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.', + 'data_gov_metadata': data_gov_metadata, + 'crawler_metadata': crawler_metadata, + }, + } - logger.info("- Moving files to final location...") - # Move files to final location - collection_path.parent.mkdir(parents=True, exist_ok=True) - metadata_path.parent.mkdir(parents=True, exist_ok=True) - os.rename(str(archive_path), collection_path) - metadata_path.write_text(json.dumps(output_metadata) + '\n') - - if s3_path: - logger.info("Uploading to S3...") - upload_archive(output_path, collection_path, metadata_path, s3_path, session_args) - - if not no_delete: - logger.info("- Deleting zip archive...") - os.remove(collection_path) - if collection_path.parent.exists() and not os.listdir(collection_path.parent): - os.rmdir(collection_path.parent) + # Use common pipeline + fetch_and_upload( + output_path=output_path, + collection_path=collection_path, + metadata_path=metadata_path, + create_archive_callback=create_archive, + signatures=signatures, + session_args=session_args, + s3_path=s3_path, + no_delete=no_delete + ) logger.info("- Setting crawler_downloaded_date...") db.connect() @@ -244,12 +148,10 @@ def get_unprocessed_datasets(output_path: Path, collection: str, min_size: int = @click.option('--signatures', help='JSON string of signature configuration.') @click.option('--profile', '-p', help='AWS profile name') @click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "/"') -@click.option('--log-level', '-l', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), default=None, - help='Logging level.') @click.option('--stop-after', help='Stop after processing this many collections', type=int) @click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata') def main(db_path: Path, output_path: Path, collection: str, workers=None, min_size=0, dataset_name=None, - if_exists='skip', signatures=None, profile=None, s3_path=None, log_level=None, stop_after=None, no_delete=False): + if_exists='skip', signatures=None, profile=None, s3_path=None, stop_after=None, no_delete=False): if dataset_name: workers = 1 diff --git a/scripts/data_gov/fetch_index.py b/scripts/data_gov/fetch_index.py index 0ffbdf9..8b54c10 100644 --- a/scripts/data_gov/fetch_index.py +++ b/scripts/data_gov/fetch_index.py @@ -151,17 +151,8 @@ def cli(): help='Number of results to fetch per page.') @click.option('--start-date', '-s', type=str, default=None, help='Date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)') -@click.option('--log-level', '-l', - type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - default='WARNING', - help='Logging level.') -def fetch(output_path: Path, rows_per_page: int, start_date: str, log_level: str): +def fetch(output_path: Path, rows_per_page: int, start_date: str): """Fetch package data from data.gov API and save to database.""" - logging.basicConfig( - level=getattr(logging, log_level), - format='%(asctime)s - %(levelname)s - %(message)s' - ) - save_packages_to_database(output_path, rows_per_page, start_date) @cli.command() diff --git a/scripts/data_gov/fetch_jsonl.py b/scripts/data_gov/fetch_jsonl.py index e0ac5b3..c4e8228 100644 --- a/scripts/data_gov/fetch_jsonl.py +++ b/scripts/data_gov/fetch_jsonl.py @@ -22,19 +22,10 @@ def cli(): @click.argument('output_path', type=click.Path(path_type=Path)) @click.option('--rows-per-page', '-r', type=int, default=1000, help='Number of results to fetch per page.') -@click.option('--log-level', '-l', - type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - default='INFO', - help='Logging level.') @click.option('--start-date', '-s', type=str, default=None, help='Start date for fetching packages in YYYY-MM-DD format.') def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str): """Fetch all package data from data.gov API and save to gzipped JSONL file.""" - logging.basicConfig( - level=getattr(logging, log_level), - format='%(asctime)s - %(levelname)s - %(message)s' - ) - if output_path.is_dir(): current_date = datetime.now().strftime('%Y%m%d') output_path = output_path / f'data_{current_date}.jsonl.gz' @@ -49,17 +40,8 @@ def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str @cli.command() @click.argument('file1', type=click.Path(exists=True, path_type=Path)) @click.argument('file2', type=click.Path(exists=True, path_type=Path)) -@click.option('--log-level', '-l', - type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - default='INFO', - help='Logging level.') def compare(file1: Path, file2: Path, log_level: str): """Compare two gzipped JSONL files by indexing on the 'name' key.""" - logging.basicConfig( - level=getattr(logging, log_level), - format='%(asctime)s - %(levelname)s - %(message)s' - ) - def load_jsonl_index(file_path: Path) -> Dict[str, Any]: # Check for pickle file pickle_path = file_path.with_suffix('.pickle') diff --git a/scripts/data_gov/models.py b/scripts/data_gov/models.py index 21743a5..b397e08 100644 --- a/scripts/data_gov/models.py +++ b/scripts/data_gov/models.py @@ -60,7 +60,7 @@ class Dataset(BaseModel): # fields starting with crawler_ are added by our crawler crawler_identified_date = DateTimeField(null=True, default=datetime.now) crawler_downloaded_date = DateTimeField(null=True) - crawler_last_crawl_id = ForeignKeyField('Crawl', backref='datasets', null=True) + crawler_last_crawl_id = ForeignKeyField(Crawl, backref='datasets', null=True) class DatasetHistory(Dataset): diff --git a/scripts/github/__init__.py b/scripts/github/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/github/download_git.py b/scripts/github/download_git.py index 8aafc25..f1623b0 100644 --- a/scripts/github/download_git.py +++ b/scripts/github/download_git.py @@ -8,8 +8,14 @@ from gitspoke import Downloader, GitHubAPI from gitspoke.cli import valid_include_items, get_token import os import json +import uuid import requests +from datetime import datetime from scripts.helpers.misc import load_config +from nabit.lib.archive import package +from nabit.lib.sign import KNOWN_TSAS, is_encrypted_key +from nabit.lib.backends.path import PathCollectionTask +from scripts.helpers.bag import fetch_and_upload logger = logging.getLogger(__name__) stats_counter = {} @@ -34,17 +40,90 @@ def check_repo_exists(org_name, repo_name, token, output_path=None): output_file.write(f"{repo_link}\n") return exists -def run_pipeline(org_name, repo_name, collection_path, include, token, check_exists=False, output_path=None): +def run_pipeline( + org_name, + repo_name, + collection_path, + include, + token, + metadata_path=None, + output_path=None, + signatures=None, + session_args=None, + s3_path=None, + no_delete=False, + save_raw=False, + raw_dir=None, + check_exists=False, + output_deleted=None, +): """Process a single repository.""" + # existing checking mode if check_exists: - return check_repo_exists(org_name, repo_name, token, output_path) - - logger.info(f"Processing repository: {org_name}/{repo_name}") - Downloader(org_name, repo_name, token, max_retries=20).download_repo(collection_path, include=include) + return check_repo_exists(org_name, repo_name, token, output_deleted) + + # raw saving mode + if save_raw: + raw_path = raw_dir / org_name / repo_name + Downloader(org_name, repo_name, token, max_retries=20).download_repo(raw_path, include=include) + logger.info("Processing complete") + return + + def create_archive_callback(temp_dir): + if raw_dir: + raw_path = raw_dir / org_name / repo_name + if raw_path.exists(): + out_dir = raw_path + else: + Downloader(org_name, repo_name, token, max_retries=20).download_repo(temp_dir, include=include) + out_dir = temp_dir + return { + 'collect': [ + PathCollectionTask(path=out_dir) + ], + 'signed_metadata': { + 'id': str(uuid.uuid4()), + 'url': f'https://github.com/{org_name}/{repo_name}', + 'description': f'Archive of GitHub repository {org_name}/{repo_name}', + 'github_metadata': { + 'org': org_name, + 'repo': repo_name, + 'archived_date': datetime.now().isoformat() + }, + }, + } + + # Archive mode - use common pipeline + fetch_and_upload( + output_path=output_path, + collection_path=collection_path, + metadata_path=metadata_path, + create_archive_callback=create_archive_callback, + signatures=signatures, + session_args=session_args, + s3_path=s3_path, + no_delete=no_delete + ) + logger.info("Processing complete") -def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int = 0, skip_existing: bool = False, stop_after: int = None, include: str = None, - check_exists: bool = False): +def get_tasks( + csv_path: Path, + output_path: Path, + skip_rows: int = 0, + skip_existing: bool = False, + stop_after: int = None, + include: str = None, + archive_mode: bool = False, + signatures: list = None, + session_args: dict = None, + s3_path: str = None, + no_delete: bool = False, + save_raw: bool = False, + raw_dir: Path = None, + check_exists: bool = False, + output_deleted: Path = None, +): """Get repositories from CSV that haven't been processed yet.""" # Initialize progress bars if not check_exists: @@ -85,19 +164,37 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int continue org_name, repo_name = row['html_url'].split('/')[-2:] - collection_path = output_path / 'collections' / collection / org_name / repo_name - if skip_existing: - if collection_path.exists(): - stats_counter['skipped'].update(1) - continue - else: - stats_counter['yielded'].update(1) + collection_path = output_path / 'data' / org_name / repo_name / 'v1.zip' + metadata_path = output_path / 'metadata' / org_name / repo_name / 'v1.json' + + if skip_existing and collection_path.exists(): + stats_counter['skipped'].update(1) + continue + else: + stats_counter['yielded'].update(1) # use tokens round robin token = tokens[processed % len(tokens)] - yield org_name, repo_name, collection_path, include, token, check_exists, output_path + yield ( + org_name, + repo_name, + collection_path, + include, + token, + output_deleted, + archive_mode, + metadata_path, + output_path, + signatures, + session_args, + s3_path, + save_raw, + raw_dir, + check_exists, + no_delete, + ) processed += 1 if stop_after and processed >= stop_after: @@ -107,11 +204,10 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int for counter in stats_counter.values(): counter.close() + @click.command() @click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed', help='Output path.') -@click.option('--collection', '-c', type=str, default='github_raw', - help='Collection name.') @click.option('--workers', '-w', type=int, default=None, help='Number of worker processes. Defaults to CPU count.') @click.option('--skip-rows', type=int, default=0, @@ -120,21 +216,44 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items)) @click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv', help='Path to the CSV file.') -@click.option('--log-level', '-l', - type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), - default=None, - help='Logging level.') @click.option('--stop-after', help='Stop after processing this many repositories', type=int) @click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories') +@click.option('--signatures', help='JSON string of signature configuration.') +# upload settings +@click.option('--profile', '-p', help='AWS profile name') +@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "/"') +@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata') +# raw saving +# useful if doing multiple runs with the same csv and different --include values +@click.option('--save-raw', is_flag=True, help='Save raw repositories to disk rather than bagging and uploading') +@click.option('--raw-dir', type=click.Path(path_type=Path), help='Directory to save raw repositories to') +# deletion checking @click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub') -def main(csv_path: Path, output_path: Path, collection: str, workers=None, skip_rows=0, include=None, - log_level=None, stop_after=None, skip_existing=False, check_exists=False): +@click.option('--output-deleted', type=click.Path(path_type=Path), help='File to output deleted repositories to') +def main(profile, workers, **kwargs): + + session_args = {} + if profile: + session_args['profile_name'] = profile + + if signatures := kwargs.get('signatures'): + signatures = json.loads(signatures) + for signature in signatures: + if signature['action'] == 'sign': + if is_encrypted_key(signature['params']['key']): + signature['params']['password'] = click.prompt( + f"Enter password for {signature['params']['key']}: ", + hide_input=True + ) + elif signature['action'] == 'timestamp': + if known_tsa := signature.pop('known_tsa', None): + signature['params'] = KNOWN_TSAS[known_tsa] + kwargs['signatures'] = signatures run_parallel( run_pipeline, - get_tasks(csv_path, output_path, collection, skip_rows, skip_existing, stop_after, include, check_exists), + get_tasks(**kwargs), workers, - log_level=log_level ) if __name__ == "__main__": diff --git a/scripts/helpers/bag.py b/scripts/helpers/bag.py new file mode 100644 index 0000000..08168f3 --- /dev/null +++ b/scripts/helpers/bag.py @@ -0,0 +1,156 @@ +import os +import json +import zipfile +import struct +import boto3 +import logging +from pathlib import Path +from datetime import datetime +import tempfile +import shutil +from nabit.lib.archive import package + +logger = logging.getLogger(__name__) + +# File extensions that are already compressed or wouldn't benefit from additional compression +UNCOMPRESSED_EXTENSIONS = { + # Already compressed archives + 'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz', + # Compressed images + 'jpg', 'jpeg', 'png', 'gif', 'webp', + # Compressed video/audio + 'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a', + # Other compressed/binary formats + 'pdf', 'docx', 'xlsx', 'pptx', +} + +def zip_archive(bag_dir, archive_path): + """Zip up a nabit archive and create metadata.""" + # Create zip archive + with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf: + for file_path in bag_dir.rglob('*'): + if file_path.is_file(): + arc_path = file_path.relative_to(bag_dir) + compression = (zipfile.ZIP_STORED + if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS + else zipfile.ZIP_DEFLATED) + zf.write(file_path, arc_path, compress_type=compression) + + # Create metadata file + zip_info = [] + with zipfile.ZipFile(archive_path, 'r') as zf: + for info in zf.filelist: + header_offset = info.header_offset + + # Read header to calculate data offset + zf.fp.seek(header_offset) + header = zf.fp.read(zipfile.sizeFileHeader) + fheader = struct.unpack(zipfile.structFileHeader, header) + fname_length = fheader[zipfile._FH_FILENAME_LENGTH] + extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH] + data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length + + zip_info.append({ + 'filename': info.filename, + 'file_size': info.file_size, + 'compress_size': info.compress_size, + 'compress_type': info.compress_type, + 'header_offset': header_offset, + 'data_offset': data_offset, + }) + + # Read the bag-info.txt and signed-metadata.json + bag_info = (bag_dir / 'bag-info.txt').read_text() + signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text()) + + return { + 'bag_info': bag_info, + 'signed_metadata': signed_metadata, + 'zip_entries': zip_info + } + +def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args): + """Upload archive and metadata to S3.""" + s3 = boto3.Session(**session_args).client('s3') + bucket_name, s3_key_prefix = s3_path.split('/', 1) + + # Upload zip file + s3_collection_key = os.path.join(s3_key_prefix, str(collection_path.relative_to(output_path))) + s3.upload_file(str(collection_path), bucket_name, s3_collection_key) + logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}") + + # Upload metadata file + s3_metadata_key = os.path.join(s3_key_prefix, str(metadata_path.relative_to(output_path))) + s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key) + logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}") + +def cleanup_files(collection_path, no_delete=False, s3_path=None): + """Clean up local files after upload if needed.""" + if not no_delete and s3_path: + logger.info("- Deleting local zip archive...") + if os.path.exists(collection_path): + os.remove(collection_path) + if collection_path.parent.exists() and not os.listdir(collection_path.parent): + os.rmdir(collection_path.parent) + +def fetch_and_upload( + output_path, + collection_path, + metadata_path, + create_archive_callback, + signatures=None, + session_args=None, + s3_path=None, + no_delete=False, + ): + """ + Common pipeline for creating and processing archives. + + Args: + output_path: Base output directory + collection_path: Path where the final zip will be stored + metadata_path: Path where the metadata will be stored + create_archive_callback: Function that will create the archive + signatures: Signature configuration for nabit + session_args: AWS session arguments + s3_path: S3 path for uploads + no_delete: Whether to preserve local files + """ + with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir: + logger.info("- Creating archive...") + # set up paths + temp_dir = Path(temp_dir) + bag_dir = temp_dir / 'bag' + archive_path = temp_dir / 'archive.zip' + source_files_dir = temp_dir / 'source_files' + source_files_dir.mkdir(parents=True, exist_ok=True) + + # Call the callback to create the archive + package_kwargs = create_archive_callback(source_files_dir) + + # create bag + package( + output_path=bag_dir, + collect_errors='ignore', + signatures=signatures, + **package_kwargs, + ) + + logger.info("- Zipping archive...") + # zip up data and create metadata + output_metadata = zip_archive(bag_dir, archive_path) + + logger.info("- Moving files to final location...") + # Move files to final location + collection_path.parent.mkdir(parents=True, exist_ok=True) + metadata_path.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(archive_path), collection_path) + with open(metadata_path, 'w') as f: + json.dump(output_metadata, f) + f.write('\n') + + if s3_path: + logger.info("Uploading to S3...") + upload_archive(output_path, collection_path, metadata_path, s3_path, session_args) + + cleanup_files(collection_path, no_delete, s3_path) \ No newline at end of file diff --git a/scripts/helpers/onepassword.py b/scripts/helpers/onepassword.py new file mode 100644 index 0000000..7af3843 --- /dev/null +++ b/scripts/helpers/onepassword.py @@ -0,0 +1,85 @@ +from onepassword import Client, ItemCreateParams, ItemField, ItemFieldType, ItemCategory, ItemSection, Item +from onepassword import ItemShareParams, ItemShareDuration, ValidRecipient +from .misc import load_config +import logging +import asyncio + +logger = logging.getLogger(__name__) + +async def get_client(): + op_config = load_config().get("1password", {}) + if not op_config.get('token'): + raise Exception("1Password token not found in config") + + return await Client.authenticate(auth=op_config['token'], integration_name="data-vault", integration_version="v1.0.0") + +def save_item(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS): + return asyncio.run(save_item_async(vault_name, title, fields, notes, category)) + +async def save_item_async(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS): + client = await get_client() + vault_id = None + vaults = await client.vaults.list_all() + async for vault in vaults: + if vault.title == vault_name: + vault_id = vault.id + break + else: + raise Exception(f"Vault {vault} not found") + + field_objs = [] + sections = [] + for field in fields: + if field.get('concealed', False): + field['field_type'] = ItemFieldType.CONCEALED + else: + field['field_type'] = ItemFieldType.TEXT + field['id'] = field['title'].lower().replace(' ', '_') + field_objs.append(ItemField(**field)) + if section_id := field.get('section_id'): + if section_id not in sections: + sections.append(section_id) + sections = [ItemSection(id=section, title=section) for section in sections] + + # Create item parameters with sections + create_params = ItemCreateParams( + title=title, + category=category, + vault_id=vault_id, + fields=field_objs, + sections=sections, + ) + + if notes: + create_params.notes = notes + + item = await client.items.create(create_params) + logger.info(f"Stored credentials in 1Password vault '{vault}' with title '{title}'") + return item + +def share_item( + item: Item, + recipients: list[str] | None = None, + expire_after: ItemShareDuration | None = ItemShareDuration.SEVENDAYS, + one_time_only: bool = False + ): + return asyncio.run(share_item_async(item, recipients, expire_after, one_time_only)) + +async def share_item_async( + item: Item, + recipients: list[str] | None, + expire_after: ItemShareDuration | None, + one_time_only: bool, + ): + client = await get_client() + policy = await client.items.shares.get_account_policy(item.vault_id, item.id) + valid_recipients = await client.items.shares.validate_recipients(policy, recipients) + share_params = ItemShareParams( + recipients=valid_recipients, + expire_after=expire_after, + one_time_only=one_time_only + ) + share_link = await client.items.shares.create(item, policy, share_params) + logger.info(f"Created share link for '{item.title}'") + return share_link + diff --git a/scripts/helpers/parallel.py b/scripts/helpers/parallel.py index b875da0..4a7d7ac 100644 --- a/scripts/helpers/parallel.py +++ b/scripts/helpers/parallel.py @@ -25,17 +25,9 @@ def worker(task_queue, task, catch_errors: bool = True): raise e -def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, log_level: str | None = None, task_count: int | None = None): +def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, task_count: int | None = None): workers = workers or os.cpu_count() or 4 - # Configure logging based on whether we're running in parallel or not - if log_level is None: - log_level = 'INFO' if workers == 1 else 'WARNING' - logging.basicConfig( - level=log_level, - format='[%(process)d] %(message)s' - ) - logger.debug(f"Starting processing with {workers} workers") if workers > 1: diff --git a/uv.lock b/uv.lock index dc1ccba..515eba5 100644 --- a/uv.lock +++ b/uv.lock @@ -169,6 +169,7 @@ dependencies = [ { name = "httpx" }, { name = "jsondiff" }, { name = "nabit" }, + { name = "onepassword-sdk" }, { name = "orjson" }, { name = "peewee" }, { name = "publicsuffixlist" }, @@ -192,6 +193,7 @@ requires-dist = [ { name = "httpx", specifier = ">=0.27.2" }, { name = "jsondiff", specifier = ">=2.2.1" }, { name = "nabit", git = "https://github.com/harvard-lil/bag-nabit" }, + { name = "onepassword-sdk", specifier = ">=0.1.7" }, { name = "orjson", specifier = ">=3.10.15" }, { name = "peewee", specifier = ">=3.17.8" }, { name = "publicsuffixlist", specifier = ">=1.0.2.20241121" }, @@ -466,6 +468,27 @@ dependencies = [ { name = "warcio" }, ] +[[package]] +name = "onepassword-sdk" +version = "0.1.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/aa/64/75462b6a21cbf98bff8ccae24848034ab280292d5b65346b77c720f46f93/onepassword_sdk-0.1.7.tar.gz", hash = "sha256:27979eb1b4fe0476a123336e3b432ce549feb4a6f87e975581497befcac985e9", size = 25312950 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/1a/016c165b87107cf771e3ac1f5a6b813003de534631ff78171b334980bef0/onepassword_sdk-0.1.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fd399e48833f5c916831c22be40f76cf7e85d0180fcb818dbd76cc5d45659cb8", size = 5261542 }, + { url = "https://files.pythonhosted.org/packages/9a/d9/3960534ef2faced9cdecbe72bc2060f718de97b8d95c15210f50d1c514bb/onepassword_sdk-0.1.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:48a6dfb85c0138896ecb8d6aa814c5985acbb4262ed5e3a18e99725454bb41d5", size = 4827963 }, + { url = "https://files.pythonhosted.org/packages/04/50/b0a34b7f1aa7e3b747bdca586896d3f2d452cdf87249f3dae56f649b1c83/onepassword_sdk-0.1.7-cp312-cp312-manylinux_2_32_aarch64.whl", hash = "sha256:7c6bbb202c83ad1566f27c3354ce88fa8dcceae0f90b0514b42d40aeb0b8ad10", size = 5262840 }, + { url = "https://files.pythonhosted.org/packages/52/2d/73a11ee89fb69d6d6e388524560e53574eb5fdd20f65d57f25212087fc9f/onepassword_sdk-0.1.7-cp312-cp312-manylinux_2_32_x86_64.whl", hash = "sha256:299cd1d33f83e0acca01f0b93474397b471e5470b7f58f542f6866f8ea4de134", size = 5559126 }, + { url = "https://files.pythonhosted.org/packages/02/fe/7aa733bfdadc930e4a02115189652524edce425ae13859d10a79f3cd24fa/onepassword_sdk-0.1.7-cp312-cp312-win_amd64.whl", hash = "sha256:c6d7ff7a15e56b2f6198b3681f6d18678fa56f6d9d3b0639bc70ae9b18b8acd2", size = 4627027 }, + { url = "https://files.pythonhosted.org/packages/6a/97/ef38f90c2b8038529c059d5e6e1d5fb447f75300258ce2c5e5761bbe7283/onepassword_sdk-0.1.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0a6c0e2b8383e2428e020abe41a82376fe24c1a1d9634bfb31c1db4692ef8a25", size = 5261540 }, + { url = "https://files.pythonhosted.org/packages/22/c2/6b9eafd9a1549bae1c7e04e344374e560842b6451279be268d1f45cada08/onepassword_sdk-0.1.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6d570ebe068ba6ce4116a71d16a180353b864e5975be4574ede623b1ea794209", size = 4827963 }, + { url = "https://files.pythonhosted.org/packages/57/0e/d924f5b92176fb86ff74b432d2dd4558048c971f902f287557e89795edc0/onepassword_sdk-0.1.7-cp313-cp313-manylinux_2_32_aarch64.whl", hash = "sha256:50f332f0f0f77913abe461c4d0913f7714ba2e07d9ef39446e0d564b5b440879", size = 5262838 }, + { url = "https://files.pythonhosted.org/packages/38/13/9a79eccdb8f65b6e2676e72f0f77a2c626e00210960e74b0ff72f7a419a8/onepassword_sdk-0.1.7-cp313-cp313-manylinux_2_32_x86_64.whl", hash = "sha256:8a773da4770e887b124ec671781678e1b1fd57f20dcc40f89a610728f55138ed", size = 5559126 }, + { url = "https://files.pythonhosted.org/packages/24/2c/15549d06fa6874d520ab123963172d18c45de34d0ac26e02465a56488f7d/onepassword_sdk-0.1.7-cp313-cp313-win_amd64.whl", hash = "sha256:9541175ed6283736745bf673f8c74696e53a164192c34c44de4400e8fdf01e38", size = 4627026 }, +] + [[package]] name = "orderly-set" version = "5.3.0"