commit 404c3627f794b10bd8dfc156830ab1459799ced2 Author: Jack Cushman Date: Wed Feb 5 10:21:50 2025 -0500 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..90613f1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ + +/data/* +!/data/README.md +*.pyc +__pycache__ +.DS_Store + diff --git a/README.md b/README.md new file mode 100644 index 0000000..9774df6 --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +This repository collects scripts to support the Library Innovation Lab's +[public data preservation project](https://lil.law.harvard.edu/blog/2025/01/30/preserving-public-u-s-federal-data/). + +These scripts are used as part of internal pipelines, so may not be usable +for others as is, but are available for reference about the creation of our +data and for remix. We also welcome contributions if they fit our internal +pipelines and goals. + +## Scripts + +Scripts are organized into subfolders for general categories of tasks: + +### collection + +Scripts for working with a "collection," meaning a set of files stored on +cloud storage that were all gathered with a similar collection strategy. +This folder is for scripts that apply to multiple collections rather than +a single collection. + +* sync.py: copy static files from collections/ to configure the collections. +* render.py: generate static indexes of files in a collection. +* verify_upload.py: fetch and verify integrity of a BagIt archive in a collection. +* cloudflare_tools.py: manage Cloudflare R2 buckets. +* s3_tools.py: manage S3 buckets. + +### helpers + +Util libraries used by other scripts. + +* parallel.py: run tasks in parallel. +* config.py: load configuration from the user's home dir. + +### data_gov + +Scripts for working with the [data.gov](https://data.gov) collection. + +* fetch_jsonl.py: fetch a jsonl file of the full API. +* fetch_index.py: fetch the full API and store updates in a sqlite database. +* models.py: database models for the sqlite database. +* fetch_data.py: use the sqlite database to fetch any datasets that require updating, + package with nabit, and upload to cloud storage. +* models.py: database models for the sqlite database. +* data_gov_diff/: scripts for identifying changes in past data created by fetch_jsonl.py or fetch_index.py (WIP). + +### github + +Scripts for working with the [github](https://github.com) collection. + +* download_git.py: use [gitspoke](https://github.com/harvard-lil/gitspoke) to download all repositories listed in a CSV. + diff --git a/collections/README.md b/collections/README.md new file mode 100644 index 0000000..51a723b --- /dev/null +++ b/collections/README.md @@ -0,0 +1 @@ +Static files to be added to individual collections. \ No newline at end of file diff --git a/collections/collections.json b/collections/collections.json new file mode 100644 index 0000000..02f2236 --- /dev/null +++ b/collections/collections.json @@ -0,0 +1,7 @@ +[ + { + "directory": "data_gov", + "aws_profile": "sc", + "s3_path": "us-west-2.opendata.source.coop/harvard-lil/gov-data" + } +] \ No newline at end of file diff --git a/collections/data_gov/README.md b/collections/data_gov/README.md new file mode 100644 index 0000000..bc51fe3 --- /dev/null +++ b/collections/data_gov/README.md @@ -0,0 +1,119 @@ +Harvard Law School Library Innovation Lab logo + +This is a regularly updated mirror of all data files linked from [data.gov](https://data.gov). + +The repository is maintained by the Harvard Law School Library Innovation Lab as part +of our [project to preserve U.S. federal public data](https://lil.law.harvard.edu/blog/2025/01/30/preserving-public-u-s-federal-data/). + +Collection Format +----------------- + +Each dataset on data.gov has a unique slug known as its `name`. We store each dataset +in this repository as: + +``` +collections/data_gov//.zip +``` + +We also store a metadata file for each dataset in the `metadata` directory: + +``` +metadata/data_gov//.json +``` + +`` is a `v` followed by the number of times we have downloaded the dataset +(v1, v2, etc.) + +For example, the data.gov dataset [https://catalog.data.gov/dataset/fruit-and-vegetable-prices](https://catalog.data.gov/dataset/fruit-and-vegetable-prices) +is stored in this repository as: + +* [collections/data_gov/fruit-and-vegetable-prices/v1.zip](https://source.coop/harvard-lil/gov-data/collections/data_gov/fruit-and-vegetable-prices) +* [metadata/data_gov/fruit-and-vegetable-prices/v1.json](https://source.coop/harvard-lil/gov-data/metadata/data_gov/fruit-and-vegetable-prices) + + +Dataset Format +-------------- + +Each dataset zip file is a BagIt package created by our [bag-nabit](https://github.com/harvard-lil/bag-nabit) tool. + +[BagIt](https://en.wikipedia.org/wiki/BagIt) is a simple file format, established by the +Library of Congress, consisting of a folder of metadata and text files. Our BagIt +files follow this directory structure: + +* `data/` + * `files/`: + * `...`: these are the actual files you likely want to use as a researcher, + downloaded from the data.gov listing. + * `headers.warc`: request and response headers from HTTP fetches for files in `files/` + * `signed-metadata.json`: metadata including data.gov's API description of the dataset + +The bags also contain these files, which are useful for authenticating the +provenance of the data: + +* `bagit.txt`: standard BagIt file +* `bag-info.txt`: standard BagIt file +* `manifest-sha256.txt`: standard BagIt file +* `tagmanifest-sha256.txt`: standard BagIt file +* `signatures/`: directory of signature files + +Metadata File Format +-------------------- + +Each metadata JSON file contains three main sections: + +1. `bag_info`: Contains the BagIt metadata including: + - Bag-Software-Agent: The version of nabit used to create the archive + - Bagging-Date: When the archive was created + +2. `signed_metadata`: Contains detailed information about the dataset including: + - `id`: A UUID for this specific archive + - `url`: The data.gov URL for the dataset + - `description`: A brief description including the dataset title and creating organization + - `data_gov_metadata`: The complete metadata from data.gov's API, including: + - Dataset details (title, description, etc.) + - Organization information + - Resource listings + - Tags and other metadata + - `collection_tasks`: Records of the HTTP requests made to collect the dataset + +3. `zip_entries`: Listing of each entry in the collection zip file, which can be used to fetch individual files from the zip file via range request without downloading the entire archive. + +Rollup files +------------ + +There are several rollup files at the top level to help with finding datasets +of interest: + +* `metadata.jsonl.zip`: zipped JSON lines file of all files contained in metadata/ +* `file_listing.jsonl.zip`: zipped JSON lines file showing the s3 listing of all files in the repository +* `collections.html`: human-readable HTML file showing the title and link to each dataset (warning, very large file that may not load in some browsers) + +Downloading data +---------------- + +To download an individual dataset by name you can construct its URL, such as: + +``` +https://source.coop/harvard-lil/gov-data/collections/data_gov/fruit-and-vegetable-prices/v1.zip +https://source.coop/harvard-lil/gov-data/metadata/data_gov/fruit-and-vegetable-prices/v1.json +``` + +To download large numbers of files, we recommend the `aws` or `rclone` command line tools: + +``` +aws s3 cp s3://us-west-2.opendata.source.coop/harvard-lil/gov-data/collections/data_gov//v1.zip --no-sign-request +``` + +Data Limitations +---------------- + +data.gov includes multiple kinds of datasets, including some that link to actual data +files, such as CSV files, and some that link to HTML landing pages. Our process +runs a "shallow crawl" that collects only the directly linked files. Datasets +that link only to a landing page will need to be collected separately. + +Source code +----------- + +The source code used to generate this and other repositories is available at [https://github.com/harvard-lil/data-mirror](https://github.com/harvard-lil/data-mirror). +We welcome conversation and collaboration in the issue tracker for that project. \ No newline at end of file diff --git a/collections/data_gov/docs/LIL_HLSL_logos.png b/collections/data_gov/docs/LIL_HLSL_logos.png new file mode 100644 index 0000000..8966ae7 Binary files /dev/null and b/collections/data_gov/docs/LIL_HLSL_logos.png differ diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..d79a8ef --- /dev/null +++ b/data/README.md @@ -0,0 +1,2 @@ +Directory for local data files. Files in this directory are not tracked by git, +but may be used by scripts. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d2e4609 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,35 @@ +[project] +name = "data-vault" +version = "0.1.0" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "httpx>=0.27.2", + "tqdm>=4.67.0", + "pyarrow>=18.0.0", + "aiosqlite>=0.20.0", + "publicsuffixlist>=1.0.2.20241121", + "bagit>=1.8.1", + "boto3>=1.35.80", + "jsondiff>=2.2.1", + "peewee>=3.17.8", + "nabit", + "gitspoke", + "cloudflare>=4.0.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.uv] +dev-dependencies = [ + "memray>=1.15.0", +] + +[tool.uv.sources] +nabit = { git = "https://github.com/harvard-lil/bag-nabit" } +gitspoke = { git = "https://github.com/harvard-lil/gitspoke" } + +[tool.hatch.build.targets.wheel] +packages = ["scripts"] diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..4251662 --- /dev/null +++ b/scripts/__init__.py @@ -0,0 +1,2 @@ +def hello() -> str: + return "Hello from data-mirror!" diff --git a/scripts/collection/__init__.py b/scripts/collection/__init__.py new file mode 100644 index 0000000..4251662 --- /dev/null +++ b/scripts/collection/__init__.py @@ -0,0 +1,2 @@ +def hello() -> str: + return "Hello from data-mirror!" diff --git a/scripts/collection/cloudflare_tools.py b/scripts/collection/cloudflare_tools.py new file mode 100644 index 0000000..77bb40d --- /dev/null +++ b/scripts/collection/cloudflare_tools.py @@ -0,0 +1,100 @@ +import logging +from pathlib import Path +import click +from cloudflare import Cloudflare +import os +from scripts.helpers.config import load_config + +logger = logging.getLogger(__name__) + +def generate_temp_key(account_id: str, bucket: str, parent_access_key_id: str, token: str, + permission: str = "object-read-write", ttl_seconds: int = 3600, + prefixes: list[str] | None = None, objects: list[str] | None = None): + """Generate a temporary R2 access key using the Cloudflare API. + + Args: + account_id: Cloudflare account ID + bucket: R2 bucket name + parent_access_key_id: Parent access key ID + token: Cloudflare API token + permission: Permission level ('object-read-write' or 'object-read') + ttl_seconds: Time-to-live in seconds + prefixes: Optional list of key prefixes to restrict access to + objects: Optional list of specific object keys to restrict access to + """ + params = { + "account_id": account_id, + "bucket": bucket, + "parent_access_key_id": parent_access_key_id, + "permission": permission, + "ttl_seconds": ttl_seconds, + } + + if prefixes: + params["prefixes"] = prefixes + if objects: + params["objects"] = objects + + return Cloudflare(api_token=token).r2.temporary_credentials.create(**params) + +@click.group() +def cli(): + """Cloudflare R2 utility commands.""" + pass + +@cli.command() +@click.option('--bucket', '-b', type=str, required=True, + help='R2 bucket name.') +@click.option('--permission', '-p', type=click.Choice(['object-read-write', 'object-read']), + default='object-read-write', + help='Permission level for the temporary key.') +@click.option('--ttl', '-t', type=int, default=1, + help='Time-to-live in hours for the temporary key.') +@click.option('--prefixes', '-x', multiple=True, + help='Key prefixes to restrict access to. Can be specified multiple times.') +@click.option('--objects', '-o', multiple=True, + help='Specific object keys to restrict access to. Can be specified multiple times.') +@click.option('--log-level', '-l', + type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), + default='INFO', + help='Logging level.') +def generate_key(bucket: str, permission: str, ttl: int, prefixes: tuple[str, ...], + objects: tuple[str, ...], log_level: str): + """Generate temporary Cloudflare R2 access credentials.""" + # Setup logging + logging.basicConfig(level=log_level) + + # Load config + config = load_config().get("temp_tokens", {}) + + if not config or any(key not in config for key in ['parent_access_key_id', 'account_id', 'token']): + raise click.ClickException("Config file must have 'temp_tokens' dict with 'parent_access_key_id', 'account_id', and 'token' keys") + + # Generate temporary key + temp_cred = generate_temp_key( + account_id=config['account_id'], + bucket=bucket, + parent_access_key_id=config['parent_access_key_id'], + token=config['token'], + permission=permission, + ttl_seconds=ttl * 3600, + prefixes=list(prefixes) if prefixes else None, + objects=list(objects) if objects else None + ) + + # Output AWS config format + click.echo("\n# Add this to ~/.aws/config:") + click.echo("[profile r2-temp]") + click.echo(f"aws_access_key_id = {temp_cred.access_key_id}") + click.echo(f"aws_secret_access_key = {temp_cred.secret_access_key}") + click.echo(f"aws_session_token = {temp_cred.session_token}") + click.echo("region = auto") + click.echo(f"endpoint_url = https://{config['account_id']}.r2.cloudflarestorage.com") + + # Output sample command using first prefix if available + click.echo("\n# Sample upload command:") + sample_path = objects[0] if objects else f"{prefixes[0].strip('/')}/" if prefixes else "" + click.echo(f"aws s3 cp local-file.txt s3://{bucket}/{sample_path} --profile r2-temp") + +if __name__ == "__main__": + cli() diff --git a/scripts/collection/render.py b/scripts/collection/render.py new file mode 100644 index 0000000..c06767b --- /dev/null +++ b/scripts/collection/render.py @@ -0,0 +1,109 @@ +import click +from pathlib import Path +from scripts.data_gov.models import db, Dataset +import logging +from tqdm import tqdm + +logger = logging.getLogger(__name__) + + +# Header template with styles +HEADER_TEMPLATE = ''' + + + Data.gov Dataset Mirror + + + +

Data.gov Dataset Mirror

+''' + +TABLE_START = ''' + + + + + + + + +''' + +ROW_TEMPLATE = ''' + + + + +''' + +TABLE_END = ''' +
NameOrganizationDescription
{name}{org}{title}
+ + +''' + +def render_html(datasets_query, output_path: Path) -> None: + """Render the datasets to an HTML file, streaming content.""" + with open(output_path / 'index.html', 'w', encoding='utf-8') as f: + # Write header + f.write(HEADER_TEMPLATE) + + # Write table start + f.write(TABLE_START) + + # Stream each dataset row + rows = [] + for dataset in tqdm(datasets_query.iterator(), desc="Rendering datasets"): + org_title = dataset.organization.get('title') if dataset.organization else 'N/A' + row = ROW_TEMPLATE.format( + name=dataset.name or '', + org=org_title, + title=dataset.title, + ) + rows.append(row) + if len(rows) >= 1000: + f.write('\n'.join(rows)) + rows = [] + + if rows: + f.write('\n'.join(rows)) + + # Write table end + f.write(TABLE_END) + +@click.command() +@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db') +@click.argument('output_path', type=click.Path(path_type=Path), default='data/processed/web') +@click.option('--log-level', '-l', + type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), + default='INFO', + help='Logging level.') +@click.option('--limit', '-n', type=int, default=None, + help='Maximum number of rows to display. Default: all rows.') +def main(db_path: Path, output_path: Path, log_level: str, limit: int | None): + """Render the Dataset table to an HTML file.""" + logging.basicConfig( + level=getattr(logging, log_level), + format='%(asctime)s - %(levelname)s - %(message)s' + ) + + logger.info(f"Connecting to database at {db_path}") + db.init(db_path) + db.connect() + + try: + logger.info("Starting HTML generation...") + datasets_query = Dataset.select().order_by(Dataset.id) + if limit: + datasets_query = datasets_query.limit(limit) + logger.info(f"Limited to {limit} rows") + + logger.info(f"Rendering HTML to {output_path}") + render_html(datasets_query, output_path) + logger.info("Done!") + + finally: + db.close() + +if __name__ == "__main__": + main() diff --git a/scripts/collection/s3_tools.py b/scripts/collection/s3_tools.py new file mode 100644 index 0000000..625485a --- /dev/null +++ b/scripts/collection/s3_tools.py @@ -0,0 +1,118 @@ +import boto3 +import click +from tqdm import tqdm +import logging +from itertools import islice + +logger = logging.getLogger(__name__) + +def get_delete_markers(s3_client, bucket: str, prefix: str): + """Get all delete markers for objects with the given prefix.""" + paginator = s3_client.get_paginator('list_object_versions') + for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="pages"): + if 'DeleteMarkers' in page: + yield [ + { + 'Key': marker['Key'], + 'VersionId': marker['VersionId'] + } + for marker in page['DeleteMarkers'] + if marker['IsLatest'] + ] + +def remove_delete_markers(s3_client, bucket: str, prefix: str, dry_run: bool = False): + """Remove all delete markers for objects with the given prefix.""" + for marker_batch in get_delete_markers(s3_client, bucket, prefix): + response = s3_client.delete_objects( + Bucket=bucket, + Delete={ + 'Objects': marker_batch, + 'Quiet': True + } + ) + + # Log any errors + if 'Errors' in response: + for error in response['Errors']: + logger.error(f"Failed to remove marker for {error['Key']}: {error['Message']}") + +def get_empty_files(s3_client, bucket: str, prefix: str): + """Get all objects with size zero under the given prefix.""" + paginator = s3_client.get_paginator('list_objects_v2') + for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="pages"): + if 'Contents' in page: + yield [ + {'Key': obj['Key']} + for obj in page['Contents'] + if obj['Size'] == 0 + ] + +def delete_empty_files(s3_client, bucket: str, prefix: str, dry_run: bool = False): + """Delete all zero-size objects under the given prefix.""" + pbar = tqdm(desc="deleted") + for empty_batch in get_empty_files(s3_client, bucket, prefix): + if not empty_batch: + continue + + if dry_run: + for obj in empty_batch: + logger.info(f"Would delete empty file: {obj['Key']}") + continue + + pbar.update(len(empty_batch)) + + response = s3_client.delete_objects( + Bucket=bucket, + Delete={ + 'Objects': empty_batch, + 'Quiet': True + } + ) + + # Log any errors + if 'Errors' in response: + for error in response['Errors']: + logger.error(f"Failed to delete {error['Key']}: {error['Message']}") + + pbar.close() + +@click.group() +def cli(): + """S3 object management commands.""" + pass + +@cli.command() +@click.argument('s3_path') +@click.option('--profile', help='AWS profile name', default='sc-direct') +@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it') +@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), + default='INFO', help='Set logging level') +def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'): + """Remove delete markers from versioned S3 objects, effectively undeleting them.""" + logging.basicConfig(level=log_level) + bucket, prefix = s3_path.split('/', 1) + + session = boto3.Session(profile_name=profile) + s3_client = session.client('s3') + + remove_delete_markers(s3_client, bucket, prefix, dry_run) + +@cli.command() +@click.argument('s3_path') +@click.option('--profile', help='AWS profile name', default='sc-direct') +@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it') +@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), + default='INFO', help='Set logging level') +def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'): + """Delete all zero-size objects under the given prefix.""" + logging.basicConfig(level=log_level) + bucket, prefix = s3_path.split('/', 1) + + session = boto3.Session(profile_name=profile) + s3_client = session.client('s3') + + delete_empty_files(s3_client, bucket, prefix, dry_run) + +if __name__ == '__main__': + cli() + diff --git a/scripts/collection/sync.py b/scripts/collection/sync.py new file mode 100644 index 0000000..69eba48 --- /dev/null +++ b/scripts/collection/sync.py @@ -0,0 +1,31 @@ +import boto3 +import click +import json +from pathlib import Path +import logging + +logger = logging.getLogger(__name__) + +@click.command() +@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path), + default='collections/collections.json', + help='Path to collections configuration file.') +def main(collections_file: Path): + # Load collections config + collections = json.loads(collections_file.read_text()) + collections_dir = collections_file.parent + + for collection in collections: + s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3') + collection_path = collections_dir / collection['directory'] + bucket_name, s3_prefix = collection['s3_path'].split('/', 1) + + for file_path in collection_path.rglob('*'): + if file_path.is_file(): + relative_path = file_path.relative_to(collection_path) + s3_key = f"{s3_prefix}/{relative_path}" + print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}") + s3.upload_file(str(file_path), bucket_name, s3_key) + +if __name__ == '__main__': + main() diff --git a/scripts/collection/verify_upload.py b/scripts/collection/verify_upload.py new file mode 100644 index 0000000..1d4ff2f --- /dev/null +++ b/scripts/collection/verify_upload.py @@ -0,0 +1,91 @@ +from pathlib import Path +import json +import zipfile +import tempfile +import requests +import click +import logging +from nabit.bin.utils import cli_validate +logger = logging.getLogger(__name__) + +def download_file(url: str, target_path: Path): + """Download a file from URL to target path""" + response = requests.get(url, stream=True) + response.raise_for_status() + with target_path.open('wb') as f: + for chunk in response.iter_content(chunk_size=2**20): + f.write(chunk) + +def verify_dataset(json_url: str, zip_url: str, output_dir: Path | None = None): + """ + Verify a dataset by downloading and checking its JSON metadata and ZIP contents. + If output_dir is provided, write the uncompressed contents there. + """ + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + # Download files + logger.info(f"Downloading metadata from {json_url}...") + json_path = tmpdir / "metadata.json" + download_file(json_url, json_path) + + logger.info(f"Downloading archive from {zip_url}...") + zip_path = tmpdir / "data.zip" + download_file(zip_url, zip_path) + + # Load metadata + metadata = json.loads(json_path.read_text()) + + # Create output directory + if not output_dir: + output_dir = tmpdir / "output" + output_dir.mkdir(parents=True, exist_ok=True) + + # Verify file contents + logger.info("Verifying file contents...") + with zip_path.open('rb') as f: + for entry in metadata['zip_entries']: + logger.info(f"Checking {entry['filename']}...") + f.seek(entry['data_offset']) + zip_data = f.read(entry['compress_size']) + + if entry['compress_type'] == zipfile.ZIP_STORED: + uncompressed = zip_data + else: + decompressor = zipfile._get_decompressor(entry['compress_type']) + uncompressed = decompressor.decompress(zip_data) + + # write the file + output_file = output_dir / entry['filename'] + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.write_bytes(uncompressed) + + logger.info("All files extracted successfully") + + # verify dataset with nabit + cli_validate(output_dir) + + # Return metadata for potential further use + return metadata + +@click.command() +@click.argument('json_url', type=str) +@click.argument('zip_url', type=str) +@click.option('--output', '-o', type=click.Path(path_type=Path), + help='Directory to write uncompressed files') +@click.option('--log-level', '-l', + type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), + default='INFO', + help='Logging level.') +def main(json_url: str, zip_url: str, output: Path = None, log_level: str = 'INFO'): + """Verify dataset from JSON and ZIP URLs""" + # Set up logging + logging.basicConfig( + level=getattr(logging, log_level), + format='%(asctime)s - %(levelname)s - %(message)s' + ) + + verify_dataset(json_url, zip_url, output) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/data_gov/diff/diff.py b/scripts/data_gov/diff/diff.py new file mode 100644 index 0000000..887beea --- /dev/null +++ b/scripts/data_gov/diff/diff.py @@ -0,0 +1,127 @@ +import json +import click +from pathlib import Path +from typing import Dict, List, Set, Tuple +import logging +from tqdm import tqdm + +logger = logging.getLogger(__name__) + + +def load_jsonl_data(jsonl_path: Path, keep_fields=None, compare_by: str = 'id') -> Dict[str, dict]: + """ + Load data from JSONL file into a dictionary keyed by id. + Only includes fields that match the CSV format. + + Args: + jsonl_path: Path to the JSONL file + + Returns: + Dictionary mapping id to filtered record data + """ + # Fields to keep from JSONL records + + data = {} + with open(jsonl_path, 'r', encoding='utf-8') as f: + for line in tqdm(f, desc="Loading JSONL"): + if line.strip(): # Skip empty lines + record = json.loads(line) + if keep_fields: + record = {k: v for k, v in record.items() if k in keep_fields} + data[record[compare_by]] = record + + return data + +def find_differences(csv_data: Dict[str, dict], + jsonl_data: Dict[str, dict]) -> Tuple[Set[str], Set[str], Set[str]]: + """ + Find records that differ between CSV and JSONL data. + + Args: + csv_data: Dictionary of CSV records keyed by id + jsonl_data: Dictionary of JSONL records keyed by id + + Returns: + Tuple of (csv_only_ids, jsonl_only_ids, different_ids) + """ + csv_ids = set(csv_data.keys()) + jsonl_ids = set(jsonl_data.keys()) + + # Find records only in CSV + csv_only = csv_ids - jsonl_ids + + # Find records only in JSONL + jsonl_only = jsonl_ids - csv_ids + + return csv_only, jsonl_only + +@click.command() +@click.argument('old_path', type=click.Path(exists=True, path_type=Path)) +@click.argument('new_path', type=click.Path(exists=True, path_type=Path)) +@click.option('--compare-by', '-c', + default='id', + help='Field to compare by.') +@click.option('--log-level', '-l', + type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), + default='INFO', + help='Logging level.') +def main(old_path: Path, new_path: Path, compare_by: str, log_level: str): + """Compare records between CSV and JSONL files.""" + logging.basicConfig( + level=getattr(logging, log_level), + format='%(asctime)s - %(levelname)s - %(message)s' + ) + + old_data = load_jsonl_data(old_path, compare_by=compare_by) + new_data = load_jsonl_data(new_path, compare_by=compare_by) + + # Find differences + old_only, new_only = find_differences(old_data, new_data) + + old_only_path = old_path.with_suffix(f'.only_{compare_by}.jsonl') + new_only_path = new_path.with_suffix(f'.only_{compare_by}.jsonl') + + logger.info(f"Writing {len(old_only)} records to {old_only_path}") + with open(old_only_path, 'w', encoding='utf-8') as f: + for id in old_only: + f.write(json.dumps(old_data[id]) + '\n') + + logger.info(f"Writing {len(new_only)} records to {new_only_path}") + with open(new_only_path, 'w', encoding='utf-8') as f: + for id in new_only: + f.write(json.dumps(new_data[id]) + '\n') + +if __name__ == '__main__': + main() + + + +# import sqlite3 +# import json + +# # Connect to the database +# conn = sqlite3.connect('data/data.db') +# conn.row_factory = sqlite3.Row # This allows us to access columns by name + +# # Open the output file +# with open('data/data_db_dump_20250130.jsonl', 'w') as f: +# # Execute the query and fetch rows in chunks +# cursor = conn.execute(''' +# SELECT * +# FROM dataset +# ''') + +# written = 0 +# while True: +# rows = cursor.fetchmany(1000) # Fetch 1000 rows at a time +# if not rows: +# break +# written += len(rows) +# # Write each row as a JSON line +# for row in rows: +# # Convert row to dict and write to file +# json_line = json.dumps(dict(row)) +# f.write(json_line + '\n') +# print(f"Wrote {written} rows") + +# conn.close() \ No newline at end of file diff --git a/scripts/data_gov/diff/diff_analyze.py b/scripts/data_gov/diff/diff_analyze.py new file mode 100644 index 0000000..0c70d75 --- /dev/null +++ b/scripts/data_gov/diff/diff_analyze.py @@ -0,0 +1,38 @@ +import json +from collections import Counter, defaultdict +from pathlib import Path + + +# Read the JSONL file and count crawler_identified_date values +downloaded_counts = Counter() +identified_counts = Counter() +titles_by_org = defaultdict(list) +with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f: + for line in f: + data = json.loads(line) + org = json.loads(data.get('organization', '{}')) + identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1 + titles_by_org[org['title']].append(data["title"]) + +# Print the counts sorted by date +for date, count in sorted(identified_counts.items()): + print(f"{date}: {count}") + +# sort each list of titles by org +for org, titles in titles_by_org.items(): + titles_by_org[org].sort() +Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2)) + + +# print urls +for path in Path('data/').glob('glass*'): + print(path) + with open(path, 'r') as f: + for line in f: + data = json.loads(line) + print("* " + data['name']) + resources = data.get('resources', []) + if type(resources) == str: + resources = json.loads(resources) + for resource in resources: + print(' * ' + resource['url']) diff --git a/scripts/data_gov/fetch_data.py b/scripts/data_gov/fetch_data.py new file mode 100644 index 0000000..8bd1eda --- /dev/null +++ b/scripts/data_gov/fetch_data.py @@ -0,0 +1,318 @@ +from nabit.lib.archive import package +from nabit.lib.sign import KNOWN_TSAS, is_encrypted_key +from nabit.lib.backends.url import UrlCollectionTask +from pathlib import Path +import json +import uuid +import tempfile +import click +import os +from urllib.parse import urlparse +import re +from scripts.helpers.parallel import run_parallel +import zipfile +import struct +import boto3 +import logging +from scripts.data_gov.models import db, Dataset +from playhouse.shortcuts import model_to_dict +from tqdm import tqdm +from datetime import datetime + +logger = logging.getLogger(__name__) + +## download data.gov datasets, create nabit archives, and upload to S3 + +# File extensions that are already compressed or wouldn't benefit from additional compression +UNCOMPRESSED_EXTENSIONS = { + # Already compressed archives + 'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz', + # Compressed images + 'jpg', 'jpeg', 'png', 'gif', 'webp', + # Compressed video/audio + 'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a', + # Other compressed/binary formats + 'pdf', 'docx', 'xlsx', 'pptx', +} + +stats_counter = {} + +def is_valid_url(url): + parsed = urlparse(url) + return parsed.scheme in ['http', 'https'] and re.search(r'[^\.]\.[^\.]', parsed.netloc) + +def extract_urls(data, urls = None): + urls = set() if urls is None else urls + if isinstance(data, dict): + for key, value in data.items(): + if isinstance(value, str): + if is_valid_url(value): + urls.add(value) + elif isinstance(value, (dict, list)): + extract_urls(value, urls) + elif isinstance(data, list): + for item in data: + extract_urls(item, urls) + return urls + +def create_archive(bag_dir, dataset: Dataset, signatures): + data_dict = model_to_dict(dataset) + for key, value in data_dict.items(): + if isinstance(value, datetime): + data_dict[key] = value.isoformat() + data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}' + collect = [ + *[UrlCollectionTask(url=url) for url in extract_urls(data_dict)], + ] + logger.info(f" - Downloading {len(collect)} files") + + # sort fields from dataset + data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')} + crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')} + + # Create the archive + package( + output_path=bag_dir, + collect=collect, + collect_errors='ignore', + signed_metadata={ + 'id': str(uuid.uuid4()), + 'url': data_gov_url, + 'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.', + 'data_gov_metadata': data_gov_metadata, + 'crawler_metadata': crawler_metadata, + }, + signatures=signatures, + ) + +def zip_archive(bag_dir, archive_path): + # Create zip archive + with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf: + for file_path in bag_dir.rglob('*'): + if file_path.is_file(): + arc_path = file_path.relative_to(bag_dir) + compression = (zipfile.ZIP_STORED + if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS + else zipfile.ZIP_DEFLATED) + zf.write(file_path, arc_path, compress_type=compression) + + # Create metadata file + zip_info = [] + with zipfile.ZipFile(archive_path, 'r') as zf: + for info in zf.filelist: + header_offset = info.header_offset + + # Read header to calculate data offset + zf.fp.seek(header_offset) + header = zf.fp.read(zipfile.sizeFileHeader) + fheader = struct.unpack(zipfile.structFileHeader, header) + fname_length = fheader[zipfile._FH_FILENAME_LENGTH] + extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH] + data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length + + zip_info.append({ + 'filename': info.filename, + 'file_size': info.file_size, + 'compress_size': info.compress_size, + 'compress_type': info.compress_type, + 'header_offset': header_offset, + 'data_offset': data_offset, + }) + + # Read the bag-info.txt and signed-metadata.json + bag_info = (bag_dir / 'bag-info.txt').read_text() + signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text()) + + return { + 'bag_info': bag_info, + 'signed_metadata': signed_metadata, + 'zip_entries': zip_info + } + +def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args): + s3 = boto3.Session(**session_args).client('s3') + bucket_name, s3_path = s3_path.split('/', 1) + + # Upload zip file + s3_collection_key = os.path.join(s3_path, str(collection_path.relative_to(output_path))) + s3.upload_file(str(collection_path), bucket_name, s3_collection_key) + logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}") + + # Upload metadata file + s3_metadata_key = os.path.join(s3_path, str(metadata_path.relative_to(output_path))) + s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key) + logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}") + + +def run_pipeline( + dataset: Dataset, + output_path: Path, + metadata_path: Path, + collection_path: Path, + signatures: list = None, + session_args: dict = None, + s3_path: str = None, + no_delete: bool = False, + ): + logger.info(f"Processing dataset: {dataset.name}") + + # we have a db forked from the main process, so we need to close it and reopen if needed + db.close() + + # set this here so it makes it into the metadata + dataset.crawler_downloaded_date = datetime.now() + + with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir: + logger.info("- Creating archive...") + # set up paths + temp_dir = Path(temp_dir) + bag_dir = temp_dir / 'bag' + archive_path = temp_dir / 'archive.zip' + + # download data with nabit + create_archive(bag_dir, dataset, signatures) + + logger.info("- Zipping archive...") + # zip up data and create metadata + output_metadata = zip_archive(bag_dir, archive_path) + + logger.info("- Moving files to final location...") + # Move files to final location + collection_path.parent.mkdir(parents=True, exist_ok=True) + metadata_path.parent.mkdir(parents=True, exist_ok=True) + os.rename(str(archive_path), collection_path) + metadata_path.write_text(json.dumps(output_metadata) + '\n') + + if s3_path: + logger.info("Uploading to S3...") + upload_archive(output_path, collection_path, metadata_path, s3_path, session_args) + + if not no_delete: + logger.info("- Deleting zip archive...") + os.remove(collection_path) + if collection_path.parent.exists() and not os.listdir(collection_path.parent): + os.rmdir(collection_path.parent) + + logger.info("- Setting crawler_downloaded_date...") + db.connect() + dataset.save() + + logger.info("Processing complete") + +def get_unprocessed_datasets(output_path: Path, collection: str, min_size: int = 0, dataset_name: str = None): + """Get datasets from SQLite that don't have metadata files yet.""" + query = Dataset.select() + + if dataset_name: + query = query.where(Dataset.name == dataset_name) + if min_size: + query = query.where(Dataset.size >= min_size) + + # Initialize progress bars + stats_counter['total'] = tqdm(desc="Total records", unit="pkg") + stats_counter['skipped'] = tqdm(desc="Already processed", unit="pkg") + stats_counter['yielded'] = tqdm(desc="Processing", unit="pkg") + + for dataset in query: + stats_counter['total'].update(1) + + # Check if metadata file exists + name = dataset.name + metadata_path = output_path / 'metadata' / collection / name / 'v1.json' + + if metadata_path.exists(): + stats_counter['skipped'].update(1) + continue + + stats_counter['yielded'].update(1) + yield dataset + + +@click.command() +@click.option('--db-path', '-d', type=click.Path(exists=True, path_type=Path), default='data/data.db') +@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed', + help='Output path.') +@click.option('--collection', '-c', type=str, default='data_gov', + help='Collection name.') +@click.option('--workers', '-w', type=int, default=None, + help='Number of worker processes. Defaults to CPU count.') +@click.option('--min-size', '-s', type=int, default=0, + help='Minimum size of dataset to process.') +@click.option('--dataset-name', help='Dataset name to process.') +@click.option('--if-exists', '-e', type=click.Choice(['skip', 'replace', 'version']), default='skip', + help='Whether to skip, replace, or add a version if dataset already exists.') +@click.option('--signatures', help='JSON string of signature configuration.') +@click.option('--profile', '-p', help='AWS profile name') +@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "/"') +@click.option('--log-level', '-l', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), default=None, + help='Logging level.') +@click.option('--stop-after', help='Stop after processing this many collections', type=int) +@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata') +def main(db_path: Path, output_path: Path, collection: str, workers=None, min_size=0, dataset_name=None, + if_exists='skip', signatures=None, profile=None, s3_path=None, log_level=None, stop_after=None, no_delete=False): + + if dataset_name: + workers = 1 + stop_after = 1 + + if signatures: + signatures = json.loads(signatures) + for signature in signatures: + if signature['action'] == 'sign': + if is_encrypted_key(signature['params']['key']): + signature['params']['password'] = click.prompt( + f"Enter password for {signature['params']['key']}: ", + hide_input=True + ) + elif signature['action'] == 'timestamp': + if known_tsa := signature.pop('known_tsa', None): + signature['params'] = KNOWN_TSAS[known_tsa] + + session_args = {} + if profile: + session_args['profile_name'] = profile + + # Initialize database connection + db.init(db_path) + db.connect() + + def get_tasks(): + processed = 0 + for dataset in get_unprocessed_datasets(output_path, collection, min_size, dataset_name): + # handle existing datasets + name = dataset.name + collection_path = output_path / 'collections' / collection / name / 'v1.zip' + metadata_path = output_path / 'metadata' / collection / name / 'v1.json' + + if metadata_path.exists(): + if if_exists == 'skip': + continue + elif if_exists == 'replace': + metadata_path.unlink() + if collection_path.exists(): + collection_path.unlink() + elif if_exists == 'version': + version = 2 + while True: + collection_path = output_path / 'collections' / collection / name / f'v{version}.zip' + metadata_path = output_path / 'metadata' / collection / name / f'v{version}.json' + if not metadata_path.exists(): + break + version += 1 + + yield dataset, output_path, metadata_path, collection_path, signatures, session_args, s3_path, no_delete + + processed += 1 + if stop_after and processed >= stop_after: + break + + try: + run_parallel(run_pipeline, get_tasks(), workers, log_level=log_level, catch_errors=False) + finally: + # Close progress bars + for counter in stats_counter.values(): + counter.close() + db.close() + +if __name__ == '__main__': + main() diff --git a/scripts/data_gov/fetch_index.py b/scripts/data_gov/fetch_index.py new file mode 100644 index 0000000..4ee2b02 --- /dev/null +++ b/scripts/data_gov/fetch_index.py @@ -0,0 +1,299 @@ +import httpx +from typing import Iterator, Dict, Any, List +import time +import click +from pathlib import Path +import logging +from datetime import datetime +from scripts.data_gov.models import db, Dataset, DatasetHistory +from tqdm import tqdm +from playhouse.shortcuts import model_to_dict +from jsondiff import diff + +logger = logging.getLogger(__name__) + +stats_counter = {} + +def init_database(db_path: Path) -> None: + """Initialize the database connection and create tables.""" + db.init(db_path) + db.connect() + db.create_tables([Dataset, DatasetHistory]) + +def save_to_database(results: List[Dict[str, Any]]) -> None: + """ + Save a batch of packages to the database using Peewee. + """ + if not results: + return + + # Process datetime fields in incoming records + for package in results: + for field in ['metadata_created', 'metadata_modified']: + if package.get(field): + try: + package[field] = datetime.fromisoformat( + package[field].replace('Z', '+00:00') + ) + except ValueError: + package[field] = None + + # Get all IDs from incoming packages + incoming_ids = [pkg['id'] for pkg in results] + + # Fetch existing records as model instances + existing_records = { + record.id: record + for record in Dataset.select().where(Dataset.id << incoming_ids) + } + + # Prepare bulk operations + history_records = [] + new_records = [] + + # Compare records and prepare operations + for package_data in results: + # Create a new model instance from the package data + new_package = Dataset(**package_data) + existing = existing_records.get(package_data['id']) + + if existing: + # Compare model instances using their dict representations + if diff(model_to_dict(existing), model_to_dict(new_package)): + # Record changed - add to history and update + history_records.append(existing) + new_records.append(new_package) + stats_counter['updated'].update(1) + else: + # Record unchanged - skip + stats_counter['skipped'].update(1) + continue + else: + # New record - just add it + new_records.append(new_package) + stats_counter['new'].update(1) + + with db.atomic(): + # Bulk move history records if any exist + if history_records: + DatasetHistory.bulk_create(history_records) + Dataset.delete().where(Dataset.id << [h.id for h in history_records]).execute() + + # Bulk insert new records + if new_records: + Dataset.bulk_create(new_records) + +def save_packages_to_database(output_path: Path, rows_per_page: int = 1000, start_date: str | None = None) -> None: + """ + Save fetched data to the database, resuming from last position if needed. + + Args: + output_path: Path to save the database + rows_per_page: Number of results to fetch per page + start_date: Optional date to start fetching from + """ + stats_counter['new'] = tqdm(desc="New records", unit="pkg") + stats_counter['updated'] = tqdm(desc="Updated records", unit="pkg") + stats_counter['skipped'] = tqdm(desc="Unchanged records", unit="pkg") + + init_database(output_path) + + try: + for results in tqdm(fetch_data_gov_packages(rows_per_page=rows_per_page, start_date=start_date, max_retries=10)): + save_to_database(results) + finally: + db.close() + +def fetch_data_gov_packages(rows_per_page: int = 1000, start_date: str = None, max_retries: int = 3) -> Iterator[Dict[str, Any]]: + """ + Fetch package data from data.gov API using date-based pagination. + + Args: + rows_per_page: Number of results to fetch per page + start_date: Optional date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm) + max_retries: Maximum number of retry attempts for 5xx errors + + Yields: + Dict containing package data for each result + """ + + base_url = "https://catalog.data.gov/api/3/action/package_search" + current_date = start_date + total_records = 0 + + while True: + logger.info(f"Current date offset: {current_date}") + + # Build date filter query + url = f"{base_url}?rows={rows_per_page}&sort=metadata_modified+desc" + if current_date: + # Format date to match Solr's expected format (dropping microseconds) + formatted_date = current_date.split('.')[0] + 'Z' + date_filter = f"+metadata_modified:[* TO {formatted_date}]" + url += f"&fq={date_filter}" + + for attempt in range(max_retries): + try: + start_time = time.time() + response = httpx.get(url, timeout=60.0) + request_time = time.time() - start_time + + response.raise_for_status() + break # Success, exit retry loop + + except httpx.HTTPStatusError as e: + if e.response.status_code >= 500 and attempt < max_retries - 1: + retry_wait = 2 ** attempt # Exponential backoff + logger.warning(f"Got {e.response.status_code}, retrying in {retry_wait}s... (attempt {attempt + 1}/{max_retries})") + logger.warning(f"Error URL: {url}") + time.sleep(retry_wait) + continue + # If not a 5xx error or we're out of retries, re-raise + logger.error(f"Error URL: {url}") + logger.error(f"Response content: {response.text}") + raise + + data = response.json() + results = data["result"]["results"] + + if not results: + break + + # Get date of last result for next query + current_date = results[-1]["metadata_modified"] + + total_records += len(results) + logger.info(f"Request took {request_time:.2f}s. Total records: {total_records}") + + yield results + + time.sleep(1) + +def get_dataset_history(dataset_name: str) -> None: + """ + Fetch and display all versions of a dataset with the given ID, + from oldest to newest, showing only changed fields between versions. + """ + # Get all versions including current + versions = [ + model_to_dict(record, recurse=True) + for record in (DatasetHistory + .select() + .where(DatasetHistory.name == dataset_name) + .order_by(DatasetHistory.metadata_modified)) + ] + current_record = Dataset.select().where(Dataset.name == dataset_name).first() + if current_record: + versions.append(model_to_dict(current_record, recurse=True)) + + if not versions: + print(f"No dataset found with name: {dataset_name}") + return + + # Print each version with changed fields + prev = None + for curr in versions: + history_id = curr.pop('history_id', None) + if prev: + diff_fields = diff(prev, curr) + else: + diff_fields = curr + + print(f"*** Version: {curr.get('metadata_modified')} ***") + for k, v in diff_fields.items(): + print(f"- {k}: {v}") + print("\n") + prev = curr + +@click.group() +def cli(): + """Data.gov dataset mirroring tools.""" + pass + +# Modify the existing main function to be a command in the group +@cli.command() +@click.argument('output_path', type=click.Path(path_type=Path), default='data/data.db') +@click.option('--rows-per-page', '-r', type=int, default=1000, + help='Number of results to fetch per page.') +@click.option('--start-date', '-s', type=str, default=None, + help='Date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)') +@click.option('--log-level', '-l', + type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), + default='WARNING', + help='Logging level.') +def fetch(output_path: Path, rows_per_page: int, start_date: str, log_level: str): + """Fetch package data from data.gov API and save to database.""" + logging.basicConfig( + level=getattr(logging, log_level), + format='%(asctime)s - %(levelname)s - %(message)s' + ) + + save_packages_to_database(output_path, rows_per_page, start_date) + +@cli.command() +@click.argument('dataset_name') +@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db') +def history(dataset_name: str, db_path: Path): + """Show version history for a dataset with the given ID.""" + init_database(db_path) + try: + get_dataset_history(dataset_name) + finally: + db.close() + +@cli.command() +@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db') +def delete_duplicate_history(db_path: Path): + """Delete duplicate history records.""" + init_database(db_path) + try: + # Get all unique dataset names in history + unique_names = (DatasetHistory + .select(DatasetHistory.name) + .distinct() + .tuples()) + + total_deleted = 0 + for (name,) in tqdm(unique_names, desc="Processing datasets"): + # Get all versions for this dataset ordered by modification date + versions = [ + model_to_dict(record) + for record in (DatasetHistory + .select() + .where(DatasetHistory.name == name) + .order_by(DatasetHistory.metadata_modified)) + ] + current_record = Dataset.select().where(Dataset.name == name).first() + if current_record: + versions.append(model_to_dict(current_record)) + + # Track IDs of duplicate records to delete + to_delete = [] + + # Compare adjacent versions + prev = versions[0] + prev_id = prev.pop('history_id') + for curr in versions[1:]: + curr_id = curr.pop('history_id', None) + + # If versions are identical, mark current version for deletion + if not diff(prev, curr): + to_delete.append(prev_id) + prev = curr + prev_id = curr_id + + # Bulk delete duplicate records + if to_delete: + deleted = (DatasetHistory + .delete() + .where(DatasetHistory.history_id << to_delete) + .execute()) + total_deleted += deleted + + click.echo(f"Deleted {total_deleted} duplicate history records") + finally: + db.close() + +if __name__ == "__main__": + cli() + diff --git a/scripts/data_gov/fetch_jsonl.py b/scripts/data_gov/fetch_jsonl.py new file mode 100644 index 0000000..8a75c0d --- /dev/null +++ b/scripts/data_gov/fetch_jsonl.py @@ -0,0 +1,35 @@ +import httpx +import json +import time +import logging +from pathlib import Path +from typing import Iterator, Dict, Any, List +import click +from scripts.data_gov.fetch_index import fetch_data_gov_packages + +logger = logging.getLogger(__name__) + +@click.command() +@click.argument('output_path', type=click.Path(path_type=Path), default='data/data_20250130.jsonl') +@click.option('--rows-per-page', '-r', type=int, default=1000, + help='Number of results to fetch per page.') +@click.option('--log-level', '-l', + type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), + default='INFO', + help='Logging level.') +@click.option('--start-date', '-s', type=str, default=None, + help='Start date for fetching packages in YYYY-MM-DD format.') +def main(output_path: Path, rows_per_page: int, log_level: str, start_date: str): + """Fetch all package data from data.gov API and save to JSONL file.""" + logging.basicConfig( + level=getattr(logging, log_level), + format='%(asctime)s - %(levelname)s - %(message)s' + ) + + with open(output_path, 'a') as f: + for results in fetch_data_gov_packages(rows_per_page=rows_per_page, start_date=start_date): + for package in results: + f.write(json.dumps(package) + '\n') + +if __name__ == "__main__": + main() diff --git a/scripts/data_gov/migrate.py b/scripts/data_gov/migrate.py new file mode 100644 index 0000000..5d843b6 --- /dev/null +++ b/scripts/data_gov/migrate.py @@ -0,0 +1,18 @@ +from playhouse.migrate import * +from scripts.data_gov.models import db + +migrator = SqliteMigrator(db) + +def do_migrate(): + crawler_identified_date = DateTimeField(null=True) + crawler_downloaded_date = DateTimeField(null=True) + with db.atomic(): + migrate( + # migrator.add_column('dataset', 'crawler_identified_date', crawler_identified_date), + # migrator.add_column('dataset', 'crawler_downloaded_date', crawler_downloaded_date), + # migrator.add_column('datasethistory', 'crawler_identified_date', crawler_identified_date), + # migrator.add_column('datasethistory', 'crawler_downloaded_date', crawler_downloaded_date), + ) + +if __name__ == '__main__': + do_migrate() \ No newline at end of file diff --git a/scripts/data_gov/models.py b/scripts/data_gov/models.py new file mode 100644 index 0000000..272ce4c --- /dev/null +++ b/scripts/data_gov/models.py @@ -0,0 +1,61 @@ +from peewee import * +from playhouse.sqlite_ext import JSONField +from pathlib import Path +from datetime import datetime + +db = SqliteDatabase(Path(__file__).parent.parent.parent / 'data/data.db', pragmas={ + # tuning suggested by Claude: + 'journal_mode': 'wal', # Write-Ahead Logging for better concurrency + 'cache_size': -1024 * 64, # 64MB cache (negative number means kibibytes) + 'synchronous': 'normal', # Good balance between safety and speed + 'busy_timeout': 30000, # Wait up to 30 seconds when database is locked + 'temp_store': 'memory', # Store temp tables in memory + 'mmap_size': 268435456, # Memory-mapped I/O (256MB) + 'page_size': 4096, # Optimal for most systems +}) + +class BaseModel(Model): + class Meta: + database = db + +class Dataset(BaseModel): + # fields from data.gov + id = CharField(primary_key=True) + name = CharField(null=True) + title = CharField(null=True) + notes = TextField(null=True) + metadata_created = DateTimeField(null=True) + metadata_modified = DateTimeField(null=True) + private = BooleanField(null=True) + state = CharField(null=True) + version = CharField(null=True) + type = CharField(null=True) + num_resources = IntegerField(null=True) + num_tags = IntegerField(null=True) + isopen = BooleanField(null=True) + author = CharField(null=True) + author_email = CharField(null=True) + creator_user_id = CharField(null=True) + license_id = CharField(null=True) + license_url = CharField(null=True) + license_title = CharField(null=True) + maintainer = CharField(null=True) + maintainer_email = CharField(null=True) + owner_org = CharField(null=True) + url = CharField(null=True) + organization = JSONField(null=True) + extras = JSONField(null=True) + resources = JSONField(null=True) + tags = JSONField(null=True) + groups = JSONField(null=True) + relationships_as_subject = JSONField(null=True) + relationships_as_object = JSONField(null=True) + + # fields starting with crawler_ are added by our crawler + crawler_identified_date = DateTimeField(null=True, default=datetime.now) + crawler_downloaded_date = DateTimeField(null=True) + +class DatasetHistory(Dataset): + history_id = AutoField(primary_key=True) + id = CharField() # Regular CharField, not primary key + #deleted_by_date = DateTimeField(null=True) # New field to track deletion date diff --git a/scripts/github/download_git.py b/scripts/github/download_git.py new file mode 100644 index 0000000..3ba6d43 --- /dev/null +++ b/scripts/github/download_git.py @@ -0,0 +1,141 @@ +import csv +import logging +from pathlib import Path +from scripts.helpers.parallel import run_parallel +import click +from tqdm import tqdm +from gitspoke import Downloader, GitHubAPI +from gitspoke.cli import valid_include_items, get_token +import os +import json +import requests +from scripts.helpers.config import load_config + +logger = logging.getLogger(__name__) +stats_counter = {} + +CONFIG_PATH = (os.environ.get("XDG_CONFIG_HOME") or (Path.home() / ".config")) / "data-mirror" / "config.json" + +def check_repo_exists(org_name, repo_name, token, output_path=None): + """Check if a repository still exists on GitHub.""" + exists = True + try: + GitHubAPI(token).request(f"repos/{org_name}/{repo_name}", method="HEAD") + except requests.exceptions.HTTPError as e: + if e.response.status_code == 404: + exists = False + else: + raise e + if not exists: + repo_link = f"https://github.com/{org_name}/{repo_name}" + print(repo_link) + if output_path: + with open(output_path, 'a') as output_file: + output_file.write(f"{repo_link}\n") + return exists + +def run_pipeline(org_name, repo_name, collection_path, include, token, check_exists=False, output_path=None): + """Process a single repository.""" + if check_exists: + return check_repo_exists(org_name, repo_name, token, output_path) + + logger.info(f"Processing repository: {org_name}/{repo_name}") + Downloader(org_name, repo_name, token, max_retries=20).download_repo(collection_path, include=include) + logger.info("Processing complete") + +def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int = 0, skip_existing: bool = False, stop_after: int = None, include: str = None, + check_exists: bool = False): + """Get repositories from CSV that haven't been processed yet.""" + # Initialize progress bars + if not check_exists: + stats_counter['total'] = tqdm(desc="Total records", unit="repo") + if skip_existing: + stats_counter['skipped'] = tqdm(desc="Skipped", unit="repo") + stats_counter['yielded'] = tqdm(desc="Processing", unit="repo") + + # handle --include + if include: + include = include.split(',') + else: + include = ['repo_info'] + + # import token or tokens + config = load_config() + if config.get('tokens'): + tokens = config['tokens'] + else: + tokens = [get_token(None)] + if tokens != [None]: + logger.warning(f"Using {len(tokens)} tokens") + else: + logger.warning("Using unauthenticated rate limits") + + with open(csv_path, 'r') as file: + reader = csv.DictReader(file) + # Skip specified number of rows + for _ in range(skip_rows): + next(reader) + + processed = 0 + for row in reader: + if not check_exists: + stats_counter['total'].update(1) + + if not row['html_url']: # Skip empty rows + continue + + org_name, repo_name = row['html_url'].split('/')[-2:] + collection_path = output_path / 'collections' / collection / org_name / repo_name + + if skip_existing: + if collection_path.exists(): + stats_counter['skipped'].update(1) + continue + else: + stats_counter['yielded'].update(1) + + # use tokens round robin + token = tokens[processed % len(tokens)] + + yield org_name, repo_name, collection_path, include, token, check_exists, output_path + + processed += 1 + if stop_after and processed >= stop_after: + break + + # Close progress bars + for counter in stats_counter.values(): + counter.close() + +@click.command() +@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed', + help='Output path.') +@click.option('--collection', '-c', type=str, default='github_raw', + help='Collection name.') +@click.option('--workers', '-w', type=int, default=None, + help='Number of worker processes. Defaults to CPU count.') +@click.option('--skip-rows', type=int, default=0, + help='Number of rows to skip in the CSV.') +@click.option('--include', + help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items)) +@click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv', + help='Path to the CSV file.') +@click.option('--log-level', '-l', + type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), + default=None, + help='Logging level.') +@click.option('--stop-after', help='Stop after processing this many repositories', type=int) +@click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories') +@click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub') +def main(csv_path: Path, output_path: Path, collection: str, workers=None, skip_rows=0, include=None, + log_level=None, stop_after=None, skip_existing=False, check_exists=False): + + run_parallel( + run_pipeline, + get_tasks(csv_path, output_path, collection, skip_rows, skip_existing, stop_after, include, check_exists), + workers, + log_level=log_level + ) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/helpers/config.py b/scripts/helpers/config.py new file mode 100644 index 0000000..d1bcc1c --- /dev/null +++ b/scripts/helpers/config.py @@ -0,0 +1,13 @@ +import json +import os +from pathlib import Path + +CONFIG_PATH = (os.environ.get("XDG_CONFIG_HOME") or (Path.home() / ".config")) / "data-mirror" / "config.json" + +def load_config(): + """Load configuration from config file.""" + if CONFIG_PATH.exists(): + config = json.loads(CONFIG_PATH.read_text()) + else: + config = {} + return config \ No newline at end of file diff --git a/scripts/helpers/parallel.py b/scripts/helpers/parallel.py new file mode 100644 index 0000000..b875da0 --- /dev/null +++ b/scripts/helpers/parallel.py @@ -0,0 +1,65 @@ +from multiprocessing import Queue, Process +from queue import Empty +import os +from tqdm import tqdm +from typing import Callable, Iterable +import logging + +# Set up logger +logger = logging.getLogger(__name__) + +def worker(task_queue, task, catch_errors: bool = True): + while True: + try: + args = task_queue.get(timeout=1) + if args is None: + break + logger.debug(f"[PID {os.getpid()}] Processing task") + task(*args) + except Empty: + continue + except Exception as e: + if catch_errors: + logger.error(f"[PID {os.getpid()}] Worker error: {e}") + else: + raise e + + +def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, log_level: str | None = None, task_count: int | None = None): + workers = workers or os.cpu_count() or 4 + + # Configure logging based on whether we're running in parallel or not + if log_level is None: + log_level = 'INFO' if workers == 1 else 'WARNING' + logging.basicConfig( + level=log_level, + format='[%(process)d] %(message)s' + ) + + logger.debug(f"Starting processing with {workers} workers") + + if workers > 1: + task_queue = Queue(maxsize=100) + + # Start worker processes + processes = [] + for _ in range(workers): + p = Process(target=worker, args=(task_queue, processor, catch_errors)) + p.start() + processes.append(p) + + # Load tasks into queue + for task_item in tqdm(tasks, total=task_count): + if workers > 1: + task_queue.put(task_item) + else: + processor(*task_item) + + if workers > 1: + # Signal workers to exit + for _ in range(workers): + task_queue.put(None) + + # Wait for all processes to complete + for p in processes: + p.join() \ No newline at end of file diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..ab28181 --- /dev/null +++ b/uv.lock @@ -0,0 +1,745 @@ +version = 1 +requires-python = ">=3.12" + +[[package]] +name = "aiosqlite" +version = "0.20.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0d/3a/22ff5415bf4d296c1e92b07fd746ad42c96781f13295a074d58e77747848/aiosqlite-0.20.0.tar.gz", hash = "sha256:6d35c8c256637f4672f843c31021464090805bf925385ac39473fb16eaaca3d7", size = 21691 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/c4/c93eb22025a2de6b83263dfe3d7df2e19138e345bca6f18dba7394120930/aiosqlite-0.20.0-py3-none-any.whl", hash = "sha256:36a1deaca0cac40ebe32aac9977a6e2bbc7f5189f23f4a54d5908986729e5bd6", size = 15564 }, +] + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 }, +] + +[[package]] +name = "anyio" +version = "4.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "sniffio" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f6/40/318e58f669b1a9e00f5c4453910682e2d9dd594334539c7b7817dabb765f/anyio-4.7.0.tar.gz", hash = "sha256:2f834749c602966b7d456a7567cafcb309f96482b5081d14ac93ccd457f9dd48", size = 177076 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/7a/4daaf3b6c08ad7ceffea4634ec206faeff697526421c20f07628c7372156/anyio-4.7.0-py3-none-any.whl", hash = "sha256:ea60c3723ab42ba6fff7e8ccb0488c898ec538ff4df1f1d5e642c3601d07e352", size = 93052 }, +] + +[[package]] +name = "bagit" +version = "1.8.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e5/99/927b704237a1286f1022ea02a2fdfd82d5567cfbca97a4c343e2de7e37c4/bagit-1.8.1.tar.gz", hash = "sha256:37df1330d2e8640c8dee8ab6d0073ac701f0614d25f5252f9e05263409cee60c", size = 26229 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/fc/58b3c209fdd383744b27914d0b88d0f9db72aa043e1475618d981d7089d9/bagit-1.8.1-py2.py3-none-any.whl", hash = "sha256:d14dd7e373dd24d41f6748c42f123f7db77098dfa4a0125dbacb4c8bdf767c09", size = 35137 }, +] + +[[package]] +name = "boto3" +version = "1.35.80" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, + { name = "jmespath" }, + { name = "s3transfer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/81/cd/58de9a4e792176bca4f2e4b4248a9db4a47ae217bc20ac6adb3052b029d3/boto3-1.35.80.tar.gz", hash = "sha256:50dae461ab5fbedfb81b690895d48a918fed0d5fdff37be1c4232770c0dc9712", size = 111009 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/72/f6724a19acaac7a7cdfc088ac95d2d0ea3626c00d5a5197a99e49bde474d/boto3-1.35.80-py3-none-any.whl", hash = "sha256:21a3b18c3a7fd20e463708fe3fa035983105dc7f3a1c274e1903e1583ab91159", size = 139179 }, +] + +[[package]] +name = "botocore" +version = "1.35.80" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jmespath" }, + { name = "python-dateutil" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/81/e3/b10f8c2c58fd144d99de10e5f964bd1b0e609d27cb05513bebfdfa47e3eb/botocore-1.35.80.tar.gz", hash = "sha256:b8dfceca58891cb2711bd6455ec4f7159051f3796e0f64adef9bb334f19d8a92", size = 13456944 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/cc/7ecef0f0e883f4bd8e23a04e86d98f8c7d6aa6a821efcec67b3547388d2e/botocore-1.35.80-py3-none-any.whl", hash = "sha256:36e589dccb62380abd628b08fecfa2f7c89b99f41ec9fc42c467c94008c0be4a", size = 13263229 }, +] + +[[package]] +name = "certifi" +version = "2024.8.30" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/ee/9b19140fe824b367c04c5e1b369942dd754c4c5462d5674002f75c4dedc1/certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9", size = 168507 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/90/3c9ff0512038035f59d279fddeb79f5f1eccd8859f06d6163c58798b9487/certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8", size = 167321 }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/16/b0/572805e227f01586461c80e0fd25d65a2115599cc9dad142fee4b747c357/charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3", size = 123188 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/9a/dd1e1cdceb841925b7798369a09279bd1cf183cef0f9ddf15a3a6502ee45/charset_normalizer-3.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545", size = 196105 }, + { url = "https://files.pythonhosted.org/packages/d3/8c/90bfabf8c4809ecb648f39794cf2a84ff2e7d2a6cf159fe68d9a26160467/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7", size = 140404 }, + { url = "https://files.pythonhosted.org/packages/ad/8f/e410d57c721945ea3b4f1a04b74f70ce8fa800d393d72899f0a40526401f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757", size = 150423 }, + { url = "https://files.pythonhosted.org/packages/f0/b8/e6825e25deb691ff98cf5c9072ee0605dc2acfca98af70c2d1b1bc75190d/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa", size = 143184 }, + { url = "https://files.pythonhosted.org/packages/3e/a2/513f6cbe752421f16d969e32f3583762bfd583848b763913ddab8d9bfd4f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d", size = 145268 }, + { url = "https://files.pythonhosted.org/packages/74/94/8a5277664f27c3c438546f3eb53b33f5b19568eb7424736bdc440a88a31f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616", size = 147601 }, + { url = "https://files.pythonhosted.org/packages/7c/5f/6d352c51ee763623a98e31194823518e09bfa48be2a7e8383cf691bbb3d0/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b", size = 141098 }, + { url = "https://files.pythonhosted.org/packages/78/d4/f5704cb629ba5ab16d1d3d741396aec6dc3ca2b67757c45b0599bb010478/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d", size = 149520 }, + { url = "https://files.pythonhosted.org/packages/c5/96/64120b1d02b81785f222b976c0fb79a35875457fa9bb40827678e54d1bc8/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a", size = 152852 }, + { url = "https://files.pythonhosted.org/packages/84/c9/98e3732278a99f47d487fd3468bc60b882920cef29d1fa6ca460a1fdf4e6/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9", size = 150488 }, + { url = "https://files.pythonhosted.org/packages/13/0e/9c8d4cb99c98c1007cc11eda969ebfe837bbbd0acdb4736d228ccaabcd22/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1", size = 146192 }, + { url = "https://files.pythonhosted.org/packages/b2/21/2b6b5b860781a0b49427309cb8670785aa543fb2178de875b87b9cc97746/charset_normalizer-3.4.1-cp312-cp312-win32.whl", hash = "sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35", size = 95550 }, + { url = "https://files.pythonhosted.org/packages/21/5b/1b390b03b1d16c7e382b561c5329f83cc06623916aab983e8ab9239c7d5c/charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f", size = 102785 }, + { url = "https://files.pythonhosted.org/packages/38/94/ce8e6f63d18049672c76d07d119304e1e2d7c6098f0841b51c666e9f44a0/charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda", size = 195698 }, + { url = "https://files.pythonhosted.org/packages/24/2e/dfdd9770664aae179a96561cc6952ff08f9a8cd09a908f259a9dfa063568/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313", size = 140162 }, + { url = "https://files.pythonhosted.org/packages/24/4e/f646b9093cff8fc86f2d60af2de4dc17c759de9d554f130b140ea4738ca6/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9", size = 150263 }, + { url = "https://files.pythonhosted.org/packages/5e/67/2937f8d548c3ef6e2f9aab0f6e21001056f692d43282b165e7c56023e6dd/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d074908e1aecee37a7635990b2c6d504cd4766c7bc9fc86d63f9c09af3fa11b", size = 142966 }, + { url = "https://files.pythonhosted.org/packages/52/ed/b7f4f07de100bdb95c1756d3a4d17b90c1a3c53715c1a476f8738058e0fa/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:955f8851919303c92343d2f66165294848d57e9bba6cf6e3625485a70a038d11", size = 144992 }, + { url = "https://files.pythonhosted.org/packages/96/2c/d49710a6dbcd3776265f4c923bb73ebe83933dfbaa841c5da850fe0fd20b/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:44ecbf16649486d4aebafeaa7ec4c9fed8b88101f4dd612dcaf65d5e815f837f", size = 147162 }, + { url = "https://files.pythonhosted.org/packages/b4/41/35ff1f9a6bd380303dea55e44c4933b4cc3c4850988927d4082ada230273/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0924e81d3d5e70f8126529951dac65c1010cdf117bb75eb02dd12339b57749dd", size = 140972 }, + { url = "https://files.pythonhosted.org/packages/fb/43/c6a0b685fe6910d08ba971f62cd9c3e862a85770395ba5d9cad4fede33ab/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2967f74ad52c3b98de4c3b32e1a44e32975e008a9cd2a8cc8966d6a5218c5cb2", size = 149095 }, + { url = "https://files.pythonhosted.org/packages/4c/ff/a9a504662452e2d2878512115638966e75633519ec11f25fca3d2049a94a/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c75cb2a3e389853835e84a2d8fb2b81a10645b503eca9bcb98df6b5a43eb8886", size = 152668 }, + { url = "https://files.pythonhosted.org/packages/6c/71/189996b6d9a4b932564701628af5cee6716733e9165af1d5e1b285c530ed/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:09b26ae6b1abf0d27570633b2b078a2a20419c99d66fb2823173d73f188ce601", size = 150073 }, + { url = "https://files.pythonhosted.org/packages/e4/93/946a86ce20790e11312c87c75ba68d5f6ad2208cfb52b2d6a2c32840d922/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd", size = 145732 }, + { url = "https://files.pythonhosted.org/packages/cd/e5/131d2fb1b0dddafc37be4f3a2fa79aa4c037368be9423061dccadfd90091/charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407", size = 95391 }, + { url = "https://files.pythonhosted.org/packages/27/f2/4f9a69cc7712b9b5ad8fdb87039fd89abba997ad5cbe690d1835d40405b0/charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971", size = 102702 }, + { url = "https://files.pythonhosted.org/packages/0e/f6/65ecc6878a89bb1c23a086ea335ad4bf21a588990c3f535a227b9eea9108/charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85", size = 49767 }, +] + +[[package]] +name = "click" +version = "8.1.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "platform_system == 'Windows'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 }, +] + +[[package]] +name = "cloudflare" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/50/2d/f5bad5e86898a650d1ceb7ddd04b4520d1bdbc82916afe7a154004b477cb/cloudflare-4.0.0.tar.gz", hash = "sha256:78b1222d532084bb29ab700257617fafd802a24c5af9b056f379b994e929af7e", size = 1638959 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/b2/54c768c0fe42fa14c62ac1a801e6d88cce3c82721aa10b5bee35ec877478/cloudflare-4.0.0-py3-none-any.whl", hash = "sha256:d8aa75b2e92f6a5f24ea368e7c19df0e42a76c528464acdc034528dd2597cb64", size = 3640599 }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, +] + +[[package]] +name = "data-mirror" +version = "0.1.0" +source = { editable = "." } +dependencies = [ + { name = "aiosqlite" }, + { name = "bagit" }, + { name = "boto3" }, + { name = "cloudflare" }, + { name = "gitspoke" }, + { name = "httpx" }, + { name = "jsondiff" }, + { name = "nabit" }, + { name = "peewee" }, + { name = "publicsuffixlist" }, + { name = "pyarrow" }, + { name = "tqdm" }, +] + +[package.dev-dependencies] +dev = [ + { name = "memray" }, +] + +[package.metadata] +requires-dist = [ + { name = "aiosqlite", specifier = ">=0.20.0" }, + { name = "bagit", specifier = ">=1.8.1" }, + { name = "boto3", specifier = ">=1.35.80" }, + { name = "cloudflare", specifier = ">=4.0.0" }, + { name = "gitspoke", git = "https://github.com/harvard-lil/gitspoke" }, + { name = "httpx", specifier = ">=0.27.2" }, + { name = "jsondiff", specifier = ">=2.2.1" }, + { name = "nabit", git = "https://github.com/harvard-lil/bag-nabit" }, + { name = "peewee", specifier = ">=3.17.8" }, + { name = "publicsuffixlist", specifier = ">=1.0.2.20241121" }, + { name = "pyarrow", specifier = ">=18.0.0" }, + { name = "tqdm", specifier = ">=4.67.0" }, +] + +[package.metadata.requires-dev] +dev = [{ name = "memray", specifier = ">=1.15.0" }] + +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277 }, +] + +[[package]] +name = "fastcore" +version = "1.7.28" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/56/a6/54133fa79c46c4873b82439539b02fcceedf06327b7b96aca48926642059/fastcore-1.7.28.tar.gz", hash = "sha256:606e4507eb4b8892e4c83ddf5462fbcf32f4bde4fa6caf56ca67ee5e2dbe2b1e", size = 80387 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2b/16/b911d4c40eddeed8ffe195763f23fb68867210b4e347ed968ea45d2c9d4f/fastcore-1.7.28-py3-none-any.whl", hash = "sha256:ffa1ab1b34518795a4342b85ebb9cd2b30588210c21df028a11e420678a59e20", size = 84053 }, +] + +[[package]] +name = "ghapi" +version = "1.0.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fastcore" }, + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f8/88/97e6b0c94885db3530d04ccab7016c606dcaf08bf0581ced1193b9668d06/ghapi-1.0.6.tar.gz", hash = "sha256:64fdd9f06d8e3373065c42c2a03e067e2bbb9ca18b583cd6e38a28aaad0224f6", size = 65518 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4c/ad/f7204c0c38175f300621af7880737ca6379dd633e9b7d1c0a8fc2748f0dc/ghapi-1.0.6-py3-none-any.whl", hash = "sha256:b3d96bf18fcaa2cb7131bad9de2948e2a1c2bb226377a25826f6c80950c57854", size = 62391 }, +] + +[[package]] +name = "gitspoke" +version = "0.1.0" +source = { git = "https://github.com/harvard-lil/gitspoke#d53df4d0d3870265bf8aa7173e9e45a9a320d378" } +dependencies = [ + { name = "click" }, + { name = "ghapi" }, + { name = "requests" }, +] + +[[package]] +name = "h11" +version = "0.14.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f5/38/3af3d3633a34a3316095b39c8e8fb4853a28a536e55d347bd8d8e9a14b03/h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d", size = 100418 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/04/ff642e65ad6b90db43e668d70ffb6736436c7ce41fcc549f4e9472234127/h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", size = 58259 }, +] + +[[package]] +name = "httpcore" +version = "1.0.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6a/41/d7d0a89eb493922c37d343b607bc1b5da7f5be7e383740b4753ad8943e90/httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c", size = 85196 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/f5/72347bc88306acb359581ac4d52f23c0ef445b57157adedb9aee0cd689d2/httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd", size = 78551 }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 }, +] + +[[package]] +name = "idna" +version = "3.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, +] + +[[package]] +name = "jinja2" +version = "3.1.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ed/55/39036716d19cab0747a5020fc7e907f362fbf48c984b14e62127f7e68e5d/jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369", size = 240245 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/80/3a54838c3fb461f6fec263ebf3a3a41771bd05190238de3486aae8540c36/jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d", size = 133271 }, +] + +[[package]] +name = "jmespath" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256 }, +] + +[[package]] +name = "jsondiff" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/35/48/841137f1843fa215ea284834d1514b8e9e20962bda63a636c7417e02f8fb/jsondiff-2.2.1.tar.gz", hash = "sha256:658d162c8a86ba86de26303cd86a7b37e1b2c1ec98b569a60e2ca6180545f7fe", size = 26649 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/94/a8066f84d62ab666d61ef97deba1a33126e3e5c0c0da2c458ada17053ed6/jsondiff-2.2.1-py3-none-any.whl", hash = "sha256:b1f0f7e2421881848b1d556d541ac01a91680cfcc14f51a9b62cdf4da0e56722", size = 13440 }, +] + +[[package]] +name = "linkify-it-py" +version = "2.0.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "uc-micro-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2a/ae/bb56c6828e4797ba5a4821eec7c43b8bf40f69cda4d4f5f8c8a2810ec96a/linkify-it-py-2.0.3.tar.gz", hash = "sha256:68cda27e162e9215c17d786649d1da0021a451bdc436ef9e0fa0ba5234b9b048", size = 27946 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/1e/b832de447dee8b582cac175871d2f6c3d5077cc56d5575cadba1fd1cccfa/linkify_it_py-2.0.3-py3-none-any.whl", hash = "sha256:6bcbc417b0ac14323382aef5c5192c0075bf8a9d6b41820a2b66371eac6b6d79", size = 19820 }, +] + +[[package]] +name = "markdown-it-py" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528 }, +] + +[package.optional-dependencies] +linkify = [ + { name = "linkify-it-py" }, +] +plugins = [ + { name = "mdit-py-plugins" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/09/d1f21434c97fc42f09d290cbb6350d44eb12f09cc62c9476effdb33a18aa/MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf", size = 14274 }, + { url = "https://files.pythonhosted.org/packages/6b/b0/18f76bba336fa5aecf79d45dcd6c806c280ec44538b3c13671d49099fdd0/MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225", size = 12348 }, + { url = "https://files.pythonhosted.org/packages/e0/25/dd5c0f6ac1311e9b40f4af06c78efde0f3b5cbf02502f8ef9501294c425b/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028", size = 24149 }, + { url = "https://files.pythonhosted.org/packages/f3/f0/89e7aadfb3749d0f52234a0c8c7867877876e0a20b60e2188e9850794c17/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8", size = 23118 }, + { url = "https://files.pythonhosted.org/packages/d5/da/f2eeb64c723f5e3777bc081da884b414671982008c47dcc1873d81f625b6/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c", size = 22993 }, + { url = "https://files.pythonhosted.org/packages/da/0e/1f32af846df486dce7c227fe0f2398dc7e2e51d4a370508281f3c1c5cddc/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557", size = 24178 }, + { url = "https://files.pythonhosted.org/packages/c4/f6/bb3ca0532de8086cbff5f06d137064c8410d10779c4c127e0e47d17c0b71/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22", size = 23319 }, + { url = "https://files.pythonhosted.org/packages/a2/82/8be4c96ffee03c5b4a034e60a31294daf481e12c7c43ab8e34a1453ee48b/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48", size = 23352 }, + { url = "https://files.pythonhosted.org/packages/51/ae/97827349d3fcffee7e184bdf7f41cd6b88d9919c80f0263ba7acd1bbcb18/MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30", size = 15097 }, + { url = "https://files.pythonhosted.org/packages/c1/80/a61f99dc3a936413c3ee4e1eecac96c0da5ed07ad56fd975f1a9da5bc630/MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87", size = 15601 }, + { url = "https://files.pythonhosted.org/packages/83/0e/67eb10a7ecc77a0c2bbe2b0235765b98d164d81600746914bebada795e97/MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd", size = 14274 }, + { url = "https://files.pythonhosted.org/packages/2b/6d/9409f3684d3335375d04e5f05744dfe7e9f120062c9857df4ab490a1031a/MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430", size = 12352 }, + { url = "https://files.pythonhosted.org/packages/d2/f5/6eadfcd3885ea85fe2a7c128315cc1bb7241e1987443d78c8fe712d03091/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094", size = 24122 }, + { url = "https://files.pythonhosted.org/packages/0c/91/96cf928db8236f1bfab6ce15ad070dfdd02ed88261c2afafd4b43575e9e9/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396", size = 23085 }, + { url = "https://files.pythonhosted.org/packages/c2/cf/c9d56af24d56ea04daae7ac0940232d31d5a8354f2b457c6d856b2057d69/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79", size = 22978 }, + { url = "https://files.pythonhosted.org/packages/2a/9f/8619835cd6a711d6272d62abb78c033bda638fdc54c4e7f4272cf1c0962b/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a", size = 24208 }, + { url = "https://files.pythonhosted.org/packages/f9/bf/176950a1792b2cd2102b8ffeb5133e1ed984547b75db47c25a67d3359f77/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca", size = 23357 }, + { url = "https://files.pythonhosted.org/packages/ce/4f/9a02c1d335caabe5c4efb90e1b6e8ee944aa245c1aaaab8e8a618987d816/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c", size = 23344 }, + { url = "https://files.pythonhosted.org/packages/ee/55/c271b57db36f748f0e04a759ace9f8f759ccf22b4960c270c78a394f58be/MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1", size = 15101 }, + { url = "https://files.pythonhosted.org/packages/29/88/07df22d2dd4df40aba9f3e402e6dc1b8ee86297dddbad4872bd5e7b0094f/MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f", size = 15603 }, + { url = "https://files.pythonhosted.org/packages/62/6a/8b89d24db2d32d433dffcd6a8779159da109842434f1dd2f6e71f32f738c/MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c", size = 14510 }, + { url = "https://files.pythonhosted.org/packages/7a/06/a10f955f70a2e5a9bf78d11a161029d278eeacbd35ef806c3fd17b13060d/MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb", size = 12486 }, + { url = "https://files.pythonhosted.org/packages/34/cf/65d4a571869a1a9078198ca28f39fba5fbb910f952f9dbc5220afff9f5e6/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c", size = 25480 }, + { url = "https://files.pythonhosted.org/packages/0c/e3/90e9651924c430b885468b56b3d597cabf6d72be4b24a0acd1fa0e12af67/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d", size = 23914 }, + { url = "https://files.pythonhosted.org/packages/66/8c/6c7cf61f95d63bb866db39085150df1f2a5bd3335298f14a66b48e92659c/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe", size = 23796 }, + { url = "https://files.pythonhosted.org/packages/bb/35/cbe9238ec3f47ac9a7c8b3df7a808e7cb50fe149dc7039f5f454b3fba218/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5", size = 25473 }, + { url = "https://files.pythonhosted.org/packages/e6/32/7621a4382488aa283cc05e8984a9c219abad3bca087be9ec77e89939ded9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a", size = 24114 }, + { url = "https://files.pythonhosted.org/packages/0d/80/0985960e4b89922cb5a0bac0ed39c5b96cbc1a536a99f30e8c220a996ed9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9", size = 24098 }, + { url = "https://files.pythonhosted.org/packages/82/78/fedb03c7d5380df2427038ec8d973587e90561b2d90cd472ce9254cf348b/MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6", size = 15208 }, + { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739 }, +] + +[[package]] +name = "mdit-py-plugins" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/03/a2ecab526543b152300717cf232bb4bb8605b6edb946c845016fa9c9c9fd/mdit_py_plugins-0.4.2.tar.gz", hash = "sha256:5f2cd1fdb606ddf152d37ec30e46101a60512bc0e5fa1a7002c36647b09e26b5", size = 43542 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/f7/7782a043553ee469c1ff49cfa1cdace2d6bf99a1f333cf38676b3ddf30da/mdit_py_plugins-0.4.2-py3-none-any.whl", hash = "sha256:0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636", size = 55316 }, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 }, +] + +[[package]] +name = "memray" +version = "1.15.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jinja2" }, + { name = "rich" }, + { name = "textual" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e8/d3/b2a01137e2391917928187c4c2837c2750cc832c99a6aecd6e0d6ea07c58/memray-1.15.0.tar.gz", hash = "sha256:1beffa2bcba3dbe0f095d547927286eca46e272798b83026dd1b5db58e16ed56", size = 1025344 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/87/9c62e12fa59967852d41df32fe5a0117d2bcd789b72960051c22a2052782/memray-1.15.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:d13554a25129593872b5fbcd55ac34453239e51d9b6ace258329596ccce22bb3", size = 927561 }, + { url = "https://files.pythonhosted.org/packages/0a/9e/8f88ef0e037ca9f11fd1e25e5abcc220bd368adfd9185630b37c405e6aa7/memray-1.15.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8cfe15962a9002ede8b1f8b4f045d95855100a8a60a9bf0d9f2b92950f914189", size = 899042 }, + { url = "https://files.pythonhosted.org/packages/06/ae/107ce4d557b6a6598c6a037108b5591abcdde48d92470d722b4a63e82cac/memray-1.15.0-cp312-cp312-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e84b39adca05e720bdbf950cc92ef4bafefa2d6160111e5fc427cf59c6c16d1a", size = 8417003 }, + { url = "https://files.pythonhosted.org/packages/46/35/151684bd2635f955f3381e0739e3abd13baa621e855bc3cc8a336f5e9587/memray-1.15.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7745d2c58dfc33ef77f8827053cb957131420051b67e2d5642b605d0e65a586", size = 8015335 }, + { url = "https://files.pythonhosted.org/packages/e9/17/b30e0bcb799bf2b7383d2133067ee50aee7312cdd785c3a7347b7a7db6bf/memray-1.15.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:412225d85db0ec22142a82646d85ecc1e8680d33adbfd15789c7eaa356ad4107", size = 8133111 }, + { url = "https://files.pythonhosted.org/packages/03/13/71ad108bece1c13e876a8d103dfafb9cebef66f799719ff2c12d1d5f5446/memray-1.15.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d25ab7a7e32fedab46219121dfb6ec3e42c66984b217572fdd4cddc37359c521", size = 8405380 }, + { url = "https://files.pythonhosted.org/packages/6f/01/eafaa4f9fed4d03c5817965f22dac280de0f1e58f9c0c9654c119ab42ad3/memray-1.15.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fb885f92833279d34addc607831352e91267b8e547ea861ad561a3dba64f6757", size = 8357767 }, + { url = "https://files.pythonhosted.org/packages/b9/c2/a4b5cabfe1389dffbc724e21dac2b454cf76e4e9446e2ec50d74124fd666/memray-1.15.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:c1308e6a5fc5bc4e183bc0fdf5e241ddd9fb374338f32d77a4d5e74ccf611ef1", size = 922782 }, + { url = "https://files.pythonhosted.org/packages/53/5d/c2968656dc33cc7ef9121b6b30da5a37a0497fe526ff0818d3ce06418085/memray-1.15.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0794227dfa4b86a56137211fd5b8ec131e0bc4a5dc41c2f5a318ca56a22c9331", size = 894514 }, + { url = "https://files.pythonhosted.org/packages/8a/59/10efbb5e35221fe2097717391bece4bcc089f0c7cdc77c7d285f9dc0a4b0/memray-1.15.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f184e82debd4f0c8ecf8e6034efddccdd9fac22909553a7f094eabf0902cd53f", size = 8006898 }, + { url = "https://files.pythonhosted.org/packages/3e/96/7cc05356c2e4e1b1965c2fcd6ad89307dadb7bc531c8da44abcea94b213e/memray-1.15.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3493c5ac1ae1353fd0d24481bc9f30da8960ef703bf4af966cefff9dd1234d38", size = 8126116 }, + { url = "https://files.pythonhosted.org/packages/fa/ba/7056f86ee16b8598288f652edc5c3c7df51eda15d3ecfc5c9f5bf7c578d3/memray-1.15.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:145a3062d8bf631aa8dc4b0928585e201282634afc369799dae1a0b9ece59fd4", size = 8397263 }, + { url = "https://files.pythonhosted.org/packages/c5/30/8410d26b9ea64c942a23fcd9e46c6daae841bc7b451676e5b671346d4955/memray-1.15.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:59a4ade09cfe46e85cdb3a1976e9768e4674a6e448533c415dbe84e5a834f7c3", size = 8340658 }, +] + +[[package]] +name = "nabit" +version = "0.1.2" +source = { git = "https://github.com/harvard-lil/bag-nabit#f1fd7331f5e8188e60447018c4a154efc24a21b1" } +dependencies = [ + { name = "bagit" }, + { name = "click" }, + { name = "requests" }, + { name = "setuptools" }, + { name = "warcio" }, +] + +[[package]] +name = "packaging" +version = "24.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 }, +] + +[[package]] +name = "peewee" +version = "3.17.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b4/dc/832bcf4ea5ee2ebc4ea42ef36e44a451de5d80f8b9858bf2066e30738c67/peewee-3.17.8.tar.gz", hash = "sha256:ce1d05db3438830b989a1b9d0d0aa4e7f6134d5f6fd57686eeaa26a3e6485a8c", size = 948249 } + +[[package]] +name = "platformdirs" +version = "4.3.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/13/fc/128cc9cb8f03208bdbf93d3aa862e16d376844a14f9a0ce5cf4507372de4/platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907", size = 21302 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb", size = 18439 }, +] + +[[package]] +name = "publicsuffixlist" +version = "1.0.2.20241207" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5d/93/2704becd38f0ada367a56bb058f918b6b3332d63b48fad821f13260284c8/publicsuffixlist-1.0.2.20241207.tar.gz", hash = "sha256:2b6d70074b00886d3098e7ed5f8eba8c3d1f3c2429eb8ecaf98362595496de04", size = 104784 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/44/4c/b3bc04cc61e3f00415bcacff5703786540c9538d7d26d16b81e44652f499/publicsuffixlist-1.0.2.20241207-py2.py3-none-any.whl", hash = "sha256:7213e69d0a2c9d7948b9bc304dbffa17a1450eccd2ba1de30c278a07134f39fd", size = 104587 }, +] + +[[package]] +name = "pyarrow" +version = "18.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7f/7b/640785a9062bb00314caa8a387abce547d2a420cf09bd6c715fe659ccffb/pyarrow-18.1.0.tar.gz", hash = "sha256:9386d3ca9c145b5539a1cfc75df07757dff870168c959b473a0bccbc3abc8c73", size = 1118671 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/50/12829e7111b932581e51dda51d5cb39207a056c30fe31ef43f14c63c4d7e/pyarrow-18.1.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:9f3a76670b263dc41d0ae877f09124ab96ce10e4e48f3e3e4257273cee61ad0d", size = 29514620 }, + { url = "https://files.pythonhosted.org/packages/d1/41/468c944eab157702e96abab3d07b48b8424927d4933541ab43788bb6964d/pyarrow-18.1.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:da31fbca07c435be88a0c321402c4e31a2ba61593ec7473630769de8346b54ee", size = 30856494 }, + { url = "https://files.pythonhosted.org/packages/68/f9/29fb659b390312a7345aeb858a9d9c157552a8852522f2c8bad437c29c0a/pyarrow-18.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:543ad8459bc438efc46d29a759e1079436290bd583141384c6f7a1068ed6f992", size = 39203624 }, + { url = "https://files.pythonhosted.org/packages/6e/f6/19360dae44200e35753c5c2889dc478154cd78e61b1f738514c9f131734d/pyarrow-18.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0743e503c55be0fdb5c08e7d44853da27f19dc854531c0570f9f394ec9671d54", size = 40139341 }, + { url = "https://files.pythonhosted.org/packages/bb/e6/9b3afbbcf10cc724312e824af94a2e993d8ace22994d823f5c35324cebf5/pyarrow-18.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d4b3d2a34780645bed6414e22dda55a92e0fcd1b8a637fba86800ad737057e33", size = 38618629 }, + { url = "https://files.pythonhosted.org/packages/3a/2e/3b99f8a3d9e0ccae0e961978a0d0089b25fb46ebbcfb5ebae3cca179a5b3/pyarrow-18.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c52f81aa6f6575058d8e2c782bf79d4f9fdc89887f16825ec3a66607a5dd8e30", size = 40078661 }, + { url = "https://files.pythonhosted.org/packages/76/52/f8da04195000099d394012b8d42c503d7041b79f778d854f410e5f05049a/pyarrow-18.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:0ad4892617e1a6c7a551cfc827e072a633eaff758fa09f21c4ee548c30bcaf99", size = 25092330 }, + { url = "https://files.pythonhosted.org/packages/cb/87/aa4d249732edef6ad88899399047d7e49311a55749d3c373007d034ee471/pyarrow-18.1.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:84e314d22231357d473eabec709d0ba285fa706a72377f9cc8e1cb3c8013813b", size = 29497406 }, + { url = "https://files.pythonhosted.org/packages/3c/c7/ed6adb46d93a3177540e228b5ca30d99fc8ea3b13bdb88b6f8b6467e2cb7/pyarrow-18.1.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:f591704ac05dfd0477bb8f8e0bd4b5dc52c1cadf50503858dce3a15db6e46ff2", size = 30835095 }, + { url = "https://files.pythonhosted.org/packages/41/d7/ed85001edfb96200ff606943cff71d64f91926ab42828676c0fc0db98963/pyarrow-18.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:acb7564204d3c40babf93a05624fc6a8ec1ab1def295c363afc40b0c9e66c191", size = 39194527 }, + { url = "https://files.pythonhosted.org/packages/59/16/35e28eab126342fa391593415d79477e89582de411bb95232f28b131a769/pyarrow-18.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74de649d1d2ccb778f7c3afff6085bd5092aed4c23df9feeb45dd6b16f3811aa", size = 40131443 }, + { url = "https://files.pythonhosted.org/packages/0c/95/e855880614c8da20f4cd74fa85d7268c725cf0013dc754048593a38896a0/pyarrow-18.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:f96bd502cb11abb08efea6dab09c003305161cb6c9eafd432e35e76e7fa9b90c", size = 38608750 }, + { url = "https://files.pythonhosted.org/packages/54/9d/f253554b1457d4fdb3831b7bd5f8f00f1795585a606eabf6fec0a58a9c38/pyarrow-18.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:36ac22d7782554754a3b50201b607d553a8d71b78cdf03b33c1125be4b52397c", size = 40066690 }, + { url = "https://files.pythonhosted.org/packages/2f/58/8912a2563e6b8273e8aa7b605a345bba5a06204549826f6493065575ebc0/pyarrow-18.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:25dbacab8c5952df0ca6ca0af28f50d45bd31c1ff6fcf79e2d120b4a65ee7181", size = 25081054 }, + { url = "https://files.pythonhosted.org/packages/82/f9/d06ddc06cab1ada0c2f2fd205ac8c25c2701182de1b9c4bf7a0a44844431/pyarrow-18.1.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6a276190309aba7bc9d5bd2933230458b3521a4317acfefe69a354f2fe59f2bc", size = 29525542 }, + { url = "https://files.pythonhosted.org/packages/ab/94/8917e3b961810587ecbdaa417f8ebac0abb25105ae667b7aa11c05876976/pyarrow-18.1.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:ad514dbfcffe30124ce655d72771ae070f30bf850b48bc4d9d3b25993ee0e386", size = 30829412 }, + { url = "https://files.pythonhosted.org/packages/5e/e3/3b16c3190f3d71d3b10f6758d2d5f7779ef008c4fd367cedab3ed178a9f7/pyarrow-18.1.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aebc13a11ed3032d8dd6e7171eb6e86d40d67a5639d96c35142bd568b9299324", size = 39119106 }, + { url = "https://files.pythonhosted.org/packages/1d/d6/5d704b0d25c3c79532f8c0639f253ec2803b897100f64bcb3f53ced236e5/pyarrow-18.1.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6cf5c05f3cee251d80e98726b5c7cc9f21bab9e9783673bac58e6dfab57ecc8", size = 40090940 }, + { url = "https://files.pythonhosted.org/packages/37/29/366bc7e588220d74ec00e497ac6710c2833c9176f0372fe0286929b2d64c/pyarrow-18.1.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:11b676cd410cf162d3f6a70b43fb9e1e40affbc542a1e9ed3681895f2962d3d9", size = 38548177 }, + { url = "https://files.pythonhosted.org/packages/c8/11/fabf6ecabb1fe5b7d96889228ca2a9158c4c3bb732e3b8ee3f7f6d40b703/pyarrow-18.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b76130d835261b38f14fc41fdfb39ad8d672afb84c447126b84d5472244cfaba", size = 40043567 }, +] + +[[package]] +name = "pydantic" +version = "2.10.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/ae/d5220c5c52b158b1de7ca89fc5edb72f304a70a4c540c84c8844bf4008de/pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236", size = 761681 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/3c/8cc1cc84deffa6e25d2d0c688ebb80635dfdbf1dbea3e30c541c8cf4d860/pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584", size = 431696 }, +] + +[[package]] +name = "pydantic-core" +version = "2.27.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/01/f3e5ac5e7c25833db5eb555f7b7ab24cd6f8c322d3a3ad2d67a952dc0abc/pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39", size = 413443 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d6/74/51c8a5482ca447871c93e142d9d4a92ead74de6c8dc5e66733e22c9bba89/pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0", size = 1893127 }, + { url = "https://files.pythonhosted.org/packages/d3/f3/c97e80721735868313c58b89d2de85fa80fe8dfeeed84dc51598b92a135e/pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef", size = 1811340 }, + { url = "https://files.pythonhosted.org/packages/9e/91/840ec1375e686dbae1bd80a9e46c26a1e0083e1186abc610efa3d9a36180/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7", size = 1822900 }, + { url = "https://files.pythonhosted.org/packages/f6/31/4240bc96025035500c18adc149aa6ffdf1a0062a4b525c932065ceb4d868/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934", size = 1869177 }, + { url = "https://files.pythonhosted.org/packages/fa/20/02fbaadb7808be578317015c462655c317a77a7c8f0ef274bc016a784c54/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6", size = 2038046 }, + { url = "https://files.pythonhosted.org/packages/06/86/7f306b904e6c9eccf0668248b3f272090e49c275bc488a7b88b0823444a4/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c", size = 2685386 }, + { url = "https://files.pythonhosted.org/packages/8d/f0/49129b27c43396581a635d8710dae54a791b17dfc50c70164866bbf865e3/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2", size = 1997060 }, + { url = "https://files.pythonhosted.org/packages/0d/0f/943b4af7cd416c477fd40b187036c4f89b416a33d3cc0ab7b82708a667aa/pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4", size = 2004870 }, + { url = "https://files.pythonhosted.org/packages/35/40/aea70b5b1a63911c53a4c8117c0a828d6790483f858041f47bab0b779f44/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3", size = 1999822 }, + { url = "https://files.pythonhosted.org/packages/f2/b3/807b94fd337d58effc5498fd1a7a4d9d59af4133e83e32ae39a96fddec9d/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4", size = 2130364 }, + { url = "https://files.pythonhosted.org/packages/fc/df/791c827cd4ee6efd59248dca9369fb35e80a9484462c33c6649a8d02b565/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57", size = 2158303 }, + { url = "https://files.pythonhosted.org/packages/9b/67/4e197c300976af185b7cef4c02203e175fb127e414125916bf1128b639a9/pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc", size = 1834064 }, + { url = "https://files.pythonhosted.org/packages/1f/ea/cd7209a889163b8dcca139fe32b9687dd05249161a3edda62860430457a5/pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9", size = 1989046 }, + { url = "https://files.pythonhosted.org/packages/bc/49/c54baab2f4658c26ac633d798dab66b4c3a9bbf47cff5284e9c182f4137a/pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b", size = 1885092 }, + { url = "https://files.pythonhosted.org/packages/41/b1/9bc383f48f8002f99104e3acff6cba1231b29ef76cfa45d1506a5cad1f84/pydantic_core-2.27.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7d14bd329640e63852364c306f4d23eb744e0f8193148d4044dd3dacdaacbd8b", size = 1892709 }, + { url = "https://files.pythonhosted.org/packages/10/6c/e62b8657b834f3eb2961b49ec8e301eb99946245e70bf42c8817350cbefc/pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82f91663004eb8ed30ff478d77c4d1179b3563df6cdb15c0817cd1cdaf34d154", size = 1811273 }, + { url = "https://files.pythonhosted.org/packages/ba/15/52cfe49c8c986e081b863b102d6b859d9defc63446b642ccbbb3742bf371/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71b24c7d61131bb83df10cc7e687433609963a944ccf45190cfc21e0887b08c9", size = 1823027 }, + { url = "https://files.pythonhosted.org/packages/b1/1c/b6f402cfc18ec0024120602bdbcebc7bdd5b856528c013bd4d13865ca473/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa8e459d4954f608fa26116118bb67f56b93b209c39b008277ace29937453dc9", size = 1868888 }, + { url = "https://files.pythonhosted.org/packages/bd/7b/8cb75b66ac37bc2975a3b7de99f3c6f355fcc4d89820b61dffa8f1e81677/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce8918cbebc8da707ba805b7fd0b382816858728ae7fe19a942080c24e5b7cd1", size = 2037738 }, + { url = "https://files.pythonhosted.org/packages/c8/f1/786d8fe78970a06f61df22cba58e365ce304bf9b9f46cc71c8c424e0c334/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3f5c2a021bbc5d976107bb302e0131351c2ba54343f8a496dc8783d3d3a6a", size = 2685138 }, + { url = "https://files.pythonhosted.org/packages/a6/74/d12b2cd841d8724dc8ffb13fc5cef86566a53ed358103150209ecd5d1999/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8086fa684c4775c27f03f062cbb9eaa6e17f064307e86b21b9e0abc9c0f02e", size = 1997025 }, + { url = "https://files.pythonhosted.org/packages/a0/6e/940bcd631bc4d9a06c9539b51f070b66e8f370ed0933f392db6ff350d873/pydantic_core-2.27.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8d9b3388db186ba0c099a6d20f0604a44eabdeef1777ddd94786cdae158729e4", size = 2004633 }, + { url = "https://files.pythonhosted.org/packages/50/cc/a46b34f1708d82498c227d5d80ce615b2dd502ddcfd8376fc14a36655af1/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7a66efda2387de898c8f38c0cf7f14fca0b51a8ef0b24bfea5849f1b3c95af27", size = 1999404 }, + { url = "https://files.pythonhosted.org/packages/ca/2d/c365cfa930ed23bc58c41463bae347d1005537dc8db79e998af8ba28d35e/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:18a101c168e4e092ab40dbc2503bdc0f62010e95d292b27827871dc85450d7ee", size = 2130130 }, + { url = "https://files.pythonhosted.org/packages/f4/d7/eb64d015c350b7cdb371145b54d96c919d4db516817f31cd1c650cae3b21/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ba5dd002f88b78a4215ed2f8ddbdf85e8513382820ba15ad5ad8955ce0ca19a1", size = 2157946 }, + { url = "https://files.pythonhosted.org/packages/a4/99/bddde3ddde76c03b65dfd5a66ab436c4e58ffc42927d4ff1198ffbf96f5f/pydantic_core-2.27.2-cp313-cp313-win32.whl", hash = "sha256:1ebaf1d0481914d004a573394f4be3a7616334be70261007e47c2a6fe7e50130", size = 1834387 }, + { url = "https://files.pythonhosted.org/packages/71/47/82b5e846e01b26ac6f1893d3c5f9f3a2eb6ba79be26eef0b759b4fe72946/pydantic_core-2.27.2-cp313-cp313-win_amd64.whl", hash = "sha256:953101387ecf2f5652883208769a79e48db18c6df442568a0b5ccd8c2723abee", size = 1990453 }, + { url = "https://files.pythonhosted.org/packages/51/b2/b2b50d5ecf21acf870190ae5d093602d95f66c9c31f9d5de6062eb329ad1/pydantic_core-2.27.2-cp313-cp313-win_arm64.whl", hash = "sha256:ac4dbfd1691affb8f48c2c13241a2e3b60ff23247cbcf981759c768b6633cf8b", size = 1885186 }, +] + +[[package]] +name = "pygments" +version = "2.19.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", size = 4968581 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892 }, +] + +[[package]] +name = "pyyaml" +version = "6.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873 }, + { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302 }, + { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154 }, + { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223 }, + { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542 }, + { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164 }, + { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611 }, + { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591 }, + { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338 }, + { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309 }, + { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679 }, + { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428 }, + { url = "https://files.pythonhosted.org/packages/a3/69/864fbe19e6c18ea3cc196cbe5d392175b4cf3d5d0ac1403ec3f2d237ebb5/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", size = 763361 }, + { url = "https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", size = 759523 }, + { url = "https://files.pythonhosted.org/packages/2b/b2/e3234f59ba06559c6ff63c4e10baea10e5e7df868092bf9ab40e5b9c56b6/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", size = 726660 }, + { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597 }, + { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527 }, + { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446 }, +] + +[[package]] +name = "requests" +version = "2.32.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 }, +] + +[[package]] +name = "rich" +version = "13.9.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ab/3a/0316b28d0761c6734d6bc14e770d85506c986c85ffb239e688eeaab2c2bc/rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098", size = 223149 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90", size = 242424 }, +] + +[[package]] +name = "s3transfer" +version = "0.10.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c0/0a/1cdbabf9edd0ea7747efdf6c9ab4e7061b085aa7f9bfc36bb1601563b069/s3transfer-0.10.4.tar.gz", hash = "sha256:29edc09801743c21eb5ecbc617a152df41d3c287f67b615f73e5f750583666a7", size = 145287 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/66/05/7957af15543b8c9799209506df4660cba7afc4cf94bfb60513827e96bed6/s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e", size = 83175 }, +] + +[[package]] +name = "setuptools" +version = "75.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/92/ec/089608b791d210aec4e7f97488e67ab0d33add3efccb83a056cbafe3a2a6/setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6", size = 1343222 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/8a/b9dc7678803429e4a3bc9ba462fa3dd9066824d3c607490235c6a796be5a/setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3", size = 1228782 }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 }, +] + +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 }, +] + +[[package]] +name = "textual" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py", extra = ["linkify", "plugins"] }, + { name = "platformdirs" }, + { name = "rich" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1f/b6/59b1de04bb4dca0f21ed7ba0b19309ed7f3f5de4396edf20cc2855e53085/textual-1.0.0.tar.gz", hash = "sha256:bec9fe63547c1c552569d1b75d309038b7d456c03f86dfa3706ddb099b151399", size = 1532733 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ac/bb/5fb6656c625019cd653d5215237d7cd6e0b12e7eae4195c3d1c91b2136fc/textual-1.0.0-py3-none-any.whl", hash = "sha256:2d4a701781c05104925e463ae370c630567c70c2880e92ab838052e3e23c986f", size = 660456 }, +] + +[[package]] +name = "tqdm" +version = "4.67.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "platform_system == 'Windows'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540 }, +] + +[[package]] +name = "typing-extensions" +version = "4.12.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 }, +] + +[[package]] +name = "uc-micro-py" +version = "1.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/91/7a/146a99696aee0609e3712f2b44c6274566bc368dfe8375191278045186b8/uc-micro-py-1.0.3.tar.gz", hash = "sha256:d321b92cff673ec58027c04015fcaa8bb1e005478643ff4a500882eaab88c48a", size = 6043 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/37/87/1f677586e8ac487e29672e4b17455758fce261de06a0d086167bb760361a/uc_micro_py-1.0.3-py3-none-any.whl", hash = "sha256:db1dffff340817673d7b466ec86114a9dc0e9d4d9b5ba229d9d60e5c12600cd5", size = 6229 }, +] + +[[package]] +name = "urllib3" +version = "2.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ed/63/22ba4ebfe7430b76388e7cd448d5478814d3032121827c12a2cc287e2260/urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9", size = 300677 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/d9/5f4c13cecde62396b0d3fe530a50ccea91e7dfc1ccf0e09c228841bb5ba8/urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac", size = 126338 }, +] + +[[package]] +name = "warcio" +version = "1.7.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f2/2b/d825506924cb4508c90cd950dbda2a4dbfa9f5609e2ae76b53deaba656db/warcio-1.7.5.tar.gz", hash = "sha256:7247b57e68074cfd9433cb6dc226f8567d6777052abec2d3c78346cffa4d19b9", size = 61691 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5c/f0/3f19085980f8a4485f4265cc9dba1099b2fa35ef7552390a8446e149c293/warcio-1.7.5-py2.py3-none-any.whl", hash = "sha256:ca96130bde7747e49da714097d144c6ff939458d4f93e1beb1e42455db4326d4", size = 40568 }, +]