Cleanup to prep for diffing

2025-09-03 05:26:00 -04:00 · 2025-02-24 16:45:50 -05:00 · 2025-02-24 16:45:50 -05:00 · a7c99e264d
commit a7c99e264d
parent 7af7f9cf3e
9 changed files with 290 additions and 122 deletions
--- a/collections/data_gov/README.md
+++ b/collections/data_gov/README.md
@ -84,9 +84,9 @@ Rollup files
 There are several rollup files at the top level to help with finding datasets
 of interest:

-* `metadata.jsonl.zip`: zipped JSON lines file of all files contained in metadata/
+* `metadata.csv.zip`: CSV listing the name, organization, title, date, metadata path, and collection path for each dataset
+* `metadata.jsonl.zip`: JSON lines file with complete metadata for each dataset, including the `signed_metadata` and `zip_entries` sections (equivalent to downloading the metadata/ directory as a single file)
 * `file_listing.jsonl.zip`: zipped JSON lines file showing the s3 listing of all files in the repository
-* `collections.html`: human-readable HTML file showing the title and link to each dataset (warning, very large file that may not load in some browsers)

 Downloading data
 ----------------
@ -117,3 +117,10 @@ Source code

 The source code used to generate this and other repositories is available at [https://github.com/harvard-lil/data-vault](https://github.com/harvard-lil/data-vault).
 We welcome conversation and collaboration in the issue tracker for that project.
+
+Collection Dates and Update Schedule
+------------------------------------
+
+Files in this repository were collected intermittently between 2024-11-19 and 2025-02-06.
+
+Beginning on 2025-02-06, we will update the repository daily.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -16,6 +16,8 @@ dependencies = [
    "nabit",
    "gitspoke",
    "cloudflare>=4.0.0",
+    "deepdiff>=8.2.0",
+    "orjson>=3.10.15",
 ]

 [build-system]
--- a/scripts/collection/write_metadata.py
+++ b/scripts/collection/write_metadata.py
@ -6,6 +6,8 @@ import logging
 import csv
 import zipfile
 from tqdm import tqdm
+import io
+
 logger = logging.getLogger(__name__)

@click.group()
@ -27,6 +29,10 @@ def write_readme(collections_file: Path):
        bucket_name, s3_prefix = collection['s3_path'].split('/', 1)

        for file_path in collection_path.rglob('*'):
+            # Skip dotfiles and files in dot directories
+            if any(part.startswith('.') for part in file_path.parts):
+                continue
+                
            if file_path.is_file():
                relative_path = file_path.relative_to(collection_path)
                s3_key = f"{s3_prefix}/{relative_path}"
@ -38,32 +44,37 @@ def write_readme(collections_file: Path):
@click.argument('output_file', type=click.Path(path_type=Path))
 def write_csv(metadata_file: Path, output_file: Path):
    """
-    Read a zipped JSONL file of metadata and write dataset info to CSV.
+    Read a zipped JSONL file of metadata and write dataset info to a zipped CSV.
    
    metadata_file: Path to the zip file containing metadata JSONL
-    output_file: Path where the CSV should be written
+    output_file: Path where the zipped CSV should be written
    """
-    with zipfile.ZipFile(metadata_file, 'r') as zf, \
-         open(output_file, 'w', newline='') as csvfile:
-        
+    # Get the base filename without .zip extension for the internal CSV file
+    internal_filename = output_file.name.replace('.zip', '')
    jsonl_name = metadata_file.name.replace('.zip', '')
-        writer = csv.writer(csvfile)
-        writer.writerow(['name', 'title'])  # Write header
    
-        with zf.open(jsonl_name) as f:
-            for line in tqdm(f, desc="Writing CSV"):
-                try:
+    with zipfile.ZipFile(metadata_file, 'r') as input_zf, \
+         zipfile.ZipFile(output_file, 'w', compression=zipfile.ZIP_DEFLATED) as output_zf, \
+         output_zf.open(internal_filename, 'w', force_zip64=True) as csvfile, \
+         input_zf.open(jsonl_name) as jsonlfile:
+         
+        # Create a text wrapper around the binary file
+        text_wrapper = io.TextIOWrapper(csvfile, write_through=True, newline='')
+        writer = csv.writer(text_wrapper)
+        writer.writerow(['name', 'organization', 'title', 'date', 'metadata_path', 'collection_path'])
+            
+        # Read from input zip and write to output zip
+        for line in tqdm(jsonlfile, desc="Writing CSV"):
            metadata = json.loads(line)
-                except json.JSONDecodeError:
-                    print(line)
-                    breakpoint()
-                    print(line)
-                    continue
-                dataset_info = metadata.get('signed_metadata', {}).get('data_gov_metadata', {})
+            dataset_info = metadata['signed_metadata']['data_gov_metadata']
            if dataset_info:
                writer.writerow([
-                        dataset_info.get('name', ''),
-                        dataset_info.get('title', '')
+                    dataset_info['name'],
+                    dataset_info['organization']['title'],
+                    dataset_info['title'],
+                    dataset_info['metadata_modified'],
+                    metadata['metadata_path'],
+                    metadata['collection_path'],
                ])

@cli.command()
@ -71,25 +82,20 @@ def write_csv(metadata_file: Path, output_file: Path):
@click.argument('output_file', type=click.Path(path_type=Path))
 def write_jsonl(metadata_dir: Path, output_file: Path):
    """
-    Read each .json file, recursively, in metadata directory and write to a single zipped JSONL file.
+    Read each .json file, recursively, in metadata directory and write to a single compressed zipped JSONL file.
    All records are written to a single JSONL file within the zip, named same as output_file without .zip
    """
    # Get the base filename without .zip extension for the internal file
    internal_filename = output_file.name.replace('.zip', '')
    output_dir = output_file.parent
    
-    # Use force_zip64=True to handle files larger than 2GB
-    with zipfile.ZipFile(output_file, 'w') as zf:
+    with zipfile.ZipFile(output_file, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        # Create a single file in the zip archive
        with zf.open(internal_filename, 'w', force_zip64=True) as f:
            # Iterate through all JSON files
            for file_path in tqdm(metadata_dir.rglob('*.json'), desc="Writing JSONL"):
                with open(file_path, 'r') as json_file:
-                    try:
                    metadata = json.load(json_file)
-                    except json.JSONDecodeError:
-                        print(file_path)
-                        raise
                    metadata['metadata_path'] = str(file_path.relative_to(output_dir))
                    metadata['collection_path'] = metadata['metadata_path'].replace('metadata', 'collections', 1)
                    # Write each record to the same file, with newline
--- a/scripts/data_gov/fetch_index.py
+++ b/scripts/data_gov/fetch_index.py
@ -1,10 +1,9 @@
-import httpx
-from typing import Iterator, Dict, Any, List
-import time
+from typing import Dict, Any, List
 import click
 from pathlib import Path
 import logging
 from datetime import datetime
+from scripts.data_gov.helpers import fetch_data_gov_packages
 from scripts.data_gov.models import db, Dataset, DatasetHistory
 from tqdm import tqdm
 from playhouse.shortcuts import model_to_dict
@ -104,71 +103,6 @@ def save_packages_to_database(output_path: Path, rows_per_page: int = 1000, star
    finally:
        db.close()

-def fetch_data_gov_packages(rows_per_page: int = 1000, start_date: str = None, max_retries: int = 3) -> Iterator[Dict[str, Any]]:
-    """
-    Fetch package data from data.gov API using date-based pagination.
-    
-    Args:
-        rows_per_page: Number of results to fetch per page
-        start_date: Optional date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)
-        max_retries: Maximum number of retry attempts for 5xx errors
-    
-    Yields:
-        Dict containing package data for each result
-    """
-    
-    base_url = "https://catalog.data.gov/api/3/action/package_search"
-    current_date = start_date
-    total_records = 0
-    
-    while True:
-        logger.info(f"Current date offset: {current_date}")
-
-        # Build date filter query
-        url = f"{base_url}?rows={rows_per_page}&sort=metadata_modified+desc"
-        if current_date:
-            # Format date to match Solr's expected format (dropping microseconds)
-            formatted_date = current_date.split('.')[0] + 'Z'
-            date_filter = f"+metadata_modified:[* TO {formatted_date}]"
-            url += f"&fq={date_filter}"
-        
-        for attempt in range(max_retries):
-            try:
-                start_time = time.time()
-                response = httpx.get(url, timeout=60.0)
-                request_time = time.time() - start_time
-                
-                response.raise_for_status()
-                break  # Success, exit retry loop
-                
-            except httpx.HTTPStatusError as e:
-                if e.response.status_code >= 500 and attempt < max_retries - 1:
-                    retry_wait = 2 ** attempt  # Exponential backoff
-                    logger.warning(f"Got {e.response.status_code}, retrying in {retry_wait}s... (attempt {attempt + 1}/{max_retries})")
-                    logger.warning(f"Error URL: {url}")
-                    time.sleep(retry_wait)
-                    continue
-                # If not a 5xx error or we're out of retries, re-raise
-                logger.error(f"Error URL: {url}")
-                logger.error(f"Response content: {response.text}")
-                raise
-        
-        data = response.json()
-        results = data["result"]["results"]
-        
-        if not results:
-            break
-            
-        # Get date of last result for next query
-        current_date = results[-1]["metadata_modified"]
-
-        total_records += len(results)
-        logger.info(f"Request took {request_time:.2f}s. Total records: {total_records}")
-            
-        yield results
-        
-        time.sleep(1)
-
 def get_dataset_history(dataset_name: str) -> None:
    """
    Fetch and display all versions of a dataset with the given ID,
--- a/scripts/data_gov/fetch_jsonl.py
+++ b/scripts/data_gov/fetch_jsonl.py
@ -1,16 +1,25 @@
-import httpx
 import json
-import time
 import logging
+import gzip
+import pickle
 from pathlib import Path
-from typing import Iterator, Dict, Any, List
 import click
-from scripts.data_gov.fetch_index import fetch_data_gov_packages
+from scripts.data_gov.helpers import fetch_data_gov_packages
+from datetime import datetime
+from typing import Dict, Any
+from tqdm import tqdm
+import deepdiff
+import orjson

 logger = logging.getLogger(__name__)

-@click.command()
-@click.argument('output_path', type=click.Path(path_type=Path), default='data/data_20250130.jsonl')
+@click.group()
+def cli():
+    """Data.gov package management commands."""
+    pass
+
+@cli.command()
+@click.argument('output_path', type=click.Path(path_type=Path))
@click.option('--rows-per-page', '-r', type=int, default=1000,
              help='Number of results to fetch per page.')
@click.option('--log-level', '-l', 
@ -19,17 +28,87 @@ logger = logging.getLogger(__name__)
              help='Logging level.')
@click.option('--start-date', '-s', type=str, default=None,
              help='Start date for fetching packages in YYYY-MM-DD format.')
-def main(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
-    """Fetch all package data from data.gov API and save to JSONL file."""
+def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
+    """Fetch all package data from data.gov API and save to gzipped JSONL file."""
    logging.basicConfig(
        level=getattr(logging, log_level),
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    
-    with open(output_path, 'a') as f:
+    if output_path.is_dir():
+        current_date = datetime.now().strftime('%Y%m%d')
+        output_path = output_path / f'data_{current_date}.jsonl.gz'
+
+    logger.info(f"Writing to {output_path}")
+    
+    with gzip.open(output_path, 'at') as f:
        for results in fetch_data_gov_packages(rows_per_page=rows_per_page, start_date=start_date):
            for package in results:
                f.write(json.dumps(package) + '\n')

+@cli.command()
+@click.argument('file1', type=click.Path(exists=True, path_type=Path))
+@click.argument('file2', type=click.Path(exists=True, path_type=Path))
+@click.option('--log-level', '-l', 
+              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO',
+              help='Logging level.')
+def compare(file1: Path, file2: Path, log_level: str):
+    """Compare two gzipped JSONL files by indexing on the 'name' key."""
+    logging.basicConfig(
+        level=getattr(logging, log_level),
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+
+    def load_jsonl_index(file_path: Path) -> Dict[str, Any]:
+        # Check for pickle file
+        pickle_path = file_path.with_suffix('.pickle')
+        if pickle_path.exists():
+            logger.info(f"Loading cached index from {pickle_path}")
+            with open(pickle_path, 'rb') as f:
+                return pickle.load(f)
+
+        # If no pickle file exists, load from JSONL and create pickle
+        index = {}
+        with gzip.open(file_path, 'rt') as f:
+            for line in tqdm(f, desc=f"Loading {file_path}"):
+                record = orjson.loads(line)
+                index[record['name']] = record
+
+        # Save to pickle for future runs
+        logger.info(f"Saving index to {pickle_path}")
+        with open(pickle_path, 'wb') as f:
+            pickle.dump(index, f)
+
+        return index
+
+    logger.info(f"Loading {file1}")
+    index1 = load_jsonl_index(file1)
+    logger.info(f"Loading {file2}")
+    index2 = load_jsonl_index(file2)
+
+    names1 = set(index1.keys())
+    names2 = set(index2.keys())
+
+    only_in_file1 = [index1[name] for name in names1 - names2]
+    only_in_file2 = [index2[name] for name in names2 - names1]
+    names_in_both = names1 & names2
+    changed = [[index1[name], index2[name]] for name in tqdm(names_in_both, desc="Changed") if index1[name] != index2[name]]
+    changed_deep = [[diff.to_json(), item1, item2] for item1, item2 in tqdm(changed[:1000], desc="Changed (deep)") if (diff := deepdiff.DeepDiff(item1, item2, ignore_order=True))]
+    
+    # for suffix, items in [
+    #     ('added', only_in_file2),
+    #     ('removed', only_in_file1),
+    #     ('changed', changed),
+    #     ('changed_deep', changed_deep)
+    # ]:
+    #     logger.info(f"Writing {suffix}: {len(items)}")
+    #     output_path = file2.parent / f'{file2.stem}_{suffix}.jsonl.gz'
+    #     with gzip.open(output_path, 'wt') as f:
+    #         for item in tqdm(items, desc=suffix):
+    #             f.write(json.dumps(item) + '\n')
+
+    breakpoint()
+
 if __name__ == "__main__":
-    main()
+    cli()
--- a/scripts/data_gov/helpers.py
+++ b/scripts/data_gov/helpers.py
@ -0,0 +1,71 @@
+import httpx
+import time
+from typing import Any, Dict, Iterator
+import logging
+
+logger = logging.getLogger(__name__)
+
+def fetch_data_gov_packages(rows_per_page: int = 1000, start_date: str = None, max_retries: int = 3) -> Iterator[Dict[str, Any]]:
+    """
+    Fetch package data from data.gov API using date-based pagination.
+
+    Args:
+        rows_per_page: Number of results to fetch per page
+        start_date: Optional date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)
+        max_retries: Maximum number of retry attempts for 5xx errors
+
+    Yields:
+        Dict containing package data for each result
+    """
+
+    base_url = "https://catalog.data.gov/api/3/action/package_search"
+    current_date = start_date
+    total_records = 0
+
+    while True:
+        logger.info(f"Current date offset: {current_date}")
+
+        # Build date filter query
+        url = f"{base_url}?rows={rows_per_page}&sort=metadata_modified+desc"
+        if current_date:
+            # Format date to match Solr's expected format (dropping microseconds)
+            formatted_date = current_date.split('.')[0] + 'Z'
+            date_filter = f"+metadata_modified:[* TO {formatted_date}]"
+            url += f"&fq={date_filter}"
+
+        for attempt in range(max_retries):
+            try:
+                start_time = time.time()
+                response = httpx.get(url, timeout=60.0)
+                request_time = time.time() - start_time
+
+                response.raise_for_status()
+                break  # Success, exit retry loop
+
+            except httpx.HTTPStatusError as e:
+                if e.response.status_code >= 500 and attempt < max_retries - 1:
+                    retry_wait = 2 ** attempt  # Exponential backoff
+                    logger.warning(f"Got {e.response.status_code}, retrying in {retry_wait}s... (attempt {attempt + 1}/{max_retries})")
+                    logger.warning(f"Error URL: {url}")
+                    time.sleep(retry_wait)
+                    continue
+                # If not a 5xx error or we're out of retries, re-raise
+                logger.error(f"Error URL: {url}")
+                logger.error(f"Response content: {response.text}")
+                raise
+
+        data = response.json()
+        results = data["result"]["results"]
+
+        if not results:
+            break
+
+        # Get date of last result for next query
+        current_date = results[-1]["metadata_modified"]
+
+        total_records += len(results)
+        logger.info(f"Request took {request_time:.2f}s. Total records: {total_records}")
+
+        yield results
+
+        time.sleep(1)
--- a/scripts/data_gov/migrate.py
+++ b/scripts/data_gov/migrate.py
@ -1,17 +1,19 @@
 from playhouse.migrate import *
-from scripts.data_gov.models import db
+from scripts.data_gov.models import db, Crawl

 migrator = SqliteMigrator(db)

 def do_migrate():
-    crawler_identified_date = DateTimeField(null=True)
-    crawler_downloaded_date = DateTimeField(null=True)
+    crawler_last_run_id = ForeignKeyField(Crawl, null=True)
+    deleted_by = ForeignKeyField(Crawl, null=True)
+    
    with db.atomic():
+        # Create the Run table first
+        db.create_tables([Crawl])
+        
        migrate(
-            # migrator.add_column('dataset', 'crawler_identified_date', crawler_identified_date),
-            # migrator.add_column('dataset', 'crawler_downloaded_date', crawler_downloaded_date),
-            # migrator.add_column('datasethistory', 'crawler_identified_date', crawler_identified_date),
-            # migrator.add_column('datasethistory', 'crawler_downloaded_date', crawler_downloaded_date),
+            migrator.add_column('dataset', 'crawler_last_run_id', crawler_last_run_id),
+            migrator.add_column('datasethistory', 'deleted_by', deleted_by),
        )

 if __name__ == '__main__':
--- a/scripts/data_gov/models.py
+++ b/scripts/data_gov/models.py
@ -18,6 +18,12 @@ class BaseModel(Model):
    class Meta:
        database = db

+class Crawl(BaseModel):
+    id = AutoField(primary_key=True)
+    start_date = DateTimeField()
+    end_date = DateTimeField(null=True)
+    
+
 class Dataset(BaseModel):
    # fields from data.gov
    id = CharField(primary_key=True)
@ -54,8 +60,10 @@ class Dataset(BaseModel):
    # fields starting with crawler_ are added by our crawler
    crawler_identified_date = DateTimeField(null=True, default=datetime.now)
    crawler_downloaded_date = DateTimeField(null=True)
+    crawler_last_crawl_id = ForeignKeyField('Crawl', backref='datasets', null=True)
+

 class DatasetHistory(Dataset):
    history_id = AutoField(primary_key=True)
    id = CharField()  # Regular CharField, not primary key
-    #deleted_by_date = DateTimeField(null=True)  # New field to track deletion date
+    deleted_by_date = DateTimeField(null=True)
--- a/uv.lock
+++ b/uv.lock
@ -122,7 +122,7 @@ name = "click"
 version = "8.1.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "platform_system == 'Windows'" },
+    { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
 wheels = [
@ -164,10 +164,12 @@ dependencies = [
    { name = "bagit" },
    { name = "boto3" },
    { name = "cloudflare" },
+    { name = "deepdiff" },
    { name = "gitspoke" },
    { name = "httpx" },
    { name = "jsondiff" },
    { name = "nabit" },
+    { name = "orjson" },
    { name = "peewee" },
    { name = "publicsuffixlist" },
    { name = "pyarrow" },
@ -185,10 +187,12 @@ requires-dist = [
    { name = "bagit", specifier = ">=1.8.1" },
    { name = "boto3", specifier = ">=1.35.80" },
    { name = "cloudflare", specifier = ">=4.0.0" },
+    { name = "deepdiff", specifier = ">=8.2.0" },
    { name = "gitspoke", git = "https://github.com/harvard-lil/gitspoke" },
    { name = "httpx", specifier = ">=0.27.2" },
    { name = "jsondiff", specifier = ">=2.2.1" },
    { name = "nabit", git = "https://github.com/harvard-lil/bag-nabit" },
+    { name = "orjson", specifier = ">=3.10.15" },
    { name = "peewee", specifier = ">=3.17.8" },
    { name = "publicsuffixlist", specifier = ">=1.0.2.20241121" },
    { name = "pyarrow", specifier = ">=18.0.0" },
@ -198,6 +202,18 @@ requires-dist = [
 [package.metadata.requires-dev]
 dev = [{ name = "memray", specifier = ">=1.15.0" }]

+[[package]]
+name = "deepdiff"
+version = "8.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "orderly-set" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/89/12/207d2ec96a526cf9d04fc2423ff9832e93b665e94b9d7c9b5198903e18a7/deepdiff-8.2.0.tar.gz", hash = "sha256:6ec78f65031485735545ffbe7a61e716c3c2d12ca6416886d5e9291fc76c46c3", size = 432573 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6c/13/d7dd6b8c297b1d5cfea4f1ebd678e68d90ab04b6613d005c0a7c506d11e1/deepdiff-8.2.0-py3-none-any.whl", hash = "sha256:5091f2cdfd372b1b9f6bfd8065ba323ae31118dc4e42594371b38c8bea3fd0a4", size = 83672 },
+]
+
 [[package]]
 name = "distro"
 version = "1.9.0"
@ -450,6 +466,49 @@ dependencies = [
    { name = "warcio" },
 ]

+[[package]]
+name = "orderly-set"
+version = "5.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/0e/ef328b512c2595831304e51f25e9287697b7bf13be0527ca9592a2659c16/orderly_set-5.3.0.tar.gz", hash = "sha256:80b3d8fdd3d39004d9aad389eaa0eab02c71f0a0511ba3a6d54a935a6c6a0acc", size = 20026 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/df/fe/8009ebb64a19cf4bdf51b16d3074375010735d8c30408efada6ce02bf37e/orderly_set-5.3.0-py3-none-any.whl", hash = "sha256:c2c0bfe604f5d3d9b24e8262a06feb612594f37aa3845650548befd7772945d1", size = 12179 },
+]
+
+[[package]]
+name = "orjson"
+version = "3.10.15"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/f9/5dea21763eeff8c1590076918a446ea3d6140743e0e36f58f369928ed0f4/orjson-3.10.15.tar.gz", hash = "sha256:05ca7fe452a2e9d8d9d706a2984c95b9c2ebc5db417ce0b7a49b91d50642a23e", size = 5282482 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/66/85/22fe737188905a71afcc4bf7cc4c79cd7f5bbe9ed1fe0aac4ce4c33edc30/orjson-3.10.15-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:9d11c0714fc85bfcf36ada1179400862da3288fc785c30e8297844c867d7505a", size = 249504 },
+    { url = "https://files.pythonhosted.org/packages/48/b7/2622b29f3afebe938a0a9037e184660379797d5fd5234e5998345d7a5b43/orjson-3.10.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dba5a1e85d554e3897fa9fe6fbcff2ed32d55008973ec9a2b992bd9a65d2352d", size = 125080 },
+    { url = "https://files.pythonhosted.org/packages/ce/8f/0b72a48f4403d0b88b2a41450c535b3e8989e8a2d7800659a967efc7c115/orjson-3.10.15-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7723ad949a0ea502df656948ddd8b392780a5beaa4c3b5f97e525191b102fff0", size = 150121 },
+    { url = "https://files.pythonhosted.org/packages/06/ec/acb1a20cd49edb2000be5a0404cd43e3c8aad219f376ac8c60b870518c03/orjson-3.10.15-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6fd9bc64421e9fe9bd88039e7ce8e58d4fead67ca88e3a4014b143cec7684fd4", size = 139796 },
+    { url = "https://files.pythonhosted.org/packages/33/e1/f7840a2ea852114b23a52a1c0b2bea0a1ea22236efbcdb876402d799c423/orjson-3.10.15-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dadba0e7b6594216c214ef7894c4bd5f08d7c0135f4dd0145600be4fbcc16767", size = 154636 },
+    { url = "https://files.pythonhosted.org/packages/fa/da/31543337febd043b8fa80a3b67de627669b88c7b128d9ad4cc2ece005b7a/orjson-3.10.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48f59114fe318f33bbaee8ebeda696d8ccc94c9e90bc27dbe72153094e26f41", size = 130621 },
+    { url = "https://files.pythonhosted.org/packages/ed/78/66115dc9afbc22496530d2139f2f4455698be444c7c2475cb48f657cefc9/orjson-3.10.15-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:035fb83585e0f15e076759b6fedaf0abb460d1765b6a36f48018a52858443514", size = 138516 },
+    { url = "https://files.pythonhosted.org/packages/22/84/cd4f5fb5427ffcf823140957a47503076184cb1ce15bcc1165125c26c46c/orjson-3.10.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d13b7fe322d75bf84464b075eafd8e7dd9eae05649aa2a5354cfa32f43c59f17", size = 130762 },
+    { url = "https://files.pythonhosted.org/packages/93/1f/67596b711ba9f56dd75d73b60089c5c92057f1130bb3a25a0f53fb9a583b/orjson-3.10.15-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:7066b74f9f259849629e0d04db6609db4cf5b973248f455ba5d3bd58a4daaa5b", size = 414700 },
+    { url = "https://files.pythonhosted.org/packages/7c/0c/6a3b3271b46443d90efb713c3e4fe83fa8cd71cda0d11a0f69a03f437c6e/orjson-3.10.15-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:88dc3f65a026bd3175eb157fea994fca6ac7c4c8579fc5a86fc2114ad05705b7", size = 141077 },
+    { url = "https://files.pythonhosted.org/packages/3b/9b/33c58e0bfc788995eccd0d525ecd6b84b40d7ed182dd0751cd4c1322ac62/orjson-3.10.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b342567e5465bd99faa559507fe45e33fc76b9fb868a63f1642c6bc0735ad02a", size = 129898 },
+    { url = "https://files.pythonhosted.org/packages/01/c1/d577ecd2e9fa393366a1ea0a9267f6510d86e6c4bb1cdfb9877104cac44c/orjson-3.10.15-cp312-cp312-win32.whl", hash = "sha256:0a4f27ea5617828e6b58922fdbec67b0aa4bb844e2d363b9244c47fa2180e665", size = 142566 },
+    { url = "https://files.pythonhosted.org/packages/ed/eb/a85317ee1732d1034b92d56f89f1de4d7bf7904f5c8fb9dcdd5b1c83917f/orjson-3.10.15-cp312-cp312-win_amd64.whl", hash = "sha256:ef5b87e7aa9545ddadd2309efe6824bd3dd64ac101c15dae0f2f597911d46eaa", size = 133732 },
+    { url = "https://files.pythonhosted.org/packages/06/10/fe7d60b8da538e8d3d3721f08c1b7bff0491e8fa4dd3bf11a17e34f4730e/orjson-3.10.15-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:bae0e6ec2b7ba6895198cd981b7cca95d1487d0147c8ed751e5632ad16f031a6", size = 249399 },
+    { url = "https://files.pythonhosted.org/packages/6b/83/52c356fd3a61abd829ae7e4366a6fe8e8863c825a60d7ac5156067516edf/orjson-3.10.15-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f93ce145b2db1252dd86af37d4165b6faa83072b46e3995ecc95d4b2301b725a", size = 125044 },
+    { url = "https://files.pythonhosted.org/packages/55/b2/d06d5901408e7ded1a74c7c20d70e3a127057a6d21355f50c90c0f337913/orjson-3.10.15-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7c203f6f969210128af3acae0ef9ea6aab9782939f45f6fe02d05958fe761ef9", size = 150066 },
+    { url = "https://files.pythonhosted.org/packages/75/8c/60c3106e08dc593a861755781c7c675a566445cc39558677d505878d879f/orjson-3.10.15-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8918719572d662e18b8af66aef699d8c21072e54b6c82a3f8f6404c1f5ccd5e0", size = 139737 },
+    { url = "https://files.pythonhosted.org/packages/6a/8c/ae00d7d0ab8a4490b1efeb01ad4ab2f1982e69cc82490bf8093407718ff5/orjson-3.10.15-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f71eae9651465dff70aa80db92586ad5b92df46a9373ee55252109bb6b703307", size = 154804 },
+    { url = "https://files.pythonhosted.org/packages/22/86/65dc69bd88b6dd254535310e97bc518aa50a39ef9c5a2a5d518e7a223710/orjson-3.10.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e117eb299a35f2634e25ed120c37c641398826c2f5a3d3cc39f5993b96171b9e", size = 130583 },
+    { url = "https://files.pythonhosted.org/packages/bb/00/6fe01ededb05d52be42fabb13d93a36e51f1fd9be173bd95707d11a8a860/orjson-3.10.15-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:13242f12d295e83c2955756a574ddd6741c81e5b99f2bef8ed8d53e47a01e4b7", size = 138465 },
+    { url = "https://files.pythonhosted.org/packages/db/2f/4cc151c4b471b0cdc8cb29d3eadbce5007eb0475d26fa26ed123dca93b33/orjson-3.10.15-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7946922ada8f3e0b7b958cc3eb22cfcf6c0df83d1fe5521b4a100103e3fa84c8", size = 130742 },
+    { url = "https://files.pythonhosted.org/packages/9f/13/8a6109e4b477c518498ca37963d9c0eb1508b259725553fb53d53b20e2ea/orjson-3.10.15-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:b7155eb1623347f0f22c38c9abdd738b287e39b9982e1da227503387b81b34ca", size = 414669 },
+    { url = "https://files.pythonhosted.org/packages/22/7b/1d229d6d24644ed4d0a803de1b0e2df832032d5beda7346831c78191b5b2/orjson-3.10.15-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:208beedfa807c922da4e81061dafa9c8489c6328934ca2a562efa707e049e561", size = 141043 },
+    { url = "https://files.pythonhosted.org/packages/cc/d3/6dc91156cf12ed86bed383bcb942d84d23304a1e57b7ab030bf60ea130d6/orjson-3.10.15-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eca81f83b1b8c07449e1d6ff7074e82e3fd6777e588f1a6632127f286a968825", size = 129826 },
+    { url = "https://files.pythonhosted.org/packages/b3/38/c47c25b86f6996f1343be721b6ea4367bc1c8bc0fc3f6bbcd995d18cb19d/orjson-3.10.15-cp313-cp313-win32.whl", hash = "sha256:c03cd6eea1bd3b949d0d007c8d57049aa2b39bd49f58b4b2af571a5d3833d890", size = 142542 },
+    { url = "https://files.pythonhosted.org/packages/27/f1/1d7ec15b20f8ce9300bc850de1e059132b88990e46cd0ccac29cbf11e4f9/orjson-3.10.15-cp313-cp313-win_amd64.whl", hash = "sha256:fd56a26a04f6ba5fb2045b0acc487a63162a958ed837648c5781e1fe3316cfbf", size = 133444 },
+]
+
 [[package]]
 name = "packaging"
 version = "24.2"
@ -698,7 +757,7 @@ name = "tqdm"
 version = "4.67.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "platform_system == 'Windows'" },
+    { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 }
 wheels = [