mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-03-15 07:31:21 +00:00
Cleanup to prep for diffing
This commit is contained in:
parent
7af7f9cf3e
commit
a7c99e264d
9 changed files with 290 additions and 122 deletions
|
@ -84,9 +84,9 @@ Rollup files
|
|||
There are several rollup files at the top level to help with finding datasets
|
||||
of interest:
|
||||
|
||||
* `metadata.jsonl.zip`: zipped JSON lines file of all files contained in metadata/
|
||||
* `metadata.csv.zip`: CSV listing the name, organization, title, date, metadata path, and collection path for each dataset
|
||||
* `metadata.jsonl.zip`: JSON lines file with complete metadata for each dataset, including the `signed_metadata` and `zip_entries` sections (equivalent to downloading the metadata/ directory as a single file)
|
||||
* `file_listing.jsonl.zip`: zipped JSON lines file showing the s3 listing of all files in the repository
|
||||
* `collections.html`: human-readable HTML file showing the title and link to each dataset (warning, very large file that may not load in some browsers)
|
||||
|
||||
Downloading data
|
||||
----------------
|
||||
|
@ -117,3 +117,10 @@ Source code
|
|||
|
||||
The source code used to generate this and other repositories is available at [https://github.com/harvard-lil/data-vault](https://github.com/harvard-lil/data-vault).
|
||||
We welcome conversation and collaboration in the issue tracker for that project.
|
||||
|
||||
Collection Dates and Update Schedule
|
||||
------------------------------------
|
||||
|
||||
Files in this repository were collected intermittently between 2024-11-19 and 2025-02-06.
|
||||
|
||||
Beginning on 2025-02-06, we will update the repository daily.
|
|
@ -16,6 +16,8 @@ dependencies = [
|
|||
"nabit",
|
||||
"gitspoke",
|
||||
"cloudflare>=4.0.0",
|
||||
"deepdiff>=8.2.0",
|
||||
"orjson>=3.10.15",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
|
|
|
@ -6,6 +6,8 @@ import logging
|
|||
import csv
|
||||
import zipfile
|
||||
from tqdm import tqdm
|
||||
import io
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@click.group()
|
||||
|
@ -27,6 +29,10 @@ def write_readme(collections_file: Path):
|
|||
bucket_name, s3_prefix = collection['s3_path'].split('/', 1)
|
||||
|
||||
for file_path in collection_path.rglob('*'):
|
||||
# Skip dotfiles and files in dot directories
|
||||
if any(part.startswith('.') for part in file_path.parts):
|
||||
continue
|
||||
|
||||
if file_path.is_file():
|
||||
relative_path = file_path.relative_to(collection_path)
|
||||
s3_key = f"{s3_prefix}/{relative_path}"
|
||||
|
@ -38,32 +44,37 @@ def write_readme(collections_file: Path):
|
|||
@click.argument('output_file', type=click.Path(path_type=Path))
|
||||
def write_csv(metadata_file: Path, output_file: Path):
|
||||
"""
|
||||
Read a zipped JSONL file of metadata and write dataset info to CSV.
|
||||
Read a zipped JSONL file of metadata and write dataset info to a zipped CSV.
|
||||
|
||||
metadata_file: Path to the zip file containing metadata JSONL
|
||||
output_file: Path where the CSV should be written
|
||||
output_file: Path where the zipped CSV should be written
|
||||
"""
|
||||
with zipfile.ZipFile(metadata_file, 'r') as zf, \
|
||||
open(output_file, 'w', newline='') as csvfile:
|
||||
|
||||
# Get the base filename without .zip extension for the internal CSV file
|
||||
internal_filename = output_file.name.replace('.zip', '')
|
||||
jsonl_name = metadata_file.name.replace('.zip', '')
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(['name', 'title']) # Write header
|
||||
|
||||
with zf.open(jsonl_name) as f:
|
||||
for line in tqdm(f, desc="Writing CSV"):
|
||||
try:
|
||||
with zipfile.ZipFile(metadata_file, 'r') as input_zf, \
|
||||
zipfile.ZipFile(output_file, 'w', compression=zipfile.ZIP_DEFLATED) as output_zf, \
|
||||
output_zf.open(internal_filename, 'w', force_zip64=True) as csvfile, \
|
||||
input_zf.open(jsonl_name) as jsonlfile:
|
||||
|
||||
# Create a text wrapper around the binary file
|
||||
text_wrapper = io.TextIOWrapper(csvfile, write_through=True, newline='')
|
||||
writer = csv.writer(text_wrapper)
|
||||
writer.writerow(['name', 'organization', 'title', 'date', 'metadata_path', 'collection_path'])
|
||||
|
||||
# Read from input zip and write to output zip
|
||||
for line in tqdm(jsonlfile, desc="Writing CSV"):
|
||||
metadata = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
print(line)
|
||||
breakpoint()
|
||||
print(line)
|
||||
continue
|
||||
dataset_info = metadata.get('signed_metadata', {}).get('data_gov_metadata', {})
|
||||
dataset_info = metadata['signed_metadata']['data_gov_metadata']
|
||||
if dataset_info:
|
||||
writer.writerow([
|
||||
dataset_info.get('name', ''),
|
||||
dataset_info.get('title', '')
|
||||
dataset_info['name'],
|
||||
dataset_info['organization']['title'],
|
||||
dataset_info['title'],
|
||||
dataset_info['metadata_modified'],
|
||||
metadata['metadata_path'],
|
||||
metadata['collection_path'],
|
||||
])
|
||||
|
||||
@cli.command()
|
||||
|
@ -71,25 +82,20 @@ def write_csv(metadata_file: Path, output_file: Path):
|
|||
@click.argument('output_file', type=click.Path(path_type=Path))
|
||||
def write_jsonl(metadata_dir: Path, output_file: Path):
|
||||
"""
|
||||
Read each .json file, recursively, in metadata directory and write to a single zipped JSONL file.
|
||||
Read each .json file, recursively, in metadata directory and write to a single compressed zipped JSONL file.
|
||||
All records are written to a single JSONL file within the zip, named same as output_file without .zip
|
||||
"""
|
||||
# Get the base filename without .zip extension for the internal file
|
||||
internal_filename = output_file.name.replace('.zip', '')
|
||||
output_dir = output_file.parent
|
||||
|
||||
# Use force_zip64=True to handle files larger than 2GB
|
||||
with zipfile.ZipFile(output_file, 'w') as zf:
|
||||
with zipfile.ZipFile(output_file, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
|
||||
# Create a single file in the zip archive
|
||||
with zf.open(internal_filename, 'w', force_zip64=True) as f:
|
||||
# Iterate through all JSON files
|
||||
for file_path in tqdm(metadata_dir.rglob('*.json'), desc="Writing JSONL"):
|
||||
with open(file_path, 'r') as json_file:
|
||||
try:
|
||||
metadata = json.load(json_file)
|
||||
except json.JSONDecodeError:
|
||||
print(file_path)
|
||||
raise
|
||||
metadata['metadata_path'] = str(file_path.relative_to(output_dir))
|
||||
metadata['collection_path'] = metadata['metadata_path'].replace('metadata', 'collections', 1)
|
||||
# Write each record to the same file, with newline
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
import httpx
|
||||
from typing import Iterator, Dict, Any, List
|
||||
import time
|
||||
from typing import Dict, Any, List
|
||||
import click
|
||||
from pathlib import Path
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from scripts.data_gov.helpers import fetch_data_gov_packages
|
||||
from scripts.data_gov.models import db, Dataset, DatasetHistory
|
||||
from tqdm import tqdm
|
||||
from playhouse.shortcuts import model_to_dict
|
||||
|
@ -104,71 +103,6 @@ def save_packages_to_database(output_path: Path, rows_per_page: int = 1000, star
|
|||
finally:
|
||||
db.close()
|
||||
|
||||
def fetch_data_gov_packages(rows_per_page: int = 1000, start_date: str = None, max_retries: int = 3) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
Fetch package data from data.gov API using date-based pagination.
|
||||
|
||||
Args:
|
||||
rows_per_page: Number of results to fetch per page
|
||||
start_date: Optional date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)
|
||||
max_retries: Maximum number of retry attempts for 5xx errors
|
||||
|
||||
Yields:
|
||||
Dict containing package data for each result
|
||||
"""
|
||||
|
||||
base_url = "https://catalog.data.gov/api/3/action/package_search"
|
||||
current_date = start_date
|
||||
total_records = 0
|
||||
|
||||
while True:
|
||||
logger.info(f"Current date offset: {current_date}")
|
||||
|
||||
# Build date filter query
|
||||
url = f"{base_url}?rows={rows_per_page}&sort=metadata_modified+desc"
|
||||
if current_date:
|
||||
# Format date to match Solr's expected format (dropping microseconds)
|
||||
formatted_date = current_date.split('.')[0] + 'Z'
|
||||
date_filter = f"+metadata_modified:[* TO {formatted_date}]"
|
||||
url += f"&fq={date_filter}"
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
start_time = time.time()
|
||||
response = httpx.get(url, timeout=60.0)
|
||||
request_time = time.time() - start_time
|
||||
|
||||
response.raise_for_status()
|
||||
break # Success, exit retry loop
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code >= 500 and attempt < max_retries - 1:
|
||||
retry_wait = 2 ** attempt # Exponential backoff
|
||||
logger.warning(f"Got {e.response.status_code}, retrying in {retry_wait}s... (attempt {attempt + 1}/{max_retries})")
|
||||
logger.warning(f"Error URL: {url}")
|
||||
time.sleep(retry_wait)
|
||||
continue
|
||||
# If not a 5xx error or we're out of retries, re-raise
|
||||
logger.error(f"Error URL: {url}")
|
||||
logger.error(f"Response content: {response.text}")
|
||||
raise
|
||||
|
||||
data = response.json()
|
||||
results = data["result"]["results"]
|
||||
|
||||
if not results:
|
||||
break
|
||||
|
||||
# Get date of last result for next query
|
||||
current_date = results[-1]["metadata_modified"]
|
||||
|
||||
total_records += len(results)
|
||||
logger.info(f"Request took {request_time:.2f}s. Total records: {total_records}")
|
||||
|
||||
yield results
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
def get_dataset_history(dataset_name: str) -> None:
|
||||
"""
|
||||
Fetch and display all versions of a dataset with the given ID,
|
||||
|
|
|
@ -1,16 +1,25 @@
|
|||
import httpx
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
import gzip
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Dict, Any, List
|
||||
import click
|
||||
from scripts.data_gov.fetch_index import fetch_data_gov_packages
|
||||
from scripts.data_gov.helpers import fetch_data_gov_packages
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
from tqdm import tqdm
|
||||
import deepdiff
|
||||
import orjson
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@click.command()
|
||||
@click.argument('output_path', type=click.Path(path_type=Path), default='data/data_20250130.jsonl')
|
||||
@click.group()
|
||||
def cli():
|
||||
"""Data.gov package management commands."""
|
||||
pass
|
||||
|
||||
@cli.command()
|
||||
@click.argument('output_path', type=click.Path(path_type=Path))
|
||||
@click.option('--rows-per-page', '-r', type=int, default=1000,
|
||||
help='Number of results to fetch per page.')
|
||||
@click.option('--log-level', '-l',
|
||||
|
@ -19,17 +28,87 @@ logger = logging.getLogger(__name__)
|
|||
help='Logging level.')
|
||||
@click.option('--start-date', '-s', type=str, default=None,
|
||||
help='Start date for fetching packages in YYYY-MM-DD format.')
|
||||
def main(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
|
||||
"""Fetch all package data from data.gov API and save to JSONL file."""
|
||||
def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
|
||||
"""Fetch all package data from data.gov API and save to gzipped JSONL file."""
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
with open(output_path, 'a') as f:
|
||||
if output_path.is_dir():
|
||||
current_date = datetime.now().strftime('%Y%m%d')
|
||||
output_path = output_path / f'data_{current_date}.jsonl.gz'
|
||||
|
||||
logger.info(f"Writing to {output_path}")
|
||||
|
||||
with gzip.open(output_path, 'at') as f:
|
||||
for results in fetch_data_gov_packages(rows_per_page=rows_per_page, start_date=start_date):
|
||||
for package in results:
|
||||
f.write(json.dumps(package) + '\n')
|
||||
|
||||
@cli.command()
|
||||
@click.argument('file1', type=click.Path(exists=True, path_type=Path))
|
||||
@click.argument('file2', type=click.Path(exists=True, path_type=Path))
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO',
|
||||
help='Logging level.')
|
||||
def compare(file1: Path, file2: Path, log_level: str):
|
||||
"""Compare two gzipped JSONL files by indexing on the 'name' key."""
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
def load_jsonl_index(file_path: Path) -> Dict[str, Any]:
|
||||
# Check for pickle file
|
||||
pickle_path = file_path.with_suffix('.pickle')
|
||||
if pickle_path.exists():
|
||||
logger.info(f"Loading cached index from {pickle_path}")
|
||||
with open(pickle_path, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
# If no pickle file exists, load from JSONL and create pickle
|
||||
index = {}
|
||||
with gzip.open(file_path, 'rt') as f:
|
||||
for line in tqdm(f, desc=f"Loading {file_path}"):
|
||||
record = orjson.loads(line)
|
||||
index[record['name']] = record
|
||||
|
||||
# Save to pickle for future runs
|
||||
logger.info(f"Saving index to {pickle_path}")
|
||||
with open(pickle_path, 'wb') as f:
|
||||
pickle.dump(index, f)
|
||||
|
||||
return index
|
||||
|
||||
logger.info(f"Loading {file1}")
|
||||
index1 = load_jsonl_index(file1)
|
||||
logger.info(f"Loading {file2}")
|
||||
index2 = load_jsonl_index(file2)
|
||||
|
||||
names1 = set(index1.keys())
|
||||
names2 = set(index2.keys())
|
||||
|
||||
only_in_file1 = [index1[name] for name in names1 - names2]
|
||||
only_in_file2 = [index2[name] for name in names2 - names1]
|
||||
names_in_both = names1 & names2
|
||||
changed = [[index1[name], index2[name]] for name in tqdm(names_in_both, desc="Changed") if index1[name] != index2[name]]
|
||||
changed_deep = [[diff.to_json(), item1, item2] for item1, item2 in tqdm(changed[:1000], desc="Changed (deep)") if (diff := deepdiff.DeepDiff(item1, item2, ignore_order=True))]
|
||||
|
||||
# for suffix, items in [
|
||||
# ('added', only_in_file2),
|
||||
# ('removed', only_in_file1),
|
||||
# ('changed', changed),
|
||||
# ('changed_deep', changed_deep)
|
||||
# ]:
|
||||
# logger.info(f"Writing {suffix}: {len(items)}")
|
||||
# output_path = file2.parent / f'{file2.stem}_{suffix}.jsonl.gz'
|
||||
# with gzip.open(output_path, 'wt') as f:
|
||||
# for item in tqdm(items, desc=suffix):
|
||||
# f.write(json.dumps(item) + '\n')
|
||||
|
||||
breakpoint()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
cli()
|
||||
|
|
71
scripts/data_gov/helpers.py
Normal file
71
scripts/data_gov/helpers.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
import httpx
|
||||
import time
|
||||
from typing import Any, Dict, Iterator
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def fetch_data_gov_packages(rows_per_page: int = 1000, start_date: str = None, max_retries: int = 3) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
Fetch package data from data.gov API using date-based pagination.
|
||||
|
||||
Args:
|
||||
rows_per_page: Number of results to fetch per page
|
||||
start_date: Optional date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)
|
||||
max_retries: Maximum number of retry attempts for 5xx errors
|
||||
|
||||
Yields:
|
||||
Dict containing package data for each result
|
||||
"""
|
||||
|
||||
base_url = "https://catalog.data.gov/api/3/action/package_search"
|
||||
current_date = start_date
|
||||
total_records = 0
|
||||
|
||||
while True:
|
||||
logger.info(f"Current date offset: {current_date}")
|
||||
|
||||
# Build date filter query
|
||||
url = f"{base_url}?rows={rows_per_page}&sort=metadata_modified+desc"
|
||||
if current_date:
|
||||
# Format date to match Solr's expected format (dropping microseconds)
|
||||
formatted_date = current_date.split('.')[0] + 'Z'
|
||||
date_filter = f"+metadata_modified:[* TO {formatted_date}]"
|
||||
url += f"&fq={date_filter}"
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
start_time = time.time()
|
||||
response = httpx.get(url, timeout=60.0)
|
||||
request_time = time.time() - start_time
|
||||
|
||||
response.raise_for_status()
|
||||
break # Success, exit retry loop
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code >= 500 and attempt < max_retries - 1:
|
||||
retry_wait = 2 ** attempt # Exponential backoff
|
||||
logger.warning(f"Got {e.response.status_code}, retrying in {retry_wait}s... (attempt {attempt + 1}/{max_retries})")
|
||||
logger.warning(f"Error URL: {url}")
|
||||
time.sleep(retry_wait)
|
||||
continue
|
||||
# If not a 5xx error or we're out of retries, re-raise
|
||||
logger.error(f"Error URL: {url}")
|
||||
logger.error(f"Response content: {response.text}")
|
||||
raise
|
||||
|
||||
data = response.json()
|
||||
results = data["result"]["results"]
|
||||
|
||||
if not results:
|
||||
break
|
||||
|
||||
# Get date of last result for next query
|
||||
current_date = results[-1]["metadata_modified"]
|
||||
|
||||
total_records += len(results)
|
||||
logger.info(f"Request took {request_time:.2f}s. Total records: {total_records}")
|
||||
|
||||
yield results
|
||||
|
||||
time.sleep(1)
|
|
@ -1,17 +1,19 @@
|
|||
from playhouse.migrate import *
|
||||
from scripts.data_gov.models import db
|
||||
from scripts.data_gov.models import db, Crawl
|
||||
|
||||
migrator = SqliteMigrator(db)
|
||||
|
||||
def do_migrate():
|
||||
crawler_identified_date = DateTimeField(null=True)
|
||||
crawler_downloaded_date = DateTimeField(null=True)
|
||||
crawler_last_run_id = ForeignKeyField(Crawl, null=True)
|
||||
deleted_by = ForeignKeyField(Crawl, null=True)
|
||||
|
||||
with db.atomic():
|
||||
# Create the Run table first
|
||||
db.create_tables([Crawl])
|
||||
|
||||
migrate(
|
||||
# migrator.add_column('dataset', 'crawler_identified_date', crawler_identified_date),
|
||||
# migrator.add_column('dataset', 'crawler_downloaded_date', crawler_downloaded_date),
|
||||
# migrator.add_column('datasethistory', 'crawler_identified_date', crawler_identified_date),
|
||||
# migrator.add_column('datasethistory', 'crawler_downloaded_date', crawler_downloaded_date),
|
||||
migrator.add_column('dataset', 'crawler_last_run_id', crawler_last_run_id),
|
||||
migrator.add_column('datasethistory', 'deleted_by', deleted_by),
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -18,6 +18,12 @@ class BaseModel(Model):
|
|||
class Meta:
|
||||
database = db
|
||||
|
||||
class Crawl(BaseModel):
|
||||
id = AutoField(primary_key=True)
|
||||
start_date = DateTimeField()
|
||||
end_date = DateTimeField(null=True)
|
||||
|
||||
|
||||
class Dataset(BaseModel):
|
||||
# fields from data.gov
|
||||
id = CharField(primary_key=True)
|
||||
|
@ -54,8 +60,10 @@ class Dataset(BaseModel):
|
|||
# fields starting with crawler_ are added by our crawler
|
||||
crawler_identified_date = DateTimeField(null=True, default=datetime.now)
|
||||
crawler_downloaded_date = DateTimeField(null=True)
|
||||
crawler_last_crawl_id = ForeignKeyField('Crawl', backref='datasets', null=True)
|
||||
|
||||
|
||||
class DatasetHistory(Dataset):
|
||||
history_id = AutoField(primary_key=True)
|
||||
id = CharField() # Regular CharField, not primary key
|
||||
#deleted_by_date = DateTimeField(null=True) # New field to track deletion date
|
||||
deleted_by_date = DateTimeField(null=True)
|
||||
|
|
63
uv.lock
generated
63
uv.lock
generated
|
@ -122,7 +122,7 @@ name = "click"
|
|||
version = "8.1.8"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "colorama", marker = "platform_system == 'Windows'" },
|
||||
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
|
||||
wheels = [
|
||||
|
@ -164,10 +164,12 @@ dependencies = [
|
|||
{ name = "bagit" },
|
||||
{ name = "boto3" },
|
||||
{ name = "cloudflare" },
|
||||
{ name = "deepdiff" },
|
||||
{ name = "gitspoke" },
|
||||
{ name = "httpx" },
|
||||
{ name = "jsondiff" },
|
||||
{ name = "nabit" },
|
||||
{ name = "orjson" },
|
||||
{ name = "peewee" },
|
||||
{ name = "publicsuffixlist" },
|
||||
{ name = "pyarrow" },
|
||||
|
@ -185,10 +187,12 @@ requires-dist = [
|
|||
{ name = "bagit", specifier = ">=1.8.1" },
|
||||
{ name = "boto3", specifier = ">=1.35.80" },
|
||||
{ name = "cloudflare", specifier = ">=4.0.0" },
|
||||
{ name = "deepdiff", specifier = ">=8.2.0" },
|
||||
{ name = "gitspoke", git = "https://github.com/harvard-lil/gitspoke" },
|
||||
{ name = "httpx", specifier = ">=0.27.2" },
|
||||
{ name = "jsondiff", specifier = ">=2.2.1" },
|
||||
{ name = "nabit", git = "https://github.com/harvard-lil/bag-nabit" },
|
||||
{ name = "orjson", specifier = ">=3.10.15" },
|
||||
{ name = "peewee", specifier = ">=3.17.8" },
|
||||
{ name = "publicsuffixlist", specifier = ">=1.0.2.20241121" },
|
||||
{ name = "pyarrow", specifier = ">=18.0.0" },
|
||||
|
@ -198,6 +202,18 @@ requires-dist = [
|
|||
[package.metadata.requires-dev]
|
||||
dev = [{ name = "memray", specifier = ">=1.15.0" }]
|
||||
|
||||
[[package]]
|
||||
name = "deepdiff"
|
||||
version = "8.2.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "orderly-set" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/89/12/207d2ec96a526cf9d04fc2423ff9832e93b665e94b9d7c9b5198903e18a7/deepdiff-8.2.0.tar.gz", hash = "sha256:6ec78f65031485735545ffbe7a61e716c3c2d12ca6416886d5e9291fc76c46c3", size = 432573 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/6c/13/d7dd6b8c297b1d5cfea4f1ebd678e68d90ab04b6613d005c0a7c506d11e1/deepdiff-8.2.0-py3-none-any.whl", hash = "sha256:5091f2cdfd372b1b9f6bfd8065ba323ae31118dc4e42594371b38c8bea3fd0a4", size = 83672 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "distro"
|
||||
version = "1.9.0"
|
||||
|
@ -450,6 +466,49 @@ dependencies = [
|
|||
{ name = "warcio" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "orderly-set"
|
||||
version = "5.3.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/e7/0e/ef328b512c2595831304e51f25e9287697b7bf13be0527ca9592a2659c16/orderly_set-5.3.0.tar.gz", hash = "sha256:80b3d8fdd3d39004d9aad389eaa0eab02c71f0a0511ba3a6d54a935a6c6a0acc", size = 20026 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/df/fe/8009ebb64a19cf4bdf51b16d3074375010735d8c30408efada6ce02bf37e/orderly_set-5.3.0-py3-none-any.whl", hash = "sha256:c2c0bfe604f5d3d9b24e8262a06feb612594f37aa3845650548befd7772945d1", size = 12179 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "orjson"
|
||||
version = "3.10.15"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ae/f9/5dea21763eeff8c1590076918a446ea3d6140743e0e36f58f369928ed0f4/orjson-3.10.15.tar.gz", hash = "sha256:05ca7fe452a2e9d8d9d706a2984c95b9c2ebc5db417ce0b7a49b91d50642a23e", size = 5282482 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/66/85/22fe737188905a71afcc4bf7cc4c79cd7f5bbe9ed1fe0aac4ce4c33edc30/orjson-3.10.15-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:9d11c0714fc85bfcf36ada1179400862da3288fc785c30e8297844c867d7505a", size = 249504 },
|
||||
{ url = "https://files.pythonhosted.org/packages/48/b7/2622b29f3afebe938a0a9037e184660379797d5fd5234e5998345d7a5b43/orjson-3.10.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dba5a1e85d554e3897fa9fe6fbcff2ed32d55008973ec9a2b992bd9a65d2352d", size = 125080 },
|
||||
{ url = "https://files.pythonhosted.org/packages/ce/8f/0b72a48f4403d0b88b2a41450c535b3e8989e8a2d7800659a967efc7c115/orjson-3.10.15-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7723ad949a0ea502df656948ddd8b392780a5beaa4c3b5f97e525191b102fff0", size = 150121 },
|
||||
{ url = "https://files.pythonhosted.org/packages/06/ec/acb1a20cd49edb2000be5a0404cd43e3c8aad219f376ac8c60b870518c03/orjson-3.10.15-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6fd9bc64421e9fe9bd88039e7ce8e58d4fead67ca88e3a4014b143cec7684fd4", size = 139796 },
|
||||
{ url = "https://files.pythonhosted.org/packages/33/e1/f7840a2ea852114b23a52a1c0b2bea0a1ea22236efbcdb876402d799c423/orjson-3.10.15-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dadba0e7b6594216c214ef7894c4bd5f08d7c0135f4dd0145600be4fbcc16767", size = 154636 },
|
||||
{ url = "https://files.pythonhosted.org/packages/fa/da/31543337febd043b8fa80a3b67de627669b88c7b128d9ad4cc2ece005b7a/orjson-3.10.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48f59114fe318f33bbaee8ebeda696d8ccc94c9e90bc27dbe72153094e26f41", size = 130621 },
|
||||
{ url = "https://files.pythonhosted.org/packages/ed/78/66115dc9afbc22496530d2139f2f4455698be444c7c2475cb48f657cefc9/orjson-3.10.15-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:035fb83585e0f15e076759b6fedaf0abb460d1765b6a36f48018a52858443514", size = 138516 },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/84/cd4f5fb5427ffcf823140957a47503076184cb1ce15bcc1165125c26c46c/orjson-3.10.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d13b7fe322d75bf84464b075eafd8e7dd9eae05649aa2a5354cfa32f43c59f17", size = 130762 },
|
||||
{ url = "https://files.pythonhosted.org/packages/93/1f/67596b711ba9f56dd75d73b60089c5c92057f1130bb3a25a0f53fb9a583b/orjson-3.10.15-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:7066b74f9f259849629e0d04db6609db4cf5b973248f455ba5d3bd58a4daaa5b", size = 414700 },
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/0c/6a3b3271b46443d90efb713c3e4fe83fa8cd71cda0d11a0f69a03f437c6e/orjson-3.10.15-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:88dc3f65a026bd3175eb157fea994fca6ac7c4c8579fc5a86fc2114ad05705b7", size = 141077 },
|
||||
{ url = "https://files.pythonhosted.org/packages/3b/9b/33c58e0bfc788995eccd0d525ecd6b84b40d7ed182dd0751cd4c1322ac62/orjson-3.10.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b342567e5465bd99faa559507fe45e33fc76b9fb868a63f1642c6bc0735ad02a", size = 129898 },
|
||||
{ url = "https://files.pythonhosted.org/packages/01/c1/d577ecd2e9fa393366a1ea0a9267f6510d86e6c4bb1cdfb9877104cac44c/orjson-3.10.15-cp312-cp312-win32.whl", hash = "sha256:0a4f27ea5617828e6b58922fdbec67b0aa4bb844e2d363b9244c47fa2180e665", size = 142566 },
|
||||
{ url = "https://files.pythonhosted.org/packages/ed/eb/a85317ee1732d1034b92d56f89f1de4d7bf7904f5c8fb9dcdd5b1c83917f/orjson-3.10.15-cp312-cp312-win_amd64.whl", hash = "sha256:ef5b87e7aa9545ddadd2309efe6824bd3dd64ac101c15dae0f2f597911d46eaa", size = 133732 },
|
||||
{ url = "https://files.pythonhosted.org/packages/06/10/fe7d60b8da538e8d3d3721f08c1b7bff0491e8fa4dd3bf11a17e34f4730e/orjson-3.10.15-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:bae0e6ec2b7ba6895198cd981b7cca95d1487d0147c8ed751e5632ad16f031a6", size = 249399 },
|
||||
{ url = "https://files.pythonhosted.org/packages/6b/83/52c356fd3a61abd829ae7e4366a6fe8e8863c825a60d7ac5156067516edf/orjson-3.10.15-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f93ce145b2db1252dd86af37d4165b6faa83072b46e3995ecc95d4b2301b725a", size = 125044 },
|
||||
{ url = "https://files.pythonhosted.org/packages/55/b2/d06d5901408e7ded1a74c7c20d70e3a127057a6d21355f50c90c0f337913/orjson-3.10.15-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7c203f6f969210128af3acae0ef9ea6aab9782939f45f6fe02d05958fe761ef9", size = 150066 },
|
||||
{ url = "https://files.pythonhosted.org/packages/75/8c/60c3106e08dc593a861755781c7c675a566445cc39558677d505878d879f/orjson-3.10.15-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8918719572d662e18b8af66aef699d8c21072e54b6c82a3f8f6404c1f5ccd5e0", size = 139737 },
|
||||
{ url = "https://files.pythonhosted.org/packages/6a/8c/ae00d7d0ab8a4490b1efeb01ad4ab2f1982e69cc82490bf8093407718ff5/orjson-3.10.15-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f71eae9651465dff70aa80db92586ad5b92df46a9373ee55252109bb6b703307", size = 154804 },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/86/65dc69bd88b6dd254535310e97bc518aa50a39ef9c5a2a5d518e7a223710/orjson-3.10.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e117eb299a35f2634e25ed120c37c641398826c2f5a3d3cc39f5993b96171b9e", size = 130583 },
|
||||
{ url = "https://files.pythonhosted.org/packages/bb/00/6fe01ededb05d52be42fabb13d93a36e51f1fd9be173bd95707d11a8a860/orjson-3.10.15-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:13242f12d295e83c2955756a574ddd6741c81e5b99f2bef8ed8d53e47a01e4b7", size = 138465 },
|
||||
{ url = "https://files.pythonhosted.org/packages/db/2f/4cc151c4b471b0cdc8cb29d3eadbce5007eb0475d26fa26ed123dca93b33/orjson-3.10.15-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7946922ada8f3e0b7b958cc3eb22cfcf6c0df83d1fe5521b4a100103e3fa84c8", size = 130742 },
|
||||
{ url = "https://files.pythonhosted.org/packages/9f/13/8a6109e4b477c518498ca37963d9c0eb1508b259725553fb53d53b20e2ea/orjson-3.10.15-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:b7155eb1623347f0f22c38c9abdd738b287e39b9982e1da227503387b81b34ca", size = 414669 },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/7b/1d229d6d24644ed4d0a803de1b0e2df832032d5beda7346831c78191b5b2/orjson-3.10.15-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:208beedfa807c922da4e81061dafa9c8489c6328934ca2a562efa707e049e561", size = 141043 },
|
||||
{ url = "https://files.pythonhosted.org/packages/cc/d3/6dc91156cf12ed86bed383bcb942d84d23304a1e57b7ab030bf60ea130d6/orjson-3.10.15-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eca81f83b1b8c07449e1d6ff7074e82e3fd6777e588f1a6632127f286a968825", size = 129826 },
|
||||
{ url = "https://files.pythonhosted.org/packages/b3/38/c47c25b86f6996f1343be721b6ea4367bc1c8bc0fc3f6bbcd995d18cb19d/orjson-3.10.15-cp313-cp313-win32.whl", hash = "sha256:c03cd6eea1bd3b949d0d007c8d57049aa2b39bd49f58b4b2af571a5d3833d890", size = 142542 },
|
||||
{ url = "https://files.pythonhosted.org/packages/27/f1/1d7ec15b20f8ce9300bc850de1e059132b88990e46cd0ccac29cbf11e4f9/orjson-3.10.15-cp313-cp313-win_amd64.whl", hash = "sha256:fd56a26a04f6ba5fb2045b0acc487a63162a958ed837648c5781e1fe3316cfbf", size = 133444 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "24.2"
|
||||
|
@ -698,7 +757,7 @@ name = "tqdm"
|
|||
version = "4.67.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "colorama", marker = "platform_system == 'Windows'" },
|
||||
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 }
|
||||
wheels = [
|
||||
|
|
Loading…
Add table
Reference in a new issue