initial commit

This commit is contained in:
Jack Cushman 2025-02-05 10:21:50 -05:00
commit 404c3627f7
26 changed files with 2534 additions and 0 deletions

View file

@ -0,0 +1,127 @@
import json
import click
from pathlib import Path
from typing import Dict, List, Set, Tuple
import logging
from tqdm import tqdm
logger = logging.getLogger(__name__)
def load_jsonl_data(jsonl_path: Path, keep_fields=None, compare_by: str = 'id') -> Dict[str, dict]:
"""
Load data from JSONL file into a dictionary keyed by id.
Only includes fields that match the CSV format.
Args:
jsonl_path: Path to the JSONL file
Returns:
Dictionary mapping id to filtered record data
"""
# Fields to keep from JSONL records
data = {}
with open(jsonl_path, 'r', encoding='utf-8') as f:
for line in tqdm(f, desc="Loading JSONL"):
if line.strip(): # Skip empty lines
record = json.loads(line)
if keep_fields:
record = {k: v for k, v in record.items() if k in keep_fields}
data[record[compare_by]] = record
return data
def find_differences(csv_data: Dict[str, dict],
jsonl_data: Dict[str, dict]) -> Tuple[Set[str], Set[str], Set[str]]:
"""
Find records that differ between CSV and JSONL data.
Args:
csv_data: Dictionary of CSV records keyed by id
jsonl_data: Dictionary of JSONL records keyed by id
Returns:
Tuple of (csv_only_ids, jsonl_only_ids, different_ids)
"""
csv_ids = set(csv_data.keys())
jsonl_ids = set(jsonl_data.keys())
# Find records only in CSV
csv_only = csv_ids - jsonl_ids
# Find records only in JSONL
jsonl_only = jsonl_ids - csv_ids
return csv_only, jsonl_only
@click.command()
@click.argument('old_path', type=click.Path(exists=True, path_type=Path))
@click.argument('new_path', type=click.Path(exists=True, path_type=Path))
@click.option('--compare-by', '-c',
default='id',
help='Field to compare by.')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
def main(old_path: Path, new_path: Path, compare_by: str, log_level: str):
"""Compare records between CSV and JSONL files."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
old_data = load_jsonl_data(old_path, compare_by=compare_by)
new_data = load_jsonl_data(new_path, compare_by=compare_by)
# Find differences
old_only, new_only = find_differences(old_data, new_data)
old_only_path = old_path.with_suffix(f'.only_{compare_by}.jsonl')
new_only_path = new_path.with_suffix(f'.only_{compare_by}.jsonl')
logger.info(f"Writing {len(old_only)} records to {old_only_path}")
with open(old_only_path, 'w', encoding='utf-8') as f:
for id in old_only:
f.write(json.dumps(old_data[id]) + '\n')
logger.info(f"Writing {len(new_only)} records to {new_only_path}")
with open(new_only_path, 'w', encoding='utf-8') as f:
for id in new_only:
f.write(json.dumps(new_data[id]) + '\n')
if __name__ == '__main__':
main()
# import sqlite3
# import json
# # Connect to the database
# conn = sqlite3.connect('data/data.db')
# conn.row_factory = sqlite3.Row # This allows us to access columns by name
# # Open the output file
# with open('data/data_db_dump_20250130.jsonl', 'w') as f:
# # Execute the query and fetch rows in chunks
# cursor = conn.execute('''
# SELECT *
# FROM dataset
# ''')
# written = 0
# while True:
# rows = cursor.fetchmany(1000) # Fetch 1000 rows at a time
# if not rows:
# break
# written += len(rows)
# # Write each row as a JSON line
# for row in rows:
# # Convert row to dict and write to file
# json_line = json.dumps(dict(row))
# f.write(json_line + '\n')
# print(f"Wrote {written} rows")
# conn.close()

View file

@ -0,0 +1,38 @@
import json
from collections import Counter, defaultdict
from pathlib import Path
# Read the JSONL file and count crawler_identified_date values
downloaded_counts = Counter()
identified_counts = Counter()
titles_by_org = defaultdict(list)
with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
for line in f:
data = json.loads(line)
org = json.loads(data.get('organization', '{}'))
identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
titles_by_org[org['title']].append(data["title"])
# Print the counts sorted by date
for date, count in sorted(identified_counts.items()):
print(f"{date}: {count}")
# sort each list of titles by org
for org, titles in titles_by_org.items():
titles_by_org[org].sort()
Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
# print urls
for path in Path('data/').glob('glass*'):
print(path)
with open(path, 'r') as f:
for line in f:
data = json.loads(line)
print("* " + data['name'])
resources = data.get('resources', [])
if type(resources) == str:
resources = json.loads(resources)
for resource in resources:
print(' * ' + resource['url'])

View file

@ -0,0 +1,318 @@
from nabit.lib.archive import package
from nabit.lib.sign import KNOWN_TSAS, is_encrypted_key
from nabit.lib.backends.url import UrlCollectionTask
from pathlib import Path
import json
import uuid
import tempfile
import click
import os
from urllib.parse import urlparse
import re
from scripts.helpers.parallel import run_parallel
import zipfile
import struct
import boto3
import logging
from scripts.data_gov.models import db, Dataset
from playhouse.shortcuts import model_to_dict
from tqdm import tqdm
from datetime import datetime
logger = logging.getLogger(__name__)
## download data.gov datasets, create nabit archives, and upload to S3
# File extensions that are already compressed or wouldn't benefit from additional compression
UNCOMPRESSED_EXTENSIONS = {
# Already compressed archives
'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
# Compressed images
'jpg', 'jpeg', 'png', 'gif', 'webp',
# Compressed video/audio
'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
# Other compressed/binary formats
'pdf', 'docx', 'xlsx', 'pptx',
}
stats_counter = {}
def is_valid_url(url):
parsed = urlparse(url)
return parsed.scheme in ['http', 'https'] and re.search(r'[^\.]\.[^\.]', parsed.netloc)
def extract_urls(data, urls = None):
urls = set() if urls is None else urls
if isinstance(data, dict):
for key, value in data.items():
if isinstance(value, str):
if is_valid_url(value):
urls.add(value)
elif isinstance(value, (dict, list)):
extract_urls(value, urls)
elif isinstance(data, list):
for item in data:
extract_urls(item, urls)
return urls
def create_archive(bag_dir, dataset: Dataset, signatures):
data_dict = model_to_dict(dataset)
for key, value in data_dict.items():
if isinstance(value, datetime):
data_dict[key] = value.isoformat()
data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}'
collect = [
*[UrlCollectionTask(url=url) for url in extract_urls(data_dict)],
]
logger.info(f" - Downloading {len(collect)} files")
# sort fields from dataset
data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')}
crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')}
# Create the archive
package(
output_path=bag_dir,
collect=collect,
collect_errors='ignore',
signed_metadata={
'id': str(uuid.uuid4()),
'url': data_gov_url,
'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.',
'data_gov_metadata': data_gov_metadata,
'crawler_metadata': crawler_metadata,
},
signatures=signatures,
)
def zip_archive(bag_dir, archive_path):
# Create zip archive
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
for file_path in bag_dir.rglob('*'):
if file_path.is_file():
arc_path = file_path.relative_to(bag_dir)
compression = (zipfile.ZIP_STORED
if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS
else zipfile.ZIP_DEFLATED)
zf.write(file_path, arc_path, compress_type=compression)
# Create metadata file
zip_info = []
with zipfile.ZipFile(archive_path, 'r') as zf:
for info in zf.filelist:
header_offset = info.header_offset
# Read header to calculate data offset
zf.fp.seek(header_offset)
header = zf.fp.read(zipfile.sizeFileHeader)
fheader = struct.unpack(zipfile.structFileHeader, header)
fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
zip_info.append({
'filename': info.filename,
'file_size': info.file_size,
'compress_size': info.compress_size,
'compress_type': info.compress_type,
'header_offset': header_offset,
'data_offset': data_offset,
})
# Read the bag-info.txt and signed-metadata.json
bag_info = (bag_dir / 'bag-info.txt').read_text()
signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
return {
'bag_info': bag_info,
'signed_metadata': signed_metadata,
'zip_entries': zip_info
}
def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
s3 = boto3.Session(**session_args).client('s3')
bucket_name, s3_path = s3_path.split('/', 1)
# Upload zip file
s3_collection_key = os.path.join(s3_path, str(collection_path.relative_to(output_path)))
s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
# Upload metadata file
s3_metadata_key = os.path.join(s3_path, str(metadata_path.relative_to(output_path)))
s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
def run_pipeline(
dataset: Dataset,
output_path: Path,
metadata_path: Path,
collection_path: Path,
signatures: list = None,
session_args: dict = None,
s3_path: str = None,
no_delete: bool = False,
):
logger.info(f"Processing dataset: {dataset.name}")
# we have a db forked from the main process, so we need to close it and reopen if needed
db.close()
# set this here so it makes it into the metadata
dataset.crawler_downloaded_date = datetime.now()
with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
logger.info("- Creating archive...")
# set up paths
temp_dir = Path(temp_dir)
bag_dir = temp_dir / 'bag'
archive_path = temp_dir / 'archive.zip'
# download data with nabit
create_archive(bag_dir, dataset, signatures)
logger.info("- Zipping archive...")
# zip up data and create metadata
output_metadata = zip_archive(bag_dir, archive_path)
logger.info("- Moving files to final location...")
# Move files to final location
collection_path.parent.mkdir(parents=True, exist_ok=True)
metadata_path.parent.mkdir(parents=True, exist_ok=True)
os.rename(str(archive_path), collection_path)
metadata_path.write_text(json.dumps(output_metadata) + '\n')
if s3_path:
logger.info("Uploading to S3...")
upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
if not no_delete:
logger.info("- Deleting zip archive...")
os.remove(collection_path)
if collection_path.parent.exists() and not os.listdir(collection_path.parent):
os.rmdir(collection_path.parent)
logger.info("- Setting crawler_downloaded_date...")
db.connect()
dataset.save()
logger.info("Processing complete")
def get_unprocessed_datasets(output_path: Path, collection: str, min_size: int = 0, dataset_name: str = None):
"""Get datasets from SQLite that don't have metadata files yet."""
query = Dataset.select()
if dataset_name:
query = query.where(Dataset.name == dataset_name)
if min_size:
query = query.where(Dataset.size >= min_size)
# Initialize progress bars
stats_counter['total'] = tqdm(desc="Total records", unit="pkg")
stats_counter['skipped'] = tqdm(desc="Already processed", unit="pkg")
stats_counter['yielded'] = tqdm(desc="Processing", unit="pkg")
for dataset in query:
stats_counter['total'].update(1)
# Check if metadata file exists
name = dataset.name
metadata_path = output_path / 'metadata' / collection / name / 'v1.json'
if metadata_path.exists():
stats_counter['skipped'].update(1)
continue
stats_counter['yielded'].update(1)
yield dataset
@click.command()
@click.option('--db-path', '-d', type=click.Path(exists=True, path_type=Path), default='data/data.db')
@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
help='Output path.')
@click.option('--collection', '-c', type=str, default='data_gov',
help='Collection name.')
@click.option('--workers', '-w', type=int, default=None,
help='Number of worker processes. Defaults to CPU count.')
@click.option('--min-size', '-s', type=int, default=0,
help='Minimum size of dataset to process.')
@click.option('--dataset-name', help='Dataset name to process.')
@click.option('--if-exists', '-e', type=click.Choice(['skip', 'replace', 'version']), default='skip',
help='Whether to skip, replace, or add a version if dataset already exists.')
@click.option('--signatures', help='JSON string of signature configuration.')
@click.option('--profile', '-p', help='AWS profile name')
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
@click.option('--log-level', '-l', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), default=None,
help='Logging level.')
@click.option('--stop-after', help='Stop after processing this many collections', type=int)
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
def main(db_path: Path, output_path: Path, collection: str, workers=None, min_size=0, dataset_name=None,
if_exists='skip', signatures=None, profile=None, s3_path=None, log_level=None, stop_after=None, no_delete=False):
if dataset_name:
workers = 1
stop_after = 1
if signatures:
signatures = json.loads(signatures)
for signature in signatures:
if signature['action'] == 'sign':
if is_encrypted_key(signature['params']['key']):
signature['params']['password'] = click.prompt(
f"Enter password for {signature['params']['key']}: ",
hide_input=True
)
elif signature['action'] == 'timestamp':
if known_tsa := signature.pop('known_tsa', None):
signature['params'] = KNOWN_TSAS[known_tsa]
session_args = {}
if profile:
session_args['profile_name'] = profile
# Initialize database connection
db.init(db_path)
db.connect()
def get_tasks():
processed = 0
for dataset in get_unprocessed_datasets(output_path, collection, min_size, dataset_name):
# handle existing datasets
name = dataset.name
collection_path = output_path / 'collections' / collection / name / 'v1.zip'
metadata_path = output_path / 'metadata' / collection / name / 'v1.json'
if metadata_path.exists():
if if_exists == 'skip':
continue
elif if_exists == 'replace':
metadata_path.unlink()
if collection_path.exists():
collection_path.unlink()
elif if_exists == 'version':
version = 2
while True:
collection_path = output_path / 'collections' / collection / name / f'v{version}.zip'
metadata_path = output_path / 'metadata' / collection / name / f'v{version}.json'
if not metadata_path.exists():
break
version += 1
yield dataset, output_path, metadata_path, collection_path, signatures, session_args, s3_path, no_delete
processed += 1
if stop_after and processed >= stop_after:
break
try:
run_parallel(run_pipeline, get_tasks(), workers, log_level=log_level, catch_errors=False)
finally:
# Close progress bars
for counter in stats_counter.values():
counter.close()
db.close()
if __name__ == '__main__':
main()

View file

@ -0,0 +1,299 @@
import httpx
from typing import Iterator, Dict, Any, List
import time
import click
from pathlib import Path
import logging
from datetime import datetime
from scripts.data_gov.models import db, Dataset, DatasetHistory
from tqdm import tqdm
from playhouse.shortcuts import model_to_dict
from jsondiff import diff
logger = logging.getLogger(__name__)
stats_counter = {}
def init_database(db_path: Path) -> None:
"""Initialize the database connection and create tables."""
db.init(db_path)
db.connect()
db.create_tables([Dataset, DatasetHistory])
def save_to_database(results: List[Dict[str, Any]]) -> None:
"""
Save a batch of packages to the database using Peewee.
"""
if not results:
return
# Process datetime fields in incoming records
for package in results:
for field in ['metadata_created', 'metadata_modified']:
if package.get(field):
try:
package[field] = datetime.fromisoformat(
package[field].replace('Z', '+00:00')
)
except ValueError:
package[field] = None
# Get all IDs from incoming packages
incoming_ids = [pkg['id'] for pkg in results]
# Fetch existing records as model instances
existing_records = {
record.id: record
for record in Dataset.select().where(Dataset.id << incoming_ids)
}
# Prepare bulk operations
history_records = []
new_records = []
# Compare records and prepare operations
for package_data in results:
# Create a new model instance from the package data
new_package = Dataset(**package_data)
existing = existing_records.get(package_data['id'])
if existing:
# Compare model instances using their dict representations
if diff(model_to_dict(existing), model_to_dict(new_package)):
# Record changed - add to history and update
history_records.append(existing)
new_records.append(new_package)
stats_counter['updated'].update(1)
else:
# Record unchanged - skip
stats_counter['skipped'].update(1)
continue
else:
# New record - just add it
new_records.append(new_package)
stats_counter['new'].update(1)
with db.atomic():
# Bulk move history records if any exist
if history_records:
DatasetHistory.bulk_create(history_records)
Dataset.delete().where(Dataset.id << [h.id for h in history_records]).execute()
# Bulk insert new records
if new_records:
Dataset.bulk_create(new_records)
def save_packages_to_database(output_path: Path, rows_per_page: int = 1000, start_date: str | None = None) -> None:
"""
Save fetched data to the database, resuming from last position if needed.
Args:
output_path: Path to save the database
rows_per_page: Number of results to fetch per page
start_date: Optional date to start fetching from
"""
stats_counter['new'] = tqdm(desc="New records", unit="pkg")
stats_counter['updated'] = tqdm(desc="Updated records", unit="pkg")
stats_counter['skipped'] = tqdm(desc="Unchanged records", unit="pkg")
init_database(output_path)
try:
for results in tqdm(fetch_data_gov_packages(rows_per_page=rows_per_page, start_date=start_date, max_retries=10)):
save_to_database(results)
finally:
db.close()
def fetch_data_gov_packages(rows_per_page: int = 1000, start_date: str = None, max_retries: int = 3) -> Iterator[Dict[str, Any]]:
"""
Fetch package data from data.gov API using date-based pagination.
Args:
rows_per_page: Number of results to fetch per page
start_date: Optional date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)
max_retries: Maximum number of retry attempts for 5xx errors
Yields:
Dict containing package data for each result
"""
base_url = "https://catalog.data.gov/api/3/action/package_search"
current_date = start_date
total_records = 0
while True:
logger.info(f"Current date offset: {current_date}")
# Build date filter query
url = f"{base_url}?rows={rows_per_page}&sort=metadata_modified+desc"
if current_date:
# Format date to match Solr's expected format (dropping microseconds)
formatted_date = current_date.split('.')[0] + 'Z'
date_filter = f"+metadata_modified:[* TO {formatted_date}]"
url += f"&fq={date_filter}"
for attempt in range(max_retries):
try:
start_time = time.time()
response = httpx.get(url, timeout=60.0)
request_time = time.time() - start_time
response.raise_for_status()
break # Success, exit retry loop
except httpx.HTTPStatusError as e:
if e.response.status_code >= 500 and attempt < max_retries - 1:
retry_wait = 2 ** attempt # Exponential backoff
logger.warning(f"Got {e.response.status_code}, retrying in {retry_wait}s... (attempt {attempt + 1}/{max_retries})")
logger.warning(f"Error URL: {url}")
time.sleep(retry_wait)
continue
# If not a 5xx error or we're out of retries, re-raise
logger.error(f"Error URL: {url}")
logger.error(f"Response content: {response.text}")
raise
data = response.json()
results = data["result"]["results"]
if not results:
break
# Get date of last result for next query
current_date = results[-1]["metadata_modified"]
total_records += len(results)
logger.info(f"Request took {request_time:.2f}s. Total records: {total_records}")
yield results
time.sleep(1)
def get_dataset_history(dataset_name: str) -> None:
"""
Fetch and display all versions of a dataset with the given ID,
from oldest to newest, showing only changed fields between versions.
"""
# Get all versions including current
versions = [
model_to_dict(record, recurse=True)
for record in (DatasetHistory
.select()
.where(DatasetHistory.name == dataset_name)
.order_by(DatasetHistory.metadata_modified))
]
current_record = Dataset.select().where(Dataset.name == dataset_name).first()
if current_record:
versions.append(model_to_dict(current_record, recurse=True))
if not versions:
print(f"No dataset found with name: {dataset_name}")
return
# Print each version with changed fields
prev = None
for curr in versions:
history_id = curr.pop('history_id', None)
if prev:
diff_fields = diff(prev, curr)
else:
diff_fields = curr
print(f"*** Version: {curr.get('metadata_modified')} ***")
for k, v in diff_fields.items():
print(f"- {k}: {v}")
print("\n")
prev = curr
@click.group()
def cli():
"""Data.gov dataset mirroring tools."""
pass
# Modify the existing main function to be a command in the group
@cli.command()
@click.argument('output_path', type=click.Path(path_type=Path), default='data/data.db')
@click.option('--rows-per-page', '-r', type=int, default=1000,
help='Number of results to fetch per page.')
@click.option('--start-date', '-s', type=str, default=None,
help='Date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='WARNING',
help='Logging level.')
def fetch(output_path: Path, rows_per_page: int, start_date: str, log_level: str):
"""Fetch package data from data.gov API and save to database."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
save_packages_to_database(output_path, rows_per_page, start_date)
@cli.command()
@click.argument('dataset_name')
@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
def history(dataset_name: str, db_path: Path):
"""Show version history for a dataset with the given ID."""
init_database(db_path)
try:
get_dataset_history(dataset_name)
finally:
db.close()
@cli.command()
@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
def delete_duplicate_history(db_path: Path):
"""Delete duplicate history records."""
init_database(db_path)
try:
# Get all unique dataset names in history
unique_names = (DatasetHistory
.select(DatasetHistory.name)
.distinct()
.tuples())
total_deleted = 0
for (name,) in tqdm(unique_names, desc="Processing datasets"):
# Get all versions for this dataset ordered by modification date
versions = [
model_to_dict(record)
for record in (DatasetHistory
.select()
.where(DatasetHistory.name == name)
.order_by(DatasetHistory.metadata_modified))
]
current_record = Dataset.select().where(Dataset.name == name).first()
if current_record:
versions.append(model_to_dict(current_record))
# Track IDs of duplicate records to delete
to_delete = []
# Compare adjacent versions
prev = versions[0]
prev_id = prev.pop('history_id')
for curr in versions[1:]:
curr_id = curr.pop('history_id', None)
# If versions are identical, mark current version for deletion
if not diff(prev, curr):
to_delete.append(prev_id)
prev = curr
prev_id = curr_id
# Bulk delete duplicate records
if to_delete:
deleted = (DatasetHistory
.delete()
.where(DatasetHistory.history_id << to_delete)
.execute())
total_deleted += deleted
click.echo(f"Deleted {total_deleted} duplicate history records")
finally:
db.close()
if __name__ == "__main__":
cli()

View file

@ -0,0 +1,35 @@
import httpx
import json
import time
import logging
from pathlib import Path
from typing import Iterator, Dict, Any, List
import click
from scripts.data_gov.fetch_index import fetch_data_gov_packages
logger = logging.getLogger(__name__)
@click.command()
@click.argument('output_path', type=click.Path(path_type=Path), default='data/data_20250130.jsonl')
@click.option('--rows-per-page', '-r', type=int, default=1000,
help='Number of results to fetch per page.')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
@click.option('--start-date', '-s', type=str, default=None,
help='Start date for fetching packages in YYYY-MM-DD format.')
def main(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
"""Fetch all package data from data.gov API and save to JSONL file."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
with open(output_path, 'a') as f:
for results in fetch_data_gov_packages(rows_per_page=rows_per_page, start_date=start_date):
for package in results:
f.write(json.dumps(package) + '\n')
if __name__ == "__main__":
main()

View file

@ -0,0 +1,18 @@
from playhouse.migrate import *
from scripts.data_gov.models import db
migrator = SqliteMigrator(db)
def do_migrate():
crawler_identified_date = DateTimeField(null=True)
crawler_downloaded_date = DateTimeField(null=True)
with db.atomic():
migrate(
# migrator.add_column('dataset', 'crawler_identified_date', crawler_identified_date),
# migrator.add_column('dataset', 'crawler_downloaded_date', crawler_downloaded_date),
# migrator.add_column('datasethistory', 'crawler_identified_date', crawler_identified_date),
# migrator.add_column('datasethistory', 'crawler_downloaded_date', crawler_downloaded_date),
)
if __name__ == '__main__':
do_migrate()

View file

@ -0,0 +1,61 @@
from peewee import *
from playhouse.sqlite_ext import JSONField
from pathlib import Path
from datetime import datetime
db = SqliteDatabase(Path(__file__).parent.parent.parent / 'data/data.db', pragmas={
# tuning suggested by Claude:
'journal_mode': 'wal', # Write-Ahead Logging for better concurrency
'cache_size': -1024 * 64, # 64MB cache (negative number means kibibytes)
'synchronous': 'normal', # Good balance between safety and speed
'busy_timeout': 30000, # Wait up to 30 seconds when database is locked
'temp_store': 'memory', # Store temp tables in memory
'mmap_size': 268435456, # Memory-mapped I/O (256MB)
'page_size': 4096, # Optimal for most systems
})
class BaseModel(Model):
class Meta:
database = db
class Dataset(BaseModel):
# fields from data.gov
id = CharField(primary_key=True)
name = CharField(null=True)
title = CharField(null=True)
notes = TextField(null=True)
metadata_created = DateTimeField(null=True)
metadata_modified = DateTimeField(null=True)
private = BooleanField(null=True)
state = CharField(null=True)
version = CharField(null=True)
type = CharField(null=True)
num_resources = IntegerField(null=True)
num_tags = IntegerField(null=True)
isopen = BooleanField(null=True)
author = CharField(null=True)
author_email = CharField(null=True)
creator_user_id = CharField(null=True)
license_id = CharField(null=True)
license_url = CharField(null=True)
license_title = CharField(null=True)
maintainer = CharField(null=True)
maintainer_email = CharField(null=True)
owner_org = CharField(null=True)
url = CharField(null=True)
organization = JSONField(null=True)
extras = JSONField(null=True)
resources = JSONField(null=True)
tags = JSONField(null=True)
groups = JSONField(null=True)
relationships_as_subject = JSONField(null=True)
relationships_as_object = JSONField(null=True)
# fields starting with crawler_ are added by our crawler
crawler_identified_date = DateTimeField(null=True, default=datetime.now)
crawler_downloaded_date = DateTimeField(null=True)
class DatasetHistory(Dataset):
history_id = AutoField(primary_key=True)
id = CharField() # Regular CharField, not primary key
#deleted_by_date = DateTimeField(null=True) # New field to track deletion date