data-vault/scripts/data_gov/fetch_jsonl.py
2025-02-05 10:21:50 -05:00

35 lines
1.3 KiB
Python

import httpx
import json
import time
import logging
from pathlib import Path
from typing import Iterator, Dict, Any, List
import click
from scripts.data_gov.fetch_index import fetch_data_gov_packages
logger = logging.getLogger(__name__)
@click.command()
@click.argument('output_path', type=click.Path(path_type=Path), default='data/data_20250130.jsonl')
@click.option('--rows-per-page', '-r', type=int, default=1000,
help='Number of results to fetch per page.')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
@click.option('--start-date', '-s', type=str, default=None,
help='Start date for fetching packages in YYYY-MM-DD format.')
def main(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
"""Fetch all package data from data.gov API and save to JSONL file."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
with open(output_path, 'a') as f:
for results in fetch_data_gov_packages(rows_per_page=rows_per_page, start_date=start_date):
for package in results:
f.write(json.dumps(package) + '\n')
if __name__ == "__main__":
main()