mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-03-21 02:01:22 +00:00
35 lines
1.3 KiB
Python
35 lines
1.3 KiB
Python
import httpx
|
|
import json
|
|
import time
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Iterator, Dict, Any, List
|
|
import click
|
|
from scripts.data_gov.fetch_index import fetch_data_gov_packages
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@click.command()
|
|
@click.argument('output_path', type=click.Path(path_type=Path), default='data/data_20250130.jsonl')
|
|
@click.option('--rows-per-page', '-r', type=int, default=1000,
|
|
help='Number of results to fetch per page.')
|
|
@click.option('--log-level', '-l',
|
|
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
|
default='INFO',
|
|
help='Logging level.')
|
|
@click.option('--start-date', '-s', type=str, default=None,
|
|
help='Start date for fetching packages in YYYY-MM-DD format.')
|
|
def main(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
|
|
"""Fetch all package data from data.gov API and save to JSONL file."""
|
|
logging.basicConfig(
|
|
level=getattr(logging, log_level),
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
|
|
with open(output_path, 'a') as f:
|
|
for results in fetch_data_gov_packages(rows_per_page=rows_per_page, start_date=start_date):
|
|
for package in results:
|
|
f.write(json.dumps(package) + '\n')
|
|
|
|
if __name__ == "__main__":
|
|
main()
|