initial commit

2025-09-02 11:26:00 -04:00 · 2025-02-05 10:21:50 -05:00 · 2025-02-05 10:21:50 -05:00 · 404c3627f7
commit 404c3627f7
26 changed files with 2534 additions and 0 deletions
--- a/scripts/init.py
+++ b/scripts/init.py
@ -0,0 +1,2 @@
+def hello() -> str:
+    return "Hello from data-mirror!"
--- a/scripts/collection/init.py
+++ b/scripts/collection/init.py
@ -0,0 +1,2 @@
+def hello() -> str:
+    return "Hello from data-mirror!"
--- a/scripts/collection/cloudflare_tools.py
+++ b/scripts/collection/cloudflare_tools.py
@ -0,0 +1,100 @@
+import logging
+from pathlib import Path
+import click
+from cloudflare import Cloudflare
+import os
+from scripts.helpers.config import load_config
+
+logger = logging.getLogger(__name__)
+
+def generate_temp_key(account_id: str, bucket: str, parent_access_key_id: str, token: str,
+                     permission: str = "object-read-write", ttl_seconds: int = 3600,
+                     prefixes: list[str] | None = None, objects: list[str] | None = None):
+    """Generate a temporary R2 access key using the Cloudflare API.
+    
+    Args:
+        account_id: Cloudflare account ID
+        bucket: R2 bucket name
+        parent_access_key_id: Parent access key ID
+        token: Cloudflare API token
+        permission: Permission level ('object-read-write' or 'object-read')
+        ttl_seconds: Time-to-live in seconds
+        prefixes: Optional list of key prefixes to restrict access to
+        objects: Optional list of specific object keys to restrict access to
+    """
+    params = {
+        "account_id": account_id,
+        "bucket": bucket,
+        "parent_access_key_id": parent_access_key_id,
+        "permission": permission,
+        "ttl_seconds": ttl_seconds,
+    }
+    
+    if prefixes:
+        params["prefixes"] = prefixes
+    if objects:
+        params["objects"] = objects
+        
+    return Cloudflare(api_token=token).r2.temporary_credentials.create(**params)
+
+@click.group()
+def cli():
+    """Cloudflare R2 utility commands."""
+    pass
+
+@cli.command()
+@click.option('--bucket', '-b', type=str, required=True,
+              help='R2 bucket name.')
+@click.option('--permission', '-p', type=click.Choice(['object-read-write', 'object-read']), 
+              default='object-read-write',
+              help='Permission level for the temporary key.')
+@click.option('--ttl', '-t', type=int, default=1,
+              help='Time-to-live in hours for the temporary key.')
+@click.option('--prefixes', '-x', multiple=True,
+              help='Key prefixes to restrict access to. Can be specified multiple times.')
+@click.option('--objects', '-o', multiple=True,
+              help='Specific object keys to restrict access to. Can be specified multiple times.')
+@click.option('--log-level', '-l', 
+              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO',
+              help='Logging level.')
+def generate_key(bucket: str, permission: str, ttl: int, prefixes: tuple[str, ...], 
+                objects: tuple[str, ...], log_level: str):
+    """Generate temporary Cloudflare R2 access credentials."""
+    # Setup logging
+    logging.basicConfig(level=log_level)
+    
+    # Load config
+    config = load_config().get("temp_tokens", {})
+
+    if not config or any(key not in config for key in ['parent_access_key_id', 'account_id', 'token']):
+        raise click.ClickException("Config file must have 'temp_tokens' dict with 'parent_access_key_id', 'account_id', and 'token' keys")
+    
+    # Generate temporary key
+    temp_cred = generate_temp_key(
+        account_id=config['account_id'],
+        bucket=bucket,
+        parent_access_key_id=config['parent_access_key_id'],
+        token=config['token'],
+        permission=permission,
+        ttl_seconds=ttl * 3600,
+        prefixes=list(prefixes) if prefixes else None,
+        objects=list(objects) if objects else None
+    )
+    
+    # Output AWS config format
+    click.echo("\n# Add this to ~/.aws/config:")
+    click.echo("[profile r2-temp]")
+    click.echo(f"aws_access_key_id = {temp_cred.access_key_id}")
+    click.echo(f"aws_secret_access_key = {temp_cred.secret_access_key}")
+    click.echo(f"aws_session_token = {temp_cred.session_token}")
+    click.echo("region = auto")
+    click.echo(f"endpoint_url = https://{config['account_id']}.r2.cloudflarestorage.com")
+
+    # Output sample command using first prefix if available
+    click.echo("\n# Sample upload command:")
+    sample_path = objects[0] if objects else f"{prefixes[0].strip('/')}/" if prefixes else ""
+    click.echo(f"aws s3 cp local-file.txt s3://{bucket}/{sample_path} --profile r2-temp")
+
+if __name__ == "__main__":
+    cli()
--- a/scripts/collection/render.py
+++ b/scripts/collection/render.py
@ -0,0 +1,109 @@
+import click
+from pathlib import Path
+from scripts.data_gov.models import db, Dataset
+import logging
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+
+# Header template with styles
+HEADER_TEMPLATE = '''<!DOCTYPE html>
+<html>
+<head>
+    <title>Data.gov Dataset Mirror</title>
+    <link rel="stylesheet" href="style.css">
+</head>
+<body>
+    <h1>Data.gov Dataset Mirror</h1>
+'''
+
+TABLE_START = '''    <table>
+        <thead>
+            <tr>
+                <th>Name</th>
+                <th>Organization</th>
+                <th>Description</th>
+            </tr>
+        </thead>
+        <tbody>
+'''
+
+ROW_TEMPLATE = '''            <tr>
+                <td>{name}</td>
+                <td>{org}</td>
+                <td>{title}</td>
+            </tr>
+'''
+
+TABLE_END = '''        </tbody>
+    </table>
+</body>
+</html>
+'''
+
+def render_html(datasets_query, output_path: Path) -> None:
+    """Render the datasets to an HTML file, streaming content."""
+    with open(output_path / 'index.html', 'w', encoding='utf-8') as f:
+        # Write header
+        f.write(HEADER_TEMPLATE)
+        
+        # Write table start
+        f.write(TABLE_START)
+        
+        # Stream each dataset row
+        rows = []
+        for dataset in tqdm(datasets_query.iterator(), desc="Rendering datasets"):
+            org_title = dataset.organization.get('title') if dataset.organization else 'N/A'
+            row = ROW_TEMPLATE.format(
+                name=dataset.name or '',
+                org=org_title,
+                title=dataset.title,
+            )
+            rows.append(row)
+            if len(rows) >= 1000:
+                f.write('\n'.join(rows))
+                rows = []
+
+        if rows:
+            f.write('\n'.join(rows))
+        
+        # Write table end
+        f.write(TABLE_END)
+
+@click.command()
+@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
+@click.argument('output_path', type=click.Path(path_type=Path), default='data/processed/web')
+@click.option('--log-level', '-l', 
+              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO',
+              help='Logging level.')
+@click.option('--limit', '-n', type=int, default=None,
+              help='Maximum number of rows to display. Default: all rows.')
+def main(db_path: Path, output_path: Path, log_level: str, limit: int | None):
+    """Render the Dataset table to an HTML file."""
+    logging.basicConfig(
+        level=getattr(logging, log_level),
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    
+    logger.info(f"Connecting to database at {db_path}")
+    db.init(db_path)
+    db.connect()
+    
+    try:
+        logger.info("Starting HTML generation...")
+        datasets_query = Dataset.select().order_by(Dataset.id)
+        if limit:
+            datasets_query = datasets_query.limit(limit)
+            logger.info(f"Limited to {limit} rows")
+        
+        logger.info(f"Rendering HTML to {output_path}")
+        render_html(datasets_query, output_path)
+        logger.info("Done!")
+        
+    finally:
+        db.close()
+
+if __name__ == "__main__":
+    main()
--- a/scripts/collection/s3_tools.py
+++ b/scripts/collection/s3_tools.py
@ -0,0 +1,118 @@
+import boto3
+import click
+from tqdm import tqdm
+import logging
+from itertools import islice
+
+logger = logging.getLogger(__name__)
+
+def get_delete_markers(s3_client, bucket: str, prefix: str):
+    """Get all delete markers for objects with the given prefix."""
+    paginator = s3_client.get_paginator('list_object_versions')
+    for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="pages"):
+        if 'DeleteMarkers' in page:
+            yield [
+                {
+                    'Key': marker['Key'],
+                    'VersionId': marker['VersionId']
+                }
+                for marker in page['DeleteMarkers']
+                if marker['IsLatest']
+            ]
+
+def remove_delete_markers(s3_client, bucket: str, prefix: str, dry_run: bool = False):
+    """Remove all delete markers for objects with the given prefix."""
+    for marker_batch in get_delete_markers(s3_client, bucket, prefix):
+        response = s3_client.delete_objects(
+            Bucket=bucket,
+            Delete={
+                'Objects': marker_batch,
+                'Quiet': True
+            }
+        )
+        
+        # Log any errors
+        if 'Errors' in response:
+            for error in response['Errors']:
+                logger.error(f"Failed to remove marker for {error['Key']}: {error['Message']}")
+
+def get_empty_files(s3_client, bucket: str, prefix: str):
+    """Get all objects with size zero under the given prefix."""
+    paginator = s3_client.get_paginator('list_objects_v2')
+    for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="pages"):
+        if 'Contents' in page:
+            yield [
+                {'Key': obj['Key']}
+                for obj in page['Contents']
+                if obj['Size'] == 0
+            ]
+
+def delete_empty_files(s3_client, bucket: str, prefix: str, dry_run: bool = False):
+    """Delete all zero-size objects under the given prefix."""
+    pbar = tqdm(desc="deleted")
+    for empty_batch in get_empty_files(s3_client, bucket, prefix):
+        if not empty_batch:
+            continue
+
+        if dry_run:
+            for obj in empty_batch:
+                logger.info(f"Would delete empty file: {obj['Key']}")
+            continue
+
+        pbar.update(len(empty_batch))
+
+        response = s3_client.delete_objects(
+            Bucket=bucket,
+            Delete={
+                'Objects': empty_batch,
+                'Quiet': True
+            }
+        )
+        
+        # Log any errors
+        if 'Errors' in response:
+            for error in response['Errors']:
+                logger.error(f"Failed to delete {error['Key']}: {error['Message']}")
+
+    pbar.close()
+
+@click.group()
+def cli():
+    """S3 object management commands."""
+    pass
+
+@cli.command()
+@click.argument('s3_path')
+@click.option('--profile', help='AWS profile name', default='sc-direct')
+@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
+@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO', help='Set logging level')
+def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
+    """Remove delete markers from versioned S3 objects, effectively undeleting them."""
+    logging.basicConfig(level=log_level)
+    bucket, prefix = s3_path.split('/', 1)
+    
+    session = boto3.Session(profile_name=profile)
+    s3_client = session.client('s3')
+    
+    remove_delete_markers(s3_client, bucket, prefix, dry_run)
+
+@cli.command()
+@click.argument('s3_path')
+@click.option('--profile', help='AWS profile name', default='sc-direct')
+@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
+@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO', help='Set logging level')
+def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
+    """Delete all zero-size objects under the given prefix."""
+    logging.basicConfig(level=log_level)
+    bucket, prefix = s3_path.split('/', 1)
+    
+    session = boto3.Session(profile_name=profile)
+    s3_client = session.client('s3')
+    
+    delete_empty_files(s3_client, bucket, prefix, dry_run)
+
+if __name__ == '__main__':
+    cli()
+
--- a/scripts/collection/sync.py
+++ b/scripts/collection/sync.py
@ -0,0 +1,31 @@
+import boto3
+import click
+import json
+from pathlib import Path
+import logging
+
+logger = logging.getLogger(__name__)
+
+@click.command()
+@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path),
+              default='collections/collections.json',
+              help='Path to collections configuration file.')
+def main(collections_file: Path):
+    # Load collections config
+    collections = json.loads(collections_file.read_text())
+    collections_dir = collections_file.parent
+
+    for collection in collections:
+        s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3')
+        collection_path = collections_dir / collection['directory']
+        bucket_name, s3_prefix = collection['s3_path'].split('/', 1)
+
+        for file_path in collection_path.rglob('*'):
+            if file_path.is_file():
+                relative_path = file_path.relative_to(collection_path)
+                s3_key = f"{s3_prefix}/{relative_path}"
+                print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}")
+                s3.upload_file(str(file_path), bucket_name, s3_key)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/collection/verify_upload.py
+++ b/scripts/collection/verify_upload.py
@ -0,0 +1,91 @@
+from pathlib import Path
+import json
+import zipfile
+import tempfile
+import requests
+import click
+import logging
+from nabit.bin.utils import cli_validate
+logger = logging.getLogger(__name__)
+
+def download_file(url: str, target_path: Path):
+    """Download a file from URL to target path"""
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    with target_path.open('wb') as f:
+        for chunk in response.iter_content(chunk_size=2**20):
+            f.write(chunk)
+
+def verify_dataset(json_url: str, zip_url: str, output_dir: Path | None = None):
+    """
+    Verify a dataset by downloading and checking its JSON metadata and ZIP contents.
+    If output_dir is provided, write the uncompressed contents there.
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+        
+        # Download files
+        logger.info(f"Downloading metadata from {json_url}...")
+        json_path = tmpdir / "metadata.json"
+        download_file(json_url, json_path)
+        
+        logger.info(f"Downloading archive from {zip_url}...")
+        zip_path = tmpdir / "data.zip"
+        download_file(zip_url, zip_path)
+        
+        # Load metadata
+        metadata = json.loads(json_path.read_text())
+        
+        # Create output directory
+        if not output_dir:
+            output_dir = tmpdir / "output"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Verify file contents
+        logger.info("Verifying file contents...")
+        with zip_path.open('rb') as f:
+            for entry in metadata['zip_entries']:
+                logger.info(f"Checking {entry['filename']}...")
+                f.seek(entry['data_offset'])
+                zip_data = f.read(entry['compress_size'])
+                
+                if entry['compress_type'] == zipfile.ZIP_STORED:
+                    uncompressed = zip_data
+                else:
+                    decompressor = zipfile._get_decompressor(entry['compress_type'])
+                    uncompressed = decompressor.decompress(zip_data)
+                
+                # write the file
+                output_file = output_dir / entry['filename']
+                output_file.parent.mkdir(parents=True, exist_ok=True)
+                output_file.write_bytes(uncompressed)
+
+        logger.info("All files extracted successfully")
+
+        # verify dataset with nabit
+        cli_validate(output_dir)
+        
+        # Return metadata for potential further use
+        return metadata
+
+@click.command()
+@click.argument('json_url', type=str)
+@click.argument('zip_url', type=str)
+@click.option('--output', '-o', type=click.Path(path_type=Path), 
+              help='Directory to write uncompressed files')
+@click.option('--log-level', '-l', 
+              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO',
+              help='Logging level.')
+def main(json_url: str, zip_url: str, output: Path = None, log_level: str = 'INFO'):
+    """Verify dataset from JSON and ZIP URLs"""
+    # Set up logging
+    logging.basicConfig(
+        level=getattr(logging, log_level),
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    
+    verify_dataset(json_url, zip_url, output)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_gov/diff/diff.py
+++ b/scripts/data_gov/diff/diff.py
@ -0,0 +1,127 @@
+import json
+import click
+from pathlib import Path
+from typing import Dict, List, Set, Tuple
+import logging
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+
+def load_jsonl_data(jsonl_path: Path, keep_fields=None, compare_by: str = 'id') -> Dict[str, dict]:
+    """
+    Load data from JSONL file into a dictionary keyed by id.
+    Only includes fields that match the CSV format.
+    
+    Args:
+        jsonl_path: Path to the JSONL file
+        
+    Returns:
+        Dictionary mapping id to filtered record data
+    """
+    # Fields to keep from JSONL records
+    
+    data = {}
+    with open(jsonl_path, 'r', encoding='utf-8') as f:
+        for line in tqdm(f, desc="Loading JSONL"):
+            if line.strip():  # Skip empty lines
+                record = json.loads(line)
+                if keep_fields:
+                    record = {k: v for k, v in record.items() if k in keep_fields}
+                data[record[compare_by]] = record
+                
+    return data
+
+def find_differences(csv_data: Dict[str, dict], 
+                    jsonl_data: Dict[str, dict]) -> Tuple[Set[str], Set[str], Set[str]]:
+    """
+    Find records that differ between CSV and JSONL data.
+    
+    Args:
+        csv_data: Dictionary of CSV records keyed by id
+        jsonl_data: Dictionary of JSONL records keyed by id
+        
+    Returns:
+        Tuple of (csv_only_ids, jsonl_only_ids, different_ids)
+    """
+    csv_ids = set(csv_data.keys())
+    jsonl_ids = set(jsonl_data.keys())
+    
+    # Find records only in CSV
+    csv_only = csv_ids - jsonl_ids
+    
+    # Find records only in JSONL
+    jsonl_only = jsonl_ids - csv_ids
+    
+    return csv_only, jsonl_only
+
+@click.command()
+@click.argument('old_path', type=click.Path(exists=True, path_type=Path))
+@click.argument('new_path', type=click.Path(exists=True, path_type=Path))
+@click.option('--compare-by', '-c',
+              default='id',
+              help='Field to compare by.')
+@click.option('--log-level', '-l',
+              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
+              default='INFO',
+              help='Logging level.')
+def main(old_path: Path, new_path: Path, compare_by: str, log_level: str):
+    """Compare records between CSV and JSONL files."""
+    logging.basicConfig(
+        level=getattr(logging, log_level),
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    
+    old_data = load_jsonl_data(old_path, compare_by=compare_by)
+    new_data = load_jsonl_data(new_path, compare_by=compare_by)
+    
+    # Find differences
+    old_only, new_only = find_differences(old_data, new_data)
+
+    old_only_path = old_path.with_suffix(f'.only_{compare_by}.jsonl')
+    new_only_path = new_path.with_suffix(f'.only_{compare_by}.jsonl')
+
+    logger.info(f"Writing {len(old_only)} records to {old_only_path}")
+    with open(old_only_path, 'w', encoding='utf-8') as f:
+        for id in old_only:
+            f.write(json.dumps(old_data[id]) + '\n')
+
+    logger.info(f"Writing {len(new_only)} records to {new_only_path}")
+    with open(new_only_path, 'w', encoding='utf-8') as f:
+        for id in new_only:
+            f.write(json.dumps(new_data[id]) + '\n')
+
+if __name__ == '__main__':
+    main() 
+
+
+
+# import sqlite3
+# import json
+
+# # Connect to the database
+# conn = sqlite3.connect('data/data.db')
+# conn.row_factory = sqlite3.Row  # This allows us to access columns by name
+
+# # Open the output file
+# with open('data/data_db_dump_20250130.jsonl', 'w') as f:
+#     # Execute the query and fetch rows in chunks
+#     cursor = conn.execute('''
+#         SELECT *
+#         FROM dataset
+#     ''')
+    
+#     written = 0
+#     while True:
+#         rows = cursor.fetchmany(1000)  # Fetch 1000 rows at a time
+#         if not rows:
+#             break
+#         written += len(rows)
+#         # Write each row as a JSON line
+#         for row in rows:
+#             # Convert row to dict and write to file
+#             json_line = json.dumps(dict(row))
+#             f.write(json_line + '\n')
+#         print(f"Wrote {written} rows")
+
+# conn.close()
--- a/scripts/data_gov/diff/diff_analyze.py
+++ b/scripts/data_gov/diff/diff_analyze.py
@ -0,0 +1,38 @@
+import json
+from collections import Counter, defaultdict
+from pathlib import Path
+
+
+# Read the JSONL file and count crawler_identified_date values
+downloaded_counts = Counter()
+identified_counts = Counter()
+titles_by_org = defaultdict(list)
+with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
+    for line in f:
+          data = json.loads(line)
+          org = json.loads(data.get('organization', '{}'))
+          identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
+          titles_by_org[org['title']].append(data["title"])
+
+# Print the counts sorted by date
+for date, count in sorted(identified_counts.items()):
+    print(f"{date}: {count}")
+
+# sort each list of titles by org
+for org, titles in titles_by_org.items():
+    titles_by_org[org].sort()
+Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
+
+
+# print urls
+for path in Path('data/').glob('glass*'):
+     print(path)
+     with open(path, 'r') as f:
+          for line in f:
+               data = json.loads(line)
+               print("* " + data['name'])
+               resources = data.get('resources', [])
+               if type(resources) == str:
+                    resources = json.loads(resources)
+               for resource in resources:
+                    print(' * ' + resource['url'])
--- a/scripts/data_gov/fetch_data.py
+++ b/scripts/data_gov/fetch_data.py
@ -0,0 +1,318 @@
+from nabit.lib.archive import package
+from nabit.lib.sign import KNOWN_TSAS, is_encrypted_key
+from nabit.lib.backends.url import UrlCollectionTask
+from pathlib import Path
+import json
+import uuid
+import tempfile
+import click
+import os
+from urllib.parse import urlparse
+import re
+from scripts.helpers.parallel import run_parallel
+import zipfile
+import struct
+import boto3
+import logging
+from scripts.data_gov.models import db, Dataset
+from playhouse.shortcuts import model_to_dict
+from tqdm import tqdm
+from datetime import datetime
+
+logger = logging.getLogger(__name__)
+
+## download data.gov datasets, create nabit archives, and upload to S3
+
+# File extensions that are already compressed or wouldn't benefit from additional compression
+UNCOMPRESSED_EXTENSIONS = {
+    # Already compressed archives
+    'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
+    # Compressed images
+    'jpg', 'jpeg', 'png', 'gif', 'webp',
+    # Compressed video/audio
+    'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
+    # Other compressed/binary formats
+    'pdf', 'docx', 'xlsx', 'pptx',
+}
+
+stats_counter = {}
+
+def is_valid_url(url):
+    parsed = urlparse(url)
+    return parsed.scheme in ['http', 'https'] and re.search(r'[^\.]\.[^\.]', parsed.netloc)
+
+def extract_urls(data, urls = None):
+    urls = set() if urls is None else urls
+    if isinstance(data, dict):
+        for key, value in data.items():
+            if isinstance(value, str):
+                if is_valid_url(value):
+                    urls.add(value)
+            elif isinstance(value, (dict, list)):
+                extract_urls(value, urls)
+    elif isinstance(data, list):
+        for item in data:
+            extract_urls(item, urls)
+    return urls
+
+def create_archive(bag_dir, dataset: Dataset, signatures):
+    data_dict = model_to_dict(dataset)
+    for key, value in data_dict.items():
+        if isinstance(value, datetime):
+            data_dict[key] = value.isoformat()
+    data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}'
+    collect = [
+        *[UrlCollectionTask(url=url) for url in extract_urls(data_dict)],
+    ]
+    logger.info(f"  - Downloading {len(collect)} files")
+
+    # sort fields from dataset
+    data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')}
+    crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')}
+
+    # Create the archive
+    package(
+        output_path=bag_dir,
+        collect=collect,
+        collect_errors='ignore',
+        signed_metadata={
+            'id': str(uuid.uuid4()),
+            'url': data_gov_url,
+            'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.',
+            'data_gov_metadata': data_gov_metadata,
+            'crawler_metadata': crawler_metadata,
+        },
+        signatures=signatures,
+    )
+
+def zip_archive(bag_dir, archive_path):
+    # Create zip archive
+    with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for file_path in bag_dir.rglob('*'):
+            if file_path.is_file():
+                arc_path = file_path.relative_to(bag_dir)
+                compression = (zipfile.ZIP_STORED 
+                            if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS 
+                            else zipfile.ZIP_DEFLATED)
+                zf.write(file_path, arc_path, compress_type=compression)
+
+    # Create metadata file
+    zip_info = []
+    with zipfile.ZipFile(archive_path, 'r') as zf:
+        for info in zf.filelist:
+            header_offset = info.header_offset
+            
+            # Read header to calculate data offset
+            zf.fp.seek(header_offset)
+            header = zf.fp.read(zipfile.sizeFileHeader)
+            fheader = struct.unpack(zipfile.structFileHeader, header)
+            fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
+            extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
+            data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
+            
+            zip_info.append({
+                'filename': info.filename,
+                'file_size': info.file_size,
+                'compress_size': info.compress_size,
+                'compress_type': info.compress_type,
+                'header_offset': header_offset,
+                'data_offset': data_offset,
+            })
+        
+    # Read the bag-info.txt and signed-metadata.json
+    bag_info = (bag_dir / 'bag-info.txt').read_text()
+    signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
+
+    return {
+        'bag_info': bag_info,
+        'signed_metadata': signed_metadata,
+        'zip_entries': zip_info
+    }
+
+def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
+    s3 = boto3.Session(**session_args).client('s3')
+    bucket_name, s3_path = s3_path.split('/', 1)
+
+    # Upload zip file
+    s3_collection_key = os.path.join(s3_path, str(collection_path.relative_to(output_path)))
+    s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
+    logger.info(f"  - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
+
+    # Upload metadata file
+    s3_metadata_key = os.path.join(s3_path, str(metadata_path.relative_to(output_path)))
+    s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
+    logger.info(f"  - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
+
+
+def run_pipeline(
+        dataset: Dataset, 
+        output_path: Path, 
+        metadata_path: Path, 
+        collection_path: Path, 
+        signatures: list = None,
+        session_args: dict = None,
+        s3_path: str = None,
+        no_delete: bool = False,
+    ):
+    logger.info(f"Processing dataset: {dataset.name}")
+    
+    # we have a db forked from the main process, so we need to close it and reopen if needed
+    db.close()
+
+    # set this here so it makes it into the metadata
+    dataset.crawler_downloaded_date = datetime.now()
+
+    with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
+        logger.info("- Creating archive...")
+        # set up paths
+        temp_dir = Path(temp_dir)
+        bag_dir = temp_dir / 'bag'
+        archive_path = temp_dir / 'archive.zip'
+
+        # download data with nabit
+        create_archive(bag_dir, dataset, signatures)
+
+        logger.info("- Zipping archive...")
+        # zip up data and create metadata
+        output_metadata = zip_archive(bag_dir, archive_path)
+
+        logger.info("- Moving files to final location...")
+        # Move files to final location
+        collection_path.parent.mkdir(parents=True, exist_ok=True)
+        metadata_path.parent.mkdir(parents=True, exist_ok=True)
+        os.rename(str(archive_path), collection_path)
+        metadata_path.write_text(json.dumps(output_metadata) + '\n')
+
+    if s3_path:
+        logger.info("Uploading to S3...")
+        upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
+
+    if not no_delete:
+        logger.info("- Deleting zip archive...")
+        os.remove(collection_path)
+        if collection_path.parent.exists() and not os.listdir(collection_path.parent):
+            os.rmdir(collection_path.parent)
+
+    logger.info("- Setting crawler_downloaded_date...")
+    db.connect()
+    dataset.save()
+    
+    logger.info("Processing complete")
+
+def get_unprocessed_datasets(output_path: Path, collection: str, min_size: int = 0, dataset_name: str = None):
+    """Get datasets from SQLite that don't have metadata files yet."""
+    query = Dataset.select()
+    
+    if dataset_name:
+        query = query.where(Dataset.name == dataset_name)
+    if min_size:
+        query = query.where(Dataset.size >= min_size)
+
+    # Initialize progress bars
+    stats_counter['total'] = tqdm(desc="Total records", unit="pkg")
+    stats_counter['skipped'] = tqdm(desc="Already processed", unit="pkg")
+    stats_counter['yielded'] = tqdm(desc="Processing", unit="pkg")
+
+    for dataset in query:
+        stats_counter['total'].update(1)
+        
+        # Check if metadata file exists
+        name = dataset.name
+        metadata_path = output_path / 'metadata' / collection / name / 'v1.json'
+        
+        if metadata_path.exists():
+            stats_counter['skipped'].update(1)
+            continue
+            
+        stats_counter['yielded'].update(1)
+        yield dataset
+
+
+@click.command()
+@click.option('--db-path', '-d', type=click.Path(exists=True, path_type=Path), default='data/data.db')
+@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
+              help='Output path.')
+@click.option('--collection', '-c', type=str, default='data_gov',
+              help='Collection name.')
+@click.option('--workers', '-w', type=int, default=None, 
+              help='Number of worker processes. Defaults to CPU count.')
+@click.option('--min-size', '-s', type=int, default=0,
+              help='Minimum size of dataset to process.')
+@click.option('--dataset-name', help='Dataset name to process.')
+@click.option('--if-exists', '-e', type=click.Choice(['skip', 'replace', 'version']), default='skip',
+              help='Whether to skip, replace, or add a version if dataset already exists.')
+@click.option('--signatures', help='JSON string of signature configuration.')
+@click.option('--profile', '-p', help='AWS profile name')
+@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
+@click.option('--log-level', '-l', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), default=None,
+              help='Logging level.')
+@click.option('--stop-after', help='Stop after processing this many collections', type=int)
+@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
+def main(db_path: Path, output_path: Path, collection: str, workers=None, min_size=0, dataset_name=None,
+         if_exists='skip', signatures=None, profile=None, s3_path=None, log_level=None, stop_after=None, no_delete=False):
+    
+    if dataset_name:
+        workers = 1
+        stop_after = 1
+
+    if signatures:
+        signatures = json.loads(signatures)
+        for signature in signatures:
+            if signature['action'] == 'sign':
+                if is_encrypted_key(signature['params']['key']):
+                    signature['params']['password'] = click.prompt(
+                        f"Enter password for {signature['params']['key']}: ", 
+                        hide_input=True
+                    )
+            elif signature['action'] == 'timestamp':
+                if known_tsa := signature.pop('known_tsa', None):
+                    signature['params'] = KNOWN_TSAS[known_tsa]
+
+    session_args = {}
+    if profile:
+        session_args['profile_name'] = profile
+
+    # Initialize database connection
+    db.init(db_path)
+    db.connect()
+
+    def get_tasks():
+        processed = 0
+        for dataset in get_unprocessed_datasets(output_path, collection, min_size, dataset_name):
+            # handle existing datasets
+            name = dataset.name
+            collection_path = output_path / 'collections' / collection / name / 'v1.zip'
+            metadata_path = output_path / 'metadata' / collection / name / 'v1.json'
+            
+            if metadata_path.exists():
+                if if_exists == 'skip':
+                    continue
+                elif if_exists == 'replace':
+                    metadata_path.unlink()
+                    if collection_path.exists():
+                        collection_path.unlink()
+                elif if_exists == 'version':
+                    version = 2
+                    while True:
+                        collection_path = output_path / 'collections' / collection / name / f'v{version}.zip'
+                        metadata_path = output_path / 'metadata' / collection / name / f'v{version}.json'
+                        if not metadata_path.exists():
+                            break
+                        version += 1
+
+            yield dataset, output_path, metadata_path, collection_path, signatures, session_args, s3_path, no_delete
+
+            processed += 1
+            if stop_after and processed >= stop_after:
+                break
+
+    try:
+        run_parallel(run_pipeline, get_tasks(), workers, log_level=log_level, catch_errors=False)
+    finally:
+        # Close progress bars
+        for counter in stats_counter.values():
+            counter.close()
+        db.close()
+
+if __name__ == '__main__':
+    main()
--- a/scripts/data_gov/fetch_index.py
+++ b/scripts/data_gov/fetch_index.py
@ -0,0 +1,299 @@
+import httpx
+from typing import Iterator, Dict, Any, List
+import time
+import click
+from pathlib import Path
+import logging
+from datetime import datetime
+from scripts.data_gov.models import db, Dataset, DatasetHistory
+from tqdm import tqdm
+from playhouse.shortcuts import model_to_dict
+from jsondiff import diff
+
+logger = logging.getLogger(__name__)
+
+stats_counter = {}
+
+def init_database(db_path: Path) -> None:
+    """Initialize the database connection and create tables."""
+    db.init(db_path)
+    db.connect()
+    db.create_tables([Dataset, DatasetHistory])
+
+def save_to_database(results: List[Dict[str, Any]]) -> None:
+    """
+    Save a batch of packages to the database using Peewee.
+    """
+    if not results:
+        return
+
+    # Process datetime fields in incoming records
+    for package in results:
+        for field in ['metadata_created', 'metadata_modified']:
+            if package.get(field):
+                try:
+                    package[field] = datetime.fromisoformat(
+                        package[field].replace('Z', '+00:00')
+                    )
+                except ValueError:
+                    package[field] = None
+
+    # Get all IDs from incoming packages
+    incoming_ids = [pkg['id'] for pkg in results]
+    
+    # Fetch existing records as model instances
+    existing_records = {
+        record.id: record
+        for record in Dataset.select().where(Dataset.id << incoming_ids)
+    }
+
+    # Prepare bulk operations
+    history_records = []
+    new_records = []
+
+    # Compare records and prepare operations
+    for package_data in results:
+        # Create a new model instance from the package data
+        new_package = Dataset(**package_data)
+        existing = existing_records.get(package_data['id'])
+        
+        if existing:
+            # Compare model instances using their dict representations
+            if diff(model_to_dict(existing), model_to_dict(new_package)):
+                # Record changed - add to history and update
+                history_records.append(existing)
+                new_records.append(new_package)
+                stats_counter['updated'].update(1)
+            else:
+                # Record unchanged - skip
+                stats_counter['skipped'].update(1)
+                continue
+        else:
+            # New record - just add it
+            new_records.append(new_package)
+            stats_counter['new'].update(1)
+
+    with db.atomic():
+        # Bulk move history records if any exist
+        if history_records:
+            DatasetHistory.bulk_create(history_records)
+            Dataset.delete().where(Dataset.id << [h.id for h in history_records]).execute()
+            
+        # Bulk insert new records
+        if new_records:
+            Dataset.bulk_create(new_records)
+
+def save_packages_to_database(output_path: Path, rows_per_page: int = 1000, start_date: str | None = None) -> None:
+    """
+    Save fetched data to the database, resuming from last position if needed.
+    
+    Args:
+        output_path: Path to save the database
+        rows_per_page: Number of results to fetch per page
+        start_date: Optional date to start fetching from
+    """
+    stats_counter['new'] = tqdm(desc="New records", unit="pkg")
+    stats_counter['updated'] = tqdm(desc="Updated records", unit="pkg")
+    stats_counter['skipped'] = tqdm(desc="Unchanged records", unit="pkg")
+
+    init_database(output_path)
+    
+    try:
+        for results in tqdm(fetch_data_gov_packages(rows_per_page=rows_per_page, start_date=start_date, max_retries=10)):
+            save_to_database(results)
+    finally:
+        db.close()
+
+def fetch_data_gov_packages(rows_per_page: int = 1000, start_date: str = None, max_retries: int = 3) -> Iterator[Dict[str, Any]]:
+    """
+    Fetch package data from data.gov API using date-based pagination.
+    
+    Args:
+        rows_per_page: Number of results to fetch per page
+        start_date: Optional date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)
+        max_retries: Maximum number of retry attempts for 5xx errors
+    
+    Yields:
+        Dict containing package data for each result
+    """
+    
+    base_url = "https://catalog.data.gov/api/3/action/package_search"
+    current_date = start_date
+    total_records = 0
+    
+    while True:
+        logger.info(f"Current date offset: {current_date}")
+
+        # Build date filter query
+        url = f"{base_url}?rows={rows_per_page}&sort=metadata_modified+desc"
+        if current_date:
+            # Format date to match Solr's expected format (dropping microseconds)
+            formatted_date = current_date.split('.')[0] + 'Z'
+            date_filter = f"+metadata_modified:[* TO {formatted_date}]"
+            url += f"&fq={date_filter}"
+        
+        for attempt in range(max_retries):
+            try:
+                start_time = time.time()
+                response = httpx.get(url, timeout=60.0)
+                request_time = time.time() - start_time
+                
+                response.raise_for_status()
+                break  # Success, exit retry loop
+                
+            except httpx.HTTPStatusError as e:
+                if e.response.status_code >= 500 and attempt < max_retries - 1:
+                    retry_wait = 2 ** attempt  # Exponential backoff
+                    logger.warning(f"Got {e.response.status_code}, retrying in {retry_wait}s... (attempt {attempt + 1}/{max_retries})")
+                    logger.warning(f"Error URL: {url}")
+                    time.sleep(retry_wait)
+                    continue
+                # If not a 5xx error or we're out of retries, re-raise
+                logger.error(f"Error URL: {url}")
+                logger.error(f"Response content: {response.text}")
+                raise
+        
+        data = response.json()
+        results = data["result"]["results"]
+        
+        if not results:
+            break
+            
+        # Get date of last result for next query
+        current_date = results[-1]["metadata_modified"]
+
+        total_records += len(results)
+        logger.info(f"Request took {request_time:.2f}s. Total records: {total_records}")
+            
+        yield results
+        
+        time.sleep(1)
+
+def get_dataset_history(dataset_name: str) -> None:
+    """
+    Fetch and display all versions of a dataset with the given ID,
+    from oldest to newest, showing only changed fields between versions.
+    """
+    # Get all versions including current
+    versions = [
+        model_to_dict(record, recurse=True)
+        for record in (DatasetHistory
+                      .select()
+                      .where(DatasetHistory.name == dataset_name)
+                      .order_by(DatasetHistory.metadata_modified))
+    ]
+    current_record = Dataset.select().where(Dataset.name == dataset_name).first()
+    if current_record:
+        versions.append(model_to_dict(current_record, recurse=True))
+
+    if not versions:
+        print(f"No dataset found with name: {dataset_name}")
+        return
+    
+    # Print each version with changed fields
+    prev = None
+    for curr in versions:
+        history_id = curr.pop('history_id', None)
+        if prev:
+            diff_fields = diff(prev, curr)
+        else:
+            diff_fields = curr
+
+        print(f"*** Version: {curr.get('metadata_modified')} ***")
+        for k, v in diff_fields.items():
+            print(f"- {k}: {v}")
+        print("\n")
+        prev = curr
+
+@click.group()
+def cli():
+    """Data.gov dataset mirroring tools."""
+    pass
+
+# Modify the existing main function to be a command in the group
+@cli.command()
+@click.argument('output_path', type=click.Path(path_type=Path), default='data/data.db')
+@click.option('--rows-per-page', '-r', type=int, default=1000,
+              help='Number of results to fetch per page.')
+@click.option('--start-date', '-s', type=str, default=None,
+              help='Date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)')
+@click.option('--log-level', '-l', 
+              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='WARNING',
+              help='Logging level.')
+def fetch(output_path: Path, rows_per_page: int, start_date: str, log_level: str):
+    """Fetch package data from data.gov API and save to database."""
+    logging.basicConfig(
+        level=getattr(logging, log_level),
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    
+    save_packages_to_database(output_path, rows_per_page, start_date)
+
+@cli.command()
+@click.argument('dataset_name')
+@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
+def history(dataset_name: str, db_path: Path):
+    """Show version history for a dataset with the given ID."""
+    init_database(db_path)
+    try:
+        get_dataset_history(dataset_name)
+    finally:
+        db.close()
+
+@cli.command()
+@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
+def delete_duplicate_history(db_path: Path):
+    """Delete duplicate history records."""
+    init_database(db_path)
+    try:
+        # Get all unique dataset names in history
+        unique_names = (DatasetHistory
+                       .select(DatasetHistory.name)
+                       .distinct()
+                       .tuples())
+
+        total_deleted = 0
+        for (name,) in tqdm(unique_names, desc="Processing datasets"):
+            # Get all versions for this dataset ordered by modification date
+            versions = [
+                model_to_dict(record)
+                for record in (DatasetHistory
+                             .select()
+                             .where(DatasetHistory.name == name)
+                             .order_by(DatasetHistory.metadata_modified))
+            ]
+            current_record = Dataset.select().where(Dataset.name == name).first()
+            if current_record:
+                versions.append(model_to_dict(current_record))
+
+            # Track IDs of duplicate records to delete
+            to_delete = []
+            
+            # Compare adjacent versions
+            prev = versions[0]
+            prev_id = prev.pop('history_id')
+            for curr in versions[1:]:
+                curr_id = curr.pop('history_id', None)
+                
+                # If versions are identical, mark current version for deletion
+                if not diff(prev, curr):
+                    to_delete.append(prev_id)
+                prev = curr
+                prev_id = curr_id
+
+            # Bulk delete duplicate records
+            if to_delete:
+                deleted = (DatasetHistory
+                          .delete()
+                          .where(DatasetHistory.history_id << to_delete)
+                          .execute())
+                total_deleted += deleted
+
+        click.echo(f"Deleted {total_deleted} duplicate history records")
+    finally:
+        db.close()
+
+if __name__ == "__main__":
+    cli()
+
--- a/scripts/data_gov/fetch_jsonl.py
+++ b/scripts/data_gov/fetch_jsonl.py
@ -0,0 +1,35 @@
+import httpx
+import json
+import time
+import logging
+from pathlib import Path
+from typing import Iterator, Dict, Any, List
+import click
+from scripts.data_gov.fetch_index import fetch_data_gov_packages
+
+logger = logging.getLogger(__name__)
+
+@click.command()
+@click.argument('output_path', type=click.Path(path_type=Path), default='data/data_20250130.jsonl')
+@click.option('--rows-per-page', '-r', type=int, default=1000,
+              help='Number of results to fetch per page.')
+@click.option('--log-level', '-l', 
+              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO',
+              help='Logging level.')
+@click.option('--start-date', '-s', type=str, default=None,
+              help='Start date for fetching packages in YYYY-MM-DD format.')
+def main(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
+    """Fetch all package data from data.gov API and save to JSONL file."""
+    logging.basicConfig(
+        level=getattr(logging, log_level),
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    
+    with open(output_path, 'a') as f:
+        for results in fetch_data_gov_packages(rows_per_page=rows_per_page, start_date=start_date):
+            for package in results:
+                f.write(json.dumps(package) + '\n')
+
+if __name__ == "__main__":
+    main()
--- a/scripts/data_gov/migrate.py
+++ b/scripts/data_gov/migrate.py
@ -0,0 +1,18 @@
+from playhouse.migrate import *
+from scripts.data_gov.models import db
+
+migrator = SqliteMigrator(db)
+
+def do_migrate():
+    crawler_identified_date = DateTimeField(null=True)
+    crawler_downloaded_date = DateTimeField(null=True)
+    with db.atomic():
+        migrate(
+            # migrator.add_column('dataset', 'crawler_identified_date', crawler_identified_date),
+            # migrator.add_column('dataset', 'crawler_downloaded_date', crawler_downloaded_date),
+            # migrator.add_column('datasethistory', 'crawler_identified_date', crawler_identified_date),
+            # migrator.add_column('datasethistory', 'crawler_downloaded_date', crawler_downloaded_date),
+        )
+
+if __name__ == '__main__':
+    do_migrate()
--- a/scripts/data_gov/models.py
+++ b/scripts/data_gov/models.py
@ -0,0 +1,61 @@
+from peewee import *
+from playhouse.sqlite_ext import JSONField
+from pathlib import Path
+from datetime import datetime
+
+db = SqliteDatabase(Path(__file__).parent.parent.parent / 'data/data.db', pragmas={
+    # tuning suggested by Claude:
+    'journal_mode': 'wal',          # Write-Ahead Logging for better concurrency
+    'cache_size': -1024 * 64,       # 64MB cache (negative number means kibibytes)
+    'synchronous': 'normal',        # Good balance between safety and speed
+    'busy_timeout': 30000,          # Wait up to 30 seconds when database is locked
+    'temp_store': 'memory',         # Store temp tables in memory
+    'mmap_size': 268435456,         # Memory-mapped I/O (256MB)
+    'page_size': 4096,              # Optimal for most systems
+})
+
+class BaseModel(Model):
+    class Meta:
+        database = db
+
+class Dataset(BaseModel):
+    # fields from data.gov
+    id = CharField(primary_key=True)
+    name = CharField(null=True)
+    title = CharField(null=True)
+    notes = TextField(null=True)
+    metadata_created = DateTimeField(null=True)
+    metadata_modified = DateTimeField(null=True)
+    private = BooleanField(null=True)
+    state = CharField(null=True)
+    version = CharField(null=True)
+    type = CharField(null=True)
+    num_resources = IntegerField(null=True)
+    num_tags = IntegerField(null=True)
+    isopen = BooleanField(null=True)
+    author = CharField(null=True)
+    author_email = CharField(null=True)
+    creator_user_id = CharField(null=True)
+    license_id = CharField(null=True)
+    license_url = CharField(null=True)
+    license_title = CharField(null=True)
+    maintainer = CharField(null=True)
+    maintainer_email = CharField(null=True)
+    owner_org = CharField(null=True)
+    url = CharField(null=True)
+    organization = JSONField(null=True)
+    extras = JSONField(null=True)
+    resources = JSONField(null=True)
+    tags = JSONField(null=True)
+    groups = JSONField(null=True)
+    relationships_as_subject = JSONField(null=True)
+    relationships_as_object = JSONField(null=True)
+
+    # fields starting with crawler_ are added by our crawler
+    crawler_identified_date = DateTimeField(null=True, default=datetime.now)
+    crawler_downloaded_date = DateTimeField(null=True)
+
+class DatasetHistory(Dataset):
+    history_id = AutoField(primary_key=True)
+    id = CharField()  # Regular CharField, not primary key
+    #deleted_by_date = DateTimeField(null=True)  # New field to track deletion date
--- a/scripts/github/download_git.py
+++ b/scripts/github/download_git.py
@ -0,0 +1,141 @@
+import csv
+import logging
+from pathlib import Path
+from scripts.helpers.parallel import run_parallel
+import click
+from tqdm import tqdm
+from gitspoke import Downloader, GitHubAPI
+from gitspoke.cli import valid_include_items, get_token
+import os
+import json
+import requests
+from scripts.helpers.config import load_config
+
+logger = logging.getLogger(__name__)
+stats_counter = {}
+
+CONFIG_PATH = (os.environ.get("XDG_CONFIG_HOME") or (Path.home() / ".config")) / "data-mirror" / "config.json"
+
+def check_repo_exists(org_name, repo_name, token, output_path=None):
+    """Check if a repository still exists on GitHub."""
+    exists = True
+    try:
+        GitHubAPI(token).request(f"repos/{org_name}/{repo_name}", method="HEAD")
+    except requests.exceptions.HTTPError as e:
+        if e.response.status_code == 404:
+            exists = False
+        else:
+            raise e
+    if not exists:
+        repo_link = f"https://github.com/{org_name}/{repo_name}"
+        print(repo_link)
+        if output_path:
+            with open(output_path, 'a') as output_file:
+                output_file.write(f"{repo_link}\n")
+    return exists
+
+def run_pipeline(org_name, repo_name, collection_path, include, token, check_exists=False, output_path=None):
+    """Process a single repository."""
+    if check_exists:
+        return check_repo_exists(org_name, repo_name, token, output_path)
+        
+    logger.info(f"Processing repository: {org_name}/{repo_name}")
+    Downloader(org_name, repo_name, token, max_retries=20).download_repo(collection_path, include=include)
+    logger.info("Processing complete")
+
+def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int = 0, skip_existing: bool = False, stop_after: int = None, include: str = None,
+              check_exists: bool = False):
+    """Get repositories from CSV that haven't been processed yet."""
+    # Initialize progress bars
+    if not check_exists:
+        stats_counter['total'] = tqdm(desc="Total records", unit="repo")
+        if skip_existing:
+            stats_counter['skipped'] = tqdm(desc="Skipped", unit="repo")
+            stats_counter['yielded'] = tqdm(desc="Processing", unit="repo")
+
+    # handle --include
+    if include:
+        include = include.split(',')
+    else:
+        include = ['repo_info']
+    
+    # import token or tokens
+    config = load_config()
+    if config.get('tokens'):
+        tokens = config['tokens']
+    else:
+        tokens = [get_token(None)]
+    if tokens != [None]:
+        logger.warning(f"Using {len(tokens)} tokens")
+    else:
+        logger.warning("Using unauthenticated rate limits")
+
+    with open(csv_path, 'r') as file:
+        reader = csv.DictReader(file)
+        # Skip specified number of rows
+        for _ in range(skip_rows):
+            next(reader)
+            
+        processed = 0
+        for row in reader:
+            if not check_exists:
+                stats_counter['total'].update(1)
+            
+            if not row['html_url']:  # Skip empty rows
+                continue
+                
+            org_name, repo_name = row['html_url'].split('/')[-2:]
+            collection_path = output_path / 'collections' / collection / org_name / repo_name
+            
+            if skip_existing:
+                if collection_path.exists():
+                    stats_counter['skipped'].update(1)
+                    continue
+                else:
+                    stats_counter['yielded'].update(1)
+
+            # use tokens round robin
+            token = tokens[processed % len(tokens)]
+
+            yield org_name, repo_name, collection_path, include, token, check_exists, output_path
+
+            processed += 1
+            if stop_after and processed >= stop_after:
+                break
+
+    # Close progress bars
+    for counter in stats_counter.values():
+        counter.close()
+
+@click.command()
+@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
+              help='Output path.')
+@click.option('--collection', '-c', type=str, default='github_raw',
+              help='Collection name.')
+@click.option('--workers', '-w', type=int, default=None, 
+              help='Number of worker processes. Defaults to CPU count.')
+@click.option('--skip-rows', type=int, default=0,
+              help='Number of rows to skip in the CSV.')
+@click.option('--include', 
+              help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items))
+@click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv',
+              help='Path to the CSV file.')
+@click.option('--log-level', '-l', 
+              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default=None,
+              help='Logging level.')
+@click.option('--stop-after', help='Stop after processing this many repositories', type=int)
+@click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories')
+@click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub')
+def main(csv_path: Path, output_path: Path, collection: str, workers=None, skip_rows=0, include=None,
+         log_level=None, stop_after=None, skip_existing=False, check_exists=False):
+    
+    run_parallel(
+        run_pipeline,
+        get_tasks(csv_path, output_path, collection, skip_rows, skip_existing, stop_after, include, check_exists),
+        workers,
+        log_level=log_level
+    )
+
+if __name__ == "__main__":
+    main()
--- a/scripts/helpers/config.py
+++ b/scripts/helpers/config.py
@ -0,0 +1,13 @@
+import json
+import os
+from pathlib import Path
+
+CONFIG_PATH = (os.environ.get("XDG_CONFIG_HOME") or (Path.home() / ".config")) / "data-mirror" / "config.json"
+
+def load_config():
+    """Load configuration from config file."""
+    if CONFIG_PATH.exists():
+        config = json.loads(CONFIG_PATH.read_text())
+    else:
+        config = {}
+    return config 
--- a/scripts/helpers/parallel.py
+++ b/scripts/helpers/parallel.py
@ -0,0 +1,65 @@
+from multiprocessing import Queue, Process
+from queue import Empty
+import os
+from tqdm import tqdm
+from typing import Callable, Iterable
+import logging
+
+# Set up logger
+logger = logging.getLogger(__name__)
+
+def worker(task_queue, task, catch_errors: bool = True):
+    while True:
+        try:
+            args = task_queue.get(timeout=1)
+            if args is None:
+                break
+            logger.debug(f"[PID {os.getpid()}] Processing task")
+            task(*args)
+        except Empty:
+            continue
+        except Exception as e:
+            if catch_errors:
+                logger.error(f"[PID {os.getpid()}] Worker error: {e}")
+            else:
+                raise e
+
+
+def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, log_level: str | None = None, task_count: int | None = None):
+    workers = workers or os.cpu_count() or 4
+    
+    # Configure logging based on whether we're running in parallel or not
+    if log_level is None:
+        log_level = 'INFO' if workers == 1 else 'WARNING'
+    logging.basicConfig(
+        level=log_level,
+        format='[%(process)d] %(message)s'
+    )
+    
+    logger.debug(f"Starting processing with {workers} workers")
+    
+    if workers > 1:
+        task_queue = Queue(maxsize=100)
+        
+        # Start worker processes
+        processes = []
+        for _ in range(workers):
+            p = Process(target=worker, args=(task_queue, processor, catch_errors))
+            p.start()
+            processes.append(p)
+
+    # Load tasks into queue
+    for task_item in tqdm(tasks, total=task_count):
+        if workers > 1:
+            task_queue.put(task_item)
+        else:
+            processor(*task_item)
+
+    if workers > 1:
+        # Signal workers to exit
+        for _ in range(workers):
+            task_queue.put(None)
+
+        # Wait for all processes to complete
+        for p in processes:
+            p.join()