metadata updates

2025-09-02 20:26:01 -04:00 · 2025-02-05 15:47:05 -05:00 · 2025-02-05 15:47:05 -05:00 · 9ccee0d422
commit 9ccee0d422
parent a1532df719
7 changed files with 165 additions and 34 deletions
--- a/collections/data_gov/docs/LIL_HLSL_logos.png
+++ b/collections/data_gov/docs/LIL_HLSL_logos.png
--- a/scripts/collection/cloudflare_tools.py
+++ b/scripts/collection/cloudflare_tools.py
@ -3,7 +3,7 @@ from pathlib import Path
 import click
 from cloudflare import Cloudflare
 import os
-from scripts.helpers.config import load_config
+from scripts.helpers.misc import load_config

 logger = logging.getLogger(__name__)

--- a/scripts/collection/s3_tools.py
+++ b/scripts/collection/s3_tools.py
@ -3,6 +3,13 @@ import click
 from tqdm import tqdm
 import logging
 from itertools import islice
+import json
+import gzip
+from io import BytesIO
+import tempfile
+import os
+from scripts.helpers.misc import json_default
+import zipfile

 logger = logging.getLogger(__name__)

@ -76,6 +83,36 @@ def delete_empty_files(s3_client, bucket: str, prefix: str, dry_run: bool = Fals

    pbar.close()

+def write_file_listing(s3_client, bucket: str, prefix: str, index_key: str):
+    """Write a JSONL listing of all files under prefix to index_key."""
+    # Create a temporary file
+    with tempfile.NamedTemporaryFile(mode='wb', suffix='.zip', delete=True) as tmp:
+        with zipfile.ZipFile(tmp, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
+            # Create a temporary file for the JSONL content
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=True) as jsonl:
+                paginator = s3_client.get_paginator('list_objects_v2')
+                for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="indexing"):
+                    if 'Contents' in page:
+                        for obj in page['Contents']:
+                            # Write each object as a JSON line using custom encoder
+                            line = json.dumps(obj, default=json_default) + '\n'
+                            jsonl.write(line)
+                
+                # Flush the JSONL file and add it to the zip
+                jsonl.flush()
+                zf.write(jsonl.name, arcname='file_listing.jsonl')
+        
+        # Upload the zip file
+        tmp.flush()
+        s3_client.upload_file(
+            tmp.name,
+            bucket,
+            index_key,
+            ExtraArgs={'ContentType': 'application/zip'}
+        )
+    
+    logger.info(f"Wrote index to s3://{bucket}/{index_key}")
+
@click.group()
 def cli():
    """S3 object management commands."""
@ -113,6 +150,25 @@ def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_l
    
    delete_empty_files(s3_client, bucket, prefix, dry_run)

+@cli.command()
+@click.argument('s3_path')
+@click.option('--profile', help='AWS profile name', default='sc-direct')
+@click.option('--output', '-o', help='Output path for index file', default=None)
+@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
+              default='INFO', help='Set logging level')
+def write_index(s3_path: str, profile: str = None, output: str | None = None, log_level: str = 'INFO'):
+    """Write a JSONL index of all files under the given prefix."""
+    logging.basicConfig(level=log_level)
+    bucket, prefix = s3_path.split('/', 1)
+    
+    if output is None:
+        output = prefix.rstrip('/') + '/file_listing.jsonl.zip'
+    
+    session = boto3.Session(profile_name=profile)
+    s3_client = session.client('s3')
+    
+    write_file_listing(s3_client, bucket, prefix, output)
+
 if __name__ == '__main__':
    cli()

--- a/scripts/collection/sync.py
+++ b/scripts/collection/sync.py
@ -1,31 +0,0 @@
-import boto3
-import click
-import json
-from pathlib import Path
-import logging
-
-logger = logging.getLogger(__name__)
-
-@click.command()
-@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path),
-              default='collections/collections.json',
-              help='Path to collections configuration file.')
-def main(collections_file: Path):
-    # Load collections config
-    collections = json.loads(collections_file.read_text())
-    collections_dir = collections_file.parent
-
-    for collection in collections:
-        s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3')
-        collection_path = collections_dir / collection['directory']
-        bucket_name, s3_prefix = collection['s3_path'].split('/', 1)
-
-        for file_path in collection_path.rglob('*'):
-            if file_path.is_file():
-                relative_path = file_path.relative_to(collection_path)
-                s3_key = f"{s3_prefix}/{relative_path}"
-                print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}")
-                s3.upload_file(str(file_path), bucket_name, s3_key)
-
-if __name__ == '__main__':
-    main()
--- a/scripts/collection/write_metadata.py
+++ b/scripts/collection/write_metadata.py
@ -0,0 +1,99 @@
+import boto3
+import click
+import json
+from pathlib import Path
+import logging
+import csv
+import zipfile
+from tqdm import tqdm
+logger = logging.getLogger(__name__)
+
+@click.group()
+def cli():
+    pass
+
+@cli.command()
+@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path),
+              default='collections/collections.json',
+              help='Path to collections configuration file.')
+def write_readme(collections_file: Path):
+    # Load collections config
+    collections = json.loads(collections_file.read_text())
+    collections_dir = collections_file.parent
+
+    for collection in collections:
+        s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3')
+        collection_path = collections_dir / collection['directory']
+        bucket_name, s3_prefix = collection['s3_path'].split('/', 1)
+
+        for file_path in collection_path.rglob('*'):
+            if file_path.is_file():
+                relative_path = file_path.relative_to(collection_path)
+                s3_key = f"{s3_prefix}/{relative_path}"
+                print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}")
+                s3.upload_file(str(file_path), bucket_name, s3_key)
+
+@cli.command()
+@click.argument('metadata_file', type=click.Path(exists=True, path_type=Path))
+@click.argument('output_file', type=click.Path(path_type=Path))
+def write_csv(metadata_file: Path, output_file: Path):
+    """
+    Read a zipped JSONL file of metadata and write dataset info to CSV.
+    
+    metadata_file: Path to the zip file containing metadata JSONL
+    output_file: Path where the CSV should be written
+    """
+    with zipfile.ZipFile(metadata_file, 'r') as zf, \
+         open(output_file, 'w', newline='') as csvfile:
+        
+        jsonl_name = metadata_file.name.replace('.zip', '')
+        writer = csv.writer(csvfile)
+        writer.writerow(['name', 'title'])  # Write header
+        
+        with zf.open(jsonl_name) as f:
+            for line in tqdm(f, desc="Writing CSV"):
+                try:
+                    metadata = json.loads(line)
+                except json.JSONDecodeError:
+                    print(line)
+                    breakpoint()
+                    print(line)
+                    continue
+                dataset_info = metadata.get('signed_metadata', {}).get('data_gov_metadata', {})
+                if dataset_info:
+                    writer.writerow([
+                        dataset_info.get('name', ''),
+                        dataset_info.get('title', '')
+                    ])
+
+@cli.command()
+@click.argument('metadata_dir', type=click.Path(exists=True, path_type=Path))
+@click.argument('output_file', type=click.Path(path_type=Path))
+def write_jsonl(metadata_dir: Path, output_file: Path):
+    """
+    Read each .json file, recursively, in metadata directory and write to a single zipped JSONL file.
+    All records are written to a single JSONL file within the zip, named same as output_file without .zip
+    """
+    # Get the base filename without .zip extension for the internal file
+    internal_filename = output_file.name.replace('.zip', '')
+    output_dir = output_file.parent
+    
+    # Use force_zip64=True to handle files larger than 2GB
+    with zipfile.ZipFile(output_file, 'w') as zf:
+        # Create a single file in the zip archive
+        with zf.open(internal_filename, 'w', force_zip64=True) as f:
+            # Iterate through all JSON files
+            for file_path in tqdm(metadata_dir.rglob('*.json'), desc="Writing JSONL"):
+                with open(file_path, 'r') as json_file:
+                    try:
+                        metadata = json.load(json_file)
+                    except json.JSONDecodeError:
+                        print(file_path)
+                        raise
+                    metadata['metadata_path'] = str(file_path.relative_to(output_dir))
+                    metadata['collection_path'] = metadata['metadata_path'].replace('metadata', 'collections', 1)
+                    # Write each record to the same file, with newline
+                    f.write((json.dumps(metadata) + '\n').encode('utf-8'))
+
+if __name__ == '__main__':
+    cli()
--- a/scripts/github/download_git.py
+++ b/scripts/github/download_git.py
@ -9,7 +9,7 @@ from gitspoke.cli import valid_include_items, get_token
 import os
 import json
 import requests
-from scripts.helpers.config import load_config
+from scripts.helpers.misc import load_config

 logger = logging.getLogger(__name__)
 stats_counter = {}
--- a/scripts/helpers/config.py
+++ b/scripts/helpers/config.py
@ -10,4 +10,11 @@ def load_config():
        config = json.loads(CONFIG_PATH.read_text())
    else:
        config = {}
-    return config 
+    return config 
+
+
+def json_default(obj):
+    """Default JSON encoder for serializing datetime objects."""
+    if hasattr(obj, 'isoformat'):
+        return obj.isoformat()
+    return super().default(obj)