diff --git a/collections/data_gov/docs/LIL_HLSL_logos.png b/collections/data_gov/docs/LIL_HLSL_logos.png index 8966ae7..e9bcd0f 100644 Binary files a/collections/data_gov/docs/LIL_HLSL_logos.png and b/collections/data_gov/docs/LIL_HLSL_logos.png differ diff --git a/scripts/collection/cloudflare_tools.py b/scripts/collection/cloudflare_tools.py index 77bb40d..6050d6c 100644 --- a/scripts/collection/cloudflare_tools.py +++ b/scripts/collection/cloudflare_tools.py @@ -3,7 +3,7 @@ from pathlib import Path import click from cloudflare import Cloudflare import os -from scripts.helpers.config import load_config +from scripts.helpers.misc import load_config logger = logging.getLogger(__name__) diff --git a/scripts/collection/s3_tools.py b/scripts/collection/s3_tools.py index 625485a..35e704f 100644 --- a/scripts/collection/s3_tools.py +++ b/scripts/collection/s3_tools.py @@ -3,6 +3,13 @@ import click from tqdm import tqdm import logging from itertools import islice +import json +import gzip +from io import BytesIO +import tempfile +import os +from scripts.helpers.misc import json_default +import zipfile logger = logging.getLogger(__name__) @@ -76,6 +83,36 @@ def delete_empty_files(s3_client, bucket: str, prefix: str, dry_run: bool = Fals pbar.close() +def write_file_listing(s3_client, bucket: str, prefix: str, index_key: str): + """Write a JSONL listing of all files under prefix to index_key.""" + # Create a temporary file + with tempfile.NamedTemporaryFile(mode='wb', suffix='.zip', delete=True) as tmp: + with zipfile.ZipFile(tmp, mode='w', compression=zipfile.ZIP_DEFLATED) as zf: + # Create a temporary file for the JSONL content + with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=True) as jsonl: + paginator = s3_client.get_paginator('list_objects_v2') + for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="indexing"): + if 'Contents' in page: + for obj in page['Contents']: + # Write each object as a JSON line using custom encoder + line = json.dumps(obj, default=json_default) + '\n' + jsonl.write(line) + + # Flush the JSONL file and add it to the zip + jsonl.flush() + zf.write(jsonl.name, arcname='file_listing.jsonl') + + # Upload the zip file + tmp.flush() + s3_client.upload_file( + tmp.name, + bucket, + index_key, + ExtraArgs={'ContentType': 'application/zip'} + ) + + logger.info(f"Wrote index to s3://{bucket}/{index_key}") + @click.group() def cli(): """S3 object management commands.""" @@ -113,6 +150,25 @@ def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_l delete_empty_files(s3_client, bucket, prefix, dry_run) +@cli.command() +@click.argument('s3_path') +@click.option('--profile', help='AWS profile name', default='sc-direct') +@click.option('--output', '-o', help='Output path for index file', default=None) +@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), + default='INFO', help='Set logging level') +def write_index(s3_path: str, profile: str = None, output: str | None = None, log_level: str = 'INFO'): + """Write a JSONL index of all files under the given prefix.""" + logging.basicConfig(level=log_level) + bucket, prefix = s3_path.split('/', 1) + + if output is None: + output = prefix.rstrip('/') + '/file_listing.jsonl.zip' + + session = boto3.Session(profile_name=profile) + s3_client = session.client('s3') + + write_file_listing(s3_client, bucket, prefix, output) + if __name__ == '__main__': cli() diff --git a/scripts/collection/sync.py b/scripts/collection/sync.py deleted file mode 100644 index 69eba48..0000000 --- a/scripts/collection/sync.py +++ /dev/null @@ -1,31 +0,0 @@ -import boto3 -import click -import json -from pathlib import Path -import logging - -logger = logging.getLogger(__name__) - -@click.command() -@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path), - default='collections/collections.json', - help='Path to collections configuration file.') -def main(collections_file: Path): - # Load collections config - collections = json.loads(collections_file.read_text()) - collections_dir = collections_file.parent - - for collection in collections: - s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3') - collection_path = collections_dir / collection['directory'] - bucket_name, s3_prefix = collection['s3_path'].split('/', 1) - - for file_path in collection_path.rglob('*'): - if file_path.is_file(): - relative_path = file_path.relative_to(collection_path) - s3_key = f"{s3_prefix}/{relative_path}" - print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}") - s3.upload_file(str(file_path), bucket_name, s3_key) - -if __name__ == '__main__': - main() diff --git a/scripts/collection/write_metadata.py b/scripts/collection/write_metadata.py new file mode 100644 index 0000000..6870156 --- /dev/null +++ b/scripts/collection/write_metadata.py @@ -0,0 +1,99 @@ +import boto3 +import click +import json +from pathlib import Path +import logging +import csv +import zipfile +from tqdm import tqdm +logger = logging.getLogger(__name__) + +@click.group() +def cli(): + pass + +@cli.command() +@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path), + default='collections/collections.json', + help='Path to collections configuration file.') +def write_readme(collections_file: Path): + # Load collections config + collections = json.loads(collections_file.read_text()) + collections_dir = collections_file.parent + + for collection in collections: + s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3') + collection_path = collections_dir / collection['directory'] + bucket_name, s3_prefix = collection['s3_path'].split('/', 1) + + for file_path in collection_path.rglob('*'): + if file_path.is_file(): + relative_path = file_path.relative_to(collection_path) + s3_key = f"{s3_prefix}/{relative_path}" + print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}") + s3.upload_file(str(file_path), bucket_name, s3_key) + +@cli.command() +@click.argument('metadata_file', type=click.Path(exists=True, path_type=Path)) +@click.argument('output_file', type=click.Path(path_type=Path)) +def write_csv(metadata_file: Path, output_file: Path): + """ + Read a zipped JSONL file of metadata and write dataset info to CSV. + + metadata_file: Path to the zip file containing metadata JSONL + output_file: Path where the CSV should be written + """ + with zipfile.ZipFile(metadata_file, 'r') as zf, \ + open(output_file, 'w', newline='') as csvfile: + + jsonl_name = metadata_file.name.replace('.zip', '') + writer = csv.writer(csvfile) + writer.writerow(['name', 'title']) # Write header + + with zf.open(jsonl_name) as f: + for line in tqdm(f, desc="Writing CSV"): + try: + metadata = json.loads(line) + except json.JSONDecodeError: + print(line) + breakpoint() + print(line) + continue + dataset_info = metadata.get('signed_metadata', {}).get('data_gov_metadata', {}) + if dataset_info: + writer.writerow([ + dataset_info.get('name', ''), + dataset_info.get('title', '') + ]) + +@cli.command() +@click.argument('metadata_dir', type=click.Path(exists=True, path_type=Path)) +@click.argument('output_file', type=click.Path(path_type=Path)) +def write_jsonl(metadata_dir: Path, output_file: Path): + """ + Read each .json file, recursively, in metadata directory and write to a single zipped JSONL file. + All records are written to a single JSONL file within the zip, named same as output_file without .zip + """ + # Get the base filename without .zip extension for the internal file + internal_filename = output_file.name.replace('.zip', '') + output_dir = output_file.parent + + # Use force_zip64=True to handle files larger than 2GB + with zipfile.ZipFile(output_file, 'w') as zf: + # Create a single file in the zip archive + with zf.open(internal_filename, 'w', force_zip64=True) as f: + # Iterate through all JSON files + for file_path in tqdm(metadata_dir.rglob('*.json'), desc="Writing JSONL"): + with open(file_path, 'r') as json_file: + try: + metadata = json.load(json_file) + except json.JSONDecodeError: + print(file_path) + raise + metadata['metadata_path'] = str(file_path.relative_to(output_dir)) + metadata['collection_path'] = metadata['metadata_path'].replace('metadata', 'collections', 1) + # Write each record to the same file, with newline + f.write((json.dumps(metadata) + '\n').encode('utf-8')) + +if __name__ == '__main__': + cli() diff --git a/scripts/github/download_git.py b/scripts/github/download_git.py index 3ba6d43..8aafc25 100644 --- a/scripts/github/download_git.py +++ b/scripts/github/download_git.py @@ -9,7 +9,7 @@ from gitspoke.cli import valid_include_items, get_token import os import json import requests -from scripts.helpers.config import load_config +from scripts.helpers.misc import load_config logger = logging.getLogger(__name__) stats_counter = {} diff --git a/scripts/helpers/config.py b/scripts/helpers/misc.py similarity index 62% rename from scripts/helpers/config.py rename to scripts/helpers/misc.py index d1bcc1c..00340ee 100644 --- a/scripts/helpers/config.py +++ b/scripts/helpers/misc.py @@ -10,4 +10,11 @@ def load_config(): config = json.loads(CONFIG_PATH.read_text()) else: config = {} - return config \ No newline at end of file + return config + + +def json_default(obj): + """Default JSON encoder for serializing datetime objects.""" + if hasattr(obj, 'isoformat'): + return obj.isoformat() + return super().default(obj) \ No newline at end of file