data-vault/scripts/collection/write_metadata.py

106 lines
4.6 KiB
Python
Raw Permalink Normal View History

2025-02-05 15:47:05 -05:00
import boto3
import click
import json
from pathlib import Path
import logging
import csv
import zipfile
from tqdm import tqdm
2025-02-24 16:45:50 -05:00
import io
2025-02-05 15:47:05 -05:00
logger = logging.getLogger(__name__)
@click.group()
def cli():
pass
@cli.command()
@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path),
default='collections/collections.json',
help='Path to collections configuration file.')
def write_readme(collections_file: Path):
# Load collections config
collections = json.loads(collections_file.read_text())
collections_dir = collections_file.parent
for collection in collections:
s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3')
collection_path = collections_dir / collection['directory']
bucket_name, s3_prefix = collection['s3_path'].split('/', 1)
for file_path in collection_path.rglob('*'):
2025-02-24 16:45:50 -05:00
# Skip dotfiles and files in dot directories
if any(part.startswith('.') for part in file_path.parts):
continue
2025-02-05 15:47:05 -05:00
if file_path.is_file():
relative_path = file_path.relative_to(collection_path)
s3_key = f"{s3_prefix}/{relative_path}"
print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}")
s3.upload_file(str(file_path), bucket_name, s3_key)
@cli.command()
@click.argument('metadata_file', type=click.Path(exists=True, path_type=Path))
@click.argument('output_file', type=click.Path(path_type=Path))
def write_csv(metadata_file: Path, output_file: Path):
"""
2025-02-24 16:45:50 -05:00
Read a zipped JSONL file of metadata and write dataset info to a zipped CSV.
2025-02-05 15:47:05 -05:00
metadata_file: Path to the zip file containing metadata JSONL
2025-02-24 16:45:50 -05:00
output_file: Path where the zipped CSV should be written
2025-02-05 15:47:05 -05:00
"""
2025-02-24 16:45:50 -05:00
# Get the base filename without .zip extension for the internal CSV file
internal_filename = output_file.name.replace('.zip', '')
jsonl_name = metadata_file.name.replace('.zip', '')
with zipfile.ZipFile(metadata_file, 'r') as input_zf, \
zipfile.ZipFile(output_file, 'w', compression=zipfile.ZIP_DEFLATED) as output_zf, \
output_zf.open(internal_filename, 'w', force_zip64=True) as csvfile, \
input_zf.open(jsonl_name) as jsonlfile:
# Create a text wrapper around the binary file
text_wrapper = io.TextIOWrapper(csvfile, write_through=True, newline='')
writer = csv.writer(text_wrapper)
writer.writerow(['name', 'organization', 'title', 'date', 'metadata_path', 'collection_path'])
# Read from input zip and write to output zip
for line in tqdm(jsonlfile, desc="Writing CSV"):
metadata = json.loads(line)
dataset_info = metadata['signed_metadata']['data_gov_metadata']
if dataset_info:
writer.writerow([
dataset_info['name'],
dataset_info['organization']['title'],
dataset_info['title'],
dataset_info['metadata_modified'],
metadata['metadata_path'],
metadata['collection_path'],
])
2025-02-05 15:47:05 -05:00
@cli.command()
@click.argument('metadata_dir', type=click.Path(exists=True, path_type=Path))
@click.argument('output_file', type=click.Path(path_type=Path))
def write_jsonl(metadata_dir: Path, output_file: Path):
"""
2025-02-24 16:45:50 -05:00
Read each .json file, recursively, in metadata directory and write to a single compressed zipped JSONL file.
2025-02-05 15:47:05 -05:00
All records are written to a single JSONL file within the zip, named same as output_file without .zip
"""
# Get the base filename without .zip extension for the internal file
internal_filename = output_file.name.replace('.zip', '')
output_dir = output_file.parent
2025-02-24 16:45:50 -05:00
with zipfile.ZipFile(output_file, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
2025-02-05 15:47:05 -05:00
# Create a single file in the zip archive
with zf.open(internal_filename, 'w', force_zip64=True) as f:
# Iterate through all JSON files
for file_path in tqdm(metadata_dir.rglob('*.json'), desc="Writing JSONL"):
with open(file_path, 'r') as json_file:
2025-02-24 16:45:50 -05:00
metadata = json.load(json_file)
2025-02-05 15:47:05 -05:00
metadata['metadata_path'] = str(file_path.relative_to(output_dir))
metadata['collection_path'] = metadata['metadata_path'].replace('metadata', 'collections', 1)
# Write each record to the same file, with newline
f.write((json.dumps(metadata) + '\n').encode('utf-8'))
if __name__ == '__main__':
cli()