metadata updates

This commit is contained in:
Jack Cushman 2025-02-05 15:47:05 -05:00
parent a1532df719
commit 9ccee0d422
7 changed files with 165 additions and 34 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 45 KiB

After

Width:  |  Height:  |  Size: 36 KiB

View file

@ -3,7 +3,7 @@ from pathlib import Path
import click import click
from cloudflare import Cloudflare from cloudflare import Cloudflare
import os import os
from scripts.helpers.config import load_config from scripts.helpers.misc import load_config
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View file

@ -3,6 +3,13 @@ import click
from tqdm import tqdm from tqdm import tqdm
import logging import logging
from itertools import islice from itertools import islice
import json
import gzip
from io import BytesIO
import tempfile
import os
from scripts.helpers.misc import json_default
import zipfile
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -76,6 +83,36 @@ def delete_empty_files(s3_client, bucket: str, prefix: str, dry_run: bool = Fals
pbar.close() pbar.close()
def write_file_listing(s3_client, bucket: str, prefix: str, index_key: str):
"""Write a JSONL listing of all files under prefix to index_key."""
# Create a temporary file
with tempfile.NamedTemporaryFile(mode='wb', suffix='.zip', delete=True) as tmp:
with zipfile.ZipFile(tmp, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
# Create a temporary file for the JSONL content
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=True) as jsonl:
paginator = s3_client.get_paginator('list_objects_v2')
for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="indexing"):
if 'Contents' in page:
for obj in page['Contents']:
# Write each object as a JSON line using custom encoder
line = json.dumps(obj, default=json_default) + '\n'
jsonl.write(line)
# Flush the JSONL file and add it to the zip
jsonl.flush()
zf.write(jsonl.name, arcname='file_listing.jsonl')
# Upload the zip file
tmp.flush()
s3_client.upload_file(
tmp.name,
bucket,
index_key,
ExtraArgs={'ContentType': 'application/zip'}
)
logger.info(f"Wrote index to s3://{bucket}/{index_key}")
@click.group() @click.group()
def cli(): def cli():
"""S3 object management commands.""" """S3 object management commands."""
@ -113,6 +150,25 @@ def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_l
delete_empty_files(s3_client, bucket, prefix, dry_run) delete_empty_files(s3_client, bucket, prefix, dry_run)
@cli.command()
@click.argument('s3_path')
@click.option('--profile', help='AWS profile name', default='sc-direct')
@click.option('--output', '-o', help='Output path for index file', default=None)
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO', help='Set logging level')
def write_index(s3_path: str, profile: str = None, output: str | None = None, log_level: str = 'INFO'):
"""Write a JSONL index of all files under the given prefix."""
logging.basicConfig(level=log_level)
bucket, prefix = s3_path.split('/', 1)
if output is None:
output = prefix.rstrip('/') + '/file_listing.jsonl.zip'
session = boto3.Session(profile_name=profile)
s3_client = session.client('s3')
write_file_listing(s3_client, bucket, prefix, output)
if __name__ == '__main__': if __name__ == '__main__':
cli() cli()

View file

@ -1,31 +0,0 @@
import boto3
import click
import json
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
@click.command()
@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path),
default='collections/collections.json',
help='Path to collections configuration file.')
def main(collections_file: Path):
# Load collections config
collections = json.loads(collections_file.read_text())
collections_dir = collections_file.parent
for collection in collections:
s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3')
collection_path = collections_dir / collection['directory']
bucket_name, s3_prefix = collection['s3_path'].split('/', 1)
for file_path in collection_path.rglob('*'):
if file_path.is_file():
relative_path = file_path.relative_to(collection_path)
s3_key = f"{s3_prefix}/{relative_path}"
print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}")
s3.upload_file(str(file_path), bucket_name, s3_key)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,99 @@
import boto3
import click
import json
from pathlib import Path
import logging
import csv
import zipfile
from tqdm import tqdm
logger = logging.getLogger(__name__)
@click.group()
def cli():
pass
@cli.command()
@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path),
default='collections/collections.json',
help='Path to collections configuration file.')
def write_readme(collections_file: Path):
# Load collections config
collections = json.loads(collections_file.read_text())
collections_dir = collections_file.parent
for collection in collections:
s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3')
collection_path = collections_dir / collection['directory']
bucket_name, s3_prefix = collection['s3_path'].split('/', 1)
for file_path in collection_path.rglob('*'):
if file_path.is_file():
relative_path = file_path.relative_to(collection_path)
s3_key = f"{s3_prefix}/{relative_path}"
print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}")
s3.upload_file(str(file_path), bucket_name, s3_key)
@cli.command()
@click.argument('metadata_file', type=click.Path(exists=True, path_type=Path))
@click.argument('output_file', type=click.Path(path_type=Path))
def write_csv(metadata_file: Path, output_file: Path):
"""
Read a zipped JSONL file of metadata and write dataset info to CSV.
metadata_file: Path to the zip file containing metadata JSONL
output_file: Path where the CSV should be written
"""
with zipfile.ZipFile(metadata_file, 'r') as zf, \
open(output_file, 'w', newline='') as csvfile:
jsonl_name = metadata_file.name.replace('.zip', '')
writer = csv.writer(csvfile)
writer.writerow(['name', 'title']) # Write header
with zf.open(jsonl_name) as f:
for line in tqdm(f, desc="Writing CSV"):
try:
metadata = json.loads(line)
except json.JSONDecodeError:
print(line)
breakpoint()
print(line)
continue
dataset_info = metadata.get('signed_metadata', {}).get('data_gov_metadata', {})
if dataset_info:
writer.writerow([
dataset_info.get('name', ''),
dataset_info.get('title', '')
])
@cli.command()
@click.argument('metadata_dir', type=click.Path(exists=True, path_type=Path))
@click.argument('output_file', type=click.Path(path_type=Path))
def write_jsonl(metadata_dir: Path, output_file: Path):
"""
Read each .json file, recursively, in metadata directory and write to a single zipped JSONL file.
All records are written to a single JSONL file within the zip, named same as output_file without .zip
"""
# Get the base filename without .zip extension for the internal file
internal_filename = output_file.name.replace('.zip', '')
output_dir = output_file.parent
# Use force_zip64=True to handle files larger than 2GB
with zipfile.ZipFile(output_file, 'w') as zf:
# Create a single file in the zip archive
with zf.open(internal_filename, 'w', force_zip64=True) as f:
# Iterate through all JSON files
for file_path in tqdm(metadata_dir.rglob('*.json'), desc="Writing JSONL"):
with open(file_path, 'r') as json_file:
try:
metadata = json.load(json_file)
except json.JSONDecodeError:
print(file_path)
raise
metadata['metadata_path'] = str(file_path.relative_to(output_dir))
metadata['collection_path'] = metadata['metadata_path'].replace('metadata', 'collections', 1)
# Write each record to the same file, with newline
f.write((json.dumps(metadata) + '\n').encode('utf-8'))
if __name__ == '__main__':
cli()

View file

@ -9,7 +9,7 @@ from gitspoke.cli import valid_include_items, get_token
import os import os
import json import json
import requests import requests
from scripts.helpers.config import load_config from scripts.helpers.misc import load_config
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
stats_counter = {} stats_counter = {}

View file

@ -10,4 +10,11 @@ def load_config():
config = json.loads(CONFIG_PATH.read_text()) config = json.loads(CONFIG_PATH.read_text())
else: else:
config = {} config = {}
return config return config
def json_default(obj):
"""Default JSON encoder for serializing datetime objects."""
if hasattr(obj, 'isoformat'):
return obj.isoformat()
return super().default(obj)