mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-03-22 10:32:19 +00:00
metadata updates
This commit is contained in:
parent
a1532df719
commit
9ccee0d422
7 changed files with 165 additions and 34 deletions
Binary file not shown.
Before Width: | Height: | Size: 45 KiB After Width: | Height: | Size: 36 KiB |
|
@ -3,7 +3,7 @@ from pathlib import Path
|
||||||
import click
|
import click
|
||||||
from cloudflare import Cloudflare
|
from cloudflare import Cloudflare
|
||||||
import os
|
import os
|
||||||
from scripts.helpers.config import load_config
|
from scripts.helpers.misc import load_config
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,13 @@ import click
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import logging
|
import logging
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
import json
|
||||||
|
import gzip
|
||||||
|
from io import BytesIO
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
from scripts.helpers.misc import json_default
|
||||||
|
import zipfile
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -76,6 +83,36 @@ def delete_empty_files(s3_client, bucket: str, prefix: str, dry_run: bool = Fals
|
||||||
|
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
|
def write_file_listing(s3_client, bucket: str, prefix: str, index_key: str):
|
||||||
|
"""Write a JSONL listing of all files under prefix to index_key."""
|
||||||
|
# Create a temporary file
|
||||||
|
with tempfile.NamedTemporaryFile(mode='wb', suffix='.zip', delete=True) as tmp:
|
||||||
|
with zipfile.ZipFile(tmp, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
|
||||||
|
# Create a temporary file for the JSONL content
|
||||||
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=True) as jsonl:
|
||||||
|
paginator = s3_client.get_paginator('list_objects_v2')
|
||||||
|
for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=prefix), desc="indexing"):
|
||||||
|
if 'Contents' in page:
|
||||||
|
for obj in page['Contents']:
|
||||||
|
# Write each object as a JSON line using custom encoder
|
||||||
|
line = json.dumps(obj, default=json_default) + '\n'
|
||||||
|
jsonl.write(line)
|
||||||
|
|
||||||
|
# Flush the JSONL file and add it to the zip
|
||||||
|
jsonl.flush()
|
||||||
|
zf.write(jsonl.name, arcname='file_listing.jsonl')
|
||||||
|
|
||||||
|
# Upload the zip file
|
||||||
|
tmp.flush()
|
||||||
|
s3_client.upload_file(
|
||||||
|
tmp.name,
|
||||||
|
bucket,
|
||||||
|
index_key,
|
||||||
|
ExtraArgs={'ContentType': 'application/zip'}
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Wrote index to s3://{bucket}/{index_key}")
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
def cli():
|
def cli():
|
||||||
"""S3 object management commands."""
|
"""S3 object management commands."""
|
||||||
|
@ -113,6 +150,25 @@ def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_l
|
||||||
|
|
||||||
delete_empty_files(s3_client, bucket, prefix, dry_run)
|
delete_empty_files(s3_client, bucket, prefix, dry_run)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument('s3_path')
|
||||||
|
@click.option('--profile', help='AWS profile name', default='sc-direct')
|
||||||
|
@click.option('--output', '-o', help='Output path for index file', default=None)
|
||||||
|
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||||
|
default='INFO', help='Set logging level')
|
||||||
|
def write_index(s3_path: str, profile: str = None, output: str | None = None, log_level: str = 'INFO'):
|
||||||
|
"""Write a JSONL index of all files under the given prefix."""
|
||||||
|
logging.basicConfig(level=log_level)
|
||||||
|
bucket, prefix = s3_path.split('/', 1)
|
||||||
|
|
||||||
|
if output is None:
|
||||||
|
output = prefix.rstrip('/') + '/file_listing.jsonl.zip'
|
||||||
|
|
||||||
|
session = boto3.Session(profile_name=profile)
|
||||||
|
s3_client = session.client('s3')
|
||||||
|
|
||||||
|
write_file_listing(s3_client, bucket, prefix, output)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
cli()
|
cli()
|
||||||
|
|
||||||
|
|
|
@ -1,31 +0,0 @@
|
||||||
import boto3
|
|
||||||
import click
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
import logging
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
@click.command()
|
|
||||||
@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path),
|
|
||||||
default='collections/collections.json',
|
|
||||||
help='Path to collections configuration file.')
|
|
||||||
def main(collections_file: Path):
|
|
||||||
# Load collections config
|
|
||||||
collections = json.loads(collections_file.read_text())
|
|
||||||
collections_dir = collections_file.parent
|
|
||||||
|
|
||||||
for collection in collections:
|
|
||||||
s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3')
|
|
||||||
collection_path = collections_dir / collection['directory']
|
|
||||||
bucket_name, s3_prefix = collection['s3_path'].split('/', 1)
|
|
||||||
|
|
||||||
for file_path in collection_path.rglob('*'):
|
|
||||||
if file_path.is_file():
|
|
||||||
relative_path = file_path.relative_to(collection_path)
|
|
||||||
s3_key = f"{s3_prefix}/{relative_path}"
|
|
||||||
print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}")
|
|
||||||
s3.upload_file(str(file_path), bucket_name, s3_key)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
99
scripts/collection/write_metadata.py
Normal file
99
scripts/collection/write_metadata.py
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
import boto3
|
||||||
|
import click
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
import csv
|
||||||
|
import zipfile
|
||||||
|
from tqdm import tqdm
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
def cli():
|
||||||
|
pass
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.option('--collections-file', '-c', type=click.Path(exists=True, path_type=Path),
|
||||||
|
default='collections/collections.json',
|
||||||
|
help='Path to collections configuration file.')
|
||||||
|
def write_readme(collections_file: Path):
|
||||||
|
# Load collections config
|
||||||
|
collections = json.loads(collections_file.read_text())
|
||||||
|
collections_dir = collections_file.parent
|
||||||
|
|
||||||
|
for collection in collections:
|
||||||
|
s3 = boto3.Session(profile_name=collection['aws_profile']).client('s3')
|
||||||
|
collection_path = collections_dir / collection['directory']
|
||||||
|
bucket_name, s3_prefix = collection['s3_path'].split('/', 1)
|
||||||
|
|
||||||
|
for file_path in collection_path.rglob('*'):
|
||||||
|
if file_path.is_file():
|
||||||
|
relative_path = file_path.relative_to(collection_path)
|
||||||
|
s3_key = f"{s3_prefix}/{relative_path}"
|
||||||
|
print(f"Uploading {file_path} to s3://{bucket_name}/{s3_key}")
|
||||||
|
s3.upload_file(str(file_path), bucket_name, s3_key)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument('metadata_file', type=click.Path(exists=True, path_type=Path))
|
||||||
|
@click.argument('output_file', type=click.Path(path_type=Path))
|
||||||
|
def write_csv(metadata_file: Path, output_file: Path):
|
||||||
|
"""
|
||||||
|
Read a zipped JSONL file of metadata and write dataset info to CSV.
|
||||||
|
|
||||||
|
metadata_file: Path to the zip file containing metadata JSONL
|
||||||
|
output_file: Path where the CSV should be written
|
||||||
|
"""
|
||||||
|
with zipfile.ZipFile(metadata_file, 'r') as zf, \
|
||||||
|
open(output_file, 'w', newline='') as csvfile:
|
||||||
|
|
||||||
|
jsonl_name = metadata_file.name.replace('.zip', '')
|
||||||
|
writer = csv.writer(csvfile)
|
||||||
|
writer.writerow(['name', 'title']) # Write header
|
||||||
|
|
||||||
|
with zf.open(jsonl_name) as f:
|
||||||
|
for line in tqdm(f, desc="Writing CSV"):
|
||||||
|
try:
|
||||||
|
metadata = json.loads(line)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print(line)
|
||||||
|
breakpoint()
|
||||||
|
print(line)
|
||||||
|
continue
|
||||||
|
dataset_info = metadata.get('signed_metadata', {}).get('data_gov_metadata', {})
|
||||||
|
if dataset_info:
|
||||||
|
writer.writerow([
|
||||||
|
dataset_info.get('name', ''),
|
||||||
|
dataset_info.get('title', '')
|
||||||
|
])
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument('metadata_dir', type=click.Path(exists=True, path_type=Path))
|
||||||
|
@click.argument('output_file', type=click.Path(path_type=Path))
|
||||||
|
def write_jsonl(metadata_dir: Path, output_file: Path):
|
||||||
|
"""
|
||||||
|
Read each .json file, recursively, in metadata directory and write to a single zipped JSONL file.
|
||||||
|
All records are written to a single JSONL file within the zip, named same as output_file without .zip
|
||||||
|
"""
|
||||||
|
# Get the base filename without .zip extension for the internal file
|
||||||
|
internal_filename = output_file.name.replace('.zip', '')
|
||||||
|
output_dir = output_file.parent
|
||||||
|
|
||||||
|
# Use force_zip64=True to handle files larger than 2GB
|
||||||
|
with zipfile.ZipFile(output_file, 'w') as zf:
|
||||||
|
# Create a single file in the zip archive
|
||||||
|
with zf.open(internal_filename, 'w', force_zip64=True) as f:
|
||||||
|
# Iterate through all JSON files
|
||||||
|
for file_path in tqdm(metadata_dir.rglob('*.json'), desc="Writing JSONL"):
|
||||||
|
with open(file_path, 'r') as json_file:
|
||||||
|
try:
|
||||||
|
metadata = json.load(json_file)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print(file_path)
|
||||||
|
raise
|
||||||
|
metadata['metadata_path'] = str(file_path.relative_to(output_dir))
|
||||||
|
metadata['collection_path'] = metadata['metadata_path'].replace('metadata', 'collections', 1)
|
||||||
|
# Write each record to the same file, with newline
|
||||||
|
f.write((json.dumps(metadata) + '\n').encode('utf-8'))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
cli()
|
|
@ -9,7 +9,7 @@ from gitspoke.cli import valid_include_items, get_token
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import requests
|
import requests
|
||||||
from scripts.helpers.config import load_config
|
from scripts.helpers.misc import load_config
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
stats_counter = {}
|
stats_counter = {}
|
||||||
|
|
|
@ -10,4 +10,11 @@ def load_config():
|
||||||
config = json.loads(CONFIG_PATH.read_text())
|
config = json.loads(CONFIG_PATH.read_text())
|
||||||
else:
|
else:
|
||||||
config = {}
|
config = {}
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def json_default(obj):
|
||||||
|
"""Default JSON encoder for serializing datetime objects."""
|
||||||
|
if hasattr(obj, 'isoformat'):
|
||||||
|
return obj.isoformat()
|
||||||
|
return super().default(obj)
|
Loading…
Add table
Reference in a new issue