mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-07-04 13:46:56 -04:00
Cleanup to prep for diffing
This commit is contained in:
parent
7af7f9cf3e
commit
a7c99e264d
9 changed files with 290 additions and 122 deletions
|
@ -6,6 +6,8 @@ import logging
|
|||
import csv
|
||||
import zipfile
|
||||
from tqdm import tqdm
|
||||
import io
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@click.group()
|
||||
|
@ -27,6 +29,10 @@ def write_readme(collections_file: Path):
|
|||
bucket_name, s3_prefix = collection['s3_path'].split('/', 1)
|
||||
|
||||
for file_path in collection_path.rglob('*'):
|
||||
# Skip dotfiles and files in dot directories
|
||||
if any(part.startswith('.') for part in file_path.parts):
|
||||
continue
|
||||
|
||||
if file_path.is_file():
|
||||
relative_path = file_path.relative_to(collection_path)
|
||||
s3_key = f"{s3_prefix}/{relative_path}"
|
||||
|
@ -38,58 +44,58 @@ def write_readme(collections_file: Path):
|
|||
@click.argument('output_file', type=click.Path(path_type=Path))
|
||||
def write_csv(metadata_file: Path, output_file: Path):
|
||||
"""
|
||||
Read a zipped JSONL file of metadata and write dataset info to CSV.
|
||||
Read a zipped JSONL file of metadata and write dataset info to a zipped CSV.
|
||||
|
||||
metadata_file: Path to the zip file containing metadata JSONL
|
||||
output_file: Path where the CSV should be written
|
||||
output_file: Path where the zipped CSV should be written
|
||||
"""
|
||||
with zipfile.ZipFile(metadata_file, 'r') as zf, \
|
||||
open(output_file, 'w', newline='') as csvfile:
|
||||
|
||||
jsonl_name = metadata_file.name.replace('.zip', '')
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(['name', 'title']) # Write header
|
||||
|
||||
with zf.open(jsonl_name) as f:
|
||||
for line in tqdm(f, desc="Writing CSV"):
|
||||
try:
|
||||
metadata = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
print(line)
|
||||
breakpoint()
|
||||
print(line)
|
||||
continue
|
||||
dataset_info = metadata.get('signed_metadata', {}).get('data_gov_metadata', {})
|
||||
if dataset_info:
|
||||
writer.writerow([
|
||||
dataset_info.get('name', ''),
|
||||
dataset_info.get('title', '')
|
||||
])
|
||||
# Get the base filename without .zip extension for the internal CSV file
|
||||
internal_filename = output_file.name.replace('.zip', '')
|
||||
jsonl_name = metadata_file.name.replace('.zip', '')
|
||||
|
||||
with zipfile.ZipFile(metadata_file, 'r') as input_zf, \
|
||||
zipfile.ZipFile(output_file, 'w', compression=zipfile.ZIP_DEFLATED) as output_zf, \
|
||||
output_zf.open(internal_filename, 'w', force_zip64=True) as csvfile, \
|
||||
input_zf.open(jsonl_name) as jsonlfile:
|
||||
|
||||
# Create a text wrapper around the binary file
|
||||
text_wrapper = io.TextIOWrapper(csvfile, write_through=True, newline='')
|
||||
writer = csv.writer(text_wrapper)
|
||||
writer.writerow(['name', 'organization', 'title', 'date', 'metadata_path', 'collection_path'])
|
||||
|
||||
# Read from input zip and write to output zip
|
||||
for line in tqdm(jsonlfile, desc="Writing CSV"):
|
||||
metadata = json.loads(line)
|
||||
dataset_info = metadata['signed_metadata']['data_gov_metadata']
|
||||
if dataset_info:
|
||||
writer.writerow([
|
||||
dataset_info['name'],
|
||||
dataset_info['organization']['title'],
|
||||
dataset_info['title'],
|
||||
dataset_info['metadata_modified'],
|
||||
metadata['metadata_path'],
|
||||
metadata['collection_path'],
|
||||
])
|
||||
|
||||
@cli.command()
|
||||
@click.argument('metadata_dir', type=click.Path(exists=True, path_type=Path))
|
||||
@click.argument('output_file', type=click.Path(path_type=Path))
|
||||
def write_jsonl(metadata_dir: Path, output_file: Path):
|
||||
"""
|
||||
Read each .json file, recursively, in metadata directory and write to a single zipped JSONL file.
|
||||
Read each .json file, recursively, in metadata directory and write to a single compressed zipped JSONL file.
|
||||
All records are written to a single JSONL file within the zip, named same as output_file without .zip
|
||||
"""
|
||||
# Get the base filename without .zip extension for the internal file
|
||||
internal_filename = output_file.name.replace('.zip', '')
|
||||
output_dir = output_file.parent
|
||||
|
||||
# Use force_zip64=True to handle files larger than 2GB
|
||||
with zipfile.ZipFile(output_file, 'w') as zf:
|
||||
with zipfile.ZipFile(output_file, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
|
||||
# Create a single file in the zip archive
|
||||
with zf.open(internal_filename, 'w', force_zip64=True) as f:
|
||||
# Iterate through all JSON files
|
||||
for file_path in tqdm(metadata_dir.rglob('*.json'), desc="Writing JSONL"):
|
||||
with open(file_path, 'r') as json_file:
|
||||
try:
|
||||
metadata = json.load(json_file)
|
||||
except json.JSONDecodeError:
|
||||
print(file_path)
|
||||
raise
|
||||
metadata = json.load(json_file)
|
||||
metadata['metadata_path'] = str(file_path.relative_to(output_dir))
|
||||
metadata['collection_path'] = metadata['metadata_path'].replace('metadata', 'collections', 1)
|
||||
# Write each record to the same file, with newline
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue