mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-03-15 07:31:21 +00:00
81 lines
No EOL
2.9 KiB
Python
81 lines
No EOL
2.9 KiB
Python
from pathlib import Path
|
|
import json
|
|
import zipfile
|
|
import tempfile
|
|
import requests
|
|
import click
|
|
import logging
|
|
from nabit.bin.utils import cli_validate
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def download_file(url: str, target_path: Path):
|
|
"""Download a file from URL to target path"""
|
|
response = requests.get(url, stream=True)
|
|
response.raise_for_status()
|
|
with target_path.open('wb') as f:
|
|
for chunk in response.iter_content(chunk_size=2**20):
|
|
f.write(chunk)
|
|
|
|
def verify_dataset(json_url: str, zip_url: str, output_dir: Path | None = None):
|
|
"""
|
|
Verify a dataset by downloading and checking its JSON metadata and ZIP contents.
|
|
If output_dir is provided, write the uncompressed contents there.
|
|
"""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
tmpdir = Path(tmpdir)
|
|
|
|
# Download files
|
|
logger.info(f"Downloading metadata from {json_url}...")
|
|
json_path = tmpdir / "metadata.json"
|
|
download_file(json_url, json_path)
|
|
|
|
logger.info(f"Downloading archive from {zip_url}...")
|
|
zip_path = tmpdir / "data.zip"
|
|
download_file(zip_url, zip_path)
|
|
|
|
# Load metadata
|
|
metadata = json.loads(json_path.read_text())
|
|
|
|
# Create output directory
|
|
if not output_dir:
|
|
output_dir = tmpdir / "output"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Verify file contents
|
|
logger.info("Verifying file contents...")
|
|
with zip_path.open('rb') as f:
|
|
for entry in metadata['zip_entries']:
|
|
logger.info(f"Checking {entry['filename']}...")
|
|
f.seek(entry['data_offset'])
|
|
zip_data = f.read(entry['compress_size'])
|
|
|
|
if entry['compress_type'] == zipfile.ZIP_STORED:
|
|
uncompressed = zip_data
|
|
else:
|
|
decompressor = zipfile._get_decompressor(entry['compress_type'])
|
|
uncompressed = decompressor.decompress(zip_data)
|
|
|
|
# write the file
|
|
output_file = output_dir / entry['filename']
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
output_file.write_bytes(uncompressed)
|
|
|
|
logger.info("All files extracted successfully")
|
|
|
|
# verify dataset with nabit
|
|
cli_validate(output_dir)
|
|
|
|
# Return metadata for potential further use
|
|
return metadata
|
|
|
|
@click.command()
|
|
@click.argument('json_url', type=str)
|
|
@click.argument('zip_url', type=str)
|
|
@click.option('--output', '-o', type=click.Path(path_type=Path),
|
|
help='Directory to write uncompressed files')
|
|
def main(json_url: str, zip_url: str, output: Path = None):
|
|
"""Verify dataset from JSON and ZIP URLs"""
|
|
verify_dataset(json_url, zip_url, output)
|
|
|
|
if __name__ == '__main__':
|
|
main() |