Cleanup to prep for diffing

2025-09-03 23:25:57 -04:00 · 2025-02-24 16:45:50 -05:00 · 2025-02-24 16:45:50 -05:00 · a7c99e264d
commit a7c99e264d
parent 7af7f9cf3e
9 changed files with 290 additions and 122 deletions
--- a/scripts/collection/write_metadata.py
+++ b/scripts/collection/write_metadata.py
@ -6,6 +6,8 @@ import logging
 import csv
 import zipfile
 from tqdm import tqdm
+import io
+
 logger = logging.getLogger(__name__)

@click.group()
@ -27,6 +29,10 @@ def write_readme(collections_file: Path):
        bucket_name, s3_prefix = collection['s3_path'].split('/', 1)

        for file_path in collection_path.rglob('*'):
+            # Skip dotfiles and files in dot directories
+            if any(part.startswith('.') for part in file_path.parts):
+                continue
+                
            if file_path.is_file():
                relative_path = file_path.relative_to(collection_path)
                s3_key = f"{s3_prefix}/{relative_path}"
@ -38,58 +44,58 @@ def write_readme(collections_file: Path):
@click.argument('output_file', type=click.Path(path_type=Path))
 def write_csv(metadata_file: Path, output_file: Path):
    """
-    Read a zipped JSONL file of metadata and write dataset info to CSV.
+    Read a zipped JSONL file of metadata and write dataset info to a zipped CSV.
    
    metadata_file: Path to the zip file containing metadata JSONL
-    output_file: Path where the CSV should be written
+    output_file: Path where the zipped CSV should be written
    """
-    with zipfile.ZipFile(metadata_file, 'r') as zf, \
-         open(output_file, 'w', newline='') as csvfile:
-        
-        jsonl_name = metadata_file.name.replace('.zip', '')
-        writer = csv.writer(csvfile)
-        writer.writerow(['name', 'title'])  # Write header
-        
-        with zf.open(jsonl_name) as f:
-            for line in tqdm(f, desc="Writing CSV"):
-                try:
-                    metadata = json.loads(line)
-                except json.JSONDecodeError:
-                    print(line)
-                    breakpoint()
-                    print(line)
-                    continue
-                dataset_info = metadata.get('signed_metadata', {}).get('data_gov_metadata', {})
-                if dataset_info:
-                    writer.writerow([
-                        dataset_info.get('name', ''),
-                        dataset_info.get('title', '')
-                    ])
+    # Get the base filename without .zip extension for the internal CSV file
+    internal_filename = output_file.name.replace('.zip', '')
+    jsonl_name = metadata_file.name.replace('.zip', '')
+    
+    with zipfile.ZipFile(metadata_file, 'r') as input_zf, \
+         zipfile.ZipFile(output_file, 'w', compression=zipfile.ZIP_DEFLATED) as output_zf, \
+         output_zf.open(internal_filename, 'w', force_zip64=True) as csvfile, \
+         input_zf.open(jsonl_name) as jsonlfile:
+         
+        # Create a text wrapper around the binary file
+        text_wrapper = io.TextIOWrapper(csvfile, write_through=True, newline='')
+        writer = csv.writer(text_wrapper)
+        writer.writerow(['name', 'organization', 'title', 'date', 'metadata_path', 'collection_path'])
+            
+        # Read from input zip and write to output zip
+        for line in tqdm(jsonlfile, desc="Writing CSV"):
+            metadata = json.loads(line)
+            dataset_info = metadata['signed_metadata']['data_gov_metadata']
+            if dataset_info:
+                writer.writerow([
+                    dataset_info['name'],
+                    dataset_info['organization']['title'],
+                    dataset_info['title'],
+                    dataset_info['metadata_modified'],
+                    metadata['metadata_path'],
+                    metadata['collection_path'],
+                ])

@cli.command()
@click.argument('metadata_dir', type=click.Path(exists=True, path_type=Path))
@click.argument('output_file', type=click.Path(path_type=Path))
 def write_jsonl(metadata_dir: Path, output_file: Path):
    """
-    Read each .json file, recursively, in metadata directory and write to a single zipped JSONL file.
+    Read each .json file, recursively, in metadata directory and write to a single compressed zipped JSONL file.
    All records are written to a single JSONL file within the zip, named same as output_file without .zip
    """
    # Get the base filename without .zip extension for the internal file
    internal_filename = output_file.name.replace('.zip', '')
    output_dir = output_file.parent
    
-    # Use force_zip64=True to handle files larger than 2GB
-    with zipfile.ZipFile(output_file, 'w') as zf:
+    with zipfile.ZipFile(output_file, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        # Create a single file in the zip archive
        with zf.open(internal_filename, 'w', force_zip64=True) as f:
            # Iterate through all JSON files
            for file_path in tqdm(metadata_dir.rglob('*.json'), desc="Writing JSONL"):
                with open(file_path, 'r') as json_file:
-                    try:
-                        metadata = json.load(json_file)
-                    except json.JSONDecodeError:
-                        print(file_path)
-                        raise
+                    metadata = json.load(json_file)
                    metadata['metadata_path'] = str(file_path.relative_to(output_dir))
                    metadata['collection_path'] = metadata['metadata_path'].replace('metadata', 'collections', 1)
                    # Write each record to the same file, with newline