Refactoring, github pipeline, s3 creation

2025-07-04 13:46:56 -04:00 · 2025-02-26 14:49:24 -05:00 · 2025-02-26 14:49:24 -05:00 · b245fd44eb
commit b245fd44eb
parent a7c99e264d
21 changed files with 718 additions and 281 deletions
--- a/scripts/data_gov/diff/diff.py
+++ b/scripts/data_gov/diff/diff.py
@ -61,17 +61,8 @@ def find_differences(csv_data: Dict[str, dict],
@click.option('--compare-by', '-c',
              default='id',
              help='Field to compare by.')
-@click.option('--log-level', '-l',
-              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
-              default='INFO',
-              help='Logging level.')
 def main(old_path: Path, new_path: Path, compare_by: str, log_level: str):
    """Compare records between CSV and JSONL files."""
-    logging.basicConfig(
-        level=getattr(logging, log_level),
-        format='%(asctime)s - %(levelname)s - %(message)s'
-    )
-    
    old_data = load_jsonl_data(old_path, compare_by=compare_by)
    new_data = load_jsonl_data(new_path, compare_by=compare_by)
    
--- a/scripts/data_gov/diff/diff_analyze.py
+++ b/scripts/data_gov/diff/diff_analyze.py
@ -3,36 +3,38 @@ from collections import Counter, defaultdict
 from pathlib import Path


-# Read the JSONL file and count crawler_identified_date values
-downloaded_counts = Counter()
-identified_counts = Counter()
-titles_by_org = defaultdict(list)
-with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
-    for line in f:
-          data = json.loads(line)
-          org = json.loads(data.get('organization', '{}'))
-          identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
-          titles_by_org[org['title']].append(data["title"])
+if __name__ == "__main__":

-# Print the counts sorted by date
-for date, count in sorted(identified_counts.items()):
-    print(f"{date}: {count}")
-
-# sort each list of titles by org
-for org, titles in titles_by_org.items():
-    titles_by_org[org].sort()
-Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
-
-
-# print urls
-for path in Path('data/').glob('glass*'):
-     print(path)
-     with open(path, 'r') as f:
+     # Read the JSONL file and count crawler_identified_date values
+     downloaded_counts = Counter()
+     identified_counts = Counter()
+     titles_by_org = defaultdict(list)
+     with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
          for line in f:
               data = json.loads(line)
-               print("* " + data['name'])
-               resources = data.get('resources', [])
-               if type(resources) == str:
-                    resources = json.loads(resources)
-               for resource in resources:
-                    print(' * ' + resource['url'])
+               org = json.loads(data.get('organization', '{}'))
+               identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
+               titles_by_org[org['title']].append(data["title"])
+
+     # Print the counts sorted by date
+     for date, count in sorted(identified_counts.items()):
+          print(f"{date}: {count}")
+
+     # sort each list of titles by org
+     for org, titles in titles_by_org.items():
+          titles_by_org[org].sort()
+     Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
+
+
+     # print urls
+     for path in Path('data/').glob('glass*'):
+          print(path)
+          with open(path, 'r') as f:
+               for line in f:
+                    data = json.loads(line)
+                    print("* " + data['name'])
+                    resources = data.get('resources', [])
+                    if type(resources) == str:
+                         resources = json.loads(resources)
+                    for resource in resources:
+                         print(' * ' + resource['url'])