data-vault/scripts/data_gov/diff/diff_analyze.py

import json
from collections import Counter, defaultdict
from pathlib import Path


if __name__ == "__main__":

     # Read the JSONL file and count crawler_identified_date values
     downloaded_counts = Counter()
     identified_counts = Counter()
     titles_by_org = defaultdict(list)
     with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
          for line in f:
               data = json.loads(line)
               org = json.loads(data.get('organization', '{}'))
               identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
               titles_by_org[org['title']].append(data["title"])

     # Print the counts sorted by date
     for date, count in sorted(identified_counts.items()):
          print(f"{date}: {count}")

     # sort each list of titles by org
     for org, titles in titles_by_org.items():
          titles_by_org[org].sort()
     Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))


     # print urls
     for path in Path('data/').glob('glass*'):
          print(path)
          with open(path, 'r') as f:
               for line in f:
                    data = json.loads(line)
                    print("* " + data['name'])
                    resources = data.get('resources', [])
                    if type(resources) == str:
                         resources = json.loads(resources)
                    for resource in resources:
                         print(' * ' + resource['url'])
initial commit 2025-02-05 10:21:50 -05:00			`import json`
			`from collections import Counter, defaultdict`
			`from pathlib import Path`


Refactoring, github pipeline, s3 creation 2025-02-26 14:49:24 -05:00			`if __name__ == "__main__":`
initial commit 2025-02-05 10:21:50 -05:00
Refactoring, github pipeline, s3 creation 2025-02-26 14:49:24 -05:00			`# Read the JSONL file and count crawler_identified_date values`
			`downloaded_counts = Counter()`
			`identified_counts = Counter()`
			`titles_by_org = defaultdict(list)`
			`with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:`
			`for line in f:`
			`data = json.loads(line)`
			`org = json.loads(data.get('organization', '{}'))`
			`identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1`
			`titles_by_org[org['title']].append(data["title"])`
initial commit 2025-02-05 10:21:50 -05:00
Refactoring, github pipeline, s3 creation 2025-02-26 14:49:24 -05:00			`# Print the counts sorted by date`
			`for date, count in sorted(identified_counts.items()):`
			`print(f"{date}: {count}")`
initial commit 2025-02-05 10:21:50 -05:00
Refactoring, github pipeline, s3 creation 2025-02-26 14:49:24 -05:00			`# sort each list of titles by org`
			`for org, titles in titles_by_org.items():`
			`titles_by_org[org].sort()`
			`Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))`
initial commit 2025-02-05 10:21:50 -05:00
Refactoring, github pipeline, s3 creation 2025-02-26 14:49:24 -05:00
			`# print urls`
			`for path in Path('data/').glob('glass*'):`
			`print(path)`
			`with open(path, 'r') as f:`
			`for line in f:`
			`data = json.loads(line)`
			`print("* " + data['name'])`
			`resources = data.get('resources', [])`
			`if type(resources) == str:`
			`resources = json.loads(resources)`
			`for resource in resources:`
			`print(' * ' + resource['url'])`