2025-02-05 10:21:50 -05:00
|
|
|
import json
|
|
|
|
from collections import Counter, defaultdict
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
2025-02-26 14:49:24 -05:00
|
|
|
if __name__ == "__main__":
|
2025-02-05 10:21:50 -05:00
|
|
|
|
2025-02-26 14:49:24 -05:00
|
|
|
# Read the JSONL file and count crawler_identified_date values
|
|
|
|
downloaded_counts = Counter()
|
|
|
|
identified_counts = Counter()
|
|
|
|
titles_by_org = defaultdict(list)
|
|
|
|
with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
|
|
|
|
for line in f:
|
|
|
|
data = json.loads(line)
|
|
|
|
org = json.loads(data.get('organization', '{}'))
|
|
|
|
identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
|
|
|
|
titles_by_org[org['title']].append(data["title"])
|
2025-02-05 10:21:50 -05:00
|
|
|
|
2025-02-26 14:49:24 -05:00
|
|
|
# Print the counts sorted by date
|
|
|
|
for date, count in sorted(identified_counts.items()):
|
|
|
|
print(f"{date}: {count}")
|
2025-02-05 10:21:50 -05:00
|
|
|
|
2025-02-26 14:49:24 -05:00
|
|
|
# sort each list of titles by org
|
|
|
|
for org, titles in titles_by_org.items():
|
|
|
|
titles_by_org[org].sort()
|
|
|
|
Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
|
2025-02-05 10:21:50 -05:00
|
|
|
|
2025-02-26 14:49:24 -05:00
|
|
|
|
|
|
|
# print urls
|
|
|
|
for path in Path('data/').glob('glass*'):
|
|
|
|
print(path)
|
|
|
|
with open(path, 'r') as f:
|
|
|
|
for line in f:
|
|
|
|
data = json.loads(line)
|
|
|
|
print("* " + data['name'])
|
|
|
|
resources = data.get('resources', [])
|
|
|
|
if type(resources) == str:
|
|
|
|
resources = json.loads(resources)
|
|
|
|
for resource in resources:
|
|
|
|
print(' * ' + resource['url'])
|