mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-07-04 13:46:56 -04:00
Refactoring, github pipeline, s3 creation
This commit is contained in:
parent
a7c99e264d
commit
b245fd44eb
21 changed files with 718 additions and 281 deletions
|
@ -61,17 +61,8 @@ def find_differences(csv_data: Dict[str, dict],
|
|||
@click.option('--compare-by', '-c',
|
||||
default='id',
|
||||
help='Field to compare by.')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default='INFO',
|
||||
help='Logging level.')
|
||||
def main(old_path: Path, new_path: Path, compare_by: str, log_level: str):
|
||||
"""Compare records between CSV and JSONL files."""
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
old_data = load_jsonl_data(old_path, compare_by=compare_by)
|
||||
new_data = load_jsonl_data(new_path, compare_by=compare_by)
|
||||
|
||||
|
|
|
@ -3,36 +3,38 @@ from collections import Counter, defaultdict
|
|||
from pathlib import Path
|
||||
|
||||
|
||||
# Read the JSONL file and count crawler_identified_date values
|
||||
downloaded_counts = Counter()
|
||||
identified_counts = Counter()
|
||||
titles_by_org = defaultdict(list)
|
||||
with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
org = json.loads(data.get('organization', '{}'))
|
||||
identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
|
||||
titles_by_org[org['title']].append(data["title"])
|
||||
if __name__ == "__main__":
|
||||
|
||||
# Print the counts sorted by date
|
||||
for date, count in sorted(identified_counts.items()):
|
||||
print(f"{date}: {count}")
|
||||
|
||||
# sort each list of titles by org
|
||||
for org, titles in titles_by_org.items():
|
||||
titles_by_org[org].sort()
|
||||
Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
|
||||
|
||||
|
||||
# print urls
|
||||
for path in Path('data/').glob('glass*'):
|
||||
print(path)
|
||||
with open(path, 'r') as f:
|
||||
# Read the JSONL file and count crawler_identified_date values
|
||||
downloaded_counts = Counter()
|
||||
identified_counts = Counter()
|
||||
titles_by_org = defaultdict(list)
|
||||
with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
print("* " + data['name'])
|
||||
resources = data.get('resources', [])
|
||||
if type(resources) == str:
|
||||
resources = json.loads(resources)
|
||||
for resource in resources:
|
||||
print(' * ' + resource['url'])
|
||||
org = json.loads(data.get('organization', '{}'))
|
||||
identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
|
||||
titles_by_org[org['title']].append(data["title"])
|
||||
|
||||
# Print the counts sorted by date
|
||||
for date, count in sorted(identified_counts.items()):
|
||||
print(f"{date}: {count}")
|
||||
|
||||
# sort each list of titles by org
|
||||
for org, titles in titles_by_org.items():
|
||||
titles_by_org[org].sort()
|
||||
Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
|
||||
|
||||
|
||||
# print urls
|
||||
for path in Path('data/').glob('glass*'):
|
||||
print(path)
|
||||
with open(path, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
print("* " + data['name'])
|
||||
resources = data.get('resources', [])
|
||||
if type(resources) == str:
|
||||
resources = json.loads(resources)
|
||||
for resource in resources:
|
||||
print(' * ' + resource['url'])
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue