mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-07-04 13:46:56 -04:00
initial commit
This commit is contained in:
commit
404c3627f7
26 changed files with 2534 additions and 0 deletions
141
scripts/github/download_git.py
Normal file
141
scripts/github/download_git.py
Normal file
|
@ -0,0 +1,141 @@
|
|||
import csv
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from scripts.helpers.parallel import run_parallel
|
||||
import click
|
||||
from tqdm import tqdm
|
||||
from gitspoke import Downloader, GitHubAPI
|
||||
from gitspoke.cli import valid_include_items, get_token
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
from scripts.helpers.config import load_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
stats_counter = {}
|
||||
|
||||
CONFIG_PATH = (os.environ.get("XDG_CONFIG_HOME") or (Path.home() / ".config")) / "data-mirror" / "config.json"
|
||||
|
||||
def check_repo_exists(org_name, repo_name, token, output_path=None):
|
||||
"""Check if a repository still exists on GitHub."""
|
||||
exists = True
|
||||
try:
|
||||
GitHubAPI(token).request(f"repos/{org_name}/{repo_name}", method="HEAD")
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code == 404:
|
||||
exists = False
|
||||
else:
|
||||
raise e
|
||||
if not exists:
|
||||
repo_link = f"https://github.com/{org_name}/{repo_name}"
|
||||
print(repo_link)
|
||||
if output_path:
|
||||
with open(output_path, 'a') as output_file:
|
||||
output_file.write(f"{repo_link}\n")
|
||||
return exists
|
||||
|
||||
def run_pipeline(org_name, repo_name, collection_path, include, token, check_exists=False, output_path=None):
|
||||
"""Process a single repository."""
|
||||
if check_exists:
|
||||
return check_repo_exists(org_name, repo_name, token, output_path)
|
||||
|
||||
logger.info(f"Processing repository: {org_name}/{repo_name}")
|
||||
Downloader(org_name, repo_name, token, max_retries=20).download_repo(collection_path, include=include)
|
||||
logger.info("Processing complete")
|
||||
|
||||
def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int = 0, skip_existing: bool = False, stop_after: int = None, include: str = None,
|
||||
check_exists: bool = False):
|
||||
"""Get repositories from CSV that haven't been processed yet."""
|
||||
# Initialize progress bars
|
||||
if not check_exists:
|
||||
stats_counter['total'] = tqdm(desc="Total records", unit="repo")
|
||||
if skip_existing:
|
||||
stats_counter['skipped'] = tqdm(desc="Skipped", unit="repo")
|
||||
stats_counter['yielded'] = tqdm(desc="Processing", unit="repo")
|
||||
|
||||
# handle --include
|
||||
if include:
|
||||
include = include.split(',')
|
||||
else:
|
||||
include = ['repo_info']
|
||||
|
||||
# import token or tokens
|
||||
config = load_config()
|
||||
if config.get('tokens'):
|
||||
tokens = config['tokens']
|
||||
else:
|
||||
tokens = [get_token(None)]
|
||||
if tokens != [None]:
|
||||
logger.warning(f"Using {len(tokens)} tokens")
|
||||
else:
|
||||
logger.warning("Using unauthenticated rate limits")
|
||||
|
||||
with open(csv_path, 'r') as file:
|
||||
reader = csv.DictReader(file)
|
||||
# Skip specified number of rows
|
||||
for _ in range(skip_rows):
|
||||
next(reader)
|
||||
|
||||
processed = 0
|
||||
for row in reader:
|
||||
if not check_exists:
|
||||
stats_counter['total'].update(1)
|
||||
|
||||
if not row['html_url']: # Skip empty rows
|
||||
continue
|
||||
|
||||
org_name, repo_name = row['html_url'].split('/')[-2:]
|
||||
collection_path = output_path / 'collections' / collection / org_name / repo_name
|
||||
|
||||
if skip_existing:
|
||||
if collection_path.exists():
|
||||
stats_counter['skipped'].update(1)
|
||||
continue
|
||||
else:
|
||||
stats_counter['yielded'].update(1)
|
||||
|
||||
# use tokens round robin
|
||||
token = tokens[processed % len(tokens)]
|
||||
|
||||
yield org_name, repo_name, collection_path, include, token, check_exists, output_path
|
||||
|
||||
processed += 1
|
||||
if stop_after and processed >= stop_after:
|
||||
break
|
||||
|
||||
# Close progress bars
|
||||
for counter in stats_counter.values():
|
||||
counter.close()
|
||||
|
||||
@click.command()
|
||||
@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
|
||||
help='Output path.')
|
||||
@click.option('--collection', '-c', type=str, default='github_raw',
|
||||
help='Collection name.')
|
||||
@click.option('--workers', '-w', type=int, default=None,
|
||||
help='Number of worker processes. Defaults to CPU count.')
|
||||
@click.option('--skip-rows', type=int, default=0,
|
||||
help='Number of rows to skip in the CSV.')
|
||||
@click.option('--include',
|
||||
help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items))
|
||||
@click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv',
|
||||
help='Path to the CSV file.')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default=None,
|
||||
help='Logging level.')
|
||||
@click.option('--stop-after', help='Stop after processing this many repositories', type=int)
|
||||
@click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories')
|
||||
@click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub')
|
||||
def main(csv_path: Path, output_path: Path, collection: str, workers=None, skip_rows=0, include=None,
|
||||
log_level=None, stop_after=None, skip_existing=False, check_exists=False):
|
||||
|
||||
run_parallel(
|
||||
run_pipeline,
|
||||
get_tasks(csv_path, output_path, collection, skip_rows, skip_existing, stop_after, include, check_exists),
|
||||
workers,
|
||||
log_level=log_level
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue