Refactoring, github pipeline, s3 creation

This commit is contained in:
Jack Cushman 2025-02-26 14:49:24 -05:00
parent a7c99e264d
commit b245fd44eb
21 changed files with 718 additions and 281 deletions

View file

View file

@ -8,8 +8,14 @@ from gitspoke import Downloader, GitHubAPI
from gitspoke.cli import valid_include_items, get_token
import os
import json
import uuid
import requests
from datetime import datetime
from scripts.helpers.misc import load_config
from nabit.lib.archive import package
from nabit.lib.sign import KNOWN_TSAS, is_encrypted_key
from nabit.lib.backends.path import PathCollectionTask
from scripts.helpers.bag import fetch_and_upload
logger = logging.getLogger(__name__)
stats_counter = {}
@ -34,17 +40,90 @@ def check_repo_exists(org_name, repo_name, token, output_path=None):
output_file.write(f"{repo_link}\n")
return exists
def run_pipeline(org_name, repo_name, collection_path, include, token, check_exists=False, output_path=None):
def run_pipeline(
org_name,
repo_name,
collection_path,
include,
token,
metadata_path=None,
output_path=None,
signatures=None,
session_args=None,
s3_path=None,
no_delete=False,
save_raw=False,
raw_dir=None,
check_exists=False,
output_deleted=None,
):
"""Process a single repository."""
# existing checking mode
if check_exists:
return check_repo_exists(org_name, repo_name, token, output_path)
logger.info(f"Processing repository: {org_name}/{repo_name}")
Downloader(org_name, repo_name, token, max_retries=20).download_repo(collection_path, include=include)
return check_repo_exists(org_name, repo_name, token, output_deleted)
# raw saving mode
if save_raw:
raw_path = raw_dir / org_name / repo_name
Downloader(org_name, repo_name, token, max_retries=20).download_repo(raw_path, include=include)
logger.info("Processing complete")
return
def create_archive_callback(temp_dir):
if raw_dir:
raw_path = raw_dir / org_name / repo_name
if raw_path.exists():
out_dir = raw_path
else:
Downloader(org_name, repo_name, token, max_retries=20).download_repo(temp_dir, include=include)
out_dir = temp_dir
return {
'collect': [
PathCollectionTask(path=out_dir)
],
'signed_metadata': {
'id': str(uuid.uuid4()),
'url': f'https://github.com/{org_name}/{repo_name}',
'description': f'Archive of GitHub repository {org_name}/{repo_name}',
'github_metadata': {
'org': org_name,
'repo': repo_name,
'archived_date': datetime.now().isoformat()
},
},
}
# Archive mode - use common pipeline
fetch_and_upload(
output_path=output_path,
collection_path=collection_path,
metadata_path=metadata_path,
create_archive_callback=create_archive_callback,
signatures=signatures,
session_args=session_args,
s3_path=s3_path,
no_delete=no_delete
)
logger.info("Processing complete")
def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int = 0, skip_existing: bool = False, stop_after: int = None, include: str = None,
check_exists: bool = False):
def get_tasks(
csv_path: Path,
output_path: Path,
skip_rows: int = 0,
skip_existing: bool = False,
stop_after: int = None,
include: str = None,
archive_mode: bool = False,
signatures: list = None,
session_args: dict = None,
s3_path: str = None,
no_delete: bool = False,
save_raw: bool = False,
raw_dir: Path = None,
check_exists: bool = False,
output_deleted: Path = None,
):
"""Get repositories from CSV that haven't been processed yet."""
# Initialize progress bars
if not check_exists:
@ -85,19 +164,37 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
continue
org_name, repo_name = row['html_url'].split('/')[-2:]
collection_path = output_path / 'collections' / collection / org_name / repo_name
if skip_existing:
if collection_path.exists():
stats_counter['skipped'].update(1)
continue
else:
stats_counter['yielded'].update(1)
collection_path = output_path / 'data' / org_name / repo_name / 'v1.zip'
metadata_path = output_path / 'metadata' / org_name / repo_name / 'v1.json'
if skip_existing and collection_path.exists():
stats_counter['skipped'].update(1)
continue
else:
stats_counter['yielded'].update(1)
# use tokens round robin
token = tokens[processed % len(tokens)]
yield org_name, repo_name, collection_path, include, token, check_exists, output_path
yield (
org_name,
repo_name,
collection_path,
include,
token,
output_deleted,
archive_mode,
metadata_path,
output_path,
signatures,
session_args,
s3_path,
save_raw,
raw_dir,
check_exists,
no_delete,
)
processed += 1
if stop_after and processed >= stop_after:
@ -107,11 +204,10 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
for counter in stats_counter.values():
counter.close()
@click.command()
@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
help='Output path.')
@click.option('--collection', '-c', type=str, default='github_raw',
help='Collection name.')
@click.option('--workers', '-w', type=int, default=None,
help='Number of worker processes. Defaults to CPU count.')
@click.option('--skip-rows', type=int, default=0,
@ -120,21 +216,44 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items))
@click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv',
help='Path to the CSV file.')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default=None,
help='Logging level.')
@click.option('--stop-after', help='Stop after processing this many repositories', type=int)
@click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories')
@click.option('--signatures', help='JSON string of signature configuration.')
# upload settings
@click.option('--profile', '-p', help='AWS profile name')
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
# raw saving
# useful if doing multiple runs with the same csv and different --include values
@click.option('--save-raw', is_flag=True, help='Save raw repositories to disk rather than bagging and uploading')
@click.option('--raw-dir', type=click.Path(path_type=Path), help='Directory to save raw repositories to')
# deletion checking
@click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub')
def main(csv_path: Path, output_path: Path, collection: str, workers=None, skip_rows=0, include=None,
log_level=None, stop_after=None, skip_existing=False, check_exists=False):
@click.option('--output-deleted', type=click.Path(path_type=Path), help='File to output deleted repositories to')
def main(profile, workers, **kwargs):
session_args = {}
if profile:
session_args['profile_name'] = profile
if signatures := kwargs.get('signatures'):
signatures = json.loads(signatures)
for signature in signatures:
if signature['action'] == 'sign':
if is_encrypted_key(signature['params']['key']):
signature['params']['password'] = click.prompt(
f"Enter password for {signature['params']['key']}: ",
hide_input=True
)
elif signature['action'] == 'timestamp':
if known_tsa := signature.pop('known_tsa', None):
signature['params'] = KNOWN_TSAS[known_tsa]
kwargs['signatures'] = signatures
run_parallel(
run_pipeline,
get_tasks(csv_path, output_path, collection, skip_rows, skip_existing, stop_after, include, check_exists),
get_tasks(**kwargs),
workers,
log_level=log_level
)
if __name__ == "__main__":