mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-07-04 13:46:56 -04:00
Refactoring, github pipeline, s3 creation
This commit is contained in:
parent
a7c99e264d
commit
b245fd44eb
21 changed files with 718 additions and 281 deletions
0
scripts/github/__init__.py
Normal file
0
scripts/github/__init__.py
Normal file
|
@ -8,8 +8,14 @@ from gitspoke import Downloader, GitHubAPI
|
|||
from gitspoke.cli import valid_include_items, get_token
|
||||
import os
|
||||
import json
|
||||
import uuid
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from scripts.helpers.misc import load_config
|
||||
from nabit.lib.archive import package
|
||||
from nabit.lib.sign import KNOWN_TSAS, is_encrypted_key
|
||||
from nabit.lib.backends.path import PathCollectionTask
|
||||
from scripts.helpers.bag import fetch_and_upload
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
stats_counter = {}
|
||||
|
@ -34,17 +40,90 @@ def check_repo_exists(org_name, repo_name, token, output_path=None):
|
|||
output_file.write(f"{repo_link}\n")
|
||||
return exists
|
||||
|
||||
def run_pipeline(org_name, repo_name, collection_path, include, token, check_exists=False, output_path=None):
|
||||
def run_pipeline(
|
||||
org_name,
|
||||
repo_name,
|
||||
collection_path,
|
||||
include,
|
||||
token,
|
||||
metadata_path=None,
|
||||
output_path=None,
|
||||
signatures=None,
|
||||
session_args=None,
|
||||
s3_path=None,
|
||||
no_delete=False,
|
||||
save_raw=False,
|
||||
raw_dir=None,
|
||||
check_exists=False,
|
||||
output_deleted=None,
|
||||
):
|
||||
"""Process a single repository."""
|
||||
# existing checking mode
|
||||
if check_exists:
|
||||
return check_repo_exists(org_name, repo_name, token, output_path)
|
||||
|
||||
logger.info(f"Processing repository: {org_name}/{repo_name}")
|
||||
Downloader(org_name, repo_name, token, max_retries=20).download_repo(collection_path, include=include)
|
||||
return check_repo_exists(org_name, repo_name, token, output_deleted)
|
||||
|
||||
# raw saving mode
|
||||
if save_raw:
|
||||
raw_path = raw_dir / org_name / repo_name
|
||||
Downloader(org_name, repo_name, token, max_retries=20).download_repo(raw_path, include=include)
|
||||
logger.info("Processing complete")
|
||||
return
|
||||
|
||||
def create_archive_callback(temp_dir):
|
||||
if raw_dir:
|
||||
raw_path = raw_dir / org_name / repo_name
|
||||
if raw_path.exists():
|
||||
out_dir = raw_path
|
||||
else:
|
||||
Downloader(org_name, repo_name, token, max_retries=20).download_repo(temp_dir, include=include)
|
||||
out_dir = temp_dir
|
||||
return {
|
||||
'collect': [
|
||||
PathCollectionTask(path=out_dir)
|
||||
],
|
||||
'signed_metadata': {
|
||||
'id': str(uuid.uuid4()),
|
||||
'url': f'https://github.com/{org_name}/{repo_name}',
|
||||
'description': f'Archive of GitHub repository {org_name}/{repo_name}',
|
||||
'github_metadata': {
|
||||
'org': org_name,
|
||||
'repo': repo_name,
|
||||
'archived_date': datetime.now().isoformat()
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Archive mode - use common pipeline
|
||||
fetch_and_upload(
|
||||
output_path=output_path,
|
||||
collection_path=collection_path,
|
||||
metadata_path=metadata_path,
|
||||
create_archive_callback=create_archive_callback,
|
||||
signatures=signatures,
|
||||
session_args=session_args,
|
||||
s3_path=s3_path,
|
||||
no_delete=no_delete
|
||||
)
|
||||
|
||||
logger.info("Processing complete")
|
||||
|
||||
def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int = 0, skip_existing: bool = False, stop_after: int = None, include: str = None,
|
||||
check_exists: bool = False):
|
||||
def get_tasks(
|
||||
csv_path: Path,
|
||||
output_path: Path,
|
||||
skip_rows: int = 0,
|
||||
skip_existing: bool = False,
|
||||
stop_after: int = None,
|
||||
include: str = None,
|
||||
archive_mode: bool = False,
|
||||
signatures: list = None,
|
||||
session_args: dict = None,
|
||||
s3_path: str = None,
|
||||
no_delete: bool = False,
|
||||
save_raw: bool = False,
|
||||
raw_dir: Path = None,
|
||||
check_exists: bool = False,
|
||||
output_deleted: Path = None,
|
||||
):
|
||||
"""Get repositories from CSV that haven't been processed yet."""
|
||||
# Initialize progress bars
|
||||
if not check_exists:
|
||||
|
@ -85,19 +164,37 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
|
|||
continue
|
||||
|
||||
org_name, repo_name = row['html_url'].split('/')[-2:]
|
||||
collection_path = output_path / 'collections' / collection / org_name / repo_name
|
||||
|
||||
if skip_existing:
|
||||
if collection_path.exists():
|
||||
stats_counter['skipped'].update(1)
|
||||
continue
|
||||
else:
|
||||
stats_counter['yielded'].update(1)
|
||||
collection_path = output_path / 'data' / org_name / repo_name / 'v1.zip'
|
||||
metadata_path = output_path / 'metadata' / org_name / repo_name / 'v1.json'
|
||||
|
||||
if skip_existing and collection_path.exists():
|
||||
stats_counter['skipped'].update(1)
|
||||
continue
|
||||
else:
|
||||
stats_counter['yielded'].update(1)
|
||||
|
||||
# use tokens round robin
|
||||
token = tokens[processed % len(tokens)]
|
||||
|
||||
yield org_name, repo_name, collection_path, include, token, check_exists, output_path
|
||||
yield (
|
||||
org_name,
|
||||
repo_name,
|
||||
collection_path,
|
||||
include,
|
||||
token,
|
||||
output_deleted,
|
||||
archive_mode,
|
||||
metadata_path,
|
||||
output_path,
|
||||
signatures,
|
||||
session_args,
|
||||
s3_path,
|
||||
save_raw,
|
||||
raw_dir,
|
||||
check_exists,
|
||||
no_delete,
|
||||
)
|
||||
|
||||
processed += 1
|
||||
if stop_after and processed >= stop_after:
|
||||
|
@ -107,11 +204,10 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
|
|||
for counter in stats_counter.values():
|
||||
counter.close()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
|
||||
help='Output path.')
|
||||
@click.option('--collection', '-c', type=str, default='github_raw',
|
||||
help='Collection name.')
|
||||
@click.option('--workers', '-w', type=int, default=None,
|
||||
help='Number of worker processes. Defaults to CPU count.')
|
||||
@click.option('--skip-rows', type=int, default=0,
|
||||
|
@ -120,21 +216,44 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
|
|||
help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items))
|
||||
@click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv',
|
||||
help='Path to the CSV file.')
|
||||
@click.option('--log-level', '-l',
|
||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||
default=None,
|
||||
help='Logging level.')
|
||||
@click.option('--stop-after', help='Stop after processing this many repositories', type=int)
|
||||
@click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories')
|
||||
@click.option('--signatures', help='JSON string of signature configuration.')
|
||||
# upload settings
|
||||
@click.option('--profile', '-p', help='AWS profile name')
|
||||
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
|
||||
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
|
||||
# raw saving
|
||||
# useful if doing multiple runs with the same csv and different --include values
|
||||
@click.option('--save-raw', is_flag=True, help='Save raw repositories to disk rather than bagging and uploading')
|
||||
@click.option('--raw-dir', type=click.Path(path_type=Path), help='Directory to save raw repositories to')
|
||||
# deletion checking
|
||||
@click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub')
|
||||
def main(csv_path: Path, output_path: Path, collection: str, workers=None, skip_rows=0, include=None,
|
||||
log_level=None, stop_after=None, skip_existing=False, check_exists=False):
|
||||
@click.option('--output-deleted', type=click.Path(path_type=Path), help='File to output deleted repositories to')
|
||||
def main(profile, workers, **kwargs):
|
||||
|
||||
session_args = {}
|
||||
if profile:
|
||||
session_args['profile_name'] = profile
|
||||
|
||||
if signatures := kwargs.get('signatures'):
|
||||
signatures = json.loads(signatures)
|
||||
for signature in signatures:
|
||||
if signature['action'] == 'sign':
|
||||
if is_encrypted_key(signature['params']['key']):
|
||||
signature['params']['password'] = click.prompt(
|
||||
f"Enter password for {signature['params']['key']}: ",
|
||||
hide_input=True
|
||||
)
|
||||
elif signature['action'] == 'timestamp':
|
||||
if known_tsa := signature.pop('known_tsa', None):
|
||||
signature['params'] = KNOWN_TSAS[known_tsa]
|
||||
kwargs['signatures'] = signatures
|
||||
|
||||
run_parallel(
|
||||
run_pipeline,
|
||||
get_tasks(csv_path, output_path, collection, skip_rows, skip_existing, stop_after, include, check_exists),
|
||||
get_tasks(**kwargs),
|
||||
workers,
|
||||
log_level=log_level
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue