Refactoring, github pipeline, s3 creation

2025-09-05 02:25:47 -04:00 · 2025-02-26 14:49:24 -05:00 · 2025-02-26 14:49:24 -05:00 · b245fd44eb
commit b245fd44eb
parent a7c99e264d
21 changed files with 718 additions and 281 deletions
--- a/scripts/github/init.py
+++ b/scripts/github/init.py
--- a/scripts/github/download_git.py
+++ b/scripts/github/download_git.py
@ -8,8 +8,14 @@ from gitspoke import Downloader, GitHubAPI
 from gitspoke.cli import valid_include_items, get_token
 import os
 import json
+import uuid
 import requests
+from datetime import datetime
 from scripts.helpers.misc import load_config
+from nabit.lib.archive import package
+from nabit.lib.sign import KNOWN_TSAS, is_encrypted_key
+from nabit.lib.backends.path import PathCollectionTask
+from scripts.helpers.bag import fetch_and_upload

 logger = logging.getLogger(__name__)
 stats_counter = {}
@ -34,17 +40,90 @@ def check_repo_exists(org_name, repo_name, token, output_path=None):
                output_file.write(f"{repo_link}\n")
    return exists

-def run_pipeline(org_name, repo_name, collection_path, include, token, check_exists=False, output_path=None):
+def run_pipeline(
+    org_name, 
+    repo_name, 
+    collection_path, 
+    include, 
+    token,
+    metadata_path=None,
+    output_path=None,
+    signatures=None,
+    session_args=None,
+    s3_path=None,
+    no_delete=False,
+    save_raw=False,
+    raw_dir=None,
+    check_exists=False,
+    output_deleted=None,
+):
    """Process a single repository."""
+    # existing checking mode
    if check_exists:
-        return check_repo_exists(org_name, repo_name, token, output_path)
-        
-    logger.info(f"Processing repository: {org_name}/{repo_name}")
-    Downloader(org_name, repo_name, token, max_retries=20).download_repo(collection_path, include=include)
+        return check_repo_exists(org_name, repo_name, token, output_deleted)
+    
+    # raw saving mode
+    if save_raw:
+        raw_path = raw_dir / org_name / repo_name
+        Downloader(org_name, repo_name, token, max_retries=20).download_repo(raw_path, include=include)
+        logger.info("Processing complete")
+        return
+    
+    def create_archive_callback(temp_dir):
+        if raw_dir:
+            raw_path = raw_dir / org_name / repo_name
+            if raw_path.exists():
+                out_dir = raw_path
+        else:
+            Downloader(org_name, repo_name, token, max_retries=20).download_repo(temp_dir, include=include)
+            out_dir = temp_dir
+        return {
+            'collect': [
+                PathCollectionTask(path=out_dir)
+            ],
+            'signed_metadata': {
+                'id': str(uuid.uuid4()),
+                'url': f'https://github.com/{org_name}/{repo_name}',
+                'description': f'Archive of GitHub repository {org_name}/{repo_name}',
+                'github_metadata': {
+                    'org': org_name,
+                    'repo': repo_name,
+                    'archived_date': datetime.now().isoformat()
+                },
+            },
+        }
+
+    # Archive mode - use common pipeline
+    fetch_and_upload(
+        output_path=output_path,
+        collection_path=collection_path,
+        metadata_path=metadata_path,
+        create_archive_callback=create_archive_callback,
+        signatures=signatures,
+        session_args=session_args,
+        s3_path=s3_path,
+        no_delete=no_delete
+    )
+    
    logger.info("Processing complete")

-def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int = 0, skip_existing: bool = False, stop_after: int = None, include: str = None,
-              check_exists: bool = False):
+def get_tasks(
+    csv_path: Path,
+    output_path: Path,
+    skip_rows: int = 0,
+    skip_existing: bool = False, 
+    stop_after: int = None,
+    include: str = None,
+    archive_mode: bool = False,
+    signatures: list = None,
+    session_args: dict = None,
+    s3_path: str = None,
+    no_delete: bool = False,
+    save_raw: bool = False,
+    raw_dir: Path = None,
+    check_exists: bool = False,
+    output_deleted: Path = None,
+):
    """Get repositories from CSV that haven't been processed yet."""
    # Initialize progress bars
    if not check_exists:
@ -85,19 +164,37 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
                continue
                
            org_name, repo_name = row['html_url'].split('/')[-2:]
-            collection_path = output_path / 'collections' / collection / org_name / repo_name
            
-            if skip_existing:
-                if collection_path.exists():
-                    stats_counter['skipped'].update(1)
-                    continue
-                else:
-                    stats_counter['yielded'].update(1)
+            collection_path = output_path / 'data' / org_name / repo_name / 'v1.zip'
+            metadata_path = output_path / 'metadata' / org_name / repo_name / 'v1.json'
+                
+            if skip_existing and collection_path.exists():
+                stats_counter['skipped'].update(1)
+                continue
+            else:
+                stats_counter['yielded'].update(1)

            # use tokens round robin
            token = tokens[processed % len(tokens)]

-            yield org_name, repo_name, collection_path, include, token, check_exists, output_path
+            yield (
+                org_name,
+                repo_name,
+                collection_path,
+                include,
+                token,
+                output_deleted,
+                archive_mode,
+                metadata_path,
+                output_path,
+                signatures,
+                session_args,
+                s3_path,
+                save_raw,
+                raw_dir,
+                check_exists,
+                no_delete,
+            )

            processed += 1
            if stop_after and processed >= stop_after:
@ -107,11 +204,10 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
    for counter in stats_counter.values():
        counter.close()

+
@click.command()
@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
              help='Output path.')
-@click.option('--collection', '-c', type=str, default='github_raw',
-              help='Collection name.')
@click.option('--workers', '-w', type=int, default=None, 
              help='Number of worker processes. Defaults to CPU count.')
@click.option('--skip-rows', type=int, default=0,
@ -120,21 +216,44 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
              help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items))
@click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv',
              help='Path to the CSV file.')
-@click.option('--log-level', '-l', 
-              type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), 
-              default=None,
-              help='Logging level.')
@click.option('--stop-after', help='Stop after processing this many repositories', type=int)
@click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories')
+@click.option('--signatures', help='JSON string of signature configuration.')
+# upload settings
+@click.option('--profile', '-p', help='AWS profile name')
+@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
+@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
+# raw saving
+# useful if doing multiple runs with the same csv and different --include values
+@click.option('--save-raw', is_flag=True, help='Save raw repositories to disk rather than bagging and uploading')
+@click.option('--raw-dir', type=click.Path(path_type=Path), help='Directory to save raw repositories to')
+# deletion checking
@click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub')
-def main(csv_path: Path, output_path: Path, collection: str, workers=None, skip_rows=0, include=None,
-         log_level=None, stop_after=None, skip_existing=False, check_exists=False):
+@click.option('--output-deleted', type=click.Path(path_type=Path), help='File to output deleted repositories to')
+def main(profile, workers, **kwargs):
+    
+    session_args = {}
+    if profile:
+        session_args['profile_name'] = profile
+
+    if signatures := kwargs.get('signatures'):
+        signatures = json.loads(signatures)
+        for signature in signatures:
+            if signature['action'] == 'sign':
+                if is_encrypted_key(signature['params']['key']):
+                    signature['params']['password'] = click.prompt(
+                        f"Enter password for {signature['params']['key']}: ", 
+                        hide_input=True
+                    )
+            elif signature['action'] == 'timestamp':
+                if known_tsa := signature.pop('known_tsa', None):
+                    signature['params'] = KNOWN_TSAS[known_tsa]
+        kwargs['signatures'] = signatures
    
    run_parallel(
        run_pipeline,
-        get_tasks(csv_path, output_path, collection, skip_rows, skip_existing, stop_after, include, check_exists),
+        get_tasks(**kwargs),
        workers,
-        log_level=log_level
    )

 if __name__ == "__main__":