mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-03-15 07:31:21 +00:00
Refactoring, github pipeline, s3 creation
This commit is contained in:
parent
a7c99e264d
commit
b245fd44eb
21 changed files with 718 additions and 281 deletions
|
@ -18,6 +18,7 @@ dependencies = [
|
||||||
"cloudflare>=4.0.0",
|
"cloudflare>=4.0.0",
|
||||||
"deepdiff>=8.2.0",
|
"deepdiff>=8.2.0",
|
||||||
"orjson>=3.10.15",
|
"orjson>=3.10.15",
|
||||||
|
"onepassword-sdk>=0.1.7",
|
||||||
]
|
]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
@ -35,3 +36,6 @@ gitspoke = { git = "https://github.com/harvard-lil/gitspoke" }
|
||||||
|
|
||||||
[tool.hatch.build.targets.wheel]
|
[tool.hatch.build.targets.wheel]
|
||||||
packages = ["scripts"]
|
packages = ["scripts"]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
vault = "scripts.cli:cli"
|
||||||
|
|
|
@ -1,2 +0,0 @@
|
||||||
def hello() -> str:
|
|
||||||
return "Hello from data-mirror!"
|
|
118
scripts/cli.py
Normal file
118
scripts/cli.py
Normal file
|
@ -0,0 +1,118 @@
|
||||||
|
import click
|
||||||
|
import importlib
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
"""
|
||||||
|
Top level CLI that registers commands with this logic:
|
||||||
|
|
||||||
|
- Find all subdirectories WITH __init__.py
|
||||||
|
- Fetch a 'cli' click.group() from __init__.py if it exists, or create one
|
||||||
|
- For all python files in the subdirectory:
|
||||||
|
- If the file defines a 'cli' click.group(), add it to the subdirectory's group
|
||||||
|
- If the file defines a 'main' click.command(), add it to the subdirectory's group
|
||||||
|
- If any commands are added to the subdirectory's group, add the group to the main CLI
|
||||||
|
"""
|
||||||
|
|
||||||
|
def register_commands(cli: click.Group) -> None:
|
||||||
|
"""Find all command groups in the scripts directory."""
|
||||||
|
scripts_dir = Path(__file__).parent
|
||||||
|
|
||||||
|
# for each subdirectory, try to import subdir.__init__
|
||||||
|
for subdir in scripts_dir.glob('**'):
|
||||||
|
if not subdir.is_dir():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# get group from __init__.py or create a new one
|
||||||
|
subdir_import = f"scripts.{subdir.name}"
|
||||||
|
try:
|
||||||
|
init_module = importlib.import_module(subdir_import+'.__init__')
|
||||||
|
except ImportError:
|
||||||
|
continue
|
||||||
|
if hasattr(init_module, 'cli'):
|
||||||
|
group = init_module.cli
|
||||||
|
else:
|
||||||
|
@click.group()
|
||||||
|
def group():
|
||||||
|
pass
|
||||||
|
|
||||||
|
# add commands from the subdirectory
|
||||||
|
for item in subdir.glob('*.py'):
|
||||||
|
if item.name == '__init__.py':
|
||||||
|
continue
|
||||||
|
file_import = f"{subdir_import}.{item.stem}"
|
||||||
|
module = importlib.import_module(file_import)
|
||||||
|
if type(getattr(module, 'cli', None)) == click.Group:
|
||||||
|
group.add_command(module.cli, name=item.stem)
|
||||||
|
elif type(getattr(module, 'main', None)) == click.Command:
|
||||||
|
group.add_command(module.main, name=item.stem)
|
||||||
|
|
||||||
|
# add the group to the main cli
|
||||||
|
if group.commands:
|
||||||
|
cli.add_command(group, name=subdir.name)
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
@click.option('--log-level', '-l',
|
||||||
|
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
||||||
|
default='WARNING',
|
||||||
|
help='Logging level.')
|
||||||
|
def cli(log_level):
|
||||||
|
"""Main CLI entry point for the vault tool."""
|
||||||
|
logging.basicConfig(level=log_level)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.option('--shell', '-s', type=click.Choice(['bash', 'zsh', 'fish']),
|
||||||
|
help='Shell to generate completion script for. Defaults to auto-detect.')
|
||||||
|
def completion(shell):
|
||||||
|
"""Install tab completion for the CLI.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
# Auto-detect shell and print instructions
|
||||||
|
$ vault completion
|
||||||
|
|
||||||
|
# Generate completion script for bash
|
||||||
|
$ vault completion --shell=bash > ~/.vault-complete.bash
|
||||||
|
|
||||||
|
# Generate completion script for zsh
|
||||||
|
$ vault completion --shell=zsh > ~/.vault-complete.zsh
|
||||||
|
|
||||||
|
# Generate completion script for fish
|
||||||
|
$ vault completion --shell=fish > ~/.config/fish/completions/vault.fish
|
||||||
|
"""
|
||||||
|
# Auto-detect shell if not specified
|
||||||
|
if not shell:
|
||||||
|
shell = os.environ.get('SHELL', '')
|
||||||
|
if 'bash' in shell:
|
||||||
|
shell = 'bash'
|
||||||
|
elif 'zsh' in shell:
|
||||||
|
shell = 'zsh'
|
||||||
|
elif 'fish' in shell:
|
||||||
|
shell = 'fish'
|
||||||
|
else:
|
||||||
|
click.echo("Could not auto-detect shell. Please specify with --shell option.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get the script name (executable name)
|
||||||
|
script_name = os.path.basename(sys.argv[0])
|
||||||
|
env_var = f"_{script_name.replace('-', '_').upper()}_COMPLETE"
|
||||||
|
|
||||||
|
if shell == 'bash':
|
||||||
|
click.echo(f"# Save the completion script:")
|
||||||
|
click.echo(f"{env_var}=bash_source {script_name} > ~/.{script_name}-complete.bash")
|
||||||
|
click.echo("\n# Then add this line to your ~/.bashrc:")
|
||||||
|
click.echo(f". ~/.{script_name}-complete.bash")
|
||||||
|
elif shell == 'zsh':
|
||||||
|
click.echo(f"# Save the completion script:")
|
||||||
|
click.echo(f"{env_var}=zsh_source {script_name} > ~/.{script_name}-complete.zsh")
|
||||||
|
click.echo("\n# Then add this line to your ~/.zshrc:")
|
||||||
|
click.echo(f". ~/.{script_name}-complete.zsh")
|
||||||
|
elif shell == 'fish':
|
||||||
|
click.echo(f"# Save the completion script to the fish completions directory:")
|
||||||
|
click.echo(f"{env_var}=fish_source {script_name} > ~/.config/fish/completions/{script_name}.fish")
|
||||||
|
|
||||||
|
register_commands(cli)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cli()
|
|
@ -1,2 +0,0 @@
|
||||||
def hello() -> str:
|
|
||||||
return "Hello from data-mirror!"
|
|
|
@ -54,15 +54,9 @@ def cli():
|
||||||
help='Key prefixes to restrict access to. Can be specified multiple times.')
|
help='Key prefixes to restrict access to. Can be specified multiple times.')
|
||||||
@click.option('--objects', '-o', multiple=True,
|
@click.option('--objects', '-o', multiple=True,
|
||||||
help='Specific object keys to restrict access to. Can be specified multiple times.')
|
help='Specific object keys to restrict access to. Can be specified multiple times.')
|
||||||
@click.option('--log-level', '-l',
|
|
||||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
|
||||||
default='INFO',
|
|
||||||
help='Logging level.')
|
|
||||||
def generate_key(bucket: str, permission: str, ttl: int, prefixes: tuple[str, ...],
|
def generate_key(bucket: str, permission: str, ttl: int, prefixes: tuple[str, ...],
|
||||||
objects: tuple[str, ...], log_level: str):
|
objects: tuple[str, ...], log_level: str):
|
||||||
"""Generate temporary Cloudflare R2 access credentials."""
|
"""Generate temporary Cloudflare R2 access credentials."""
|
||||||
# Setup logging
|
|
||||||
logging.basicConfig(level=log_level)
|
|
||||||
|
|
||||||
# Load config
|
# Load config
|
||||||
config = load_config().get("temp_tokens", {})
|
config = load_config().get("temp_tokens", {})
|
||||||
|
|
|
@ -74,18 +74,10 @@ def render_html(datasets_query, output_path: Path) -> None:
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
|
@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
|
||||||
@click.argument('output_path', type=click.Path(path_type=Path), default='data/processed/web')
|
@click.argument('output_path', type=click.Path(path_type=Path), default='data/processed/web')
|
||||||
@click.option('--log-level', '-l',
|
|
||||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
|
||||||
default='INFO',
|
|
||||||
help='Logging level.')
|
|
||||||
@click.option('--limit', '-n', type=int, default=None,
|
@click.option('--limit', '-n', type=int, default=None,
|
||||||
help='Maximum number of rows to display. Default: all rows.')
|
help='Maximum number of rows to display. Default: all rows.')
|
||||||
def main(db_path: Path, output_path: Path, log_level: str, limit: int | None):
|
def main(db_path: Path, output_path: Path, limit: int | None):
|
||||||
"""Render the Dataset table to an HTML file."""
|
"""Render the Dataset table to an HTML file."""
|
||||||
logging.basicConfig(
|
|
||||||
level=getattr(logging, log_level),
|
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(f"Connecting to database at {db_path}")
|
logger.info(f"Connecting to database at {db_path}")
|
||||||
db.init(db_path)
|
db.init(db_path)
|
||||||
|
|
|
@ -2,14 +2,12 @@ import boto3
|
||||||
import click
|
import click
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import logging
|
import logging
|
||||||
from itertools import islice
|
|
||||||
import json
|
import json
|
||||||
import gzip
|
|
||||||
from io import BytesIO
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import os
|
import os
|
||||||
from scripts.helpers.misc import json_default
|
from scripts.helpers.misc import json_default
|
||||||
import zipfile
|
import zipfile
|
||||||
|
from scripts.helpers.onepassword import save_item, share_item
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -122,11 +120,8 @@ def cli():
|
||||||
@click.argument('s3_path')
|
@click.argument('s3_path')
|
||||||
@click.option('--profile', help='AWS profile name', default='sc-direct')
|
@click.option('--profile', help='AWS profile name', default='sc-direct')
|
||||||
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
|
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
|
||||||
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
def undelete(s3_path: str, profile: str = None, dry_run: bool = False):
|
||||||
default='INFO', help='Set logging level')
|
|
||||||
def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
|
|
||||||
"""Remove delete markers from versioned S3 objects, effectively undeleting them."""
|
"""Remove delete markers from versioned S3 objects, effectively undeleting them."""
|
||||||
logging.basicConfig(level=log_level)
|
|
||||||
bucket, prefix = s3_path.split('/', 1)
|
bucket, prefix = s3_path.split('/', 1)
|
||||||
|
|
||||||
session = boto3.Session(profile_name=profile)
|
session = boto3.Session(profile_name=profile)
|
||||||
|
@ -138,11 +133,8 @@ def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level
|
||||||
@click.argument('s3_path')
|
@click.argument('s3_path')
|
||||||
@click.option('--profile', help='AWS profile name', default='sc-direct')
|
@click.option('--profile', help='AWS profile name', default='sc-direct')
|
||||||
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
|
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
|
||||||
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False):
|
||||||
default='INFO', help='Set logging level')
|
|
||||||
def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
|
|
||||||
"""Delete all zero-size objects under the given prefix."""
|
"""Delete all zero-size objects under the given prefix."""
|
||||||
logging.basicConfig(level=log_level)
|
|
||||||
bucket, prefix = s3_path.split('/', 1)
|
bucket, prefix = s3_path.split('/', 1)
|
||||||
|
|
||||||
session = boto3.Session(profile_name=profile)
|
session = boto3.Session(profile_name=profile)
|
||||||
|
@ -154,11 +146,8 @@ def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_l
|
||||||
@click.argument('s3_path')
|
@click.argument('s3_path')
|
||||||
@click.option('--profile', help='AWS profile name', default='sc-direct')
|
@click.option('--profile', help='AWS profile name', default='sc-direct')
|
||||||
@click.option('--output', '-o', help='Output path for index file', default=None)
|
@click.option('--output', '-o', help='Output path for index file', default=None)
|
||||||
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
def write_index(s3_path: str, profile: str = None, output: str | None = None):
|
||||||
default='INFO', help='Set logging level')
|
|
||||||
def write_index(s3_path: str, profile: str = None, output: str | None = None, log_level: str = 'INFO'):
|
|
||||||
"""Write a JSONL index of all files under the given prefix."""
|
"""Write a JSONL index of all files under the given prefix."""
|
||||||
logging.basicConfig(level=log_level)
|
|
||||||
bucket, prefix = s3_path.split('/', 1)
|
bucket, prefix = s3_path.split('/', 1)
|
||||||
|
|
||||||
if output is None:
|
if output is None:
|
||||||
|
@ -169,6 +158,117 @@ def write_index(s3_path: str, profile: str = None, output: str | None = None, lo
|
||||||
|
|
||||||
write_file_listing(s3_client, bucket, prefix, output)
|
write_file_listing(s3_client, bucket, prefix, output)
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument('bucket_name')
|
||||||
|
@click.option('--profile', '-p', help='AWS profile name')
|
||||||
|
@click.option('--region', '-r', help='AWS region', default='us-east-1')
|
||||||
|
@click.option('--tag', '-t', help='Tag the bucket with a name', default=None)
|
||||||
|
def create_bucket(bucket_name: str, profile: str = None, region: str = 'us-east-1', tag: str | None = None):
|
||||||
|
"""Create a new S3 bucket with versioning enabled by default."""
|
||||||
|
session = boto3.Session(profile_name=profile)
|
||||||
|
s3_client = session.client('s3')
|
||||||
|
|
||||||
|
# Ensure bucket exists
|
||||||
|
try:
|
||||||
|
if region == 'us-east-1':
|
||||||
|
s3_client.create_bucket(Bucket=bucket_name)
|
||||||
|
else:
|
||||||
|
s3_client.create_bucket(
|
||||||
|
Bucket=bucket_name,
|
||||||
|
CreateBucketConfiguration={'LocationConstraint': region},
|
||||||
|
)
|
||||||
|
except s3_client.exceptions.BucketAlreadyExists:
|
||||||
|
logger.warning(f"Bucket {bucket_name} already exists. Updating settings.")
|
||||||
|
|
||||||
|
# Configure bucket
|
||||||
|
s3_client.put_bucket_versioning(
|
||||||
|
Bucket=bucket_name,
|
||||||
|
VersioningConfiguration={'Status': 'Enabled'}
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Ensured bucket {bucket_name} exists with versioning enabled")
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument('bucket_name')
|
||||||
|
@click.argument('username')
|
||||||
|
@click.option('--profile', '-p', help='AWS profile name')
|
||||||
|
@click.option('--permissions-boundary', '-b', help='ARN of the permissions boundary policy')
|
||||||
|
@click.option('--op-vault', help='1Password vault to store credentials in', default='Private')
|
||||||
|
@click.option('--op-share', help='Share the credentials with the given email', default=None)
|
||||||
|
def create_user(bucket_name: str, username: str, profile: str, permissions_boundary: str, op_vault: str, op_share: str | None):
|
||||||
|
"""Generate temporary S3 credentials with read/write/list access for a specific bucket."""
|
||||||
|
session = boto3.Session(profile_name=profile)
|
||||||
|
iam_client = session.client('iam')
|
||||||
|
|
||||||
|
# Define inline policy for bucket access
|
||||||
|
bucket_policy = {
|
||||||
|
"Version": "2012-10-17",
|
||||||
|
"Statement": [{
|
||||||
|
"Effect": "Allow",
|
||||||
|
"Action": [
|
||||||
|
"s3:GetObject",
|
||||||
|
"s3:PutObject",
|
||||||
|
"s3:DeleteObject",
|
||||||
|
"s3:ListBucket"
|
||||||
|
],
|
||||||
|
"Resource": [
|
||||||
|
f"arn:aws:s3:::{bucket_name}",
|
||||||
|
f"arn:aws:s3:::{bucket_name}/*"
|
||||||
|
]
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create the IAM user with permissions boundary
|
||||||
|
try:
|
||||||
|
iam_client.create_user(
|
||||||
|
UserName=username,
|
||||||
|
PermissionsBoundary=permissions_boundary
|
||||||
|
)
|
||||||
|
logger.info(f"Created IAM user: {username}")
|
||||||
|
except iam_client.exceptions.EntityAlreadyExistsException:
|
||||||
|
logger.warning(f"User {username} already exists")
|
||||||
|
|
||||||
|
# Attach inline policy directly to user
|
||||||
|
iam_client.put_user_policy(
|
||||||
|
UserName=username,
|
||||||
|
PolicyName=f"{bucket_name}-access",
|
||||||
|
PolicyDocument=json.dumps(bucket_policy)
|
||||||
|
)
|
||||||
|
logger.info(f"Attached bucket access policy to user {username}")
|
||||||
|
|
||||||
|
# Create access key for the user
|
||||||
|
response = iam_client.create_access_key(UserName=username)
|
||||||
|
credentials = response['AccessKey']
|
||||||
|
|
||||||
|
# Output the credentials
|
||||||
|
click.echo(f"AWS_ACCESS_KEY_ID={credentials['AccessKeyId']}")
|
||||||
|
click.echo(f"AWS_SECRET_ACCESS_KEY={credentials['SecretAccessKey']}")
|
||||||
|
|
||||||
|
# Save credentials to 1Password if requested
|
||||||
|
if op_vault:
|
||||||
|
item = save_item(op_vault, f"{username} S3 Credentials for {bucket_name}", [
|
||||||
|
{
|
||||||
|
'title': 'Access Key ID',
|
||||||
|
'value': credentials['AccessKeyId'],
|
||||||
|
'section_id': 's3_details'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'title': 'Secret Access Key',
|
||||||
|
'value': credentials['SecretAccessKey'],
|
||||||
|
'section_id': 's3_details'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'title': 'S3 Bucket',
|
||||||
|
'value': bucket_name,
|
||||||
|
'section_id': 's3_details'
|
||||||
|
},
|
||||||
|
])
|
||||||
|
if op_share:
|
||||||
|
share_link = share_item(item, [op_share])
|
||||||
|
click.echo(f"To share credentials with {op_share}, use the following link: {share_link}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
cli()
|
cli()
|
||||||
|
|
||||||
|
|
|
@ -73,18 +73,8 @@ def verify_dataset(json_url: str, zip_url: str, output_dir: Path | None = None):
|
||||||
@click.argument('zip_url', type=str)
|
@click.argument('zip_url', type=str)
|
||||||
@click.option('--output', '-o', type=click.Path(path_type=Path),
|
@click.option('--output', '-o', type=click.Path(path_type=Path),
|
||||||
help='Directory to write uncompressed files')
|
help='Directory to write uncompressed files')
|
||||||
@click.option('--log-level', '-l',
|
def main(json_url: str, zip_url: str, output: Path = None):
|
||||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
|
||||||
default='INFO',
|
|
||||||
help='Logging level.')
|
|
||||||
def main(json_url: str, zip_url: str, output: Path = None, log_level: str = 'INFO'):
|
|
||||||
"""Verify dataset from JSON and ZIP URLs"""
|
"""Verify dataset from JSON and ZIP URLs"""
|
||||||
# Set up logging
|
|
||||||
logging.basicConfig(
|
|
||||||
level=getattr(logging, log_level),
|
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
verify_dataset(json_url, zip_url, output)
|
verify_dataset(json_url, zip_url, output)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
0
scripts/data_gov/__init__.py
Normal file
0
scripts/data_gov/__init__.py
Normal file
|
@ -61,17 +61,8 @@ def find_differences(csv_data: Dict[str, dict],
|
||||||
@click.option('--compare-by', '-c',
|
@click.option('--compare-by', '-c',
|
||||||
default='id',
|
default='id',
|
||||||
help='Field to compare by.')
|
help='Field to compare by.')
|
||||||
@click.option('--log-level', '-l',
|
|
||||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
|
||||||
default='INFO',
|
|
||||||
help='Logging level.')
|
|
||||||
def main(old_path: Path, new_path: Path, compare_by: str, log_level: str):
|
def main(old_path: Path, new_path: Path, compare_by: str, log_level: str):
|
||||||
"""Compare records between CSV and JSONL files."""
|
"""Compare records between CSV and JSONL files."""
|
||||||
logging.basicConfig(
|
|
||||||
level=getattr(logging, log_level),
|
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
old_data = load_jsonl_data(old_path, compare_by=compare_by)
|
old_data = load_jsonl_data(old_path, compare_by=compare_by)
|
||||||
new_data = load_jsonl_data(new_path, compare_by=compare_by)
|
new_data = load_jsonl_data(new_path, compare_by=compare_by)
|
||||||
|
|
||||||
|
|
|
@ -3,36 +3,38 @@ from collections import Counter, defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
# Read the JSONL file and count crawler_identified_date values
|
if __name__ == "__main__":
|
||||||
downloaded_counts = Counter()
|
|
||||||
identified_counts = Counter()
|
|
||||||
titles_by_org = defaultdict(list)
|
|
||||||
with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
|
|
||||||
for line in f:
|
|
||||||
data = json.loads(line)
|
|
||||||
org = json.loads(data.get('organization', '{}'))
|
|
||||||
identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
|
|
||||||
titles_by_org[org['title']].append(data["title"])
|
|
||||||
|
|
||||||
# Print the counts sorted by date
|
# Read the JSONL file and count crawler_identified_date values
|
||||||
for date, count in sorted(identified_counts.items()):
|
downloaded_counts = Counter()
|
||||||
print(f"{date}: {count}")
|
identified_counts = Counter()
|
||||||
|
titles_by_org = defaultdict(list)
|
||||||
# sort each list of titles by org
|
with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
|
||||||
for org, titles in titles_by_org.items():
|
|
||||||
titles_by_org[org].sort()
|
|
||||||
Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
|
|
||||||
|
|
||||||
|
|
||||||
# print urls
|
|
||||||
for path in Path('data/').glob('glass*'):
|
|
||||||
print(path)
|
|
||||||
with open(path, 'r') as f:
|
|
||||||
for line in f:
|
for line in f:
|
||||||
data = json.loads(line)
|
data = json.loads(line)
|
||||||
print("* " + data['name'])
|
org = json.loads(data.get('organization', '{}'))
|
||||||
resources = data.get('resources', [])
|
identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
|
||||||
if type(resources) == str:
|
titles_by_org[org['title']].append(data["title"])
|
||||||
resources = json.loads(resources)
|
|
||||||
for resource in resources:
|
# Print the counts sorted by date
|
||||||
print(' * ' + resource['url'])
|
for date, count in sorted(identified_counts.items()):
|
||||||
|
print(f"{date}: {count}")
|
||||||
|
|
||||||
|
# sort each list of titles by org
|
||||||
|
for org, titles in titles_by_org.items():
|
||||||
|
titles_by_org[org].sort()
|
||||||
|
Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
# print urls
|
||||||
|
for path in Path('data/').glob('glass*'):
|
||||||
|
print(path)
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
data = json.loads(line)
|
||||||
|
print("* " + data['name'])
|
||||||
|
resources = data.get('resources', [])
|
||||||
|
if type(resources) == str:
|
||||||
|
resources = json.loads(resources)
|
||||||
|
for resource in resources:
|
||||||
|
print(' * ' + resource['url'])
|
||||||
|
|
|
@ -10,31 +10,17 @@ import os
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import re
|
import re
|
||||||
from scripts.helpers.parallel import run_parallel
|
from scripts.helpers.parallel import run_parallel
|
||||||
import zipfile
|
|
||||||
import struct
|
|
||||||
import boto3
|
|
||||||
import logging
|
import logging
|
||||||
from scripts.data_gov.models import db, Dataset
|
from scripts.data_gov.models import db, Dataset
|
||||||
from playhouse.shortcuts import model_to_dict
|
from playhouse.shortcuts import model_to_dict
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from scripts.helpers.bag import zip_archive, upload_archive, cleanup_files, fetch_and_upload
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
## download data.gov datasets, create nabit archives, and upload to S3
|
## download data.gov datasets, create nabit archives, and upload to S3
|
||||||
|
|
||||||
# File extensions that are already compressed or wouldn't benefit from additional compression
|
|
||||||
UNCOMPRESSED_EXTENSIONS = {
|
|
||||||
# Already compressed archives
|
|
||||||
'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
|
|
||||||
# Compressed images
|
|
||||||
'jpg', 'jpeg', 'png', 'gif', 'webp',
|
|
||||||
# Compressed video/audio
|
|
||||||
'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
|
|
||||||
# Other compressed/binary formats
|
|
||||||
'pdf', 'docx', 'xlsx', 'pptx',
|
|
||||||
}
|
|
||||||
|
|
||||||
stats_counter = {}
|
stats_counter = {}
|
||||||
|
|
||||||
def is_valid_url(url):
|
def is_valid_url(url):
|
||||||
|
@ -55,95 +41,6 @@ def extract_urls(data, urls = None):
|
||||||
extract_urls(item, urls)
|
extract_urls(item, urls)
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
def create_archive(bag_dir, dataset: Dataset, signatures):
|
|
||||||
data_dict = model_to_dict(dataset)
|
|
||||||
for key, value in data_dict.items():
|
|
||||||
if isinstance(value, datetime):
|
|
||||||
data_dict[key] = value.isoformat()
|
|
||||||
data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}'
|
|
||||||
collect = [
|
|
||||||
*[UrlCollectionTask(url=url) for url in extract_urls(data_dict)],
|
|
||||||
]
|
|
||||||
logger.info(f" - Downloading {len(collect)} files")
|
|
||||||
|
|
||||||
# sort fields from dataset
|
|
||||||
data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')}
|
|
||||||
crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')}
|
|
||||||
|
|
||||||
# Create the archive
|
|
||||||
package(
|
|
||||||
output_path=bag_dir,
|
|
||||||
collect=collect,
|
|
||||||
collect_errors='ignore',
|
|
||||||
signed_metadata={
|
|
||||||
'id': str(uuid.uuid4()),
|
|
||||||
'url': data_gov_url,
|
|
||||||
'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.',
|
|
||||||
'data_gov_metadata': data_gov_metadata,
|
|
||||||
'crawler_metadata': crawler_metadata,
|
|
||||||
},
|
|
||||||
signatures=signatures,
|
|
||||||
)
|
|
||||||
|
|
||||||
def zip_archive(bag_dir, archive_path):
|
|
||||||
# Create zip archive
|
|
||||||
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
|
||||||
for file_path in bag_dir.rglob('*'):
|
|
||||||
if file_path.is_file():
|
|
||||||
arc_path = file_path.relative_to(bag_dir)
|
|
||||||
compression = (zipfile.ZIP_STORED
|
|
||||||
if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS
|
|
||||||
else zipfile.ZIP_DEFLATED)
|
|
||||||
zf.write(file_path, arc_path, compress_type=compression)
|
|
||||||
|
|
||||||
# Create metadata file
|
|
||||||
zip_info = []
|
|
||||||
with zipfile.ZipFile(archive_path, 'r') as zf:
|
|
||||||
for info in zf.filelist:
|
|
||||||
header_offset = info.header_offset
|
|
||||||
|
|
||||||
# Read header to calculate data offset
|
|
||||||
zf.fp.seek(header_offset)
|
|
||||||
header = zf.fp.read(zipfile.sizeFileHeader)
|
|
||||||
fheader = struct.unpack(zipfile.structFileHeader, header)
|
|
||||||
fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
|
|
||||||
extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
|
|
||||||
data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
|
|
||||||
|
|
||||||
zip_info.append({
|
|
||||||
'filename': info.filename,
|
|
||||||
'file_size': info.file_size,
|
|
||||||
'compress_size': info.compress_size,
|
|
||||||
'compress_type': info.compress_type,
|
|
||||||
'header_offset': header_offset,
|
|
||||||
'data_offset': data_offset,
|
|
||||||
})
|
|
||||||
|
|
||||||
# Read the bag-info.txt and signed-metadata.json
|
|
||||||
bag_info = (bag_dir / 'bag-info.txt').read_text()
|
|
||||||
signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
|
|
||||||
|
|
||||||
return {
|
|
||||||
'bag_info': bag_info,
|
|
||||||
'signed_metadata': signed_metadata,
|
|
||||||
'zip_entries': zip_info
|
|
||||||
}
|
|
||||||
|
|
||||||
def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
|
|
||||||
s3 = boto3.Session(**session_args).client('s3')
|
|
||||||
bucket_name, s3_path = s3_path.split('/', 1)
|
|
||||||
|
|
||||||
# Upload zip file
|
|
||||||
s3_collection_key = os.path.join(s3_path, str(collection_path.relative_to(output_path)))
|
|
||||||
s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
|
|
||||||
logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
|
|
||||||
|
|
||||||
# Upload metadata file
|
|
||||||
s3_metadata_key = os.path.join(s3_path, str(metadata_path.relative_to(output_path)))
|
|
||||||
s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
|
|
||||||
logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
|
|
||||||
|
|
||||||
|
|
||||||
def run_pipeline(
|
def run_pipeline(
|
||||||
dataset: Dataset,
|
dataset: Dataset,
|
||||||
output_path: Path,
|
output_path: Path,
|
||||||
|
@ -162,36 +59,43 @@ def run_pipeline(
|
||||||
# set this here so it makes it into the metadata
|
# set this here so it makes it into the metadata
|
||||||
dataset.crawler_downloaded_date = datetime.now()
|
dataset.crawler_downloaded_date = datetime.now()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
|
def create_archive(temp_dir):
|
||||||
logger.info("- Creating archive...")
|
data_dict = model_to_dict(dataset)
|
||||||
# set up paths
|
for key, value in data_dict.items():
|
||||||
temp_dir = Path(temp_dir)
|
if isinstance(value, datetime):
|
||||||
bag_dir = temp_dir / 'bag'
|
data_dict[key] = value.isoformat()
|
||||||
archive_path = temp_dir / 'archive.zip'
|
data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}'
|
||||||
|
collect = [
|
||||||
|
*[UrlCollectionTask(url=url) for url in extract_urls(data_dict)],
|
||||||
|
]
|
||||||
|
logger.info(f" - Downloading {len(collect)} files")
|
||||||
|
|
||||||
# download data with nabit
|
# sort fields from dataset
|
||||||
create_archive(bag_dir, dataset, signatures)
|
data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')}
|
||||||
|
crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')}
|
||||||
|
|
||||||
logger.info("- Zipping archive...")
|
return {
|
||||||
# zip up data and create metadata
|
'collect': collect,
|
||||||
output_metadata = zip_archive(bag_dir, archive_path)
|
'signed_metadata': {
|
||||||
|
'id': str(uuid.uuid4()),
|
||||||
|
'url': data_gov_url,
|
||||||
|
'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.',
|
||||||
|
'data_gov_metadata': data_gov_metadata,
|
||||||
|
'crawler_metadata': crawler_metadata,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
logger.info("- Moving files to final location...")
|
# Use common pipeline
|
||||||
# Move files to final location
|
fetch_and_upload(
|
||||||
collection_path.parent.mkdir(parents=True, exist_ok=True)
|
output_path=output_path,
|
||||||
metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
collection_path=collection_path,
|
||||||
os.rename(str(archive_path), collection_path)
|
metadata_path=metadata_path,
|
||||||
metadata_path.write_text(json.dumps(output_metadata) + '\n')
|
create_archive_callback=create_archive,
|
||||||
|
signatures=signatures,
|
||||||
if s3_path:
|
session_args=session_args,
|
||||||
logger.info("Uploading to S3...")
|
s3_path=s3_path,
|
||||||
upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
|
no_delete=no_delete
|
||||||
|
)
|
||||||
if not no_delete:
|
|
||||||
logger.info("- Deleting zip archive...")
|
|
||||||
os.remove(collection_path)
|
|
||||||
if collection_path.parent.exists() and not os.listdir(collection_path.parent):
|
|
||||||
os.rmdir(collection_path.parent)
|
|
||||||
|
|
||||||
logger.info("- Setting crawler_downloaded_date...")
|
logger.info("- Setting crawler_downloaded_date...")
|
||||||
db.connect()
|
db.connect()
|
||||||
|
@ -244,12 +148,10 @@ def get_unprocessed_datasets(output_path: Path, collection: str, min_size: int =
|
||||||
@click.option('--signatures', help='JSON string of signature configuration.')
|
@click.option('--signatures', help='JSON string of signature configuration.')
|
||||||
@click.option('--profile', '-p', help='AWS profile name')
|
@click.option('--profile', '-p', help='AWS profile name')
|
||||||
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
|
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
|
||||||
@click.option('--log-level', '-l', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), default=None,
|
|
||||||
help='Logging level.')
|
|
||||||
@click.option('--stop-after', help='Stop after processing this many collections', type=int)
|
@click.option('--stop-after', help='Stop after processing this many collections', type=int)
|
||||||
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
|
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
|
||||||
def main(db_path: Path, output_path: Path, collection: str, workers=None, min_size=0, dataset_name=None,
|
def main(db_path: Path, output_path: Path, collection: str, workers=None, min_size=0, dataset_name=None,
|
||||||
if_exists='skip', signatures=None, profile=None, s3_path=None, log_level=None, stop_after=None, no_delete=False):
|
if_exists='skip', signatures=None, profile=None, s3_path=None, stop_after=None, no_delete=False):
|
||||||
|
|
||||||
if dataset_name:
|
if dataset_name:
|
||||||
workers = 1
|
workers = 1
|
||||||
|
|
|
@ -151,17 +151,8 @@ def cli():
|
||||||
help='Number of results to fetch per page.')
|
help='Number of results to fetch per page.')
|
||||||
@click.option('--start-date', '-s', type=str, default=None,
|
@click.option('--start-date', '-s', type=str, default=None,
|
||||||
help='Date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)')
|
help='Date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)')
|
||||||
@click.option('--log-level', '-l',
|
def fetch(output_path: Path, rows_per_page: int, start_date: str):
|
||||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
|
||||||
default='WARNING',
|
|
||||||
help='Logging level.')
|
|
||||||
def fetch(output_path: Path, rows_per_page: int, start_date: str, log_level: str):
|
|
||||||
"""Fetch package data from data.gov API and save to database."""
|
"""Fetch package data from data.gov API and save to database."""
|
||||||
logging.basicConfig(
|
|
||||||
level=getattr(logging, log_level),
|
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
save_packages_to_database(output_path, rows_per_page, start_date)
|
save_packages_to_database(output_path, rows_per_page, start_date)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
|
|
|
@ -22,19 +22,10 @@ def cli():
|
||||||
@click.argument('output_path', type=click.Path(path_type=Path))
|
@click.argument('output_path', type=click.Path(path_type=Path))
|
||||||
@click.option('--rows-per-page', '-r', type=int, default=1000,
|
@click.option('--rows-per-page', '-r', type=int, default=1000,
|
||||||
help='Number of results to fetch per page.')
|
help='Number of results to fetch per page.')
|
||||||
@click.option('--log-level', '-l',
|
|
||||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
|
||||||
default='INFO',
|
|
||||||
help='Logging level.')
|
|
||||||
@click.option('--start-date', '-s', type=str, default=None,
|
@click.option('--start-date', '-s', type=str, default=None,
|
||||||
help='Start date for fetching packages in YYYY-MM-DD format.')
|
help='Start date for fetching packages in YYYY-MM-DD format.')
|
||||||
def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
|
def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
|
||||||
"""Fetch all package data from data.gov API and save to gzipped JSONL file."""
|
"""Fetch all package data from data.gov API and save to gzipped JSONL file."""
|
||||||
logging.basicConfig(
|
|
||||||
level=getattr(logging, log_level),
|
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
if output_path.is_dir():
|
if output_path.is_dir():
|
||||||
current_date = datetime.now().strftime('%Y%m%d')
|
current_date = datetime.now().strftime('%Y%m%d')
|
||||||
output_path = output_path / f'data_{current_date}.jsonl.gz'
|
output_path = output_path / f'data_{current_date}.jsonl.gz'
|
||||||
|
@ -49,17 +40,8 @@ def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.argument('file1', type=click.Path(exists=True, path_type=Path))
|
@click.argument('file1', type=click.Path(exists=True, path_type=Path))
|
||||||
@click.argument('file2', type=click.Path(exists=True, path_type=Path))
|
@click.argument('file2', type=click.Path(exists=True, path_type=Path))
|
||||||
@click.option('--log-level', '-l',
|
|
||||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
|
||||||
default='INFO',
|
|
||||||
help='Logging level.')
|
|
||||||
def compare(file1: Path, file2: Path, log_level: str):
|
def compare(file1: Path, file2: Path, log_level: str):
|
||||||
"""Compare two gzipped JSONL files by indexing on the 'name' key."""
|
"""Compare two gzipped JSONL files by indexing on the 'name' key."""
|
||||||
logging.basicConfig(
|
|
||||||
level=getattr(logging, log_level),
|
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
def load_jsonl_index(file_path: Path) -> Dict[str, Any]:
|
def load_jsonl_index(file_path: Path) -> Dict[str, Any]:
|
||||||
# Check for pickle file
|
# Check for pickle file
|
||||||
pickle_path = file_path.with_suffix('.pickle')
|
pickle_path = file_path.with_suffix('.pickle')
|
||||||
|
|
|
@ -60,7 +60,7 @@ class Dataset(BaseModel):
|
||||||
# fields starting with crawler_ are added by our crawler
|
# fields starting with crawler_ are added by our crawler
|
||||||
crawler_identified_date = DateTimeField(null=True, default=datetime.now)
|
crawler_identified_date = DateTimeField(null=True, default=datetime.now)
|
||||||
crawler_downloaded_date = DateTimeField(null=True)
|
crawler_downloaded_date = DateTimeField(null=True)
|
||||||
crawler_last_crawl_id = ForeignKeyField('Crawl', backref='datasets', null=True)
|
crawler_last_crawl_id = ForeignKeyField(Crawl, backref='datasets', null=True)
|
||||||
|
|
||||||
|
|
||||||
class DatasetHistory(Dataset):
|
class DatasetHistory(Dataset):
|
||||||
|
|
0
scripts/github/__init__.py
Normal file
0
scripts/github/__init__.py
Normal file
|
@ -8,8 +8,14 @@ from gitspoke import Downloader, GitHubAPI
|
||||||
from gitspoke.cli import valid_include_items, get_token
|
from gitspoke.cli import valid_include_items, get_token
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
import uuid
|
||||||
import requests
|
import requests
|
||||||
|
from datetime import datetime
|
||||||
from scripts.helpers.misc import load_config
|
from scripts.helpers.misc import load_config
|
||||||
|
from nabit.lib.archive import package
|
||||||
|
from nabit.lib.sign import KNOWN_TSAS, is_encrypted_key
|
||||||
|
from nabit.lib.backends.path import PathCollectionTask
|
||||||
|
from scripts.helpers.bag import fetch_and_upload
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
stats_counter = {}
|
stats_counter = {}
|
||||||
|
@ -34,17 +40,90 @@ def check_repo_exists(org_name, repo_name, token, output_path=None):
|
||||||
output_file.write(f"{repo_link}\n")
|
output_file.write(f"{repo_link}\n")
|
||||||
return exists
|
return exists
|
||||||
|
|
||||||
def run_pipeline(org_name, repo_name, collection_path, include, token, check_exists=False, output_path=None):
|
def run_pipeline(
|
||||||
|
org_name,
|
||||||
|
repo_name,
|
||||||
|
collection_path,
|
||||||
|
include,
|
||||||
|
token,
|
||||||
|
metadata_path=None,
|
||||||
|
output_path=None,
|
||||||
|
signatures=None,
|
||||||
|
session_args=None,
|
||||||
|
s3_path=None,
|
||||||
|
no_delete=False,
|
||||||
|
save_raw=False,
|
||||||
|
raw_dir=None,
|
||||||
|
check_exists=False,
|
||||||
|
output_deleted=None,
|
||||||
|
):
|
||||||
"""Process a single repository."""
|
"""Process a single repository."""
|
||||||
|
# existing checking mode
|
||||||
if check_exists:
|
if check_exists:
|
||||||
return check_repo_exists(org_name, repo_name, token, output_path)
|
return check_repo_exists(org_name, repo_name, token, output_deleted)
|
||||||
|
|
||||||
logger.info(f"Processing repository: {org_name}/{repo_name}")
|
# raw saving mode
|
||||||
Downloader(org_name, repo_name, token, max_retries=20).download_repo(collection_path, include=include)
|
if save_raw:
|
||||||
|
raw_path = raw_dir / org_name / repo_name
|
||||||
|
Downloader(org_name, repo_name, token, max_retries=20).download_repo(raw_path, include=include)
|
||||||
|
logger.info("Processing complete")
|
||||||
|
return
|
||||||
|
|
||||||
|
def create_archive_callback(temp_dir):
|
||||||
|
if raw_dir:
|
||||||
|
raw_path = raw_dir / org_name / repo_name
|
||||||
|
if raw_path.exists():
|
||||||
|
out_dir = raw_path
|
||||||
|
else:
|
||||||
|
Downloader(org_name, repo_name, token, max_retries=20).download_repo(temp_dir, include=include)
|
||||||
|
out_dir = temp_dir
|
||||||
|
return {
|
||||||
|
'collect': [
|
||||||
|
PathCollectionTask(path=out_dir)
|
||||||
|
],
|
||||||
|
'signed_metadata': {
|
||||||
|
'id': str(uuid.uuid4()),
|
||||||
|
'url': f'https://github.com/{org_name}/{repo_name}',
|
||||||
|
'description': f'Archive of GitHub repository {org_name}/{repo_name}',
|
||||||
|
'github_metadata': {
|
||||||
|
'org': org_name,
|
||||||
|
'repo': repo_name,
|
||||||
|
'archived_date': datetime.now().isoformat()
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Archive mode - use common pipeline
|
||||||
|
fetch_and_upload(
|
||||||
|
output_path=output_path,
|
||||||
|
collection_path=collection_path,
|
||||||
|
metadata_path=metadata_path,
|
||||||
|
create_archive_callback=create_archive_callback,
|
||||||
|
signatures=signatures,
|
||||||
|
session_args=session_args,
|
||||||
|
s3_path=s3_path,
|
||||||
|
no_delete=no_delete
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("Processing complete")
|
logger.info("Processing complete")
|
||||||
|
|
||||||
def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int = 0, skip_existing: bool = False, stop_after: int = None, include: str = None,
|
def get_tasks(
|
||||||
check_exists: bool = False):
|
csv_path: Path,
|
||||||
|
output_path: Path,
|
||||||
|
skip_rows: int = 0,
|
||||||
|
skip_existing: bool = False,
|
||||||
|
stop_after: int = None,
|
||||||
|
include: str = None,
|
||||||
|
archive_mode: bool = False,
|
||||||
|
signatures: list = None,
|
||||||
|
session_args: dict = None,
|
||||||
|
s3_path: str = None,
|
||||||
|
no_delete: bool = False,
|
||||||
|
save_raw: bool = False,
|
||||||
|
raw_dir: Path = None,
|
||||||
|
check_exists: bool = False,
|
||||||
|
output_deleted: Path = None,
|
||||||
|
):
|
||||||
"""Get repositories from CSV that haven't been processed yet."""
|
"""Get repositories from CSV that haven't been processed yet."""
|
||||||
# Initialize progress bars
|
# Initialize progress bars
|
||||||
if not check_exists:
|
if not check_exists:
|
||||||
|
@ -85,19 +164,37 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
|
||||||
continue
|
continue
|
||||||
|
|
||||||
org_name, repo_name = row['html_url'].split('/')[-2:]
|
org_name, repo_name = row['html_url'].split('/')[-2:]
|
||||||
collection_path = output_path / 'collections' / collection / org_name / repo_name
|
|
||||||
|
|
||||||
if skip_existing:
|
collection_path = output_path / 'data' / org_name / repo_name / 'v1.zip'
|
||||||
if collection_path.exists():
|
metadata_path = output_path / 'metadata' / org_name / repo_name / 'v1.json'
|
||||||
stats_counter['skipped'].update(1)
|
|
||||||
continue
|
if skip_existing and collection_path.exists():
|
||||||
else:
|
stats_counter['skipped'].update(1)
|
||||||
stats_counter['yielded'].update(1)
|
continue
|
||||||
|
else:
|
||||||
|
stats_counter['yielded'].update(1)
|
||||||
|
|
||||||
# use tokens round robin
|
# use tokens round robin
|
||||||
token = tokens[processed % len(tokens)]
|
token = tokens[processed % len(tokens)]
|
||||||
|
|
||||||
yield org_name, repo_name, collection_path, include, token, check_exists, output_path
|
yield (
|
||||||
|
org_name,
|
||||||
|
repo_name,
|
||||||
|
collection_path,
|
||||||
|
include,
|
||||||
|
token,
|
||||||
|
output_deleted,
|
||||||
|
archive_mode,
|
||||||
|
metadata_path,
|
||||||
|
output_path,
|
||||||
|
signatures,
|
||||||
|
session_args,
|
||||||
|
s3_path,
|
||||||
|
save_raw,
|
||||||
|
raw_dir,
|
||||||
|
check_exists,
|
||||||
|
no_delete,
|
||||||
|
)
|
||||||
|
|
||||||
processed += 1
|
processed += 1
|
||||||
if stop_after and processed >= stop_after:
|
if stop_after and processed >= stop_after:
|
||||||
|
@ -107,11 +204,10 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
|
||||||
for counter in stats_counter.values():
|
for counter in stats_counter.values():
|
||||||
counter.close()
|
counter.close()
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
|
@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
|
||||||
help='Output path.')
|
help='Output path.')
|
||||||
@click.option('--collection', '-c', type=str, default='github_raw',
|
|
||||||
help='Collection name.')
|
|
||||||
@click.option('--workers', '-w', type=int, default=None,
|
@click.option('--workers', '-w', type=int, default=None,
|
||||||
help='Number of worker processes. Defaults to CPU count.')
|
help='Number of worker processes. Defaults to CPU count.')
|
||||||
@click.option('--skip-rows', type=int, default=0,
|
@click.option('--skip-rows', type=int, default=0,
|
||||||
|
@ -120,21 +216,44 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
|
||||||
help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items))
|
help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items))
|
||||||
@click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv',
|
@click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv',
|
||||||
help='Path to the CSV file.')
|
help='Path to the CSV file.')
|
||||||
@click.option('--log-level', '-l',
|
|
||||||
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
|
|
||||||
default=None,
|
|
||||||
help='Logging level.')
|
|
||||||
@click.option('--stop-after', help='Stop after processing this many repositories', type=int)
|
@click.option('--stop-after', help='Stop after processing this many repositories', type=int)
|
||||||
@click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories')
|
@click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories')
|
||||||
|
@click.option('--signatures', help='JSON string of signature configuration.')
|
||||||
|
# upload settings
|
||||||
|
@click.option('--profile', '-p', help='AWS profile name')
|
||||||
|
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
|
||||||
|
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
|
||||||
|
# raw saving
|
||||||
|
# useful if doing multiple runs with the same csv and different --include values
|
||||||
|
@click.option('--save-raw', is_flag=True, help='Save raw repositories to disk rather than bagging and uploading')
|
||||||
|
@click.option('--raw-dir', type=click.Path(path_type=Path), help='Directory to save raw repositories to')
|
||||||
|
# deletion checking
|
||||||
@click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub')
|
@click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub')
|
||||||
def main(csv_path: Path, output_path: Path, collection: str, workers=None, skip_rows=0, include=None,
|
@click.option('--output-deleted', type=click.Path(path_type=Path), help='File to output deleted repositories to')
|
||||||
log_level=None, stop_after=None, skip_existing=False, check_exists=False):
|
def main(profile, workers, **kwargs):
|
||||||
|
|
||||||
|
session_args = {}
|
||||||
|
if profile:
|
||||||
|
session_args['profile_name'] = profile
|
||||||
|
|
||||||
|
if signatures := kwargs.get('signatures'):
|
||||||
|
signatures = json.loads(signatures)
|
||||||
|
for signature in signatures:
|
||||||
|
if signature['action'] == 'sign':
|
||||||
|
if is_encrypted_key(signature['params']['key']):
|
||||||
|
signature['params']['password'] = click.prompt(
|
||||||
|
f"Enter password for {signature['params']['key']}: ",
|
||||||
|
hide_input=True
|
||||||
|
)
|
||||||
|
elif signature['action'] == 'timestamp':
|
||||||
|
if known_tsa := signature.pop('known_tsa', None):
|
||||||
|
signature['params'] = KNOWN_TSAS[known_tsa]
|
||||||
|
kwargs['signatures'] = signatures
|
||||||
|
|
||||||
run_parallel(
|
run_parallel(
|
||||||
run_pipeline,
|
run_pipeline,
|
||||||
get_tasks(csv_path, output_path, collection, skip_rows, skip_existing, stop_after, include, check_exists),
|
get_tasks(**kwargs),
|
||||||
workers,
|
workers,
|
||||||
log_level=log_level
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
156
scripts/helpers/bag.py
Normal file
156
scripts/helpers/bag.py
Normal file
|
@ -0,0 +1,156 @@
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import zipfile
|
||||||
|
import struct
|
||||||
|
import boto3
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
from nabit.lib.archive import package
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# File extensions that are already compressed or wouldn't benefit from additional compression
|
||||||
|
UNCOMPRESSED_EXTENSIONS = {
|
||||||
|
# Already compressed archives
|
||||||
|
'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
|
||||||
|
# Compressed images
|
||||||
|
'jpg', 'jpeg', 'png', 'gif', 'webp',
|
||||||
|
# Compressed video/audio
|
||||||
|
'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
|
||||||
|
# Other compressed/binary formats
|
||||||
|
'pdf', 'docx', 'xlsx', 'pptx',
|
||||||
|
}
|
||||||
|
|
||||||
|
def zip_archive(bag_dir, archive_path):
|
||||||
|
"""Zip up a nabit archive and create metadata."""
|
||||||
|
# Create zip archive
|
||||||
|
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
||||||
|
for file_path in bag_dir.rglob('*'):
|
||||||
|
if file_path.is_file():
|
||||||
|
arc_path = file_path.relative_to(bag_dir)
|
||||||
|
compression = (zipfile.ZIP_STORED
|
||||||
|
if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS
|
||||||
|
else zipfile.ZIP_DEFLATED)
|
||||||
|
zf.write(file_path, arc_path, compress_type=compression)
|
||||||
|
|
||||||
|
# Create metadata file
|
||||||
|
zip_info = []
|
||||||
|
with zipfile.ZipFile(archive_path, 'r') as zf:
|
||||||
|
for info in zf.filelist:
|
||||||
|
header_offset = info.header_offset
|
||||||
|
|
||||||
|
# Read header to calculate data offset
|
||||||
|
zf.fp.seek(header_offset)
|
||||||
|
header = zf.fp.read(zipfile.sizeFileHeader)
|
||||||
|
fheader = struct.unpack(zipfile.structFileHeader, header)
|
||||||
|
fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
|
||||||
|
extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
|
||||||
|
data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
|
||||||
|
|
||||||
|
zip_info.append({
|
||||||
|
'filename': info.filename,
|
||||||
|
'file_size': info.file_size,
|
||||||
|
'compress_size': info.compress_size,
|
||||||
|
'compress_type': info.compress_type,
|
||||||
|
'header_offset': header_offset,
|
||||||
|
'data_offset': data_offset,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Read the bag-info.txt and signed-metadata.json
|
||||||
|
bag_info = (bag_dir / 'bag-info.txt').read_text()
|
||||||
|
signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
|
||||||
|
|
||||||
|
return {
|
||||||
|
'bag_info': bag_info,
|
||||||
|
'signed_metadata': signed_metadata,
|
||||||
|
'zip_entries': zip_info
|
||||||
|
}
|
||||||
|
|
||||||
|
def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
|
||||||
|
"""Upload archive and metadata to S3."""
|
||||||
|
s3 = boto3.Session(**session_args).client('s3')
|
||||||
|
bucket_name, s3_key_prefix = s3_path.split('/', 1)
|
||||||
|
|
||||||
|
# Upload zip file
|
||||||
|
s3_collection_key = os.path.join(s3_key_prefix, str(collection_path.relative_to(output_path)))
|
||||||
|
s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
|
||||||
|
logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
|
||||||
|
|
||||||
|
# Upload metadata file
|
||||||
|
s3_metadata_key = os.path.join(s3_key_prefix, str(metadata_path.relative_to(output_path)))
|
||||||
|
s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
|
||||||
|
logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
|
||||||
|
|
||||||
|
def cleanup_files(collection_path, no_delete=False, s3_path=None):
|
||||||
|
"""Clean up local files after upload if needed."""
|
||||||
|
if not no_delete and s3_path:
|
||||||
|
logger.info("- Deleting local zip archive...")
|
||||||
|
if os.path.exists(collection_path):
|
||||||
|
os.remove(collection_path)
|
||||||
|
if collection_path.parent.exists() and not os.listdir(collection_path.parent):
|
||||||
|
os.rmdir(collection_path.parent)
|
||||||
|
|
||||||
|
def fetch_and_upload(
|
||||||
|
output_path,
|
||||||
|
collection_path,
|
||||||
|
metadata_path,
|
||||||
|
create_archive_callback,
|
||||||
|
signatures=None,
|
||||||
|
session_args=None,
|
||||||
|
s3_path=None,
|
||||||
|
no_delete=False,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Common pipeline for creating and processing archives.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output_path: Base output directory
|
||||||
|
collection_path: Path where the final zip will be stored
|
||||||
|
metadata_path: Path where the metadata will be stored
|
||||||
|
create_archive_callback: Function that will create the archive
|
||||||
|
signatures: Signature configuration for nabit
|
||||||
|
session_args: AWS session arguments
|
||||||
|
s3_path: S3 path for uploads
|
||||||
|
no_delete: Whether to preserve local files
|
||||||
|
"""
|
||||||
|
with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
|
||||||
|
logger.info("- Creating archive...")
|
||||||
|
# set up paths
|
||||||
|
temp_dir = Path(temp_dir)
|
||||||
|
bag_dir = temp_dir / 'bag'
|
||||||
|
archive_path = temp_dir / 'archive.zip'
|
||||||
|
source_files_dir = temp_dir / 'source_files'
|
||||||
|
source_files_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Call the callback to create the archive
|
||||||
|
package_kwargs = create_archive_callback(source_files_dir)
|
||||||
|
|
||||||
|
# create bag
|
||||||
|
package(
|
||||||
|
output_path=bag_dir,
|
||||||
|
collect_errors='ignore',
|
||||||
|
signatures=signatures,
|
||||||
|
**package_kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("- Zipping archive...")
|
||||||
|
# zip up data and create metadata
|
||||||
|
output_metadata = zip_archive(bag_dir, archive_path)
|
||||||
|
|
||||||
|
logger.info("- Moving files to final location...")
|
||||||
|
# Move files to final location
|
||||||
|
collection_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.move(str(archive_path), collection_path)
|
||||||
|
with open(metadata_path, 'w') as f:
|
||||||
|
json.dump(output_metadata, f)
|
||||||
|
f.write('\n')
|
||||||
|
|
||||||
|
if s3_path:
|
||||||
|
logger.info("Uploading to S3...")
|
||||||
|
upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
|
||||||
|
|
||||||
|
cleanup_files(collection_path, no_delete, s3_path)
|
85
scripts/helpers/onepassword.py
Normal file
85
scripts/helpers/onepassword.py
Normal file
|
@ -0,0 +1,85 @@
|
||||||
|
from onepassword import Client, ItemCreateParams, ItemField, ItemFieldType, ItemCategory, ItemSection, Item
|
||||||
|
from onepassword import ItemShareParams, ItemShareDuration, ValidRecipient
|
||||||
|
from .misc import load_config
|
||||||
|
import logging
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
async def get_client():
|
||||||
|
op_config = load_config().get("1password", {})
|
||||||
|
if not op_config.get('token'):
|
||||||
|
raise Exception("1Password token not found in config")
|
||||||
|
|
||||||
|
return await Client.authenticate(auth=op_config['token'], integration_name="data-vault", integration_version="v1.0.0")
|
||||||
|
|
||||||
|
def save_item(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
|
||||||
|
return asyncio.run(save_item_async(vault_name, title, fields, notes, category))
|
||||||
|
|
||||||
|
async def save_item_async(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
|
||||||
|
client = await get_client()
|
||||||
|
vault_id = None
|
||||||
|
vaults = await client.vaults.list_all()
|
||||||
|
async for vault in vaults:
|
||||||
|
if vault.title == vault_name:
|
||||||
|
vault_id = vault.id
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise Exception(f"Vault {vault} not found")
|
||||||
|
|
||||||
|
field_objs = []
|
||||||
|
sections = []
|
||||||
|
for field in fields:
|
||||||
|
if field.get('concealed', False):
|
||||||
|
field['field_type'] = ItemFieldType.CONCEALED
|
||||||
|
else:
|
||||||
|
field['field_type'] = ItemFieldType.TEXT
|
||||||
|
field['id'] = field['title'].lower().replace(' ', '_')
|
||||||
|
field_objs.append(ItemField(**field))
|
||||||
|
if section_id := field.get('section_id'):
|
||||||
|
if section_id not in sections:
|
||||||
|
sections.append(section_id)
|
||||||
|
sections = [ItemSection(id=section, title=section) for section in sections]
|
||||||
|
|
||||||
|
# Create item parameters with sections
|
||||||
|
create_params = ItemCreateParams(
|
||||||
|
title=title,
|
||||||
|
category=category,
|
||||||
|
vault_id=vault_id,
|
||||||
|
fields=field_objs,
|
||||||
|
sections=sections,
|
||||||
|
)
|
||||||
|
|
||||||
|
if notes:
|
||||||
|
create_params.notes = notes
|
||||||
|
|
||||||
|
item = await client.items.create(create_params)
|
||||||
|
logger.info(f"Stored credentials in 1Password vault '{vault}' with title '{title}'")
|
||||||
|
return item
|
||||||
|
|
||||||
|
def share_item(
|
||||||
|
item: Item,
|
||||||
|
recipients: list[str] | None = None,
|
||||||
|
expire_after: ItemShareDuration | None = ItemShareDuration.SEVENDAYS,
|
||||||
|
one_time_only: bool = False
|
||||||
|
):
|
||||||
|
return asyncio.run(share_item_async(item, recipients, expire_after, one_time_only))
|
||||||
|
|
||||||
|
async def share_item_async(
|
||||||
|
item: Item,
|
||||||
|
recipients: list[str] | None,
|
||||||
|
expire_after: ItemShareDuration | None,
|
||||||
|
one_time_only: bool,
|
||||||
|
):
|
||||||
|
client = await get_client()
|
||||||
|
policy = await client.items.shares.get_account_policy(item.vault_id, item.id)
|
||||||
|
valid_recipients = await client.items.shares.validate_recipients(policy, recipients)
|
||||||
|
share_params = ItemShareParams(
|
||||||
|
recipients=valid_recipients,
|
||||||
|
expire_after=expire_after,
|
||||||
|
one_time_only=one_time_only
|
||||||
|
)
|
||||||
|
share_link = await client.items.shares.create(item, policy, share_params)
|
||||||
|
logger.info(f"Created share link for '{item.title}'")
|
||||||
|
return share_link
|
||||||
|
|
|
@ -25,17 +25,9 @@ def worker(task_queue, task, catch_errors: bool = True):
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
||||||
def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, log_level: str | None = None, task_count: int | None = None):
|
def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, task_count: int | None = None):
|
||||||
workers = workers or os.cpu_count() or 4
|
workers = workers or os.cpu_count() or 4
|
||||||
|
|
||||||
# Configure logging based on whether we're running in parallel or not
|
|
||||||
if log_level is None:
|
|
||||||
log_level = 'INFO' if workers == 1 else 'WARNING'
|
|
||||||
logging.basicConfig(
|
|
||||||
level=log_level,
|
|
||||||
format='[%(process)d] %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.debug(f"Starting processing with {workers} workers")
|
logger.debug(f"Starting processing with {workers} workers")
|
||||||
|
|
||||||
if workers > 1:
|
if workers > 1:
|
||||||
|
|
23
uv.lock
generated
23
uv.lock
generated
|
@ -169,6 +169,7 @@ dependencies = [
|
||||||
{ name = "httpx" },
|
{ name = "httpx" },
|
||||||
{ name = "jsondiff" },
|
{ name = "jsondiff" },
|
||||||
{ name = "nabit" },
|
{ name = "nabit" },
|
||||||
|
{ name = "onepassword-sdk" },
|
||||||
{ name = "orjson" },
|
{ name = "orjson" },
|
||||||
{ name = "peewee" },
|
{ name = "peewee" },
|
||||||
{ name = "publicsuffixlist" },
|
{ name = "publicsuffixlist" },
|
||||||
|
@ -192,6 +193,7 @@ requires-dist = [
|
||||||
{ name = "httpx", specifier = ">=0.27.2" },
|
{ name = "httpx", specifier = ">=0.27.2" },
|
||||||
{ name = "jsondiff", specifier = ">=2.2.1" },
|
{ name = "jsondiff", specifier = ">=2.2.1" },
|
||||||
{ name = "nabit", git = "https://github.com/harvard-lil/bag-nabit" },
|
{ name = "nabit", git = "https://github.com/harvard-lil/bag-nabit" },
|
||||||
|
{ name = "onepassword-sdk", specifier = ">=0.1.7" },
|
||||||
{ name = "orjson", specifier = ">=3.10.15" },
|
{ name = "orjson", specifier = ">=3.10.15" },
|
||||||
{ name = "peewee", specifier = ">=3.17.8" },
|
{ name = "peewee", specifier = ">=3.17.8" },
|
||||||
{ name = "publicsuffixlist", specifier = ">=1.0.2.20241121" },
|
{ name = "publicsuffixlist", specifier = ">=1.0.2.20241121" },
|
||||||
|
@ -466,6 +468,27 @@ dependencies = [
|
||||||
{ name = "warcio" },
|
{ name = "warcio" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "onepassword-sdk"
|
||||||
|
version = "0.1.7"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "pydantic" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/aa/64/75462b6a21cbf98bff8ccae24848034ab280292d5b65346b77c720f46f93/onepassword_sdk-0.1.7.tar.gz", hash = "sha256:27979eb1b4fe0476a123336e3b432ce549feb4a6f87e975581497befcac985e9", size = 25312950 }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/d9/1a/016c165b87107cf771e3ac1f5a6b813003de534631ff78171b334980bef0/onepassword_sdk-0.1.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fd399e48833f5c916831c22be40f76cf7e85d0180fcb818dbd76cc5d45659cb8", size = 5261542 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/9a/d9/3960534ef2faced9cdecbe72bc2060f718de97b8d95c15210f50d1c514bb/onepassword_sdk-0.1.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:48a6dfb85c0138896ecb8d6aa814c5985acbb4262ed5e3a18e99725454bb41d5", size = 4827963 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/04/50/b0a34b7f1aa7e3b747bdca586896d3f2d452cdf87249f3dae56f649b1c83/onepassword_sdk-0.1.7-cp312-cp312-manylinux_2_32_aarch64.whl", hash = "sha256:7c6bbb202c83ad1566f27c3354ce88fa8dcceae0f90b0514b42d40aeb0b8ad10", size = 5262840 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/52/2d/73a11ee89fb69d6d6e388524560e53574eb5fdd20f65d57f25212087fc9f/onepassword_sdk-0.1.7-cp312-cp312-manylinux_2_32_x86_64.whl", hash = "sha256:299cd1d33f83e0acca01f0b93474397b471e5470b7f58f542f6866f8ea4de134", size = 5559126 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/02/fe/7aa733bfdadc930e4a02115189652524edce425ae13859d10a79f3cd24fa/onepassword_sdk-0.1.7-cp312-cp312-win_amd64.whl", hash = "sha256:c6d7ff7a15e56b2f6198b3681f6d18678fa56f6d9d3b0639bc70ae9b18b8acd2", size = 4627027 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/6a/97/ef38f90c2b8038529c059d5e6e1d5fb447f75300258ce2c5e5761bbe7283/onepassword_sdk-0.1.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0a6c0e2b8383e2428e020abe41a82376fe24c1a1d9634bfb31c1db4692ef8a25", size = 5261540 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/22/c2/6b9eafd9a1549bae1c7e04e344374e560842b6451279be268d1f45cada08/onepassword_sdk-0.1.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6d570ebe068ba6ce4116a71d16a180353b864e5975be4574ede623b1ea794209", size = 4827963 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/57/0e/d924f5b92176fb86ff74b432d2dd4558048c971f902f287557e89795edc0/onepassword_sdk-0.1.7-cp313-cp313-manylinux_2_32_aarch64.whl", hash = "sha256:50f332f0f0f77913abe461c4d0913f7714ba2e07d9ef39446e0d564b5b440879", size = 5262838 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/38/13/9a79eccdb8f65b6e2676e72f0f77a2c626e00210960e74b0ff72f7a419a8/onepassword_sdk-0.1.7-cp313-cp313-manylinux_2_32_x86_64.whl", hash = "sha256:8a773da4770e887b124ec671781678e1b1fd57f20dcc40f89a610728f55138ed", size = 5559126 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/24/2c/15549d06fa6874d520ab123963172d18c45de34d0ac26e02465a56488f7d/onepassword_sdk-0.1.7-cp313-cp313-win_amd64.whl", hash = "sha256:9541175ed6283736745bf673f8c74696e53a164192c34c44de4400e8fdf01e38", size = 4627026 },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "orderly-set"
|
name = "orderly-set"
|
||||||
version = "5.3.0"
|
version = "5.3.0"
|
||||||
|
|
Loading…
Add table
Reference in a new issue