Refactoring, github pipeline, s3 creation

This commit is contained in:
Jack Cushman 2025-02-26 14:49:24 -05:00
parent a7c99e264d
commit b245fd44eb
21 changed files with 718 additions and 281 deletions

View file

@ -18,6 +18,7 @@ dependencies = [
"cloudflare>=4.0.0", "cloudflare>=4.0.0",
"deepdiff>=8.2.0", "deepdiff>=8.2.0",
"orjson>=3.10.15", "orjson>=3.10.15",
"onepassword-sdk>=0.1.7",
] ]
[build-system] [build-system]
@ -35,3 +36,6 @@ gitspoke = { git = "https://github.com/harvard-lil/gitspoke" }
[tool.hatch.build.targets.wheel] [tool.hatch.build.targets.wheel]
packages = ["scripts"] packages = ["scripts"]
[project.scripts]
vault = "scripts.cli:cli"

View file

@ -1,2 +0,0 @@
def hello() -> str:
return "Hello from data-mirror!"

118
scripts/cli.py Normal file
View file

@ -0,0 +1,118 @@
import click
import importlib
from pathlib import Path
import logging
import os
import sys
"""
Top level CLI that registers commands with this logic:
- Find all subdirectories WITH __init__.py
- Fetch a 'cli' click.group() from __init__.py if it exists, or create one
- For all python files in the subdirectory:
- If the file defines a 'cli' click.group(), add it to the subdirectory's group
- If the file defines a 'main' click.command(), add it to the subdirectory's group
- If any commands are added to the subdirectory's group, add the group to the main CLI
"""
def register_commands(cli: click.Group) -> None:
"""Find all command groups in the scripts directory."""
scripts_dir = Path(__file__).parent
# for each subdirectory, try to import subdir.__init__
for subdir in scripts_dir.glob('**'):
if not subdir.is_dir():
continue
# get group from __init__.py or create a new one
subdir_import = f"scripts.{subdir.name}"
try:
init_module = importlib.import_module(subdir_import+'.__init__')
except ImportError:
continue
if hasattr(init_module, 'cli'):
group = init_module.cli
else:
@click.group()
def group():
pass
# add commands from the subdirectory
for item in subdir.glob('*.py'):
if item.name == '__init__.py':
continue
file_import = f"{subdir_import}.{item.stem}"
module = importlib.import_module(file_import)
if type(getattr(module, 'cli', None)) == click.Group:
group.add_command(module.cli, name=item.stem)
elif type(getattr(module, 'main', None)) == click.Command:
group.add_command(module.main, name=item.stem)
# add the group to the main cli
if group.commands:
cli.add_command(group, name=subdir.name)
@click.group()
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='WARNING',
help='Logging level.')
def cli(log_level):
"""Main CLI entry point for the vault tool."""
logging.basicConfig(level=log_level)
@cli.command()
@click.option('--shell', '-s', type=click.Choice(['bash', 'zsh', 'fish']),
help='Shell to generate completion script for. Defaults to auto-detect.')
def completion(shell):
"""Install tab completion for the CLI.
Examples:
# Auto-detect shell and print instructions
$ vault completion
# Generate completion script for bash
$ vault completion --shell=bash > ~/.vault-complete.bash
# Generate completion script for zsh
$ vault completion --shell=zsh > ~/.vault-complete.zsh
# Generate completion script for fish
$ vault completion --shell=fish > ~/.config/fish/completions/vault.fish
"""
# Auto-detect shell if not specified
if not shell:
shell = os.environ.get('SHELL', '')
if 'bash' in shell:
shell = 'bash'
elif 'zsh' in shell:
shell = 'zsh'
elif 'fish' in shell:
shell = 'fish'
else:
click.echo("Could not auto-detect shell. Please specify with --shell option.")
return
# Get the script name (executable name)
script_name = os.path.basename(sys.argv[0])
env_var = f"_{script_name.replace('-', '_').upper()}_COMPLETE"
if shell == 'bash':
click.echo(f"# Save the completion script:")
click.echo(f"{env_var}=bash_source {script_name} > ~/.{script_name}-complete.bash")
click.echo("\n# Then add this line to your ~/.bashrc:")
click.echo(f". ~/.{script_name}-complete.bash")
elif shell == 'zsh':
click.echo(f"# Save the completion script:")
click.echo(f"{env_var}=zsh_source {script_name} > ~/.{script_name}-complete.zsh")
click.echo("\n# Then add this line to your ~/.zshrc:")
click.echo(f". ~/.{script_name}-complete.zsh")
elif shell == 'fish':
click.echo(f"# Save the completion script to the fish completions directory:")
click.echo(f"{env_var}=fish_source {script_name} > ~/.config/fish/completions/{script_name}.fish")
register_commands(cli)
if __name__ == "__main__":
cli()

View file

@ -1,2 +0,0 @@
def hello() -> str:
return "Hello from data-mirror!"

View file

@ -54,15 +54,9 @@ def cli():
help='Key prefixes to restrict access to. Can be specified multiple times.') help='Key prefixes to restrict access to. Can be specified multiple times.')
@click.option('--objects', '-o', multiple=True, @click.option('--objects', '-o', multiple=True,
help='Specific object keys to restrict access to. Can be specified multiple times.') help='Specific object keys to restrict access to. Can be specified multiple times.')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
def generate_key(bucket: str, permission: str, ttl: int, prefixes: tuple[str, ...], def generate_key(bucket: str, permission: str, ttl: int, prefixes: tuple[str, ...],
objects: tuple[str, ...], log_level: str): objects: tuple[str, ...], log_level: str):
"""Generate temporary Cloudflare R2 access credentials.""" """Generate temporary Cloudflare R2 access credentials."""
# Setup logging
logging.basicConfig(level=log_level)
# Load config # Load config
config = load_config().get("temp_tokens", {}) config = load_config().get("temp_tokens", {})

View file

@ -74,18 +74,10 @@ def render_html(datasets_query, output_path: Path) -> None:
@click.command() @click.command()
@click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db') @click.argument('db_path', type=click.Path(path_type=Path), default='data/data.db')
@click.argument('output_path', type=click.Path(path_type=Path), default='data/processed/web') @click.argument('output_path', type=click.Path(path_type=Path), default='data/processed/web')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
@click.option('--limit', '-n', type=int, default=None, @click.option('--limit', '-n', type=int, default=None,
help='Maximum number of rows to display. Default: all rows.') help='Maximum number of rows to display. Default: all rows.')
def main(db_path: Path, output_path: Path, log_level: str, limit: int | None): def main(db_path: Path, output_path: Path, limit: int | None):
"""Render the Dataset table to an HTML file.""" """Render the Dataset table to an HTML file."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger.info(f"Connecting to database at {db_path}") logger.info(f"Connecting to database at {db_path}")
db.init(db_path) db.init(db_path)

View file

@ -2,14 +2,12 @@ import boto3
import click import click
from tqdm import tqdm from tqdm import tqdm
import logging import logging
from itertools import islice
import json import json
import gzip
from io import BytesIO
import tempfile import tempfile
import os import os
from scripts.helpers.misc import json_default from scripts.helpers.misc import json_default
import zipfile import zipfile
from scripts.helpers.onepassword import save_item, share_item
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -122,11 +120,8 @@ def cli():
@click.argument('s3_path') @click.argument('s3_path')
@click.option('--profile', help='AWS profile name', default='sc-direct') @click.option('--profile', help='AWS profile name', default='sc-direct')
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it') @click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), def undelete(s3_path: str, profile: str = None, dry_run: bool = False):
default='INFO', help='Set logging level')
def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
"""Remove delete markers from versioned S3 objects, effectively undeleting them.""" """Remove delete markers from versioned S3 objects, effectively undeleting them."""
logging.basicConfig(level=log_level)
bucket, prefix = s3_path.split('/', 1) bucket, prefix = s3_path.split('/', 1)
session = boto3.Session(profile_name=profile) session = boto3.Session(profile_name=profile)
@ -138,11 +133,8 @@ def undelete(s3_path: str, profile: str = None, dry_run: bool = False, log_level
@click.argument('s3_path') @click.argument('s3_path')
@click.option('--profile', help='AWS profile name', default='sc-direct') @click.option('--profile', help='AWS profile name', default='sc-direct')
@click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it') @click.option('--dry-run', is_flag=True, help='Show what would be done without actually doing it')
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False):
default='INFO', help='Set logging level')
def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_level: str = 'INFO'):
"""Delete all zero-size objects under the given prefix.""" """Delete all zero-size objects under the given prefix."""
logging.basicConfig(level=log_level)
bucket, prefix = s3_path.split('/', 1) bucket, prefix = s3_path.split('/', 1)
session = boto3.Session(profile_name=profile) session = boto3.Session(profile_name=profile)
@ -154,11 +146,8 @@ def delete_empty(s3_path: str, profile: str = None, dry_run: bool = False, log_l
@click.argument('s3_path') @click.argument('s3_path')
@click.option('--profile', help='AWS profile name', default='sc-direct') @click.option('--profile', help='AWS profile name', default='sc-direct')
@click.option('--output', '-o', help='Output path for index file', default=None) @click.option('--output', '-o', help='Output path for index file', default=None)
@click.option('--log-level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), def write_index(s3_path: str, profile: str = None, output: str | None = None):
default='INFO', help='Set logging level')
def write_index(s3_path: str, profile: str = None, output: str | None = None, log_level: str = 'INFO'):
"""Write a JSONL index of all files under the given prefix.""" """Write a JSONL index of all files under the given prefix."""
logging.basicConfig(level=log_level)
bucket, prefix = s3_path.split('/', 1) bucket, prefix = s3_path.split('/', 1)
if output is None: if output is None:
@ -169,6 +158,117 @@ def write_index(s3_path: str, profile: str = None, output: str | None = None, lo
write_file_listing(s3_client, bucket, prefix, output) write_file_listing(s3_client, bucket, prefix, output)
@cli.command()
@click.argument('bucket_name')
@click.option('--profile', '-p', help='AWS profile name')
@click.option('--region', '-r', help='AWS region', default='us-east-1')
@click.option('--tag', '-t', help='Tag the bucket with a name', default=None)
def create_bucket(bucket_name: str, profile: str = None, region: str = 'us-east-1', tag: str | None = None):
"""Create a new S3 bucket with versioning enabled by default."""
session = boto3.Session(profile_name=profile)
s3_client = session.client('s3')
# Ensure bucket exists
try:
if region == 'us-east-1':
s3_client.create_bucket(Bucket=bucket_name)
else:
s3_client.create_bucket(
Bucket=bucket_name,
CreateBucketConfiguration={'LocationConstraint': region},
)
except s3_client.exceptions.BucketAlreadyExists:
logger.warning(f"Bucket {bucket_name} already exists. Updating settings.")
# Configure bucket
s3_client.put_bucket_versioning(
Bucket=bucket_name,
VersioningConfiguration={'Status': 'Enabled'}
)
logger.info(f"Ensured bucket {bucket_name} exists with versioning enabled")
@cli.command()
@click.argument('bucket_name')
@click.argument('username')
@click.option('--profile', '-p', help='AWS profile name')
@click.option('--permissions-boundary', '-b', help='ARN of the permissions boundary policy')
@click.option('--op-vault', help='1Password vault to store credentials in', default='Private')
@click.option('--op-share', help='Share the credentials with the given email', default=None)
def create_user(bucket_name: str, username: str, profile: str, permissions_boundary: str, op_vault: str, op_share: str | None):
"""Generate temporary S3 credentials with read/write/list access for a specific bucket."""
session = boto3.Session(profile_name=profile)
iam_client = session.client('iam')
# Define inline policy for bucket access
bucket_policy = {
"Version": "2012-10-17",
"Statement": [{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject",
"s3:DeleteObject",
"s3:ListBucket"
],
"Resource": [
f"arn:aws:s3:::{bucket_name}",
f"arn:aws:s3:::{bucket_name}/*"
]
}]
}
# Create the IAM user with permissions boundary
try:
iam_client.create_user(
UserName=username,
PermissionsBoundary=permissions_boundary
)
logger.info(f"Created IAM user: {username}")
except iam_client.exceptions.EntityAlreadyExistsException:
logger.warning(f"User {username} already exists")
# Attach inline policy directly to user
iam_client.put_user_policy(
UserName=username,
PolicyName=f"{bucket_name}-access",
PolicyDocument=json.dumps(bucket_policy)
)
logger.info(f"Attached bucket access policy to user {username}")
# Create access key for the user
response = iam_client.create_access_key(UserName=username)
credentials = response['AccessKey']
# Output the credentials
click.echo(f"AWS_ACCESS_KEY_ID={credentials['AccessKeyId']}")
click.echo(f"AWS_SECRET_ACCESS_KEY={credentials['SecretAccessKey']}")
# Save credentials to 1Password if requested
if op_vault:
item = save_item(op_vault, f"{username} S3 Credentials for {bucket_name}", [
{
'title': 'Access Key ID',
'value': credentials['AccessKeyId'],
'section_id': 's3_details'
},
{
'title': 'Secret Access Key',
'value': credentials['SecretAccessKey'],
'section_id': 's3_details'
},
{
'title': 'S3 Bucket',
'value': bucket_name,
'section_id': 's3_details'
},
])
if op_share:
share_link = share_item(item, [op_share])
click.echo(f"To share credentials with {op_share}, use the following link: {share_link}")
if __name__ == '__main__': if __name__ == '__main__':
cli() cli()

View file

@ -73,18 +73,8 @@ def verify_dataset(json_url: str, zip_url: str, output_dir: Path | None = None):
@click.argument('zip_url', type=str) @click.argument('zip_url', type=str)
@click.option('--output', '-o', type=click.Path(path_type=Path), @click.option('--output', '-o', type=click.Path(path_type=Path),
help='Directory to write uncompressed files') help='Directory to write uncompressed files')
@click.option('--log-level', '-l', def main(json_url: str, zip_url: str, output: Path = None):
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
def main(json_url: str, zip_url: str, output: Path = None, log_level: str = 'INFO'):
"""Verify dataset from JSON and ZIP URLs""" """Verify dataset from JSON and ZIP URLs"""
# Set up logging
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
verify_dataset(json_url, zip_url, output) verify_dataset(json_url, zip_url, output)
if __name__ == '__main__': if __name__ == '__main__':

View file

View file

@ -61,17 +61,8 @@ def find_differences(csv_data: Dict[str, dict],
@click.option('--compare-by', '-c', @click.option('--compare-by', '-c',
default='id', default='id',
help='Field to compare by.') help='Field to compare by.')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
def main(old_path: Path, new_path: Path, compare_by: str, log_level: str): def main(old_path: Path, new_path: Path, compare_by: str, log_level: str):
"""Compare records between CSV and JSONL files.""" """Compare records between CSV and JSONL files."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
old_data = load_jsonl_data(old_path, compare_by=compare_by) old_data = load_jsonl_data(old_path, compare_by=compare_by)
new_data = load_jsonl_data(new_path, compare_by=compare_by) new_data = load_jsonl_data(new_path, compare_by=compare_by)

View file

@ -3,36 +3,38 @@ from collections import Counter, defaultdict
from pathlib import Path from pathlib import Path
# Read the JSONL file and count crawler_identified_date values if __name__ == "__main__":
downloaded_counts = Counter()
identified_counts = Counter()
titles_by_org = defaultdict(list)
with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
for line in f:
data = json.loads(line)
org = json.loads(data.get('organization', '{}'))
identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
titles_by_org[org['title']].append(data["title"])
# Print the counts sorted by date # Read the JSONL file and count crawler_identified_date values
for date, count in sorted(identified_counts.items()): downloaded_counts = Counter()
print(f"{date}: {count}") identified_counts = Counter()
titles_by_org = defaultdict(list)
# sort each list of titles by org with open('data/data_db_dump_20250130.only_name.jsonl', 'r') as f:
for org, titles in titles_by_org.items():
titles_by_org[org].sort()
Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
# print urls
for path in Path('data/').glob('glass*'):
print(path)
with open(path, 'r') as f:
for line in f: for line in f:
data = json.loads(line) data = json.loads(line)
print("* " + data['name']) org = json.loads(data.get('organization', '{}'))
resources = data.get('resources', []) identified_counts[(data.get('crawler_identified_date') or '')[:10]] += 1
if type(resources) == str: titles_by_org[org['title']].append(data["title"])
resources = json.loads(resources)
for resource in resources: # Print the counts sorted by date
print(' * ' + resource['url']) for date, count in sorted(identified_counts.items()):
print(f"{date}: {count}")
# sort each list of titles by org
for org, titles in titles_by_org.items():
titles_by_org[org].sort()
Path('data/titles_by_org.json').write_text(json.dumps(titles_by_org, indent=2))
# print urls
for path in Path('data/').glob('glass*'):
print(path)
with open(path, 'r') as f:
for line in f:
data = json.loads(line)
print("* " + data['name'])
resources = data.get('resources', [])
if type(resources) == str:
resources = json.loads(resources)
for resource in resources:
print(' * ' + resource['url'])

View file

@ -10,31 +10,17 @@ import os
from urllib.parse import urlparse from urllib.parse import urlparse
import re import re
from scripts.helpers.parallel import run_parallel from scripts.helpers.parallel import run_parallel
import zipfile
import struct
import boto3
import logging import logging
from scripts.data_gov.models import db, Dataset from scripts.data_gov.models import db, Dataset
from playhouse.shortcuts import model_to_dict from playhouse.shortcuts import model_to_dict
from tqdm import tqdm from tqdm import tqdm
from datetime import datetime from datetime import datetime
from scripts.helpers.bag import zip_archive, upload_archive, cleanup_files, fetch_and_upload
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
## download data.gov datasets, create nabit archives, and upload to S3 ## download data.gov datasets, create nabit archives, and upload to S3
# File extensions that are already compressed or wouldn't benefit from additional compression
UNCOMPRESSED_EXTENSIONS = {
# Already compressed archives
'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
# Compressed images
'jpg', 'jpeg', 'png', 'gif', 'webp',
# Compressed video/audio
'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
# Other compressed/binary formats
'pdf', 'docx', 'xlsx', 'pptx',
}
stats_counter = {} stats_counter = {}
def is_valid_url(url): def is_valid_url(url):
@ -55,95 +41,6 @@ def extract_urls(data, urls = None):
extract_urls(item, urls) extract_urls(item, urls)
return urls return urls
def create_archive(bag_dir, dataset: Dataset, signatures):
data_dict = model_to_dict(dataset)
for key, value in data_dict.items():
if isinstance(value, datetime):
data_dict[key] = value.isoformat()
data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}'
collect = [
*[UrlCollectionTask(url=url) for url in extract_urls(data_dict)],
]
logger.info(f" - Downloading {len(collect)} files")
# sort fields from dataset
data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')}
crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')}
# Create the archive
package(
output_path=bag_dir,
collect=collect,
collect_errors='ignore',
signed_metadata={
'id': str(uuid.uuid4()),
'url': data_gov_url,
'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.',
'data_gov_metadata': data_gov_metadata,
'crawler_metadata': crawler_metadata,
},
signatures=signatures,
)
def zip_archive(bag_dir, archive_path):
# Create zip archive
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
for file_path in bag_dir.rglob('*'):
if file_path.is_file():
arc_path = file_path.relative_to(bag_dir)
compression = (zipfile.ZIP_STORED
if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS
else zipfile.ZIP_DEFLATED)
zf.write(file_path, arc_path, compress_type=compression)
# Create metadata file
zip_info = []
with zipfile.ZipFile(archive_path, 'r') as zf:
for info in zf.filelist:
header_offset = info.header_offset
# Read header to calculate data offset
zf.fp.seek(header_offset)
header = zf.fp.read(zipfile.sizeFileHeader)
fheader = struct.unpack(zipfile.structFileHeader, header)
fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
zip_info.append({
'filename': info.filename,
'file_size': info.file_size,
'compress_size': info.compress_size,
'compress_type': info.compress_type,
'header_offset': header_offset,
'data_offset': data_offset,
})
# Read the bag-info.txt and signed-metadata.json
bag_info = (bag_dir / 'bag-info.txt').read_text()
signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
return {
'bag_info': bag_info,
'signed_metadata': signed_metadata,
'zip_entries': zip_info
}
def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
s3 = boto3.Session(**session_args).client('s3')
bucket_name, s3_path = s3_path.split('/', 1)
# Upload zip file
s3_collection_key = os.path.join(s3_path, str(collection_path.relative_to(output_path)))
s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
# Upload metadata file
s3_metadata_key = os.path.join(s3_path, str(metadata_path.relative_to(output_path)))
s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
def run_pipeline( def run_pipeline(
dataset: Dataset, dataset: Dataset,
output_path: Path, output_path: Path,
@ -162,36 +59,43 @@ def run_pipeline(
# set this here so it makes it into the metadata # set this here so it makes it into the metadata
dataset.crawler_downloaded_date = datetime.now() dataset.crawler_downloaded_date = datetime.now()
with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir: def create_archive(temp_dir):
logger.info("- Creating archive...") data_dict = model_to_dict(dataset)
# set up paths for key, value in data_dict.items():
temp_dir = Path(temp_dir) if isinstance(value, datetime):
bag_dir = temp_dir / 'bag' data_dict[key] = value.isoformat()
archive_path = temp_dir / 'archive.zip' data_gov_url = f'https://catalog.data.gov/dataset/{dataset.name}'
collect = [
*[UrlCollectionTask(url=url) for url in extract_urls(data_dict)],
]
logger.info(f" - Downloading {len(collect)} files")
# download data with nabit # sort fields from dataset
create_archive(bag_dir, dataset, signatures) data_gov_metadata = {k: v for k, v in data_dict.items() if not k.startswith('crawler_')}
crawler_metadata = {k: v for k, v in data_dict.items() if k.startswith('crawler_')}
logger.info("- Zipping archive...") return {
# zip up data and create metadata 'collect': collect,
output_metadata = zip_archive(bag_dir, archive_path) 'signed_metadata': {
'id': str(uuid.uuid4()),
'url': data_gov_url,
'description': f'Archive of data.gov dataset "{dataset.title}" created by {dataset.organization["title"]}. Full metadata stored in data_gov_metadata key.',
'data_gov_metadata': data_gov_metadata,
'crawler_metadata': crawler_metadata,
},
}
logger.info("- Moving files to final location...") # Use common pipeline
# Move files to final location fetch_and_upload(
collection_path.parent.mkdir(parents=True, exist_ok=True) output_path=output_path,
metadata_path.parent.mkdir(parents=True, exist_ok=True) collection_path=collection_path,
os.rename(str(archive_path), collection_path) metadata_path=metadata_path,
metadata_path.write_text(json.dumps(output_metadata) + '\n') create_archive_callback=create_archive,
signatures=signatures,
if s3_path: session_args=session_args,
logger.info("Uploading to S3...") s3_path=s3_path,
upload_archive(output_path, collection_path, metadata_path, s3_path, session_args) no_delete=no_delete
)
if not no_delete:
logger.info("- Deleting zip archive...")
os.remove(collection_path)
if collection_path.parent.exists() and not os.listdir(collection_path.parent):
os.rmdir(collection_path.parent)
logger.info("- Setting crawler_downloaded_date...") logger.info("- Setting crawler_downloaded_date...")
db.connect() db.connect()
@ -244,12 +148,10 @@ def get_unprocessed_datasets(output_path: Path, collection: str, min_size: int =
@click.option('--signatures', help='JSON string of signature configuration.') @click.option('--signatures', help='JSON string of signature configuration.')
@click.option('--profile', '-p', help='AWS profile name') @click.option('--profile', '-p', help='AWS profile name')
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"') @click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
@click.option('--log-level', '-l', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']), default=None,
help='Logging level.')
@click.option('--stop-after', help='Stop after processing this many collections', type=int) @click.option('--stop-after', help='Stop after processing this many collections', type=int)
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata') @click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
def main(db_path: Path, output_path: Path, collection: str, workers=None, min_size=0, dataset_name=None, def main(db_path: Path, output_path: Path, collection: str, workers=None, min_size=0, dataset_name=None,
if_exists='skip', signatures=None, profile=None, s3_path=None, log_level=None, stop_after=None, no_delete=False): if_exists='skip', signatures=None, profile=None, s3_path=None, stop_after=None, no_delete=False):
if dataset_name: if dataset_name:
workers = 1 workers = 1

View file

@ -151,17 +151,8 @@ def cli():
help='Number of results to fetch per page.') help='Number of results to fetch per page.')
@click.option('--start-date', '-s', type=str, default=None, @click.option('--start-date', '-s', type=str, default=None,
help='Date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)') help='Date to start fetching from (format: YYYY-MM-DDTHH:MM:SS.mmmmmm)')
@click.option('--log-level', '-l', def fetch(output_path: Path, rows_per_page: int, start_date: str):
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='WARNING',
help='Logging level.')
def fetch(output_path: Path, rows_per_page: int, start_date: str, log_level: str):
"""Fetch package data from data.gov API and save to database.""" """Fetch package data from data.gov API and save to database."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
save_packages_to_database(output_path, rows_per_page, start_date) save_packages_to_database(output_path, rows_per_page, start_date)
@cli.command() @cli.command()

View file

@ -22,19 +22,10 @@ def cli():
@click.argument('output_path', type=click.Path(path_type=Path)) @click.argument('output_path', type=click.Path(path_type=Path))
@click.option('--rows-per-page', '-r', type=int, default=1000, @click.option('--rows-per-page', '-r', type=int, default=1000,
help='Number of results to fetch per page.') help='Number of results to fetch per page.')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
@click.option('--start-date', '-s', type=str, default=None, @click.option('--start-date', '-s', type=str, default=None,
help='Start date for fetching packages in YYYY-MM-DD format.') help='Start date for fetching packages in YYYY-MM-DD format.')
def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str): def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str):
"""Fetch all package data from data.gov API and save to gzipped JSONL file.""" """Fetch all package data from data.gov API and save to gzipped JSONL file."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
if output_path.is_dir(): if output_path.is_dir():
current_date = datetime.now().strftime('%Y%m%d') current_date = datetime.now().strftime('%Y%m%d')
output_path = output_path / f'data_{current_date}.jsonl.gz' output_path = output_path / f'data_{current_date}.jsonl.gz'
@ -49,17 +40,8 @@ def fetch(output_path: Path, rows_per_page: int, log_level: str, start_date: str
@cli.command() @cli.command()
@click.argument('file1', type=click.Path(exists=True, path_type=Path)) @click.argument('file1', type=click.Path(exists=True, path_type=Path))
@click.argument('file2', type=click.Path(exists=True, path_type=Path)) @click.argument('file2', type=click.Path(exists=True, path_type=Path))
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default='INFO',
help='Logging level.')
def compare(file1: Path, file2: Path, log_level: str): def compare(file1: Path, file2: Path, log_level: str):
"""Compare two gzipped JSONL files by indexing on the 'name' key.""" """Compare two gzipped JSONL files by indexing on the 'name' key."""
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s'
)
def load_jsonl_index(file_path: Path) -> Dict[str, Any]: def load_jsonl_index(file_path: Path) -> Dict[str, Any]:
# Check for pickle file # Check for pickle file
pickle_path = file_path.with_suffix('.pickle') pickle_path = file_path.with_suffix('.pickle')

View file

@ -60,7 +60,7 @@ class Dataset(BaseModel):
# fields starting with crawler_ are added by our crawler # fields starting with crawler_ are added by our crawler
crawler_identified_date = DateTimeField(null=True, default=datetime.now) crawler_identified_date = DateTimeField(null=True, default=datetime.now)
crawler_downloaded_date = DateTimeField(null=True) crawler_downloaded_date = DateTimeField(null=True)
crawler_last_crawl_id = ForeignKeyField('Crawl', backref='datasets', null=True) crawler_last_crawl_id = ForeignKeyField(Crawl, backref='datasets', null=True)
class DatasetHistory(Dataset): class DatasetHistory(Dataset):

View file

View file

@ -8,8 +8,14 @@ from gitspoke import Downloader, GitHubAPI
from gitspoke.cli import valid_include_items, get_token from gitspoke.cli import valid_include_items, get_token
import os import os
import json import json
import uuid
import requests import requests
from datetime import datetime
from scripts.helpers.misc import load_config from scripts.helpers.misc import load_config
from nabit.lib.archive import package
from nabit.lib.sign import KNOWN_TSAS, is_encrypted_key
from nabit.lib.backends.path import PathCollectionTask
from scripts.helpers.bag import fetch_and_upload
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
stats_counter = {} stats_counter = {}
@ -34,17 +40,90 @@ def check_repo_exists(org_name, repo_name, token, output_path=None):
output_file.write(f"{repo_link}\n") output_file.write(f"{repo_link}\n")
return exists return exists
def run_pipeline(org_name, repo_name, collection_path, include, token, check_exists=False, output_path=None): def run_pipeline(
org_name,
repo_name,
collection_path,
include,
token,
metadata_path=None,
output_path=None,
signatures=None,
session_args=None,
s3_path=None,
no_delete=False,
save_raw=False,
raw_dir=None,
check_exists=False,
output_deleted=None,
):
"""Process a single repository.""" """Process a single repository."""
# existing checking mode
if check_exists: if check_exists:
return check_repo_exists(org_name, repo_name, token, output_path) return check_repo_exists(org_name, repo_name, token, output_deleted)
logger.info(f"Processing repository: {org_name}/{repo_name}") # raw saving mode
Downloader(org_name, repo_name, token, max_retries=20).download_repo(collection_path, include=include) if save_raw:
raw_path = raw_dir / org_name / repo_name
Downloader(org_name, repo_name, token, max_retries=20).download_repo(raw_path, include=include)
logger.info("Processing complete")
return
def create_archive_callback(temp_dir):
if raw_dir:
raw_path = raw_dir / org_name / repo_name
if raw_path.exists():
out_dir = raw_path
else:
Downloader(org_name, repo_name, token, max_retries=20).download_repo(temp_dir, include=include)
out_dir = temp_dir
return {
'collect': [
PathCollectionTask(path=out_dir)
],
'signed_metadata': {
'id': str(uuid.uuid4()),
'url': f'https://github.com/{org_name}/{repo_name}',
'description': f'Archive of GitHub repository {org_name}/{repo_name}',
'github_metadata': {
'org': org_name,
'repo': repo_name,
'archived_date': datetime.now().isoformat()
},
},
}
# Archive mode - use common pipeline
fetch_and_upload(
output_path=output_path,
collection_path=collection_path,
metadata_path=metadata_path,
create_archive_callback=create_archive_callback,
signatures=signatures,
session_args=session_args,
s3_path=s3_path,
no_delete=no_delete
)
logger.info("Processing complete") logger.info("Processing complete")
def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int = 0, skip_existing: bool = False, stop_after: int = None, include: str = None, def get_tasks(
check_exists: bool = False): csv_path: Path,
output_path: Path,
skip_rows: int = 0,
skip_existing: bool = False,
stop_after: int = None,
include: str = None,
archive_mode: bool = False,
signatures: list = None,
session_args: dict = None,
s3_path: str = None,
no_delete: bool = False,
save_raw: bool = False,
raw_dir: Path = None,
check_exists: bool = False,
output_deleted: Path = None,
):
"""Get repositories from CSV that haven't been processed yet.""" """Get repositories from CSV that haven't been processed yet."""
# Initialize progress bars # Initialize progress bars
if not check_exists: if not check_exists:
@ -85,19 +164,37 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
continue continue
org_name, repo_name = row['html_url'].split('/')[-2:] org_name, repo_name = row['html_url'].split('/')[-2:]
collection_path = output_path / 'collections' / collection / org_name / repo_name
if skip_existing: collection_path = output_path / 'data' / org_name / repo_name / 'v1.zip'
if collection_path.exists(): metadata_path = output_path / 'metadata' / org_name / repo_name / 'v1.json'
stats_counter['skipped'].update(1)
continue if skip_existing and collection_path.exists():
else: stats_counter['skipped'].update(1)
stats_counter['yielded'].update(1) continue
else:
stats_counter['yielded'].update(1)
# use tokens round robin # use tokens round robin
token = tokens[processed % len(tokens)] token = tokens[processed % len(tokens)]
yield org_name, repo_name, collection_path, include, token, check_exists, output_path yield (
org_name,
repo_name,
collection_path,
include,
token,
output_deleted,
archive_mode,
metadata_path,
output_path,
signatures,
session_args,
s3_path,
save_raw,
raw_dir,
check_exists,
no_delete,
)
processed += 1 processed += 1
if stop_after and processed >= stop_after: if stop_after and processed >= stop_after:
@ -107,11 +204,10 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
for counter in stats_counter.values(): for counter in stats_counter.values():
counter.close() counter.close()
@click.command() @click.command()
@click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed', @click.option('--output-path', '-o', type=click.Path(path_type=Path), default='data/processed',
help='Output path.') help='Output path.')
@click.option('--collection', '-c', type=str, default='github_raw',
help='Collection name.')
@click.option('--workers', '-w', type=int, default=None, @click.option('--workers', '-w', type=int, default=None,
help='Number of worker processes. Defaults to CPU count.') help='Number of worker processes. Defaults to CPU count.')
@click.option('--skip-rows', type=int, default=0, @click.option('--skip-rows', type=int, default=0,
@ -120,21 +216,44 @@ def get_tasks(csv_path: Path, output_path: Path, collection: str, skip_rows: int
help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items)) help='Comma-separated list of elements to include: ' + ', '.join(valid_include_items))
@click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv', @click.option('--csv-path', '-csv', type=click.Path(path_type=Path), default='data/repos_by_cumulative_popularity.csv',
help='Path to the CSV file.') help='Path to the CSV file.')
@click.option('--log-level', '-l',
type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']),
default=None,
help='Logging level.')
@click.option('--stop-after', help='Stop after processing this many repositories', type=int) @click.option('--stop-after', help='Stop after processing this many repositories', type=int)
@click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories') @click.option('--skip-existing', is_flag=True, help='Set to skip existing repositories')
@click.option('--signatures', help='JSON string of signature configuration.')
# upload settings
@click.option('--profile', '-p', help='AWS profile name')
@click.option('--s3-path', '-s', help='S3 path for uploads, e.g. "<bucket_name>/<path>"')
@click.option('--no-delete', is_flag=True, help='Set to preserve zipped data on disk as well as metadata')
# raw saving
# useful if doing multiple runs with the same csv and different --include values
@click.option('--save-raw', is_flag=True, help='Save raw repositories to disk rather than bagging and uploading')
@click.option('--raw-dir', type=click.Path(path_type=Path), help='Directory to save raw repositories to')
# deletion checking
@click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub') @click.option('--check-exists', is_flag=True, help='Only check if repositories still exist on GitHub')
def main(csv_path: Path, output_path: Path, collection: str, workers=None, skip_rows=0, include=None, @click.option('--output-deleted', type=click.Path(path_type=Path), help='File to output deleted repositories to')
log_level=None, stop_after=None, skip_existing=False, check_exists=False): def main(profile, workers, **kwargs):
session_args = {}
if profile:
session_args['profile_name'] = profile
if signatures := kwargs.get('signatures'):
signatures = json.loads(signatures)
for signature in signatures:
if signature['action'] == 'sign':
if is_encrypted_key(signature['params']['key']):
signature['params']['password'] = click.prompt(
f"Enter password for {signature['params']['key']}: ",
hide_input=True
)
elif signature['action'] == 'timestamp':
if known_tsa := signature.pop('known_tsa', None):
signature['params'] = KNOWN_TSAS[known_tsa]
kwargs['signatures'] = signatures
run_parallel( run_parallel(
run_pipeline, run_pipeline,
get_tasks(csv_path, output_path, collection, skip_rows, skip_existing, stop_after, include, check_exists), get_tasks(**kwargs),
workers, workers,
log_level=log_level
) )
if __name__ == "__main__": if __name__ == "__main__":

156
scripts/helpers/bag.py Normal file
View file

@ -0,0 +1,156 @@
import os
import json
import zipfile
import struct
import boto3
import logging
from pathlib import Path
from datetime import datetime
import tempfile
import shutil
from nabit.lib.archive import package
logger = logging.getLogger(__name__)
# File extensions that are already compressed or wouldn't benefit from additional compression
UNCOMPRESSED_EXTENSIONS = {
# Already compressed archives
'zip', 'gz', 'tgz', 'bz2', '7z', 'rar', 'xz',
# Compressed images
'jpg', 'jpeg', 'png', 'gif', 'webp',
# Compressed video/audio
'mp4', 'mov', 'avi', 'wmv', 'ogv', 'mp3', 'm4a',
# Other compressed/binary formats
'pdf', 'docx', 'xlsx', 'pptx',
}
def zip_archive(bag_dir, archive_path):
"""Zip up a nabit archive and create metadata."""
# Create zip archive
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zf:
for file_path in bag_dir.rglob('*'):
if file_path.is_file():
arc_path = file_path.relative_to(bag_dir)
compression = (zipfile.ZIP_STORED
if file_path.suffix.lower().lstrip('.') in UNCOMPRESSED_EXTENSIONS
else zipfile.ZIP_DEFLATED)
zf.write(file_path, arc_path, compress_type=compression)
# Create metadata file
zip_info = []
with zipfile.ZipFile(archive_path, 'r') as zf:
for info in zf.filelist:
header_offset = info.header_offset
# Read header to calculate data offset
zf.fp.seek(header_offset)
header = zf.fp.read(zipfile.sizeFileHeader)
fheader = struct.unpack(zipfile.structFileHeader, header)
fname_length = fheader[zipfile._FH_FILENAME_LENGTH]
extra_length = fheader[zipfile._FH_EXTRA_FIELD_LENGTH]
data_offset = header_offset + zipfile.sizeFileHeader + fname_length + extra_length
zip_info.append({
'filename': info.filename,
'file_size': info.file_size,
'compress_size': info.compress_size,
'compress_type': info.compress_type,
'header_offset': header_offset,
'data_offset': data_offset,
})
# Read the bag-info.txt and signed-metadata.json
bag_info = (bag_dir / 'bag-info.txt').read_text()
signed_metadata = json.loads((bag_dir / 'data/signed-metadata.json').read_text())
return {
'bag_info': bag_info,
'signed_metadata': signed_metadata,
'zip_entries': zip_info
}
def upload_archive(output_path, collection_path, metadata_path, s3_path, session_args):
"""Upload archive and metadata to S3."""
s3 = boto3.Session(**session_args).client('s3')
bucket_name, s3_key_prefix = s3_path.split('/', 1)
# Upload zip file
s3_collection_key = os.path.join(s3_key_prefix, str(collection_path.relative_to(output_path)))
s3.upload_file(str(collection_path), bucket_name, s3_collection_key)
logger.info(f" - Uploaded {collection_path.relative_to(output_path)} to {s3_collection_key}")
# Upload metadata file
s3_metadata_key = os.path.join(s3_key_prefix, str(metadata_path.relative_to(output_path)))
s3.upload_file(str(metadata_path), bucket_name, s3_metadata_key)
logger.info(f" - Uploaded {metadata_path.relative_to(output_path)} to {s3_metadata_key}")
def cleanup_files(collection_path, no_delete=False, s3_path=None):
"""Clean up local files after upload if needed."""
if not no_delete and s3_path:
logger.info("- Deleting local zip archive...")
if os.path.exists(collection_path):
os.remove(collection_path)
if collection_path.parent.exists() and not os.listdir(collection_path.parent):
os.rmdir(collection_path.parent)
def fetch_and_upload(
output_path,
collection_path,
metadata_path,
create_archive_callback,
signatures=None,
session_args=None,
s3_path=None,
no_delete=False,
):
"""
Common pipeline for creating and processing archives.
Args:
output_path: Base output directory
collection_path: Path where the final zip will be stored
metadata_path: Path where the metadata will be stored
create_archive_callback: Function that will create the archive
signatures: Signature configuration for nabit
session_args: AWS session arguments
s3_path: S3 path for uploads
no_delete: Whether to preserve local files
"""
with tempfile.TemporaryDirectory(dir=str(output_path)) as temp_dir:
logger.info("- Creating archive...")
# set up paths
temp_dir = Path(temp_dir)
bag_dir = temp_dir / 'bag'
archive_path = temp_dir / 'archive.zip'
source_files_dir = temp_dir / 'source_files'
source_files_dir.mkdir(parents=True, exist_ok=True)
# Call the callback to create the archive
package_kwargs = create_archive_callback(source_files_dir)
# create bag
package(
output_path=bag_dir,
collect_errors='ignore',
signatures=signatures,
**package_kwargs,
)
logger.info("- Zipping archive...")
# zip up data and create metadata
output_metadata = zip_archive(bag_dir, archive_path)
logger.info("- Moving files to final location...")
# Move files to final location
collection_path.parent.mkdir(parents=True, exist_ok=True)
metadata_path.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(archive_path), collection_path)
with open(metadata_path, 'w') as f:
json.dump(output_metadata, f)
f.write('\n')
if s3_path:
logger.info("Uploading to S3...")
upload_archive(output_path, collection_path, metadata_path, s3_path, session_args)
cleanup_files(collection_path, no_delete, s3_path)

View file

@ -0,0 +1,85 @@
from onepassword import Client, ItemCreateParams, ItemField, ItemFieldType, ItemCategory, ItemSection, Item
from onepassword import ItemShareParams, ItemShareDuration, ValidRecipient
from .misc import load_config
import logging
import asyncio
logger = logging.getLogger(__name__)
async def get_client():
op_config = load_config().get("1password", {})
if not op_config.get('token'):
raise Exception("1Password token not found in config")
return await Client.authenticate(auth=op_config['token'], integration_name="data-vault", integration_version="v1.0.0")
def save_item(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
return asyncio.run(save_item_async(vault_name, title, fields, notes, category))
async def save_item_async(vault_name: str, title, fields, notes=None, category=ItemCategory.APICREDENTIALS):
client = await get_client()
vault_id = None
vaults = await client.vaults.list_all()
async for vault in vaults:
if vault.title == vault_name:
vault_id = vault.id
break
else:
raise Exception(f"Vault {vault} not found")
field_objs = []
sections = []
for field in fields:
if field.get('concealed', False):
field['field_type'] = ItemFieldType.CONCEALED
else:
field['field_type'] = ItemFieldType.TEXT
field['id'] = field['title'].lower().replace(' ', '_')
field_objs.append(ItemField(**field))
if section_id := field.get('section_id'):
if section_id not in sections:
sections.append(section_id)
sections = [ItemSection(id=section, title=section) for section in sections]
# Create item parameters with sections
create_params = ItemCreateParams(
title=title,
category=category,
vault_id=vault_id,
fields=field_objs,
sections=sections,
)
if notes:
create_params.notes = notes
item = await client.items.create(create_params)
logger.info(f"Stored credentials in 1Password vault '{vault}' with title '{title}'")
return item
def share_item(
item: Item,
recipients: list[str] | None = None,
expire_after: ItemShareDuration | None = ItemShareDuration.SEVENDAYS,
one_time_only: bool = False
):
return asyncio.run(share_item_async(item, recipients, expire_after, one_time_only))
async def share_item_async(
item: Item,
recipients: list[str] | None,
expire_after: ItemShareDuration | None,
one_time_only: bool,
):
client = await get_client()
policy = await client.items.shares.get_account_policy(item.vault_id, item.id)
valid_recipients = await client.items.shares.validate_recipients(policy, recipients)
share_params = ItemShareParams(
recipients=valid_recipients,
expire_after=expire_after,
one_time_only=one_time_only
)
share_link = await client.items.shares.create(item, policy, share_params)
logger.info(f"Created share link for '{item.title}'")
return share_link

View file

@ -25,17 +25,9 @@ def worker(task_queue, task, catch_errors: bool = True):
raise e raise e
def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, log_level: str | None = None, task_count: int | None = None): def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, task_count: int | None = None):
workers = workers or os.cpu_count() or 4 workers = workers or os.cpu_count() or 4
# Configure logging based on whether we're running in parallel or not
if log_level is None:
log_level = 'INFO' if workers == 1 else 'WARNING'
logging.basicConfig(
level=log_level,
format='[%(process)d] %(message)s'
)
logger.debug(f"Starting processing with {workers} workers") logger.debug(f"Starting processing with {workers} workers")
if workers > 1: if workers > 1:

23
uv.lock generated
View file

@ -169,6 +169,7 @@ dependencies = [
{ name = "httpx" }, { name = "httpx" },
{ name = "jsondiff" }, { name = "jsondiff" },
{ name = "nabit" }, { name = "nabit" },
{ name = "onepassword-sdk" },
{ name = "orjson" }, { name = "orjson" },
{ name = "peewee" }, { name = "peewee" },
{ name = "publicsuffixlist" }, { name = "publicsuffixlist" },
@ -192,6 +193,7 @@ requires-dist = [
{ name = "httpx", specifier = ">=0.27.2" }, { name = "httpx", specifier = ">=0.27.2" },
{ name = "jsondiff", specifier = ">=2.2.1" }, { name = "jsondiff", specifier = ">=2.2.1" },
{ name = "nabit", git = "https://github.com/harvard-lil/bag-nabit" }, { name = "nabit", git = "https://github.com/harvard-lil/bag-nabit" },
{ name = "onepassword-sdk", specifier = ">=0.1.7" },
{ name = "orjson", specifier = ">=3.10.15" }, { name = "orjson", specifier = ">=3.10.15" },
{ name = "peewee", specifier = ">=3.17.8" }, { name = "peewee", specifier = ">=3.17.8" },
{ name = "publicsuffixlist", specifier = ">=1.0.2.20241121" }, { name = "publicsuffixlist", specifier = ">=1.0.2.20241121" },
@ -466,6 +468,27 @@ dependencies = [
{ name = "warcio" }, { name = "warcio" },
] ]
[[package]]
name = "onepassword-sdk"
version = "0.1.7"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pydantic" },
]
sdist = { url = "https://files.pythonhosted.org/packages/aa/64/75462b6a21cbf98bff8ccae24848034ab280292d5b65346b77c720f46f93/onepassword_sdk-0.1.7.tar.gz", hash = "sha256:27979eb1b4fe0476a123336e3b432ce549feb4a6f87e975581497befcac985e9", size = 25312950 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d9/1a/016c165b87107cf771e3ac1f5a6b813003de534631ff78171b334980bef0/onepassword_sdk-0.1.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fd399e48833f5c916831c22be40f76cf7e85d0180fcb818dbd76cc5d45659cb8", size = 5261542 },
{ url = "https://files.pythonhosted.org/packages/9a/d9/3960534ef2faced9cdecbe72bc2060f718de97b8d95c15210f50d1c514bb/onepassword_sdk-0.1.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:48a6dfb85c0138896ecb8d6aa814c5985acbb4262ed5e3a18e99725454bb41d5", size = 4827963 },
{ url = "https://files.pythonhosted.org/packages/04/50/b0a34b7f1aa7e3b747bdca586896d3f2d452cdf87249f3dae56f649b1c83/onepassword_sdk-0.1.7-cp312-cp312-manylinux_2_32_aarch64.whl", hash = "sha256:7c6bbb202c83ad1566f27c3354ce88fa8dcceae0f90b0514b42d40aeb0b8ad10", size = 5262840 },
{ url = "https://files.pythonhosted.org/packages/52/2d/73a11ee89fb69d6d6e388524560e53574eb5fdd20f65d57f25212087fc9f/onepassword_sdk-0.1.7-cp312-cp312-manylinux_2_32_x86_64.whl", hash = "sha256:299cd1d33f83e0acca01f0b93474397b471e5470b7f58f542f6866f8ea4de134", size = 5559126 },
{ url = "https://files.pythonhosted.org/packages/02/fe/7aa733bfdadc930e4a02115189652524edce425ae13859d10a79f3cd24fa/onepassword_sdk-0.1.7-cp312-cp312-win_amd64.whl", hash = "sha256:c6d7ff7a15e56b2f6198b3681f6d18678fa56f6d9d3b0639bc70ae9b18b8acd2", size = 4627027 },
{ url = "https://files.pythonhosted.org/packages/6a/97/ef38f90c2b8038529c059d5e6e1d5fb447f75300258ce2c5e5761bbe7283/onepassword_sdk-0.1.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0a6c0e2b8383e2428e020abe41a82376fe24c1a1d9634bfb31c1db4692ef8a25", size = 5261540 },
{ url = "https://files.pythonhosted.org/packages/22/c2/6b9eafd9a1549bae1c7e04e344374e560842b6451279be268d1f45cada08/onepassword_sdk-0.1.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6d570ebe068ba6ce4116a71d16a180353b864e5975be4574ede623b1ea794209", size = 4827963 },
{ url = "https://files.pythonhosted.org/packages/57/0e/d924f5b92176fb86ff74b432d2dd4558048c971f902f287557e89795edc0/onepassword_sdk-0.1.7-cp313-cp313-manylinux_2_32_aarch64.whl", hash = "sha256:50f332f0f0f77913abe461c4d0913f7714ba2e07d9ef39446e0d564b5b440879", size = 5262838 },
{ url = "https://files.pythonhosted.org/packages/38/13/9a79eccdb8f65b6e2676e72f0f77a2c626e00210960e74b0ff72f7a419a8/onepassword_sdk-0.1.7-cp313-cp313-manylinux_2_32_x86_64.whl", hash = "sha256:8a773da4770e887b124ec671781678e1b1fd57f20dcc40f89a610728f55138ed", size = 5559126 },
{ url = "https://files.pythonhosted.org/packages/24/2c/15549d06fa6874d520ab123963172d18c45de34d0ac26e02465a56488f7d/onepassword_sdk-0.1.7-cp313-cp313-win_amd64.whl", hash = "sha256:9541175ed6283736745bf673f8c74696e53a164192c34c44de4400e8fdf01e38", size = 4627026 },
]
[[package]] [[package]]
name = "orderly-set" name = "orderly-set"
version = "5.3.0" version = "5.3.0"