mirror of
https://github.com/harvard-lil/data-vault.git
synced 2025-07-04 13:46:56 -04:00
initial commit
This commit is contained in:
commit
404c3627f7
26 changed files with 2534 additions and 0 deletions
13
scripts/helpers/config.py
Normal file
13
scripts/helpers/config.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
CONFIG_PATH = (os.environ.get("XDG_CONFIG_HOME") or (Path.home() / ".config")) / "data-mirror" / "config.json"
|
||||
|
||||
def load_config():
|
||||
"""Load configuration from config file."""
|
||||
if CONFIG_PATH.exists():
|
||||
config = json.loads(CONFIG_PATH.read_text())
|
||||
else:
|
||||
config = {}
|
||||
return config
|
65
scripts/helpers/parallel.py
Normal file
65
scripts/helpers/parallel.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
from multiprocessing import Queue, Process
|
||||
from queue import Empty
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
from typing import Callable, Iterable
|
||||
import logging
|
||||
|
||||
# Set up logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def worker(task_queue, task, catch_errors: bool = True):
|
||||
while True:
|
||||
try:
|
||||
args = task_queue.get(timeout=1)
|
||||
if args is None:
|
||||
break
|
||||
logger.debug(f"[PID {os.getpid()}] Processing task")
|
||||
task(*args)
|
||||
except Empty:
|
||||
continue
|
||||
except Exception as e:
|
||||
if catch_errors:
|
||||
logger.error(f"[PID {os.getpid()}] Worker error: {e}")
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
def run_parallel(processor: Callable, tasks: Iterable, workers = None, catch_errors: bool = True, log_level: str | None = None, task_count: int | None = None):
|
||||
workers = workers or os.cpu_count() or 4
|
||||
|
||||
# Configure logging based on whether we're running in parallel or not
|
||||
if log_level is None:
|
||||
log_level = 'INFO' if workers == 1 else 'WARNING'
|
||||
logging.basicConfig(
|
||||
level=log_level,
|
||||
format='[%(process)d] %(message)s'
|
||||
)
|
||||
|
||||
logger.debug(f"Starting processing with {workers} workers")
|
||||
|
||||
if workers > 1:
|
||||
task_queue = Queue(maxsize=100)
|
||||
|
||||
# Start worker processes
|
||||
processes = []
|
||||
for _ in range(workers):
|
||||
p = Process(target=worker, args=(task_queue, processor, catch_errors))
|
||||
p.start()
|
||||
processes.append(p)
|
||||
|
||||
# Load tasks into queue
|
||||
for task_item in tqdm(tasks, total=task_count):
|
||||
if workers > 1:
|
||||
task_queue.put(task_item)
|
||||
else:
|
||||
processor(*task_item)
|
||||
|
||||
if workers > 1:
|
||||
# Signal workers to exit
|
||||
for _ in range(workers):
|
||||
task_queue.put(None)
|
||||
|
||||
# Wait for all processes to complete
|
||||
for p in processes:
|
||||
p.join()
|
Loading…
Add table
Add a link
Reference in a new issue