web/scripts/sync-ebooks

270 lines
7.2 KiB
Bash
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#! /usr/bin/env bash
set -e
set -o pipefail
usage(){
cat <<EOF
DESCRIPTION
Syncs books from standardebooks.org GitHub org to specified folder.
USAGE
${0##*/} [-v,-vv,--verbosity INTEGER] [-u,--update-only] [--token TOKEN] DIRECTORY
With -v or --verbosity 1, display general progress updates.
With -vv or --verbosity 2, display general progress updates and verbose git output.
With --update-only, only sync existing repositories, do not download new repositories.
With -b or --bare, clone bare repositories (for a server) instead of working directories.
With --token TOKEN, specify a GitHub access token to use for request. Useful for when you hit the rate limit.
DIRECTORY should be where the repositories should go.
NOTE: This script requires GNU versions of grep and sed. If you are on a Mac, you will need to
install GNU versions (via Homebrew, MacPorts, etc.) and make sure they are first in your path,
or modify the script to use the GNU versions if they're named differently.
EXAMPLE
${0##*/} /standardebooks.org/ebooks
EOF
exit
}
# functions used by the script
die(){
printf "\033[0;7;31mError:\033[0m %s\n" "${1}" 1>&2;
exit 1;
}
require(){
command -v "$1" > /dev/null 2>&1 || {
suggestion="";
if [ -n "$2" ]; then
suggestion=" $2";
fi
die "$1 is required but not installed.${suggestion}";
}
}
check_arg() {
case "$2" in
''|$1) die "$3" ;;
esac
}
# end functions
# Terminate on CTRL-C
trap ctrl_c INT
ctrl_c() {
exit
}
require "git"
if [[ $# -eq 0 ]]; then
usage
fi
verbosity=0
updateOnly="false"
githubToken=""
target=""
bare=""
# process each of the parameters one at a time, shifting each time to get the next one
while [ $# -gt 0 ]; do
case "$1" in
-h|--help)
usage ;;
-v)
verbosity=1
shift 1
;;
-vv)
verbosity=2
shift 1
;;
-u|--update-only)
updateOnly="true"
shift 1
;;
--verbosity)
check_arg '*[!0-9]*' "$2" "Verbosity is not a positive integer."
verbosity="$2"
shift 2
;;
--token)
check_arg '*[!0-9a-zA-Z_]*' "$2" "Token is empty or contains illegal characters."
githubToken="$2"
shift 2
;;
-b|--bare)
bare="--mirror"
shift 1
;;
*)
break ;;
esac
done
if [ $# -ne 1 ] || [ -z "$1" ]; then
usage
fi
target="$1"
if ! [ -d "${target}" ]; then
die "${target} is not a directory."
fi
if ! cd "${target}"; then
die "Couldnt cd into ${target}"
fi
# update any existing repositories
if [ "${verbosity}" -gt 0 ]; then
printf "Updating local repositories ... \n"
fi
for item in ./*/; do
[ -e "${item}" ] || break
if [ "${verbosity}" -gt 0 ]; then
printf "Updating %s ... " "${item}"
fi
# if it's not a repository directory, skip it
git -C "${item}" rev-parse > /dev/null 2>&1 || continue
# this works whether the repository is bare or a working directory
if [ "${verbosity}" -lt 2 ]; then
git -C "${item}" fetch -q
else
git -C "${item}" fetch -v
fi
if [ "${verbosity}" -gt 0 ]; then
printf "Done.\n"
fi
done
if [ "${updateOnly}" = "true" ]; then
exit
fi
# clone the remaining repositories
if [ "${verbosity}" -gt 0 ]; then
printf "Cloning remote repositories ... \n"
printf "Fetching repository urls ..."
fi
# get all of the repository names from the GitHub API, one "page" at a time
url="https://api.github.com/orgs/standardebooks/repos?per_page=100"
repoUrls=""
while true; do
# get a "page" worth of repository URL's
if [ -n "${githubToken}" ]; then
response=$(curl -H "Authorization: token ${githubToken}" -si "${url}") ||
die "Curl request failed."
else
response=$(curl -si "${url}") ||
die "Curl request failed."
fi
if printf "%s" "${response}" | grep -q "^X-RateLimit-Remaining: 0$"; then
limitReset=$(printf "%s" "${response}" | grep -oP "^X-RateLimit-Reset: \K[0-9]+$")
printf "You have reached your daily allowance for unauthenticated GitHub API requests.\n\
Either wait until %s or use an OAuth token.\n\
You can create a new token at https://github.com/settings/tokens/new and \
pass it to this script with the --token option.\n\
The token does not need any permissions.\n" "$(date -d @"${limitReset}")" 1>&2
exit
fi
# parse the response to get the current page's URL's
currentRepoUrls=$(printf "%s" "${response}" | awk 'BEGIN { FS="\""; RS="," }; { if ($2 == "clone_url") {print $4} }')
# add them to the full list in repoUrls
repoUrls=$(printf "%s\n%s" "${repoUrls}" "${currentRepoUrls}")
# set the variable to get the next "page"
url=$(printf "%s" "${response}" | grep -oP "<\Khttps://api.github.com/[^>]*(?=>; rel=\"next\",)") || break
if [ "${verbosity}" -gt 0 ]; then
printf "."
fi
done
if [ "${verbosity}" -gt 0 ]; then
printf " Done.\n"
fi
# skip the non-ebook repositories by removing their names from the list
repoUrls=$(printf "%s" "${repoUrls}" | grep -v -e "/tools.git\$" -e "/web.git\$" -e "/manual.git\$" -e "/standard-blackletter.git\$" -e "/sublime-text-se-plugin.git\$" | awk 'NF')
# process the list, reading one repository at a time
printf "%s\n" "${repoUrls}" | while IFS= read -r repoUrl; do
# make sure it's not an empty string
[ -n "${repoUrl}" ] || continue
# strip everything prior to the last segment of the name
repoName="${repoUrl##*/}"
if [ "${bare}" = "" ]; then
repoName="${repoName%.git}"
fi
# if the repo already exists, skip it (handled in the update above)
[ -d "${repoName}" ] && continue
# if the repository name has been truncated due to GitHub's name length limits,
# but a local clone with the full name exists, don't attempt to clone it again
repoNameLength=$(printf "%s" "${repoName%.git}" | wc -m)
if [ "${repoNameLength}" -ge 100 ]; then
if dirs=( "${repoName%.git}"*/ ) && [[ -d ${dirs[0]} ]]; then
continue
fi
fi
if [ "${verbosity}" -gt 0 ]; then
printf "Cloning %s ... \n" "${repoUrl}"
fi
# clone the repository, creating either a bare or working directory based on the option
if [ "${verbosity}" -lt 2 ]; then
git clone -q ${bare} "${repoUrl}"
else
git clone -v ${bare} "${repoUrl}"
fi
# if a directory with the repository name doesn't exist, the clone did not complete successfully
if ! [ -d "${repoName}" ]; then
printf "Failed to clone %s.\n" "${repoName}." 1>&2
elif [ "${verbosity}" -gt 0 ]; then
printf "Done.\n"
fi
# if the repository doesn't have a metadata file, skip to the next repository
metadata=$(git -C "${repoName}" show HEAD:src/epub/content.opf > /dev/null 2>&1) || continue
# get the last segment of the dc:identifier from the metadata
properName="$(git -C "${repoName}" show HEAD:src/epub/content.opf |
grep -oE "<dc:identifier id=\"uid\">url:https://standardebooks.org/ebooks/[^<]+</dc:identifier>" |
sed -E "s/<[^>]+?>//g" |
sed -E "s|url:https://standardebooks.org/ebooks/||g" |
sed -E "s|/|_|g").git"
if [ "${bare}" = "" ]; then
properName="${properName%.git}"
fi
# if for some reason the repository name isn't the same as the identifier (they are identical
# 99% of the time), rename the directory to be the identifier name; not sure why this is done, either
if [ "${repoName}" != "${properName}" ]; then
if [ -d "${properName}" ]; then
if [ "${verbosity}" -gt 0 ]; then
printf "Not moving %s to %s: directory exists\n" "${repoName}" "${properName}"
fi
else
if [ "${verbosity}" -gt 0 ]; then
printf "Moving %s to %s\n" "${repoName}" "${properName}"
fi
mv "${repoName}" "${properName}"
fi
fi
done