Flesh out OPDS generation backend to be more robust and to support generic Atom feeds

This commit is contained in:
Alex Cabal 2022-06-20 21:55:34 -05:00
parent 35188195f1
commit f6df03cfca
23 changed files with 1549 additions and 267 deletions

View file

@ -121,12 +121,8 @@ if ! [ -x "${scriptsDir}"/reset-php-fpm-opcache ]; then
die "\"${scriptsDir}\"/reset-php-fpm-opcache is not an executable file."
fi
if ! [ -f "${scriptsDir}"/generate-opds ]; then
die "\"${scriptsDir}\"/generate-opds\" is not a file or could not be found."
fi
if ! [ -f "${scriptsDir}"/generate-rss ]; then
die "\"${scriptsDir}\"/generate-rss\" is not a file or could not be found."
if ! [ -f "${scriptsDir}"/generate-feeds ]; then
die "\"${scriptsDir}\"/generate-feeds\" is not a file or could not be found."
fi
mkdir -p "${webRoot}"/www/images/covers/
@ -382,7 +378,7 @@ if [ "${verbose}" = "true" ]; then
printf "Rebuilding OPDS catalog ... "
fi
"${scriptsDir}/generate-opds" --webroot "${webRoot}" --weburl "${webUrl}"
"${scriptsDir}/generate-feeds" --webroot "${webRoot}" --weburl "${webUrl}"
sudo chown --recursive se:committers "${webRoot}/www/opds/"*
sudo chmod --recursive 664 "${webRoot}/www/opds/"*.xml
@ -400,13 +396,6 @@ if [ "${verbose}" = "true" ]; then
printf "Rebuilding new releases RSS feed ... "
fi
output=$("${scriptsDir}/generate-rss" --webroot "${webRoot}" --weburl "${webUrl}")
# Check the return code; if the script failed (for example invalid XML in content.opf), don't overwrite the existing feed with a blank file
if [ $? = 0 ]; then
echo "${output}" > "${webRoot}/www/rss/new-releases.xml"
fi
if [ "${verbose}" = "true" ]; then
printf "Done.\n"
fi

View file

@ -17,6 +17,7 @@ $allEbooks = [];
$newestEbooks = [];
$subjects = [];
$ebooksBySubject = [];
$ebooksPerNewestEbooksFeed = 30;
// Iterate over all ebooks to build the various feeds
foreach($contentFiles as $path){
@ -49,6 +50,36 @@ foreach($contentFiles as $path){
}
}
$now = new DateTime();
// Create OPDS feeds
$opdsRootEntries = [
new OpdsNavigationEntry(
'/opds/new-releases',
'http://opds-spec.org/sort/new',
'acquisition',
$now,
'Newest ' . number_format($ebooksPerNewestEbooksFeed) . ' Standard Ebooks',
'A list of the ' . number_format($ebooksPerNewestEbooksFeed) . ' newest Standard Ebooks, most-recently-released first.'),
new OpdsNavigationEntry(
'/opds/subjects',
'subsection',
'navigation',
$now,
'Standard Ebooks by Subject',
'Browse Standard Ebooks by subject.'),
new OpdsNavigationEntry(
'/opds/all',
'http://opds-spec.org/crawlable',
'acquisition',
$now,
'All Standard Ebooks',
'A list of all Standard Ebooks, most-recently-updated first. This is a Complete Acquisition Feed as defined in OPDS 1.2 §2.5.')
];
$opdsRoot = new OpdsNavigationFeed('/opds', 'Standard Ebooks', WEB_ROOT . '/opds/index.xml', $opdsRootEntries, null);
$opdsRoot->Save();
// Create the subjects navigation document
sort($subjects);
$subjectNavigationEntries = [];
@ -60,27 +91,32 @@ foreach($subjects as $subject){
$summary .= ' tagged with “' . strtolower($subject) . ',” most-recently-released first.';
// We leave the updated timestamp blank, as it will be filled in when we generate the individual feeds
$subjectNavigationEntries[] = new OpdsNavigationEntry('/opds/subjects/' . Formatter::MakeUrlSafe($subject), 'subsection', 'navigation', null, $subject, $summary);
$subjectNavigationEntries[] = new OpdsNavigationEntry('/opds/subjects/' . Formatter::MakeUrlSafe($subject), 'subsection', 'navigation', $now, $subject, $summary);
}
$subjectsFeed = new OpdsNavigationFeed('/opds/subjects', 'Standard Ebooks by Subject', '/opds', $subjectNavigationEntries);
$subjectsFeed->Save(WEB_ROOT . '/opds/subjects/index.xml');
$subjectsFeed = new OpdsNavigationFeed('/opds/subjects', 'Standard Ebooks by Subject', WEB_ROOT . '/opds/subjects/index.xml', $subjectNavigationEntries, $opdsRoot);
$subjectsFeed->Save();
// Now generate each individual subject feed
foreach($ebooksBySubject as $subject => $ebooks){
krsort($ebooks);
$subjectFeed = new OpdsAcquisitionFeed('/opds/subjects/' . Formatter::MakeUrlSafe((string)$subject), (string)$subject, '/opds/subjects', $ebooks);
$subjectFeed->Save(WEB_ROOT . '/opds/subjects/' . Formatter::MakeUrlSafe((string)$subject) . '.xml');
$subjectFeed = new OpdsAcquisitionFeed('/opds/subjects/' . Formatter::MakeUrlSafe((string)$subject), (string)$subject, WEB_ROOT . '/opds/subjects/' . Formatter::MakeUrlSafe((string)$subject) . '.xml', $ebooks, $subjectsFeed);
$subjectFeed->Save();
}
// Create the 'all' feed
krsort($allEbooks);
$allFeed = new OpdsAcquisitionFeed('/opds/all', 'All Standard Ebooks', '/opds', $allEbooks, true);
$allFeed->Save(WEB_ROOT . '/opds/all.xml');
$allFeed = new OpdsAcquisitionFeed('/opds/all', 'All Standard Ebooks', WEB_ROOT . '/opds/all.xml', $allEbooks, $opdsRoot, true);
$allFeed->Save();
// Create the 'newest' feed
krsort($newestEbooks);
$newestEbooks = array_slice($newestEbooks, 0, 30);
$newestFeed = new OpdsAcquisitionFeed('/opds/new-releases', 'Newest 30 Standard Ebooks', '/opds', $newestEbooks);
$newestFeed->Save(WEB_ROOT . '/opds/new-releases.xml');
$newestEbooks = array_slice($newestEbooks, 0, $ebooksPerNewestEbooksFeed);
$newestFeed = new OpdsAcquisitionFeed('/opds/new-releases', 'Newest ' . number_format($ebooksPerNewestEbooksFeed) . ' Standard Ebooks', WEB_ROOT . '/opds/new-releases.xml', $newestEbooks, $opdsRoot);
$newestFeed->Save();
// Now create RSS feeds
// Create the 'newest' feed
$newestFeed = new RssFeed('/rss/new-releases', 'Newest ' . number_format($ebooksPerNewestEbooksFeed) . ' Standard Ebooks', WEB_ROOT . '/rss/new-releases.xml', 'A list of the ' . number_format($ebooksPerNewestEbooksFeed) . ' latest Standard Ebooks ebook releases, most-recently-released first.', $newestEbooks);
$newestFeed->Save();
?>

View file

@ -1,90 +0,0 @@
#!/usr/bin/php
<?
require_once('/standardebooks.org/web/lib/Core.php');
use function Safe\file_get_contents;
use function Safe\getopt;
use function Safe\gmdate;
use function Safe\krsort;
use function Safe\preg_replace;
use function Safe\strtotime;
$longopts = ["webroot:", "weburl:"];
$options = getopt("", $longopts);
$webRoot = $options["webroot"] ?? "/standardebooks.org/web";
$webUrl = $options["weburl"] ?? "https://standardebooks.org";
$rssLength = 30;
$contentFiles = explode("\n", trim(shell_exec('find ' . escapeshellarg($webRoot . '/www/ebooks/') . ' -name "content.opf" | sort') ?? ''));
$sortedContentFiles = array();
foreach($contentFiles as $path){
if($path == '')
continue;
$xml = new SimpleXMLElement(str_replace('xmlns=', 'ns=', file_get_contents("$path") ?: ''));
$xml->registerXPathNamespace('dc', 'http://purl.org/dc/elements/1.1/');
$temp = $xml->xpath('/package/metadata/dc:date') ?: [];
$publishedTimestamp = strtotime(array_shift($temp));
$sortedContentFiles[$publishedTimestamp] = $xml;
}
krsort($sortedContentFiles);
$sortedContentFiles = array_slice($sortedContentFiles, 0, $rssLength);
// XSL stylesheet mime type must be `text/xsl` otherwise Chrome doesn't read it
print("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<?xml-stylesheet href=\"/rss/style\" type=\"text/xsl\"?>\n");
?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>Standard Ebooks - New Releases</title>
<link><?= $webUrl ?></link>
<description>A list of the <?= number_format($rssLength) ?> latest Standard Ebooks ebook releases, most-recently-released first.</description>
<language>en-US</language>
<copyright>https://creativecommons.org/publicdomain/zero/1.0/</copyright>
<lastBuildDate><?= gmdate('D, d M Y H:i:s +0000') ?></lastBuildDate>
<docs>http://blogs.law.harvard.edu/tech/rss</docs>
<atom:link href="<?= $webUrl ?>/rss/new-releases" rel="self" type="application/rss+xml"/>
<image>
<url><?= $webUrl ?>/images/logo-rss.png</url>
<title>Standard Ebooks - New Releases</title>
<description>The Standard Ebooks logo</description>
<link><?= $webUrl ?></link>
<height>144</height>
<width>144</width>
</image>
<? foreach($sortedContentFiles as $xml){
$temp = $xml->xpath('/package/metadata/dc:identifier') ?: [];
$url = preg_replace('/^url:/ius', '', (string)array_shift($temp));
$url = preg_replace('/^https:\/\/standardebooks.org/ius', $webUrl, $url);
$temp = $xml->xpath('/package/metadata/dc:title') ?: [];
$title = array_shift($temp) ?? '';
$temp = $xml->xpath('/package/metadata/dc:creator') ?: [];
$title .= ', by ' . (array_shift($temp) ?? '');
$temp = $xml->xpath('/package/metadata/dc:description') ?: [];
$description = array_shift($temp) ?? '';
$temp = $xml->xpath('/package/metadata/dc:date') ?: [];
$published = gmdate('D, d M Y H:i:s +0000', strtotime(array_shift($temp) ?? '') ?: 0);
$seSubjects = $xml->xpath('/package/metadata/meta[@property="se:subject"]') ?: [];
?><item>
<title><?= $title ?></title>
<link><?= $url ?></link>
<description><?= htmlspecialchars($description, ENT_QUOTES, 'UTF-8') ?></description>
<pubDate><?= $published ?></pubDate>
<guid><?= $url ?></guid>
<? foreach($seSubjects as $seSubject){ ?>
<category domain="standardebooks.org"><?= htmlspecialchars($seSubject, ENT_QUOTES, 'UTF-8') ?></category>
<? } ?>
</item>
<? } ?>
</channel>
</rss>