Flesh out OPDS generation backend to be more robust and to support generic Atom feeds

2025-07-12 09:32:24 -04:00 · 2022-06-20 21:55:34 -05:00 · 2022-06-20 21:55:34 -05:00 · f6df03cfca
commit f6df03cfca
parent 35188195f1
23 changed files with 1549 additions and 267 deletions
--- a/scripts/deploy-ebook-to-www
+++ b/scripts/deploy-ebook-to-www
@ -121,12 +121,8 @@ if ! [ -x "${scriptsDir}"/reset-php-fpm-opcache ]; then
 	die "\"${scriptsDir}\"/reset-php-fpm-opcache is not an executable file."
 fi

-if ! [ -f "${scriptsDir}"/generate-opds ]; then
-	die "\"${scriptsDir}\"/generate-opds\" is not a file or could not be found."
-fi
-
-if ! [ -f "${scriptsDir}"/generate-rss ]; then
-	die "\"${scriptsDir}\"/generate-rss\" is not a file or could not be found."
+if ! [ -f "${scriptsDir}"/generate-feeds ]; then
+	die "\"${scriptsDir}\"/generate-feeds\" is not a file or could not be found."
 fi

 mkdir -p "${webRoot}"/www/images/covers/
@ -382,7 +378,7 @@ if [ "${verbose}" = "true" ]; then
 	printf "Rebuilding OPDS catalog ... "
 fi

-"${scriptsDir}/generate-opds" --webroot "${webRoot}" --weburl "${webUrl}"
+"${scriptsDir}/generate-feeds" --webroot "${webRoot}" --weburl "${webUrl}"

 sudo chown --recursive se:committers "${webRoot}/www/opds/"*
 sudo chmod --recursive 664 "${webRoot}/www/opds/"*.xml
@ -400,13 +396,6 @@ if [ "${verbose}" = "true" ]; then
 	printf "Rebuilding new releases RSS feed ... "
 fi

-output=$("${scriptsDir}/generate-rss" --webroot "${webRoot}" --weburl "${webUrl}")
-
-# Check the return code; if the script failed (for example invalid XML in content.opf), don't overwrite the existing feed with a blank file
-if [ $? = 0 ]; then
-	echo "${output}" > "${webRoot}/www/rss/new-releases.xml"
-fi
-
 if [ "${verbose}" = "true" ]; then
 	printf "Done.\n"
 fi
--- a/scripts/generate-feeds
+++ b/scripts/generate-feeds
@ -17,6 +17,7 @@ $allEbooks = [];
 $newestEbooks = [];
 $subjects = [];
 $ebooksBySubject = [];
+$ebooksPerNewestEbooksFeed = 30;

 // Iterate over all ebooks to build the various feeds
 foreach($contentFiles as $path){
@ -49,6 +50,36 @@ foreach($contentFiles as $path){
 	}
 }

+$now = new DateTime();
+
+// Create OPDS feeds
+$opdsRootEntries = [
+	new OpdsNavigationEntry(
+		'/opds/new-releases',
+		'http://opds-spec.org/sort/new',
+		'acquisition',
+		$now,
+		'Newest ' . number_format($ebooksPerNewestEbooksFeed) . ' Standard Ebooks',
+		'A list of the ' . number_format($ebooksPerNewestEbooksFeed) . ' newest Standard Ebooks, most-recently-released first.'),
+	new OpdsNavigationEntry(
+		'/opds/subjects',
+		'subsection',
+		'navigation',
+		$now,
+		'Standard Ebooks by Subject',
+		'Browse Standard Ebooks by subject.'),
+	new OpdsNavigationEntry(
+		'/opds/all',
+		'http://opds-spec.org/crawlable',
+		'acquisition',
+		$now,
+		'All Standard Ebooks',
+		'A list of all Standard Ebooks, most-recently-updated first. This is a Complete Acquisition Feed as defined in OPDS 1.2 §2.5.')
+];
+
+$opdsRoot = new OpdsNavigationFeed('/opds', 'Standard Ebooks', WEB_ROOT . '/opds/index.xml', $opdsRootEntries, null);
+$opdsRoot->Save();
+
 // Create the subjects navigation document
 sort($subjects);
 $subjectNavigationEntries = [];
@ -60,27 +91,32 @@ foreach($subjects as $subject){
 	$summary .= ' tagged with “' . strtolower($subject) . ',” most-recently-released first.';

 	// We leave the updated timestamp blank, as it will be filled in when we generate the individual feeds
-	$subjectNavigationEntries[] = new OpdsNavigationEntry('/opds/subjects/' . Formatter::MakeUrlSafe($subject), 'subsection', 'navigation', null, $subject, $summary);
+	$subjectNavigationEntries[] = new OpdsNavigationEntry('/opds/subjects/' . Formatter::MakeUrlSafe($subject), 'subsection', 'navigation', $now, $subject, $summary);
 }
-$subjectsFeed = new OpdsNavigationFeed('/opds/subjects', 'Standard Ebooks by Subject', '/opds', $subjectNavigationEntries);
-$subjectsFeed->Save(WEB_ROOT . '/opds/subjects/index.xml');
+$subjectsFeed = new OpdsNavigationFeed('/opds/subjects', 'Standard Ebooks by Subject', WEB_ROOT . '/opds/subjects/index.xml', $subjectNavigationEntries, $opdsRoot);
+$subjectsFeed->Save();

 // Now generate each individual subject feed
 foreach($ebooksBySubject as $subject => $ebooks){
 	krsort($ebooks);
-	$subjectFeed = new OpdsAcquisitionFeed('/opds/subjects/' . Formatter::MakeUrlSafe((string)$subject), (string)$subject, '/opds/subjects', $ebooks);
-	$subjectFeed->Save(WEB_ROOT . '/opds/subjects/' . Formatter::MakeUrlSafe((string)$subject) . '.xml');
+	$subjectFeed = new OpdsAcquisitionFeed('/opds/subjects/' . Formatter::MakeUrlSafe((string)$subject), (string)$subject, WEB_ROOT . '/opds/subjects/' . Formatter::MakeUrlSafe((string)$subject) . '.xml', $ebooks, $subjectsFeed);
+	$subjectFeed->Save();
 }

 // Create the 'all' feed
 krsort($allEbooks);
-$allFeed = new OpdsAcquisitionFeed('/opds/all', 'All Standard Ebooks', '/opds', $allEbooks, true);
-$allFeed->Save(WEB_ROOT . '/opds/all.xml');
+$allFeed = new OpdsAcquisitionFeed('/opds/all', 'All Standard Ebooks', WEB_ROOT . '/opds/all.xml', $allEbooks, $opdsRoot, true);
+$allFeed->Save();

 // Create the 'newest' feed
 krsort($newestEbooks);
-$newestEbooks = array_slice($newestEbooks, 0, 30);
-$newestFeed = new OpdsAcquisitionFeed('/opds/new-releases', 'Newest 30 Standard Ebooks', '/opds', $newestEbooks);
-$newestFeed->Save(WEB_ROOT . '/opds/new-releases.xml');
+$newestEbooks = array_slice($newestEbooks, 0, $ebooksPerNewestEbooksFeed);
+$newestFeed = new OpdsAcquisitionFeed('/opds/new-releases', 'Newest ' . number_format($ebooksPerNewestEbooksFeed) . ' Standard Ebooks', WEB_ROOT . '/opds/new-releases.xml', $newestEbooks, $opdsRoot);
+$newestFeed->Save();

+// Now create RSS feeds
+
+// Create the 'newest' feed
+$newestFeed = new RssFeed('/rss/new-releases', 'Newest ' . number_format($ebooksPerNewestEbooksFeed) . ' Standard Ebooks', WEB_ROOT . '/rss/new-releases.xml', 'A list of the ' . number_format($ebooksPerNewestEbooksFeed) . ' latest Standard Ebooks ebook releases, most-recently-released first.', $newestEbooks);
+$newestFeed->Save();
 ?>
--- a/scripts/generate-rss
+++ b/scripts/generate-rss
@ -1,90 +0,0 @@
-#!/usr/bin/php
-<?
-require_once('/standardebooks.org/web/lib/Core.php');
-
-use function Safe\file_get_contents;
-use function Safe\getopt;
-use function Safe\gmdate;
-use function Safe\krsort;
-use function Safe\preg_replace;
-use function Safe\strtotime;
-
-$longopts = ["webroot:", "weburl:"];
-$options = getopt("", $longopts);
-$webRoot = $options["webroot"] ?? "/standardebooks.org/web";
-$webUrl = $options["weburl"] ?? "https://standardebooks.org";
-
-$rssLength = 30;
-$contentFiles = explode("\n", trim(shell_exec('find ' . escapeshellarg($webRoot . '/www/ebooks/') . ' -name "content.opf" | sort') ?? ''));
-
-$sortedContentFiles = array();
-
-foreach($contentFiles as $path){
-	if($path == '')
-		continue;
-
-	$xml = new SimpleXMLElement(str_replace('xmlns=', 'ns=', file_get_contents("$path") ?: ''));
-	$xml->registerXPathNamespace('dc', 'http://purl.org/dc/elements/1.1/');
-
-	$temp = $xml->xpath('/package/metadata/dc:date') ?: [];
-	$publishedTimestamp = strtotime(array_shift($temp));
-
-	$sortedContentFiles[$publishedTimestamp] = $xml;
-}
-
-krsort($sortedContentFiles);
-
-$sortedContentFiles = array_slice($sortedContentFiles, 0, $rssLength);
-
-// XSL stylesheet mime type must be `text/xsl` otherwise Chrome doesn't read it
-print("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<?xml-stylesheet href=\"/rss/style\" type=\"text/xsl\"?>\n");
-?>
-<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
-	<channel>
-		<title>Standard Ebooks - New Releases</title>
-		<link><?= $webUrl ?></link>
-		<description>A list of the <?= number_format($rssLength) ?> latest Standard Ebooks ebook releases, most-recently-released first.</description>
-		<language>en-US</language>
-		<copyright>https://creativecommons.org/publicdomain/zero/1.0/</copyright>
-		<lastBuildDate><?= gmdate('D, d M Y H:i:s +0000') ?></lastBuildDate>
-		<docs>http://blogs.law.harvard.edu/tech/rss</docs>
-		<atom:link href="<?= $webUrl ?>/rss/new-releases" rel="self" type="application/rss+xml"/>
-		<image>
-			<url><?= $webUrl ?>/images/logo-rss.png</url>
-			<title>Standard Ebooks - New Releases</title>
-			<description>The Standard Ebooks logo</description>
-			<link><?= $webUrl ?></link>
-			<height>144</height>
-			<width>144</width>
-		</image>
-		<? foreach($sortedContentFiles as $xml){
-			$temp = $xml->xpath('/package/metadata/dc:identifier') ?: [];
-			$url = preg_replace('/^url:/ius', '', (string)array_shift($temp));
-			$url = preg_replace('/^https:\/\/standardebooks.org/ius', $webUrl, $url);
-
-			$temp = $xml->xpath('/package/metadata/dc:title') ?: [];
-			$title = array_shift($temp) ?? '';
-
-			$temp = $xml->xpath('/package/metadata/dc:creator') ?: [];
-			$title .= ', by ' . (array_shift($temp) ?? '');
-
-			$temp = $xml->xpath('/package/metadata/dc:description') ?: [];
-			$description = array_shift($temp) ?? '';
-
-			$temp = $xml->xpath('/package/metadata/dc:date') ?: [];
-			$published = gmdate('D, d M Y H:i:s +0000', strtotime(array_shift($temp) ?? '') ?: 0);
-
-			$seSubjects = $xml->xpath('/package/metadata/meta[@property="se:subject"]') ?: [];
-		?><item>
-			<title><?= $title ?></title>
-			<link><?= $url ?></link>
-			<description><?= htmlspecialchars($description, ENT_QUOTES, 'UTF-8') ?></description>
-			<pubDate><?= $published ?></pubDate>
-			<guid><?= $url ?></guid>
-			<? foreach($seSubjects as $seSubject){ ?>
-			<category domain="standardebooks.org"><?= htmlspecialchars($seSubject, ENT_QUOTES, 'UTF-8') ?></category>
-			<? } ?>
-		</item>
-		<? } ?>
-	</channel>
-</rss>