web/scripts/generate-bulk-downloads

249 lines
8.3 KiB
PHP
Executable file

#!/usr/bin/php
<?
require_once('/standardebooks.org/web/lib/Core.php');
use function Safe\copy;
use function Safe\exec;
use function Safe\file_get_contents;
use function Safe\filemtime;
use function Safe\getopt;
use function Safe\glob;
use function Safe\mkdir;
use function Safe\opendir;
use function Safe\preg_replace;
use function Safe\rmdir;
use function Safe\tempnam;
use function Safe\unlink;
$longopts = ['webroot:'];
$options = getopt('', $longopts);
$webRoot = $options['webroot'] ?? WEB_ROOT;
$types = ['epub', 'epub-advanced', 'azw3', 'kepub', 'xhtml'];
$groups = ['collections', 'subjects', 'authors', 'months'];
$ebooksByGroup = ['collections' => [], 'subjects' => [], 'authors' => [], 'months' => []];
/**
* @see https://www.php.net/manual/en/function.rmdir.php#117354
*/
function rrmdir(string $src): void{
$dir = opendir($src);
while(false !== ($file = readdir($dir))){
if (($file != '.') && ($file != '..')){
$full = $src . '/' . $file;
if(is_dir($full)){
rrmdir($full);
}
else{
unlink($full);
}
}
}
closedir($dir);
rmdir($src);
}
/**
* @param array<Ebook> $ebooks
*/
function CreateZip(string $filePath, array $ebooks, string $type, string $webRoot): void{
$tempFilename = tempnam(sys_get_temp_dir(), "se-ebooks");
$zip = new ZipArchive();
if($zip->open($tempFilename, ZipArchive::OVERWRITE) !== true){
print('Can\'t open file: ' . $tempFilename . "\n");
}
foreach($ebooks as $ebook){
if($type == 'epub' && $ebook->EpubUrl !== null){
$ebookFilePath = $webRoot . '/' . $ebook->EpubUrl;
$zip->addFile($ebookFilePath, basename($ebookFilePath));
}
if($type == 'azw3' && $ebook->Azw3Url !== null){
$ebookFilePath = $webRoot . '/' . $ebook->Azw3Url;
$folderName = basename($ebookFilePath, '.azw3');
$zip->addFile($ebookFilePath, $folderName . '/' . basename($ebookFilePath));
if($ebook->KindleCoverUrl !== null){
$ebookThumbnailPath = $webRoot . '/' . $ebook->KindleCoverUrl;
$zip->addFile($ebookThumbnailPath, $folderName . '/' . basename($ebookThumbnailPath));
}
}
if($type == 'kepub' && $ebook->KepubUrl !== null){
$ebookFilePath = $webRoot . '/' . $ebook->KepubUrl;
$zip->addFile($ebookFilePath, basename($ebookFilePath));
}
if($type == 'epub-advanced' && $ebook->AdvancedEpubUrl !== null){
$ebookFilePath = $webRoot . '/' . $ebook->AdvancedEpubUrl;
$zip->addFile($ebookFilePath, basename($ebookFilePath));
}
if($type == 'xhtml' && $ebook->TextSinglePageUrl !== null){
$ebookFilePath = $webRoot . '/' . $ebook->TextSinglePageUrl . '.xhtml';
// Strip the navigation header that was added as part of the deploy process
$xhtml = file_get_contents($ebookFilePath);
$xhtml = preg_replace('|<body><header><nav>.+?</nav></header>|ius', '<body>', $xhtml);
$zip->addFromString(str_replace('single-page', $ebook->UrlSafeIdentifier, basename($ebookFilePath)), $xhtml);
}
}
$zip->close();
// We have to do a copy, then unlink because `rename()` can't rename across file systems.
// If the bulk downloads are symlinked to a storage volume, then `rename()` won't work.
copy($tempFilename, $filePath);
unlink($tempFilename);
exec('attr -q -s se-ebook-type -V ' . escapeshellarg($type) . ' ' . escapeshellarg($filePath));
}
// Iterate over all ebooks and arrange them by publication month.
foreach(Ebook::GetAll() as $ebook){
if($ebook->IsPlaceholder()){
continue;
}
if($ebook->EbookCreated === null || $ebook->EbookUpdated === null){
continue;
}
$timestamp = $ebook->EbookCreated->format('Y-m');
$updatedTimestamp = $ebook->EbookUpdated->getTimestamp();
// Add to the 'ebooks by month' list.
if(!isset($ebooksByGroup['months'][$timestamp])){
$obj = new stdClass();
$obj->Label = $timestamp;
$obj->LabelSort = $timestamp;
$obj->UrlLabel = Formatter::MakeUrlSafe($obj->Label);
$obj->Updated = $updatedTimestamp;
$obj->Ebooks = [$ebook];
$ebooksByGroup['months'][$timestamp] = $obj;
}
else{
$ebooksByGroup['months'][$timestamp]->Ebooks[] = $ebook;
if($updatedTimestamp > $ebooksByGroup['months'][$timestamp]->Updated){
$ebooksByGroup['months'][$timestamp]->Updated = $updatedTimestamp;
}
}
// Add to the 'books by subject' list.
foreach($ebook->Tags as $tag){
if(!isset($ebooksByGroup['subjects'][$tag->Name])){
$obj = new stdClass();
$obj->Label = $tag->Name;
$obj->LabelSort = $tag->Name;
$obj->UrlLabel = Formatter::MakeUrlSafe($obj->Label);
$obj->Updated = $updatedTimestamp;
$obj->Ebooks = [$ebook];
$ebooksByGroup['subjects'][$tag->Name] = $obj;
}
else{
$ebooksByGroup['subjects'][$tag->Name]->Ebooks[] = $ebook;
if($updatedTimestamp > $ebooksByGroup['subjects'][$tag->Name]->Updated){
$ebooksByGroup['subjects'][$tag->Name]->Updated = $updatedTimestamp;
}
}
}
// Add to the 'books by collection' list.
foreach($ebook->CollectionMemberships as $cm){
$collection = $cm->Collection;
if(!isset($ebooksByGroup['collections'][$collection->Name])){
$obj = new stdClass();
$obj->Label = $collection->Name;
$obj->LabelSort = $collection->GetSortedName();
$obj->UrlLabel = Formatter::MakeUrlSafe($obj->Label);
$obj->Updated = $updatedTimestamp;
$obj->Ebooks = [$ebook];
$ebooksByGroup['collections'][$collection->Name] = $obj;
}
else{
$ebooksByGroup['collections'][$collection->Name]->Ebooks[] = $ebook;
if($updatedTimestamp > $ebooksByGroup['collections'][$collection->Name]->Updated){
$ebooksByGroup['collections'][$collection->Name]->Updated = $updatedTimestamp;
}
}
}
// Add to the 'books by author' list.
// We have to index by `UrlName` for cases like `Samuel Butler` whose `UrlName` is `samuel-butler-1612-1680`.
$authorsUrl = preg_replace('|^/ebooks/|', '', $ebook->AuthorsUrl);
if(!isset($ebooksByGroup['authors'][$authorsUrl])){
$obj = new stdClass();
$obj->Label = strip_tags($ebook->AuthorsHtml);
$obj->LabelSort = $ebook->Authors[0]->SortName;
$obj->UrlLabel = $authorsUrl;
$obj->Updated = $updatedTimestamp;
$obj->Ebooks = [$ebook];
$ebooksByGroup['authors'][$authorsUrl] = $obj;
}
else{
$ebooksByGroup['authors'][$authorsUrl]->Ebooks[] = $ebook;
if($updatedTimestamp > $ebooksByGroup['authors'][$authorsUrl]->Updated){
$ebooksByGroup['authors'][$authorsUrl]->Updated = $updatedTimestamp;
}
}
}
foreach($groups as $group){
// First delete any orphan directories that we don't expect to be here, for example a collection that was later renamed.
foreach(glob($webRoot . '/bulk-downloads/' . $group . '/*/') as $dir){
$expected = false;
foreach($ebooksByGroup[$group] as $collection){
if($collection->UrlLabel == basename($dir)){
$expected = true;
break;
}
}
if(!$expected){
print('Removing ' . $dir . "\n");
rrmdir($dir);
}
}
// Now create the zip files!
foreach($ebooksByGroup[$group] as $collection){
$parentDir = $webRoot . '/bulk-downloads/' . $group . '/' . $collection->UrlLabel;
if(!is_dir($parentDir)){
mkdir($parentDir, 0775, true);
}
exec('attr -q -s se-ebook-count -V ' . escapeshellarg((string)sizeof($collection->Ebooks)) . ' ' . escapeshellarg($parentDir));
exec('attr -q -s se-label -V ' . escapeshellarg($collection->Label) . ' ' . escapeshellarg($parentDir));
exec('attr -q -s se-label-sort -V ' . escapeshellarg($collection->LabelSort) . ' ' . escapeshellarg($parentDir));
// We also need to save the URL label for author edge cases like `Samuel Butler` -> `samuel-butler-1612-1680` or `Karl Marx and Freidrich Engels` -> `karl-marx_friedrich-engels`.
exec('attr -q -s se-url-label -V ' . escapeshellarg($collection->UrlLabel) . ' ' . escapeshellarg($parentDir));
foreach($types as $type){
$filePath = $parentDir . '/se-ebooks-' . $collection->UrlLabel . '-' . $type . '.zip';
// If the file doesn't exist, or if the `content.opf` last updated time is newer than the file modification time.
if(!file_exists($filePath) || filemtime($filePath) < $collection->Updated){
print('Creating ' . $filePath . "\n");
CreateZip($filePath, $collection->Ebooks, $type, $webRoot);
}
}
}
}
// Set ownership and permissions.
// We don't use PHP's built in `chown()`/`chmod()` because `chmod()` can't accept strings.
// The `chmod +X` command, with a capital `X`, makes only matched directories executable.
exec('sudo chown --preserve-root --recursive se:committers ' . escapeshellarg($webRoot) . '/bulk-downloads/*/');
exec('sudo chmod --preserve-root --recursive a+r,ug+w,a+X ' . escapeshellarg($webRoot) . '/bulk-downloads/*/');