Add a Relevance sort order and improve search

Here's what's in `IndexableText` right now:

1. Title
2. Collections
3. Authors
4. Tags
5. LocSubjects
6. TocEntries

Here is the proposed new ranking:

```
10 * Title +
8 * Authors +
3 * Collections +
IndexableText
```

New indices for existing DBs:

```
ALTER TABLE `Ebooks` ADD COLUMN `IndexableAuthors` text NOT NULL;
ALTER TABLE `Ebooks` ADD COLUMN `IndexableCollections` text NULL;
ALTER TABLE `Ebooks` ADD FULLTEXT `indexSearchTitle` (`Title`);
ALTER TABLE `Ebooks` ADD FULLTEXT `idxSearchAuthors` (`IndexableAuthors`);
ALTER TABLE `Ebooks` ADD FULLTEXT `idxSearchCollections` (`IndexableCollections`);
```
This commit is contained in:
Mike Colagrosso 2025-01-24 20:42:18 -07:00 committed by Alex Cabal
parent b2df8a7018
commit 1a71913794
8 changed files with 120 additions and 15 deletions

View file

@ -25,8 +25,13 @@ CREATE TABLE IF NOT EXISTS `Ebooks` (
`EbookUpdated` datetime NULL,
`TextSinglePageByteCount` bigint unsigned NULL,
`IndexableText` text NOT NULL,
`IndexableAuthors` text NOT NULL,
`IndexableCollections` text NULL,
PRIMARY KEY (`EbookId`),
UNIQUE KEY `index1` (`Identifier`),
KEY `index2` (`EbookCreated`),
FULLTEXT `idxSearch` (`IndexableText`)
FULLTEXT `idxSearch` (`IndexableText`),
FULLTEXT `idxSearchTitle` (`Title`),
FULLTEXT `idxSearchAuthors` (`IndexableAuthors`),
FULLTEXT `idxSearchCollections` (`IndexableCollections`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

View file

@ -44,6 +44,10 @@ const EBOOKS_MAX_STRING_LENGTH = 250;
const EBOOKS_MAX_LONG_STRING_LENGTH = 500;
const EBOOK_SINGLE_PAGE_SIZE_WARNING = 3 * 1024 * 1024; // 3145728 bytes.
const EBOOK_SEARCH_WEIGHT_TITLE = 10;
const EBOOK_SEARCH_WEIGHT_AUTHORS = 8;
const EBOOK_SEARCH_WEIGHT_COLLECTIONS = 3;
const ARTWORK_THUMBNAIL_HEIGHT = 350;
const ARTWORK_THUMBNAIL_WIDTH = 350;
const ARTWORK_PER_PAGE = 20;

View file

@ -44,6 +44,8 @@ use function Safe\shell_exec;
* @property string $TextSinglePageUrl
* @property string $TextSinglePageSizeFormatted
* @property string $IndexableText
* @property string $IndexableAuthors
* @property ?string $IndexableCollections
* @property ?EbookPlaceholder $EbookPlaceholder
* @property array<Project> $Projects
* @property array<Project> $PastProjects
@ -128,6 +130,8 @@ final class Ebook{
protected string $_TextSinglePageUrl;
protected string $_TextSinglePageSizeFormatted;
protected string $_IndexableText;
protected string $_IndexableAuthors;
protected ?string $_IndexableCollections = null;
protected ?EbookPlaceholder $_EbookPlaceholder = null;
/** @var array<Project> $_Projects */
protected array $_Projects;
@ -732,13 +736,40 @@ final class Ebook{
}
}
// Remove diacritics and non-alphanumeric characters.
$this->_IndexableText = trim(preg_replace('|[^a-zA-Z0-9 ]|ius', ' ', Formatter::RemoveDiacritics($this->_IndexableText)));
$this->_IndexableText = Formatter::RemoveDiacriticsAndNonalphanumerics($this->_IndexableText);
}
return $this->_IndexableText;
}
protected function GetIndexableAuthors(): string{
if(!isset($this->_IndexableAuthors)){
$this->_IndexableAuthors = '';
foreach($this->Authors as $author){
$this->_IndexableAuthors .= ' ' . $author->Name;
}
$this->_IndexableAuthors = Formatter::RemoveDiacriticsAndNonalphanumerics($this->_IndexableAuthors);
}
return $this->_IndexableAuthors;
}
protected function GetIndexableCollections(): ?string{
if(!isset($this->_IndexableCollections)){
foreach($this->CollectionMemberships as $collectionMembership){
$this->_IndexableCollections .= ' ' . $collectionMembership->Collection->Name;
}
if(isset($this->_IndexableCollections)){
$this->_IndexableCollections = Formatter::RemoveDiacriticsAndNonalphanumerics($this->_IndexableCollections);
}
}
return $this->_IndexableCollections;
}
protected function GetEbookPlaceholder(): ?EbookPlaceholder{
if(!isset($this->_EbookPlaceholder)){
if(!isset($this->EbookId)){
@ -1561,6 +1592,22 @@ final class Ebook{
$error->Add(new Exceptions\EbookIndexableTextRequiredException());
}
if(isset($this->IndexableAuthors)){
$this->IndexableAuthors = trim($this->IndexableAuthors ?? '');
if($this->IndexableAuthors == ''){
$error->Add(new Exceptions\EbookIndexableAuthorsRequiredException());
}
}
else{
$error->Add(new Exceptions\EbookIndexableAuthorsRequiredException());
}
$this->IndexableCollections = trim($this->IndexableCollections ?? '');
if($this->IndexableCollections == ''){
$this->IndexableCollections = null;
}
if(isset($this->EbookPlaceholder)){
try{
$this->EbookPlaceholder->Validate();
@ -1897,7 +1944,8 @@ final class Ebook{
INSERT into Ebooks (Identifier, WwwFilesystemPath, RepoFilesystemPath, KindleCoverUrl, EpubUrl,
AdvancedEpubUrl, KepubUrl, Azw3Url, DistCoverUrl, Title, FullTitle, AlternateTitle,
Description, LongDescription, Language, WordCount, ReadingEase, GitHubUrl, WikipediaUrl,
EbookCreated, EbookUpdated, TextSinglePageByteCount, IndexableText)
EbookCreated, EbookUpdated, TextSinglePageByteCount, IndexableText, IndexableAuthors,
IndexableCollections)
values (?,
?,
?,
@ -1920,12 +1968,15 @@ final class Ebook{
?,
?,
?,
?,
?,
?)
', [$this->Identifier, $this->WwwFilesystemPath, $this->RepoFilesystemPath, $this->KindleCoverUrl, $this->EpubUrl,
$this->AdvancedEpubUrl, $this->KepubUrl, $this->Azw3Url, $this->DistCoverUrl, $this->Title,
$this->FullTitle, $this->AlternateTitle, $this->Description, $this->LongDescription,
$this->Language, $this->WordCount, $this->ReadingEase, $this->GitHubUrl, $this->WikipediaUrl,
$this->EbookCreated, $this->EbookUpdated, $this->TextSinglePageByteCount, $this->IndexableText]);
$this->EbookCreated, $this->EbookUpdated, $this->TextSinglePageByteCount, $this->IndexableText,
$this->IndexableAuthors, $this->IndexableCollections]);
$this->EbookId = Db::GetLastInsertedId();
@ -1990,7 +2041,9 @@ final class Ebook{
EbookCreated = ?,
EbookUpdated = ?,
TextSinglePageByteCount = ?,
IndexableText = ?
IndexableText = ?,
IndexableAuthors = ?,
IndexableCollections = ?
where
EbookId = ?
', [$this->Identifier, $this->WwwFilesystemPath, $this->RepoFilesystemPath, $this->KindleCoverUrl, $this->EpubUrl,
@ -1998,6 +2051,7 @@ final class Ebook{
$this->FullTitle, $this->AlternateTitle, $this->Description, $this->LongDescription,
$this->Language, $this->WordCount, $this->ReadingEase, $this->GitHubUrl, $this->WikipediaUrl,
$this->EbookCreated, $this->EbookUpdated, $this->TextSinglePageByteCount, $this->IndexableText,
$this->IndexableAuthors, $this->IndexableCollections,
$this->EbookId]);
}
catch(Exceptions\DuplicateDatabaseKeyException){
@ -2380,6 +2434,7 @@ final class Ebook{
public static function GetAllByFilter(string $query = null, array $tags = [], Enums\EbookSortType $sort = null, int $page = 1, int $perPage = EBOOKS_PER_PAGE, Enums\EbookReleaseStatusFilter $releaseStatusFilter = Enums\EbookReleaseStatusFilter::All): array{
$limit = $perPage;
$offset = (($page - 1) * $perPage);
$relevanceScoreField = '';
$joinContributors = '';
$joinTags = '';
$params = [];
@ -2424,10 +2479,22 @@ final class Ebook{
}
if($query !== null && $query != ''){
$query = trim(preg_replace('|[^a-zA-Z0-9 ]|ius', ' ', Formatter::RemoveDiacritics($query)));
$query = sprintf('"%s"', $query); // Require an exact match via double quotes.
$whereCondition .= ' and match(e.IndexableText) against(? in boolean mode) ';
// Preserve quotes in the query so the user can enter, e.g., "war and peace" for an exact match.
$query = trim(preg_replace('|[^a-zA-Z0-9" ]|ius', ' ', Formatter::RemoveDiacritics($query)));
$relevanceScoreField = ', (
match(e.Title) against (?) * ' . EBOOK_SEARCH_WEIGHT_TITLE . ' +
match(e.IndexableAuthors) against (?) * ' . EBOOK_SEARCH_WEIGHT_AUTHORS . ' +
match(e.IndexableCollections) against (?) * ' . EBOOK_SEARCH_WEIGHT_COLLECTIONS . ' +
match(e.IndexableText) against (?)
) as relevance_score ';
$whereCondition .= ' and match(e.IndexableText) against(?) ';
$params[] = $query;
if($sort == null || $sort == Enums\EbookSortType::Relevance || $sort == Enums\EbookSortType::Newest){
$orderBy = 'relevance_score desc, e.EbookCreated desc';
}
}
try{
@ -2439,11 +2506,17 @@ final class Ebook{
' . $whereCondition . '
', $params);
if($relevanceScoreField != ''){
// `relevance_score` is at the beginning of the query, so these params must go at the start of the array.
array_unshift($params, $query, $query, $query, $query);
}
$params[] = $limit;
$params[] = $offset;
$ebooks = Db::Query('
SELECT distinct e.*
' . $relevanceScoreField . '
from Ebooks e
' . $joinContributors . '
' . $joinTags . '

View file

@ -6,4 +6,5 @@ enum EbookSortType: string{
case AuthorAlpha = 'author-alpha';
case ReadingEase = 'reading-ease';
case Length = 'length';
case Relevance = 'relevance';
}

View file

@ -0,0 +1,7 @@
<?
namespace Exceptions;
class EbookIndexableAuthorsRequiredException extends AppException{
/** @var string $message */
protected $message = 'Ebook IndexableAuthors required.';
}

View file

@ -31,6 +31,13 @@ class Formatter{
}
}
/**
* Remove diacritics and non-alphanumeric characters.
*/
public static function RemoveDiacriticsAndNonalphanumerics(string $text): string{
return trim(preg_replace('|[^a-zA-Z0-9 ]|ius', ' ', Formatter::RemoveDiacritics($text)));
}
/**
* Escape a string so that it's appropriate to use in a URL slug.
*

View file

@ -18,13 +18,17 @@ $isAllSelected = sizeof($tags) == 0 || in_array('all', $tags);
</select>
</label>
<label>Keywords
<input type="search" name="query" value="<?= Formatter::EscapeHtml($query ?? '') ?>"/>
<input type="search" name="query" placeholder="Quotes allowed, e.g., &quot;war and peace&quot;" value="<?= Formatter::EscapeHtml($query ?? '') ?>"/>
</label>
<label class="sort">
<span>Sort</span>
<span>
<select name="sort">
<? if(isset($query) && $query != ''){ ?>
<option value="<?= Enums\EbookSortType::Relevance->value ?>"<? if($sort == Enums\EbookSortType::Relevance){ ?> selected="selected"<? } ?>>Relevance</option>
<? }else{ ?>
<option value="<?= Enums\EbookSortType::Newest->value ?>"<? if($sort == Enums\EbookSortType::Newest){ ?> selected="selected"<? } ?>>S.E. release date (new &#x2192; old)</option>
<? } ?>
<option value="<?= Enums\EbookSortType::AuthorAlpha->value ?>"<? if($sort == Enums\EbookSortType::AuthorAlpha){ ?> selected="selected"<? } ?>>Author name (a &#x2192; z)</option>
<option value="<?= Enums\EbookSortType::ReadingEase->value ?>"<? if($sort == Enums\EbookSortType::ReadingEase){ ?> selected="selected"<? } ?>>Reading ease (easy &#x2192; hard)</option>
<option value="<?= Enums\EbookSortType::Length->value ?>"<? if($sort == Enums\EbookSortType::Length){ ?> selected="selected"<? } ?>>Length (short &#x2192; long)</option>

View file

@ -28,6 +28,14 @@ try{
$view = null;
}
if($query != ''){
$queryStringParams['query'] = $query;
// If the user entered a query with the default sort order, change it to relevance sort.
if($sort == Enums\EbookSortType::Newest){
$sort = Enums\EbookSortType::Relevance;
}
}
if($sort == Enums\EbookSortType::Newest){
$sort = null;
}
@ -38,10 +46,6 @@ try{
$pageDescription = 'Page ' . $page . ' of the Standard Ebooks free ebook library';
if($query != ''){
$queryStringParams['query'] = $query;
}
if(sizeof($tags) > 0){
$queryStringParams['tags'] = $tags;
}