mirror of
https://github.com/standardebooks/web.git
synced 2025-07-09 16:20:27 -04:00
Add a Relevance
sort order and improve search
Here's what's in `IndexableText` right now: 1. Title 2. Collections 3. Authors 4. Tags 5. LocSubjects 6. TocEntries Here is the proposed new ranking: ``` 10 * Title + 8 * Authors + 3 * Collections + IndexableText ``` New indices for existing DBs: ``` ALTER TABLE `Ebooks` ADD COLUMN `IndexableAuthors` text NOT NULL; ALTER TABLE `Ebooks` ADD COLUMN `IndexableCollections` text NULL; ALTER TABLE `Ebooks` ADD FULLTEXT `indexSearchTitle` (`Title`); ALTER TABLE `Ebooks` ADD FULLTEXT `idxSearchAuthors` (`IndexableAuthors`); ALTER TABLE `Ebooks` ADD FULLTEXT `idxSearchCollections` (`IndexableCollections`); ```
This commit is contained in:
parent
b2df8a7018
commit
1a71913794
8 changed files with 120 additions and 15 deletions
|
@ -25,8 +25,13 @@ CREATE TABLE IF NOT EXISTS `Ebooks` (
|
|||
`EbookUpdated` datetime NULL,
|
||||
`TextSinglePageByteCount` bigint unsigned NULL,
|
||||
`IndexableText` text NOT NULL,
|
||||
`IndexableAuthors` text NOT NULL,
|
||||
`IndexableCollections` text NULL,
|
||||
PRIMARY KEY (`EbookId`),
|
||||
UNIQUE KEY `index1` (`Identifier`),
|
||||
KEY `index2` (`EbookCreated`),
|
||||
FULLTEXT `idxSearch` (`IndexableText`)
|
||||
FULLTEXT `idxSearch` (`IndexableText`),
|
||||
FULLTEXT `idxSearchTitle` (`Title`),
|
||||
FULLTEXT `idxSearchAuthors` (`IndexableAuthors`),
|
||||
FULLTEXT `idxSearchCollections` (`IndexableCollections`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
|
|
|
@ -44,6 +44,10 @@ const EBOOKS_MAX_STRING_LENGTH = 250;
|
|||
const EBOOKS_MAX_LONG_STRING_LENGTH = 500;
|
||||
const EBOOK_SINGLE_PAGE_SIZE_WARNING = 3 * 1024 * 1024; // 3145728 bytes.
|
||||
|
||||
const EBOOK_SEARCH_WEIGHT_TITLE = 10;
|
||||
const EBOOK_SEARCH_WEIGHT_AUTHORS = 8;
|
||||
const EBOOK_SEARCH_WEIGHT_COLLECTIONS = 3;
|
||||
|
||||
const ARTWORK_THUMBNAIL_HEIGHT = 350;
|
||||
const ARTWORK_THUMBNAIL_WIDTH = 350;
|
||||
const ARTWORK_PER_PAGE = 20;
|
||||
|
|
|
@ -44,6 +44,8 @@ use function Safe\shell_exec;
|
|||
* @property string $TextSinglePageUrl
|
||||
* @property string $TextSinglePageSizeFormatted
|
||||
* @property string $IndexableText
|
||||
* @property string $IndexableAuthors
|
||||
* @property ?string $IndexableCollections
|
||||
* @property ?EbookPlaceholder $EbookPlaceholder
|
||||
* @property array<Project> $Projects
|
||||
* @property array<Project> $PastProjects
|
||||
|
@ -128,6 +130,8 @@ final class Ebook{
|
|||
protected string $_TextSinglePageUrl;
|
||||
protected string $_TextSinglePageSizeFormatted;
|
||||
protected string $_IndexableText;
|
||||
protected string $_IndexableAuthors;
|
||||
protected ?string $_IndexableCollections = null;
|
||||
protected ?EbookPlaceholder $_EbookPlaceholder = null;
|
||||
/** @var array<Project> $_Projects */
|
||||
protected array $_Projects;
|
||||
|
@ -732,13 +736,40 @@ final class Ebook{
|
|||
}
|
||||
}
|
||||
|
||||
// Remove diacritics and non-alphanumeric characters.
|
||||
$this->_IndexableText = trim(preg_replace('|[^a-zA-Z0-9 ]|ius', ' ', Formatter::RemoveDiacritics($this->_IndexableText)));
|
||||
$this->_IndexableText = Formatter::RemoveDiacriticsAndNonalphanumerics($this->_IndexableText);
|
||||
}
|
||||
|
||||
return $this->_IndexableText;
|
||||
}
|
||||
|
||||
protected function GetIndexableAuthors(): string{
|
||||
if(!isset($this->_IndexableAuthors)){
|
||||
$this->_IndexableAuthors = '';
|
||||
|
||||
foreach($this->Authors as $author){
|
||||
$this->_IndexableAuthors .= ' ' . $author->Name;
|
||||
}
|
||||
|
||||
$this->_IndexableAuthors = Formatter::RemoveDiacriticsAndNonalphanumerics($this->_IndexableAuthors);
|
||||
}
|
||||
|
||||
return $this->_IndexableAuthors;
|
||||
}
|
||||
|
||||
protected function GetIndexableCollections(): ?string{
|
||||
if(!isset($this->_IndexableCollections)){
|
||||
foreach($this->CollectionMemberships as $collectionMembership){
|
||||
$this->_IndexableCollections .= ' ' . $collectionMembership->Collection->Name;
|
||||
}
|
||||
|
||||
if(isset($this->_IndexableCollections)){
|
||||
$this->_IndexableCollections = Formatter::RemoveDiacriticsAndNonalphanumerics($this->_IndexableCollections);
|
||||
}
|
||||
}
|
||||
|
||||
return $this->_IndexableCollections;
|
||||
}
|
||||
|
||||
protected function GetEbookPlaceholder(): ?EbookPlaceholder{
|
||||
if(!isset($this->_EbookPlaceholder)){
|
||||
if(!isset($this->EbookId)){
|
||||
|
@ -1561,6 +1592,22 @@ final class Ebook{
|
|||
$error->Add(new Exceptions\EbookIndexableTextRequiredException());
|
||||
}
|
||||
|
||||
if(isset($this->IndexableAuthors)){
|
||||
$this->IndexableAuthors = trim($this->IndexableAuthors ?? '');
|
||||
|
||||
if($this->IndexableAuthors == ''){
|
||||
$error->Add(new Exceptions\EbookIndexableAuthorsRequiredException());
|
||||
}
|
||||
}
|
||||
else{
|
||||
$error->Add(new Exceptions\EbookIndexableAuthorsRequiredException());
|
||||
}
|
||||
|
||||
$this->IndexableCollections = trim($this->IndexableCollections ?? '');
|
||||
if($this->IndexableCollections == ''){
|
||||
$this->IndexableCollections = null;
|
||||
}
|
||||
|
||||
if(isset($this->EbookPlaceholder)){
|
||||
try{
|
||||
$this->EbookPlaceholder->Validate();
|
||||
|
@ -1897,7 +1944,8 @@ final class Ebook{
|
|||
INSERT into Ebooks (Identifier, WwwFilesystemPath, RepoFilesystemPath, KindleCoverUrl, EpubUrl,
|
||||
AdvancedEpubUrl, KepubUrl, Azw3Url, DistCoverUrl, Title, FullTitle, AlternateTitle,
|
||||
Description, LongDescription, Language, WordCount, ReadingEase, GitHubUrl, WikipediaUrl,
|
||||
EbookCreated, EbookUpdated, TextSinglePageByteCount, IndexableText)
|
||||
EbookCreated, EbookUpdated, TextSinglePageByteCount, IndexableText, IndexableAuthors,
|
||||
IndexableCollections)
|
||||
values (?,
|
||||
?,
|
||||
?,
|
||||
|
@ -1920,12 +1968,15 @@ final class Ebook{
|
|||
?,
|
||||
?,
|
||||
?,
|
||||
?,
|
||||
?,
|
||||
?)
|
||||
', [$this->Identifier, $this->WwwFilesystemPath, $this->RepoFilesystemPath, $this->KindleCoverUrl, $this->EpubUrl,
|
||||
$this->AdvancedEpubUrl, $this->KepubUrl, $this->Azw3Url, $this->DistCoverUrl, $this->Title,
|
||||
$this->FullTitle, $this->AlternateTitle, $this->Description, $this->LongDescription,
|
||||
$this->Language, $this->WordCount, $this->ReadingEase, $this->GitHubUrl, $this->WikipediaUrl,
|
||||
$this->EbookCreated, $this->EbookUpdated, $this->TextSinglePageByteCount, $this->IndexableText]);
|
||||
$this->EbookCreated, $this->EbookUpdated, $this->TextSinglePageByteCount, $this->IndexableText,
|
||||
$this->IndexableAuthors, $this->IndexableCollections]);
|
||||
|
||||
$this->EbookId = Db::GetLastInsertedId();
|
||||
|
||||
|
@ -1990,7 +2041,9 @@ final class Ebook{
|
|||
EbookCreated = ?,
|
||||
EbookUpdated = ?,
|
||||
TextSinglePageByteCount = ?,
|
||||
IndexableText = ?
|
||||
IndexableText = ?,
|
||||
IndexableAuthors = ?,
|
||||
IndexableCollections = ?
|
||||
where
|
||||
EbookId = ?
|
||||
', [$this->Identifier, $this->WwwFilesystemPath, $this->RepoFilesystemPath, $this->KindleCoverUrl, $this->EpubUrl,
|
||||
|
@ -1998,6 +2051,7 @@ final class Ebook{
|
|||
$this->FullTitle, $this->AlternateTitle, $this->Description, $this->LongDescription,
|
||||
$this->Language, $this->WordCount, $this->ReadingEase, $this->GitHubUrl, $this->WikipediaUrl,
|
||||
$this->EbookCreated, $this->EbookUpdated, $this->TextSinglePageByteCount, $this->IndexableText,
|
||||
$this->IndexableAuthors, $this->IndexableCollections,
|
||||
$this->EbookId]);
|
||||
}
|
||||
catch(Exceptions\DuplicateDatabaseKeyException){
|
||||
|
@ -2380,6 +2434,7 @@ final class Ebook{
|
|||
public static function GetAllByFilter(string $query = null, array $tags = [], Enums\EbookSortType $sort = null, int $page = 1, int $perPage = EBOOKS_PER_PAGE, Enums\EbookReleaseStatusFilter $releaseStatusFilter = Enums\EbookReleaseStatusFilter::All): array{
|
||||
$limit = $perPage;
|
||||
$offset = (($page - 1) * $perPage);
|
||||
$relevanceScoreField = '';
|
||||
$joinContributors = '';
|
||||
$joinTags = '';
|
||||
$params = [];
|
||||
|
@ -2424,10 +2479,22 @@ final class Ebook{
|
|||
}
|
||||
|
||||
if($query !== null && $query != ''){
|
||||
$query = trim(preg_replace('|[^a-zA-Z0-9 ]|ius', ' ', Formatter::RemoveDiacritics($query)));
|
||||
$query = sprintf('"%s"', $query); // Require an exact match via double quotes.
|
||||
$whereCondition .= ' and match(e.IndexableText) against(? in boolean mode) ';
|
||||
// Preserve quotes in the query so the user can enter, e.g., "war and peace" for an exact match.
|
||||
$query = trim(preg_replace('|[^a-zA-Z0-9" ]|ius', ' ', Formatter::RemoveDiacritics($query)));
|
||||
|
||||
$relevanceScoreField = ', (
|
||||
match(e.Title) against (?) * ' . EBOOK_SEARCH_WEIGHT_TITLE . ' +
|
||||
match(e.IndexableAuthors) against (?) * ' . EBOOK_SEARCH_WEIGHT_AUTHORS . ' +
|
||||
match(e.IndexableCollections) against (?) * ' . EBOOK_SEARCH_WEIGHT_COLLECTIONS . ' +
|
||||
match(e.IndexableText) against (?)
|
||||
) as relevance_score ';
|
||||
|
||||
$whereCondition .= ' and match(e.IndexableText) against(?) ';
|
||||
$params[] = $query;
|
||||
|
||||
if($sort == null || $sort == Enums\EbookSortType::Relevance || $sort == Enums\EbookSortType::Newest){
|
||||
$orderBy = 'relevance_score desc, e.EbookCreated desc';
|
||||
}
|
||||
}
|
||||
|
||||
try{
|
||||
|
@ -2439,11 +2506,17 @@ final class Ebook{
|
|||
' . $whereCondition . '
|
||||
', $params);
|
||||
|
||||
if($relevanceScoreField != ''){
|
||||
// `relevance_score` is at the beginning of the query, so these params must go at the start of the array.
|
||||
array_unshift($params, $query, $query, $query, $query);
|
||||
}
|
||||
|
||||
$params[] = $limit;
|
||||
$params[] = $offset;
|
||||
|
||||
$ebooks = Db::Query('
|
||||
SELECT distinct e.*
|
||||
' . $relevanceScoreField . '
|
||||
from Ebooks e
|
||||
' . $joinContributors . '
|
||||
' . $joinTags . '
|
||||
|
|
|
@ -6,4 +6,5 @@ enum EbookSortType: string{
|
|||
case AuthorAlpha = 'author-alpha';
|
||||
case ReadingEase = 'reading-ease';
|
||||
case Length = 'length';
|
||||
case Relevance = 'relevance';
|
||||
}
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
<?
|
||||
namespace Exceptions;
|
||||
|
||||
class EbookIndexableAuthorsRequiredException extends AppException{
|
||||
/** @var string $message */
|
||||
protected $message = 'Ebook IndexableAuthors required.';
|
||||
}
|
|
@ -31,6 +31,13 @@ class Formatter{
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove diacritics and non-alphanumeric characters.
|
||||
*/
|
||||
public static function RemoveDiacriticsAndNonalphanumerics(string $text): string{
|
||||
return trim(preg_replace('|[^a-zA-Z0-9 ]|ius', ' ', Formatter::RemoveDiacritics($text)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Escape a string so that it's appropriate to use in a URL slug.
|
||||
*
|
||||
|
|
|
@ -18,13 +18,17 @@ $isAllSelected = sizeof($tags) == 0 || in_array('all', $tags);
|
|||
</select>
|
||||
</label>
|
||||
<label>Keywords
|
||||
<input type="search" name="query" value="<?= Formatter::EscapeHtml($query ?? '') ?>"/>
|
||||
<input type="search" name="query" placeholder="Quotes allowed, e.g., "war and peace"" value="<?= Formatter::EscapeHtml($query ?? '') ?>"/>
|
||||
</label>
|
||||
<label class="sort">
|
||||
<span>Sort</span>
|
||||
<span>
|
||||
<select name="sort">
|
||||
<? if(isset($query) && $query != ''){ ?>
|
||||
<option value="<?= Enums\EbookSortType::Relevance->value ?>"<? if($sort == Enums\EbookSortType::Relevance){ ?> selected="selected"<? } ?>>Relevance</option>
|
||||
<? }else{ ?>
|
||||
<option value="<?= Enums\EbookSortType::Newest->value ?>"<? if($sort == Enums\EbookSortType::Newest){ ?> selected="selected"<? } ?>>S.E. release date (new → old)</option>
|
||||
<? } ?>
|
||||
<option value="<?= Enums\EbookSortType::AuthorAlpha->value ?>"<? if($sort == Enums\EbookSortType::AuthorAlpha){ ?> selected="selected"<? } ?>>Author name (a → z)</option>
|
||||
<option value="<?= Enums\EbookSortType::ReadingEase->value ?>"<? if($sort == Enums\EbookSortType::ReadingEase){ ?> selected="selected"<? } ?>>Reading ease (easy → hard)</option>
|
||||
<option value="<?= Enums\EbookSortType::Length->value ?>"<? if($sort == Enums\EbookSortType::Length){ ?> selected="selected"<? } ?>>Length (short → long)</option>
|
||||
|
|
|
@ -28,6 +28,14 @@ try{
|
|||
$view = null;
|
||||
}
|
||||
|
||||
if($query != ''){
|
||||
$queryStringParams['query'] = $query;
|
||||
// If the user entered a query with the default sort order, change it to relevance sort.
|
||||
if($sort == Enums\EbookSortType::Newest){
|
||||
$sort = Enums\EbookSortType::Relevance;
|
||||
}
|
||||
}
|
||||
|
||||
if($sort == Enums\EbookSortType::Newest){
|
||||
$sort = null;
|
||||
}
|
||||
|
@ -38,10 +46,6 @@ try{
|
|||
|
||||
$pageDescription = 'Page ' . $page . ' of the Standard Ebooks free ebook library';
|
||||
|
||||
if($query != ''){
|
||||
$queryStringParams['query'] = $query;
|
||||
}
|
||||
|
||||
if(sizeof($tags) > 0){
|
||||
$queryStringParams['tags'] = $tags;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue