mirror of
https://github.com/standardebooks/web.git
synced 2025-07-09 16:20:27 -04:00
Add a Relevance
sort order and improve search
Here's what's in `IndexableText` right now: 1. Title 2. Collections 3. Authors 4. Tags 5. LocSubjects 6. TocEntries Here is the proposed new ranking: ``` 10 * Title + 8 * Authors + 3 * Collections + IndexableText ``` New indices for existing DBs: ``` ALTER TABLE `Ebooks` ADD COLUMN `IndexableAuthors` text NOT NULL; ALTER TABLE `Ebooks` ADD COLUMN `IndexableCollections` text NULL; ALTER TABLE `Ebooks` ADD FULLTEXT `indexSearchTitle` (`Title`); ALTER TABLE `Ebooks` ADD FULLTEXT `idxSearchAuthors` (`IndexableAuthors`); ALTER TABLE `Ebooks` ADD FULLTEXT `idxSearchCollections` (`IndexableCollections`); ```
This commit is contained in:
parent
b2df8a7018
commit
1a71913794
8 changed files with 120 additions and 15 deletions
|
@ -25,8 +25,13 @@ CREATE TABLE IF NOT EXISTS `Ebooks` (
|
||||||
`EbookUpdated` datetime NULL,
|
`EbookUpdated` datetime NULL,
|
||||||
`TextSinglePageByteCount` bigint unsigned NULL,
|
`TextSinglePageByteCount` bigint unsigned NULL,
|
||||||
`IndexableText` text NOT NULL,
|
`IndexableText` text NOT NULL,
|
||||||
|
`IndexableAuthors` text NOT NULL,
|
||||||
|
`IndexableCollections` text NULL,
|
||||||
PRIMARY KEY (`EbookId`),
|
PRIMARY KEY (`EbookId`),
|
||||||
UNIQUE KEY `index1` (`Identifier`),
|
UNIQUE KEY `index1` (`Identifier`),
|
||||||
KEY `index2` (`EbookCreated`),
|
KEY `index2` (`EbookCreated`),
|
||||||
FULLTEXT `idxSearch` (`IndexableText`)
|
FULLTEXT `idxSearch` (`IndexableText`),
|
||||||
|
FULLTEXT `idxSearchTitle` (`Title`),
|
||||||
|
FULLTEXT `idxSearchAuthors` (`IndexableAuthors`),
|
||||||
|
FULLTEXT `idxSearchCollections` (`IndexableCollections`)
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||||
|
|
|
@ -44,6 +44,10 @@ const EBOOKS_MAX_STRING_LENGTH = 250;
|
||||||
const EBOOKS_MAX_LONG_STRING_LENGTH = 500;
|
const EBOOKS_MAX_LONG_STRING_LENGTH = 500;
|
||||||
const EBOOK_SINGLE_PAGE_SIZE_WARNING = 3 * 1024 * 1024; // 3145728 bytes.
|
const EBOOK_SINGLE_PAGE_SIZE_WARNING = 3 * 1024 * 1024; // 3145728 bytes.
|
||||||
|
|
||||||
|
const EBOOK_SEARCH_WEIGHT_TITLE = 10;
|
||||||
|
const EBOOK_SEARCH_WEIGHT_AUTHORS = 8;
|
||||||
|
const EBOOK_SEARCH_WEIGHT_COLLECTIONS = 3;
|
||||||
|
|
||||||
const ARTWORK_THUMBNAIL_HEIGHT = 350;
|
const ARTWORK_THUMBNAIL_HEIGHT = 350;
|
||||||
const ARTWORK_THUMBNAIL_WIDTH = 350;
|
const ARTWORK_THUMBNAIL_WIDTH = 350;
|
||||||
const ARTWORK_PER_PAGE = 20;
|
const ARTWORK_PER_PAGE = 20;
|
||||||
|
|
|
@ -44,6 +44,8 @@ use function Safe\shell_exec;
|
||||||
* @property string $TextSinglePageUrl
|
* @property string $TextSinglePageUrl
|
||||||
* @property string $TextSinglePageSizeFormatted
|
* @property string $TextSinglePageSizeFormatted
|
||||||
* @property string $IndexableText
|
* @property string $IndexableText
|
||||||
|
* @property string $IndexableAuthors
|
||||||
|
* @property ?string $IndexableCollections
|
||||||
* @property ?EbookPlaceholder $EbookPlaceholder
|
* @property ?EbookPlaceholder $EbookPlaceholder
|
||||||
* @property array<Project> $Projects
|
* @property array<Project> $Projects
|
||||||
* @property array<Project> $PastProjects
|
* @property array<Project> $PastProjects
|
||||||
|
@ -128,6 +130,8 @@ final class Ebook{
|
||||||
protected string $_TextSinglePageUrl;
|
protected string $_TextSinglePageUrl;
|
||||||
protected string $_TextSinglePageSizeFormatted;
|
protected string $_TextSinglePageSizeFormatted;
|
||||||
protected string $_IndexableText;
|
protected string $_IndexableText;
|
||||||
|
protected string $_IndexableAuthors;
|
||||||
|
protected ?string $_IndexableCollections = null;
|
||||||
protected ?EbookPlaceholder $_EbookPlaceholder = null;
|
protected ?EbookPlaceholder $_EbookPlaceholder = null;
|
||||||
/** @var array<Project> $_Projects */
|
/** @var array<Project> $_Projects */
|
||||||
protected array $_Projects;
|
protected array $_Projects;
|
||||||
|
@ -732,13 +736,40 @@ final class Ebook{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove diacritics and non-alphanumeric characters.
|
$this->_IndexableText = Formatter::RemoveDiacriticsAndNonalphanumerics($this->_IndexableText);
|
||||||
$this->_IndexableText = trim(preg_replace('|[^a-zA-Z0-9 ]|ius', ' ', Formatter::RemoveDiacritics($this->_IndexableText)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return $this->_IndexableText;
|
return $this->_IndexableText;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected function GetIndexableAuthors(): string{
|
||||||
|
if(!isset($this->_IndexableAuthors)){
|
||||||
|
$this->_IndexableAuthors = '';
|
||||||
|
|
||||||
|
foreach($this->Authors as $author){
|
||||||
|
$this->_IndexableAuthors .= ' ' . $author->Name;
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->_IndexableAuthors = Formatter::RemoveDiacriticsAndNonalphanumerics($this->_IndexableAuthors);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->_IndexableAuthors;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function GetIndexableCollections(): ?string{
|
||||||
|
if(!isset($this->_IndexableCollections)){
|
||||||
|
foreach($this->CollectionMemberships as $collectionMembership){
|
||||||
|
$this->_IndexableCollections .= ' ' . $collectionMembership->Collection->Name;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(isset($this->_IndexableCollections)){
|
||||||
|
$this->_IndexableCollections = Formatter::RemoveDiacriticsAndNonalphanumerics($this->_IndexableCollections);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->_IndexableCollections;
|
||||||
|
}
|
||||||
|
|
||||||
protected function GetEbookPlaceholder(): ?EbookPlaceholder{
|
protected function GetEbookPlaceholder(): ?EbookPlaceholder{
|
||||||
if(!isset($this->_EbookPlaceholder)){
|
if(!isset($this->_EbookPlaceholder)){
|
||||||
if(!isset($this->EbookId)){
|
if(!isset($this->EbookId)){
|
||||||
|
@ -1561,6 +1592,22 @@ final class Ebook{
|
||||||
$error->Add(new Exceptions\EbookIndexableTextRequiredException());
|
$error->Add(new Exceptions\EbookIndexableTextRequiredException());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(isset($this->IndexableAuthors)){
|
||||||
|
$this->IndexableAuthors = trim($this->IndexableAuthors ?? '');
|
||||||
|
|
||||||
|
if($this->IndexableAuthors == ''){
|
||||||
|
$error->Add(new Exceptions\EbookIndexableAuthorsRequiredException());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
$error->Add(new Exceptions\EbookIndexableAuthorsRequiredException());
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->IndexableCollections = trim($this->IndexableCollections ?? '');
|
||||||
|
if($this->IndexableCollections == ''){
|
||||||
|
$this->IndexableCollections = null;
|
||||||
|
}
|
||||||
|
|
||||||
if(isset($this->EbookPlaceholder)){
|
if(isset($this->EbookPlaceholder)){
|
||||||
try{
|
try{
|
||||||
$this->EbookPlaceholder->Validate();
|
$this->EbookPlaceholder->Validate();
|
||||||
|
@ -1897,7 +1944,8 @@ final class Ebook{
|
||||||
INSERT into Ebooks (Identifier, WwwFilesystemPath, RepoFilesystemPath, KindleCoverUrl, EpubUrl,
|
INSERT into Ebooks (Identifier, WwwFilesystemPath, RepoFilesystemPath, KindleCoverUrl, EpubUrl,
|
||||||
AdvancedEpubUrl, KepubUrl, Azw3Url, DistCoverUrl, Title, FullTitle, AlternateTitle,
|
AdvancedEpubUrl, KepubUrl, Azw3Url, DistCoverUrl, Title, FullTitle, AlternateTitle,
|
||||||
Description, LongDescription, Language, WordCount, ReadingEase, GitHubUrl, WikipediaUrl,
|
Description, LongDescription, Language, WordCount, ReadingEase, GitHubUrl, WikipediaUrl,
|
||||||
EbookCreated, EbookUpdated, TextSinglePageByteCount, IndexableText)
|
EbookCreated, EbookUpdated, TextSinglePageByteCount, IndexableText, IndexableAuthors,
|
||||||
|
IndexableCollections)
|
||||||
values (?,
|
values (?,
|
||||||
?,
|
?,
|
||||||
?,
|
?,
|
||||||
|
@ -1920,12 +1968,15 @@ final class Ebook{
|
||||||
?,
|
?,
|
||||||
?,
|
?,
|
||||||
?,
|
?,
|
||||||
|
?,
|
||||||
|
?,
|
||||||
?)
|
?)
|
||||||
', [$this->Identifier, $this->WwwFilesystemPath, $this->RepoFilesystemPath, $this->KindleCoverUrl, $this->EpubUrl,
|
', [$this->Identifier, $this->WwwFilesystemPath, $this->RepoFilesystemPath, $this->KindleCoverUrl, $this->EpubUrl,
|
||||||
$this->AdvancedEpubUrl, $this->KepubUrl, $this->Azw3Url, $this->DistCoverUrl, $this->Title,
|
$this->AdvancedEpubUrl, $this->KepubUrl, $this->Azw3Url, $this->DistCoverUrl, $this->Title,
|
||||||
$this->FullTitle, $this->AlternateTitle, $this->Description, $this->LongDescription,
|
$this->FullTitle, $this->AlternateTitle, $this->Description, $this->LongDescription,
|
||||||
$this->Language, $this->WordCount, $this->ReadingEase, $this->GitHubUrl, $this->WikipediaUrl,
|
$this->Language, $this->WordCount, $this->ReadingEase, $this->GitHubUrl, $this->WikipediaUrl,
|
||||||
$this->EbookCreated, $this->EbookUpdated, $this->TextSinglePageByteCount, $this->IndexableText]);
|
$this->EbookCreated, $this->EbookUpdated, $this->TextSinglePageByteCount, $this->IndexableText,
|
||||||
|
$this->IndexableAuthors, $this->IndexableCollections]);
|
||||||
|
|
||||||
$this->EbookId = Db::GetLastInsertedId();
|
$this->EbookId = Db::GetLastInsertedId();
|
||||||
|
|
||||||
|
@ -1990,7 +2041,9 @@ final class Ebook{
|
||||||
EbookCreated = ?,
|
EbookCreated = ?,
|
||||||
EbookUpdated = ?,
|
EbookUpdated = ?,
|
||||||
TextSinglePageByteCount = ?,
|
TextSinglePageByteCount = ?,
|
||||||
IndexableText = ?
|
IndexableText = ?,
|
||||||
|
IndexableAuthors = ?,
|
||||||
|
IndexableCollections = ?
|
||||||
where
|
where
|
||||||
EbookId = ?
|
EbookId = ?
|
||||||
', [$this->Identifier, $this->WwwFilesystemPath, $this->RepoFilesystemPath, $this->KindleCoverUrl, $this->EpubUrl,
|
', [$this->Identifier, $this->WwwFilesystemPath, $this->RepoFilesystemPath, $this->KindleCoverUrl, $this->EpubUrl,
|
||||||
|
@ -1998,6 +2051,7 @@ final class Ebook{
|
||||||
$this->FullTitle, $this->AlternateTitle, $this->Description, $this->LongDescription,
|
$this->FullTitle, $this->AlternateTitle, $this->Description, $this->LongDescription,
|
||||||
$this->Language, $this->WordCount, $this->ReadingEase, $this->GitHubUrl, $this->WikipediaUrl,
|
$this->Language, $this->WordCount, $this->ReadingEase, $this->GitHubUrl, $this->WikipediaUrl,
|
||||||
$this->EbookCreated, $this->EbookUpdated, $this->TextSinglePageByteCount, $this->IndexableText,
|
$this->EbookCreated, $this->EbookUpdated, $this->TextSinglePageByteCount, $this->IndexableText,
|
||||||
|
$this->IndexableAuthors, $this->IndexableCollections,
|
||||||
$this->EbookId]);
|
$this->EbookId]);
|
||||||
}
|
}
|
||||||
catch(Exceptions\DuplicateDatabaseKeyException){
|
catch(Exceptions\DuplicateDatabaseKeyException){
|
||||||
|
@ -2380,6 +2434,7 @@ final class Ebook{
|
||||||
public static function GetAllByFilter(string $query = null, array $tags = [], Enums\EbookSortType $sort = null, int $page = 1, int $perPage = EBOOKS_PER_PAGE, Enums\EbookReleaseStatusFilter $releaseStatusFilter = Enums\EbookReleaseStatusFilter::All): array{
|
public static function GetAllByFilter(string $query = null, array $tags = [], Enums\EbookSortType $sort = null, int $page = 1, int $perPage = EBOOKS_PER_PAGE, Enums\EbookReleaseStatusFilter $releaseStatusFilter = Enums\EbookReleaseStatusFilter::All): array{
|
||||||
$limit = $perPage;
|
$limit = $perPage;
|
||||||
$offset = (($page - 1) * $perPage);
|
$offset = (($page - 1) * $perPage);
|
||||||
|
$relevanceScoreField = '';
|
||||||
$joinContributors = '';
|
$joinContributors = '';
|
||||||
$joinTags = '';
|
$joinTags = '';
|
||||||
$params = [];
|
$params = [];
|
||||||
|
@ -2424,10 +2479,22 @@ final class Ebook{
|
||||||
}
|
}
|
||||||
|
|
||||||
if($query !== null && $query != ''){
|
if($query !== null && $query != ''){
|
||||||
$query = trim(preg_replace('|[^a-zA-Z0-9 ]|ius', ' ', Formatter::RemoveDiacritics($query)));
|
// Preserve quotes in the query so the user can enter, e.g., "war and peace" for an exact match.
|
||||||
$query = sprintf('"%s"', $query); // Require an exact match via double quotes.
|
$query = trim(preg_replace('|[^a-zA-Z0-9" ]|ius', ' ', Formatter::RemoveDiacritics($query)));
|
||||||
$whereCondition .= ' and match(e.IndexableText) against(? in boolean mode) ';
|
|
||||||
|
$relevanceScoreField = ', (
|
||||||
|
match(e.Title) against (?) * ' . EBOOK_SEARCH_WEIGHT_TITLE . ' +
|
||||||
|
match(e.IndexableAuthors) against (?) * ' . EBOOK_SEARCH_WEIGHT_AUTHORS . ' +
|
||||||
|
match(e.IndexableCollections) against (?) * ' . EBOOK_SEARCH_WEIGHT_COLLECTIONS . ' +
|
||||||
|
match(e.IndexableText) against (?)
|
||||||
|
) as relevance_score ';
|
||||||
|
|
||||||
|
$whereCondition .= ' and match(e.IndexableText) against(?) ';
|
||||||
$params[] = $query;
|
$params[] = $query;
|
||||||
|
|
||||||
|
if($sort == null || $sort == Enums\EbookSortType::Relevance || $sort == Enums\EbookSortType::Newest){
|
||||||
|
$orderBy = 'relevance_score desc, e.EbookCreated desc';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try{
|
try{
|
||||||
|
@ -2439,11 +2506,17 @@ final class Ebook{
|
||||||
' . $whereCondition . '
|
' . $whereCondition . '
|
||||||
', $params);
|
', $params);
|
||||||
|
|
||||||
|
if($relevanceScoreField != ''){
|
||||||
|
// `relevance_score` is at the beginning of the query, so these params must go at the start of the array.
|
||||||
|
array_unshift($params, $query, $query, $query, $query);
|
||||||
|
}
|
||||||
|
|
||||||
$params[] = $limit;
|
$params[] = $limit;
|
||||||
$params[] = $offset;
|
$params[] = $offset;
|
||||||
|
|
||||||
$ebooks = Db::Query('
|
$ebooks = Db::Query('
|
||||||
SELECT distinct e.*
|
SELECT distinct e.*
|
||||||
|
' . $relevanceScoreField . '
|
||||||
from Ebooks e
|
from Ebooks e
|
||||||
' . $joinContributors . '
|
' . $joinContributors . '
|
||||||
' . $joinTags . '
|
' . $joinTags . '
|
||||||
|
|
|
@ -6,4 +6,5 @@ enum EbookSortType: string{
|
||||||
case AuthorAlpha = 'author-alpha';
|
case AuthorAlpha = 'author-alpha';
|
||||||
case ReadingEase = 'reading-ease';
|
case ReadingEase = 'reading-ease';
|
||||||
case Length = 'length';
|
case Length = 'length';
|
||||||
|
case Relevance = 'relevance';
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
<?
|
||||||
|
namespace Exceptions;
|
||||||
|
|
||||||
|
class EbookIndexableAuthorsRequiredException extends AppException{
|
||||||
|
/** @var string $message */
|
||||||
|
protected $message = 'Ebook IndexableAuthors required.';
|
||||||
|
}
|
|
@ -31,6 +31,13 @@ class Formatter{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove diacritics and non-alphanumeric characters.
|
||||||
|
*/
|
||||||
|
public static function RemoveDiacriticsAndNonalphanumerics(string $text): string{
|
||||||
|
return trim(preg_replace('|[^a-zA-Z0-9 ]|ius', ' ', Formatter::RemoveDiacritics($text)));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Escape a string so that it's appropriate to use in a URL slug.
|
* Escape a string so that it's appropriate to use in a URL slug.
|
||||||
*
|
*
|
||||||
|
|
|
@ -18,13 +18,17 @@ $isAllSelected = sizeof($tags) == 0 || in_array('all', $tags);
|
||||||
</select>
|
</select>
|
||||||
</label>
|
</label>
|
||||||
<label>Keywords
|
<label>Keywords
|
||||||
<input type="search" name="query" value="<?= Formatter::EscapeHtml($query ?? '') ?>"/>
|
<input type="search" name="query" placeholder="Quotes allowed, e.g., "war and peace"" value="<?= Formatter::EscapeHtml($query ?? '') ?>"/>
|
||||||
</label>
|
</label>
|
||||||
<label class="sort">
|
<label class="sort">
|
||||||
<span>Sort</span>
|
<span>Sort</span>
|
||||||
<span>
|
<span>
|
||||||
<select name="sort">
|
<select name="sort">
|
||||||
<option value="<?= Enums\EbookSortType::Newest->value ?>"<? if($sort == Enums\EbookSortType::Newest){ ?> selected="selected"<? } ?>>S.E. release date (new → old)</option>
|
<? if(isset($query) && $query != ''){ ?>
|
||||||
|
<option value="<?= Enums\EbookSortType::Relevance->value ?>"<? if($sort == Enums\EbookSortType::Relevance){ ?> selected="selected"<? } ?>>Relevance</option>
|
||||||
|
<? }else{ ?>
|
||||||
|
<option value="<?= Enums\EbookSortType::Newest->value ?>"<? if($sort == Enums\EbookSortType::Newest){ ?> selected="selected"<? } ?>>S.E. release date (new → old)</option>
|
||||||
|
<? } ?>
|
||||||
<option value="<?= Enums\EbookSortType::AuthorAlpha->value ?>"<? if($sort == Enums\EbookSortType::AuthorAlpha){ ?> selected="selected"<? } ?>>Author name (a → z)</option>
|
<option value="<?= Enums\EbookSortType::AuthorAlpha->value ?>"<? if($sort == Enums\EbookSortType::AuthorAlpha){ ?> selected="selected"<? } ?>>Author name (a → z)</option>
|
||||||
<option value="<?= Enums\EbookSortType::ReadingEase->value ?>"<? if($sort == Enums\EbookSortType::ReadingEase){ ?> selected="selected"<? } ?>>Reading ease (easy → hard)</option>
|
<option value="<?= Enums\EbookSortType::ReadingEase->value ?>"<? if($sort == Enums\EbookSortType::ReadingEase){ ?> selected="selected"<? } ?>>Reading ease (easy → hard)</option>
|
||||||
<option value="<?= Enums\EbookSortType::Length->value ?>"<? if($sort == Enums\EbookSortType::Length){ ?> selected="selected"<? } ?>>Length (short → long)</option>
|
<option value="<?= Enums\EbookSortType::Length->value ?>"<? if($sort == Enums\EbookSortType::Length){ ?> selected="selected"<? } ?>>Length (short → long)</option>
|
||||||
|
|
|
@ -28,6 +28,14 @@ try{
|
||||||
$view = null;
|
$view = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if($query != ''){
|
||||||
|
$queryStringParams['query'] = $query;
|
||||||
|
// If the user entered a query with the default sort order, change it to relevance sort.
|
||||||
|
if($sort == Enums\EbookSortType::Newest){
|
||||||
|
$sort = Enums\EbookSortType::Relevance;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if($sort == Enums\EbookSortType::Newest){
|
if($sort == Enums\EbookSortType::Newest){
|
||||||
$sort = null;
|
$sort = null;
|
||||||
}
|
}
|
||||||
|
@ -38,10 +46,6 @@ try{
|
||||||
|
|
||||||
$pageDescription = 'Page ' . $page . ' of the Standard Ebooks free ebook library';
|
$pageDescription = 'Page ' . $page . ' of the Standard Ebooks free ebook library';
|
||||||
|
|
||||||
if($query != ''){
|
|
||||||
$queryStringParams['query'] = $query;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(sizeof($tags) > 0){
|
if(sizeof($tags) > 0){
|
||||||
$queryStringParams['tags'] = $tags;
|
$queryStringParams['tags'] = $tags;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue