mirror of
https://github.com/standardebooks/web.git
synced 2025-07-05 14:20:29 -04:00
Normalize URLs when submitting artwork to database
This commit is contained in:
parent
f9c873003e
commit
e17a4bcc65
10 changed files with 182 additions and 29 deletions
|
@ -140,8 +140,6 @@ Before submitting design contributions, please discuss them with the Standard Eb
|
|||
|
||||
- Write responsive CSS to make artwork list at `/artworks` mobile-friendly.
|
||||
|
||||
- Normalize page scan/museum URLs to remove unnecessary query string parameters and hash anchors, resulting in a minimum viable URL. For example, normalizing `https://books.google.com/books?id=k9qgAAAAMAAJ&newbks=1&newbks_redir=0&pg=PA11#v=onepage&q&f=false` to `https://books.google.com/books?id=k9qgAAAAMAAJ&pg=PA11`
|
||||
|
||||
## PHP code style
|
||||
|
||||
- Indent with tabs.
|
||||
|
|
171
lib/Artwork.php
171
lib/Artwork.php
|
@ -1,10 +1,14 @@
|
|||
<?
|
||||
|
||||
use Exceptions\InvalidUrlException;
|
||||
use Safe\DateTime;
|
||||
use function Safe\copy;
|
||||
use function Safe\date;
|
||||
use function Safe\exec;
|
||||
use function Safe\getimagesize;
|
||||
use function Safe\ini_get;
|
||||
use function Safe\parse_url;
|
||||
use function Safe\preg_match;
|
||||
use function Safe\preg_replace;
|
||||
|
||||
/**
|
||||
|
@ -309,36 +313,68 @@ class Artwork extends PropertiesBase{
|
|||
}
|
||||
}
|
||||
|
||||
if($this->MuseumUrl !== null && strlen($this->MuseumUrl) > 0 && filter_var($this->MuseumUrl, FILTER_VALIDATE_URL) === false){
|
||||
$error->Add(new Exceptions\InvalidMuseumUrlException());
|
||||
if($this->MuseumUrl !== null){
|
||||
if(strlen($this->MuseumUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
||||
$error->Add(new Exceptions\StringTooLongException('Link to an approved museum page'));
|
||||
}
|
||||
|
||||
if($this->MuseumUrl == '' || filter_var($this->MuseumUrl, FILTER_VALIDATE_URL) === false){
|
||||
$error->Add(new Exceptions\InvalidMuseumUrlException());
|
||||
}
|
||||
}
|
||||
|
||||
if($this->MuseumUrl !== null && strlen($this->MuseumUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
||||
$error->Add(new Exceptions\StringTooLongException('Link to an approved museum page'));
|
||||
if($this->PublicationYearPageUrl !== null){
|
||||
if(strlen($this->PublicationYearPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
||||
$error->Add(new Exceptions\StringTooLongException('Link to page with year of publication'));
|
||||
}
|
||||
|
||||
if($this->PublicationYearPageUrl == '' || filter_var($this->PublicationYearPageUrl, FILTER_VALIDATE_URL) === false){
|
||||
$error->Add(new Exceptions\InvalidPublicationYearPageUrlException());
|
||||
}
|
||||
else{
|
||||
try{
|
||||
$this->PublicationYearPageUrl = $this->NormalizePageScanUrl($this->PublicationYearPageUrl);
|
||||
}
|
||||
catch(Exceptions\InvalidUrlException $ex){
|
||||
$error->Add($ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if($this->PublicationYearPageUrl !== null && strlen($this->PublicationYearPageUrl) > 0 && filter_var($this->PublicationYearPageUrl, FILTER_VALIDATE_URL) === false){
|
||||
$error->Add(new Exceptions\InvalidPublicationYearPageUrlException());
|
||||
if($this->CopyrightPageUrl !== null){
|
||||
if(strlen($this->CopyrightPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
||||
$error->Add(new Exceptions\StringTooLongException('Link to page with copyright details'));
|
||||
}
|
||||
|
||||
if($this->CopyrightPageUrl == '' || filter_var($this->CopyrightPageUrl, FILTER_VALIDATE_URL) === false){
|
||||
$error->Add(new Exceptions\InvalidCopyrightPageUrlException());
|
||||
}
|
||||
else{
|
||||
try{
|
||||
$this->CopyrightPageUrl = $this->NormalizePageScanUrl($this->CopyrightPageUrl);
|
||||
}
|
||||
catch(Exceptions\InvalidUrlException $ex){
|
||||
$error->Add($ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if($this->PublicationYearPageUrl !== null && strlen($this->PublicationYearPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
||||
$error->Add(new Exceptions\StringTooLongException('Link to page with year of publication'));
|
||||
}
|
||||
if($this->ArtworkPageUrl !== null){
|
||||
if(strlen($this->ArtworkPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
||||
$error->Add(new Exceptions\StringTooLongException('Link to page with artwork'));
|
||||
}
|
||||
|
||||
if($this->CopyrightPageUrl !== null && strlen($this->CopyrightPageUrl) > 0 && filter_var($this->CopyrightPageUrl, FILTER_VALIDATE_URL) === false){
|
||||
$error->Add(new Exceptions\InvalidCopyrightPageUrlException());
|
||||
}
|
||||
|
||||
if($this->CopyrightPageUrl !== null && strlen($this->CopyrightPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
||||
$error->Add(new Exceptions\StringTooLongException('Link to page with copyright details'));
|
||||
}
|
||||
|
||||
if($this->ArtworkPageUrl !== null && strlen($this->ArtworkPageUrl) > 0 && filter_var($this->ArtworkPageUrl, FILTER_VALIDATE_URL) === false){
|
||||
$error->Add(new Exceptions\InvalidArtworkPageUrlException());
|
||||
}
|
||||
|
||||
if($this->ArtworkPageUrl !== null && strlen($this->ArtworkPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
||||
$error->Add(new Exceptions\StringTooLongException('Link to page with artwork'));
|
||||
if($this->ArtworkPageUrl == '' || filter_var($this->ArtworkPageUrl, FILTER_VALIDATE_URL) === false){
|
||||
$error->Add(new Exceptions\InvalidArtworkPageUrlException());
|
||||
}
|
||||
else{
|
||||
try{
|
||||
$this->ArtworkPageUrl = $this->NormalizePageScanUrl($this->ArtworkPageUrl);
|
||||
}
|
||||
catch(Exceptions\InvalidUrlException $ex){
|
||||
$error->Add($ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$hasMuseumProof = $this->MuseumUrl !== null && $this->MuseumUrl != '';
|
||||
|
@ -406,6 +442,95 @@ class Artwork extends PropertiesBase{
|
|||
}
|
||||
}
|
||||
|
||||
private function NormalizePageScanUrl(string $url): string{
|
||||
$outputUrl = $url;
|
||||
|
||||
try{
|
||||
$parsedUrl = parse_url($url);
|
||||
}
|
||||
catch(Exception){
|
||||
throw new InvalidUrlException($url);
|
||||
}
|
||||
|
||||
if(!is_array($parsedUrl)){
|
||||
throw new InvalidUrlException($url);
|
||||
}
|
||||
|
||||
if(stripos($parsedUrl['host'], 'hathitrust.org') !== false){
|
||||
// https://babel.hathitrust.org/cgi/pt?id=hvd.32044034383265&seq=13
|
||||
if($parsedUrl['host'] != 'babel.hathitrust.org'){
|
||||
throw new Exceptions\InvalidHathiTrustUrlException();
|
||||
}
|
||||
|
||||
if($parsedUrl['path'] != '/cgi/pt'){
|
||||
throw new Exceptions\InvalidHathiTrustUrlException();
|
||||
}
|
||||
|
||||
parse_str($parsedUrl['query'] ?? '', $vars);
|
||||
|
||||
if(!isset($vars['id']) || !isset($vars['seq']) || is_array($vars['id']) || is_array($vars['seq'])){
|
||||
throw new Exceptions\InvalidHathiTrustUrlException();
|
||||
}
|
||||
|
||||
$outputUrl = 'https://' . $parsedUrl['host'] . $parsedUrl['path'] . '?id=' . $vars['id'] . '&seq=' . $vars['seq'];
|
||||
}
|
||||
|
||||
if(stripos($parsedUrl['host'], 'archive.org') !== false){
|
||||
// https://archive.org/details/royalacademypict1902roya/page/n9/mode/1up
|
||||
|
||||
if($parsedUrl['host'] != 'archive.org'){
|
||||
throw new Exceptions\InvalidInternetArchiveUrlException();
|
||||
}
|
||||
|
||||
if(!preg_match('|^/details/[^/]+?/page/[^/]+/mode/1up$|ius', $parsedUrl['path'])){
|
||||
throw new Exceptions\InvalidInternetArchiveUrlException();
|
||||
}
|
||||
|
||||
$outputUrl = 'https://' . $parsedUrl['host'] . $parsedUrl['path'];
|
||||
}
|
||||
|
||||
if(stripos($parsedUrl['host'], 'google.com') !== false){
|
||||
// Old style: https://books.google.com/books?id=mZpAAAAAYAAJ&pg=PA70-IA2
|
||||
// New style: https://www.google.com/books/edition/_/mZpAAAAAYAAJ?gbpv=1&pg=PA70-IA2
|
||||
|
||||
if($parsedUrl['host'] == 'books.google.com'){
|
||||
// Old style, convert to new style
|
||||
|
||||
if($parsedUrl['path'] != '/books'){
|
||||
throw new Exceptions\InvalidGoogleBooksUrlException();
|
||||
}
|
||||
|
||||
parse_str($parsedUrl['query'] ?? '', $vars);
|
||||
|
||||
if(!isset($vars['id']) || !isset($vars['pg']) || is_array($vars['id']) || is_array($vars['pg'])){
|
||||
throw new Exceptions\InvalidGoogleBooksUrlException();
|
||||
}
|
||||
|
||||
$outputUrl = 'https://www.google.com/books/edition/_/' . $vars['id'] . '?gbpv=1&pg=' . $vars['pg'];
|
||||
}
|
||||
elseif($parsedUrl['host'] == 'www.google.com'){
|
||||
// New style
|
||||
|
||||
if(!preg_match('|^/books/edition/_/[^/]+$|ius', $parsedUrl['path'])){
|
||||
throw new Exceptions\InvalidGoogleBooksUrlException();
|
||||
}
|
||||
|
||||
parse_str($parsedUrl['query'] ?? '', $vars);
|
||||
|
||||
if(!isset($vars['gbpv']) || $vars['gbpv'] !== '1' || !isset($vars['pg']) || is_array($vars['pg'])){
|
||||
throw new Exceptions\InvalidGoogleBooksUrlException();
|
||||
}
|
||||
|
||||
$outputUrl = 'https://' . $parsedUrl['host'] . $parsedUrl['path'] . '?gbpv=' . $vars['gbpv'] . '&pg=' . $vars['pg'];
|
||||
}
|
||||
else{
|
||||
throw new Exceptions\InvalidGoogleBooksUrlException();
|
||||
}
|
||||
}
|
||||
|
||||
return $outputUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<mixed> $uploadedFile
|
||||
* @throws \Exceptions\ValidationException
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
<?
|
||||
namespace Exceptions;
|
||||
|
||||
class InvalidArtworkPageUrlException extends AppException{
|
||||
class InvalidArtworkPageUrlException extends InvalidUrlException{
|
||||
protected $message = 'Invalid link to page with artwork.';
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
<?
|
||||
namespace Exceptions;
|
||||
|
||||
class InvalidCopyrightPageUrlException extends AppException{
|
||||
class InvalidCopyrightPageUrlException extends InvalidUrlException{
|
||||
protected $message = 'Invalid link to page with copyright details.';
|
||||
}
|
||||
|
|
6
lib/Exceptions/InvalidGoogleBooksUrlException.php
Normal file
6
lib/Exceptions/InvalidGoogleBooksUrlException.php
Normal file
|
@ -0,0 +1,6 @@
|
|||
<?
|
||||
namespace Exceptions;
|
||||
|
||||
class InvalidGoogleBooksUrlException extends InvalidUrlException{
|
||||
protected $message = 'Invalid Google Books URL. Google Books URLs begin with “https://www.google.com/books/edition/_/” and must be in single-page view. An example of a valid Google Books URL is “https://www.google.com/books/edition/_/mZpAAAAAYAAJ?gbpv=1&pg=PA70-IA2”.';
|
||||
}
|
6
lib/Exceptions/InvalidHathiTrustUrlException.php
Normal file
6
lib/Exceptions/InvalidHathiTrustUrlException.php
Normal file
|
@ -0,0 +1,6 @@
|
|||
<?
|
||||
namespace Exceptions;
|
||||
|
||||
class InvalidHathiTrustUrlException extends InvalidUrlException{
|
||||
protected $message = 'Invalid HathiTrust URL. HathiTrust URLs begin with “https://babel.hathitrust.org/cgi/pt”. An example of a valid HathiTrust URL is “https://babel.hathitrust.org/cgi/pt?id=hvd.32044034383265&seq=13”.';
|
||||
}
|
6
lib/Exceptions/InvalidInternetArchiveUrlException.php
Normal file
6
lib/Exceptions/InvalidInternetArchiveUrlException.php
Normal file
|
@ -0,0 +1,6 @@
|
|||
<?
|
||||
namespace Exceptions;
|
||||
|
||||
class InvalidInternetArchiveUrlException extends InvalidUrlException{
|
||||
protected $message = 'Invalid Internet Archive URL. Internet Archive URLs begin with “https://archive.org/details/” and must be in single-page view. An example of a valid Internet Archive URL is “https://archive.org/details/royalacademypict1902roya/page/n9/mode/1up”.';
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
<?
|
||||
namespace Exceptions;
|
||||
|
||||
class InvalidMuseumUrlException extends AppException{
|
||||
class InvalidMuseumUrlException extends InvalidUrlException{
|
||||
protected $message = 'Invalid link to an approved museum page.';
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
<?
|
||||
namespace Exceptions;
|
||||
|
||||
class InvalidPublicationYearPageUrlException extends AppException{
|
||||
class InvalidPublicationYearPageUrlException extends InvalidUrlException{
|
||||
protected $message = 'Invalid link to page with year of publication.';
|
||||
}
|
||||
|
|
12
lib/Exceptions/InvalidUrlException.php
Normal file
12
lib/Exceptions/InvalidUrlException.php
Normal file
|
@ -0,0 +1,12 @@
|
|||
<?
|
||||
namespace Exceptions;
|
||||
|
||||
class InvalidUrlException extends AppException{
|
||||
protected $message = 'Invalid URL.';
|
||||
|
||||
public function __construct(?string $url = null){
|
||||
if($url !== null){
|
||||
parent::__construct('Invalid URL: “' . $url . '”.');
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue