mirror of
https://github.com/standardebooks/web.git
synced 2025-07-05 22:30:30 -04:00
Normalize URLs when submitting artwork to database
This commit is contained in:
parent
f9c873003e
commit
e17a4bcc65
10 changed files with 182 additions and 29 deletions
|
@ -140,8 +140,6 @@ Before submitting design contributions, please discuss them with the Standard Eb
|
||||||
|
|
||||||
- Write responsive CSS to make artwork list at `/artworks` mobile-friendly.
|
- Write responsive CSS to make artwork list at `/artworks` mobile-friendly.
|
||||||
|
|
||||||
- Normalize page scan/museum URLs to remove unnecessary query string parameters and hash anchors, resulting in a minimum viable URL. For example, normalizing `https://books.google.com/books?id=k9qgAAAAMAAJ&newbks=1&newbks_redir=0&pg=PA11#v=onepage&q&f=false` to `https://books.google.com/books?id=k9qgAAAAMAAJ&pg=PA11`
|
|
||||||
|
|
||||||
## PHP code style
|
## PHP code style
|
||||||
|
|
||||||
- Indent with tabs.
|
- Indent with tabs.
|
||||||
|
|
171
lib/Artwork.php
171
lib/Artwork.php
|
@ -1,10 +1,14 @@
|
||||||
<?
|
<?
|
||||||
|
|
||||||
|
use Exceptions\InvalidUrlException;
|
||||||
use Safe\DateTime;
|
use Safe\DateTime;
|
||||||
use function Safe\copy;
|
use function Safe\copy;
|
||||||
use function Safe\date;
|
use function Safe\date;
|
||||||
use function Safe\exec;
|
use function Safe\exec;
|
||||||
use function Safe\getimagesize;
|
use function Safe\getimagesize;
|
||||||
use function Safe\ini_get;
|
use function Safe\ini_get;
|
||||||
|
use function Safe\parse_url;
|
||||||
|
use function Safe\preg_match;
|
||||||
use function Safe\preg_replace;
|
use function Safe\preg_replace;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -309,36 +313,68 @@ class Artwork extends PropertiesBase{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if($this->MuseumUrl !== null && strlen($this->MuseumUrl) > 0 && filter_var($this->MuseumUrl, FILTER_VALIDATE_URL) === false){
|
if($this->MuseumUrl !== null){
|
||||||
$error->Add(new Exceptions\InvalidMuseumUrlException());
|
if(strlen($this->MuseumUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
||||||
|
$error->Add(new Exceptions\StringTooLongException('Link to an approved museum page'));
|
||||||
|
}
|
||||||
|
|
||||||
|
if($this->MuseumUrl == '' || filter_var($this->MuseumUrl, FILTER_VALIDATE_URL) === false){
|
||||||
|
$error->Add(new Exceptions\InvalidMuseumUrlException());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if($this->MuseumUrl !== null && strlen($this->MuseumUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
if($this->PublicationYearPageUrl !== null){
|
||||||
$error->Add(new Exceptions\StringTooLongException('Link to an approved museum page'));
|
if(strlen($this->PublicationYearPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
||||||
|
$error->Add(new Exceptions\StringTooLongException('Link to page with year of publication'));
|
||||||
|
}
|
||||||
|
|
||||||
|
if($this->PublicationYearPageUrl == '' || filter_var($this->PublicationYearPageUrl, FILTER_VALIDATE_URL) === false){
|
||||||
|
$error->Add(new Exceptions\InvalidPublicationYearPageUrlException());
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
try{
|
||||||
|
$this->PublicationYearPageUrl = $this->NormalizePageScanUrl($this->PublicationYearPageUrl);
|
||||||
|
}
|
||||||
|
catch(Exceptions\InvalidUrlException $ex){
|
||||||
|
$error->Add($ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if($this->PublicationYearPageUrl !== null && strlen($this->PublicationYearPageUrl) > 0 && filter_var($this->PublicationYearPageUrl, FILTER_VALIDATE_URL) === false){
|
if($this->CopyrightPageUrl !== null){
|
||||||
$error->Add(new Exceptions\InvalidPublicationYearPageUrlException());
|
if(strlen($this->CopyrightPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
||||||
|
$error->Add(new Exceptions\StringTooLongException('Link to page with copyright details'));
|
||||||
|
}
|
||||||
|
|
||||||
|
if($this->CopyrightPageUrl == '' || filter_var($this->CopyrightPageUrl, FILTER_VALIDATE_URL) === false){
|
||||||
|
$error->Add(new Exceptions\InvalidCopyrightPageUrlException());
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
try{
|
||||||
|
$this->CopyrightPageUrl = $this->NormalizePageScanUrl($this->CopyrightPageUrl);
|
||||||
|
}
|
||||||
|
catch(Exceptions\InvalidUrlException $ex){
|
||||||
|
$error->Add($ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if($this->PublicationYearPageUrl !== null && strlen($this->PublicationYearPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
if($this->ArtworkPageUrl !== null){
|
||||||
$error->Add(new Exceptions\StringTooLongException('Link to page with year of publication'));
|
if(strlen($this->ArtworkPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
||||||
}
|
$error->Add(new Exceptions\StringTooLongException('Link to page with artwork'));
|
||||||
|
}
|
||||||
|
|
||||||
if($this->CopyrightPageUrl !== null && strlen($this->CopyrightPageUrl) > 0 && filter_var($this->CopyrightPageUrl, FILTER_VALIDATE_URL) === false){
|
if($this->ArtworkPageUrl == '' || filter_var($this->ArtworkPageUrl, FILTER_VALIDATE_URL) === false){
|
||||||
$error->Add(new Exceptions\InvalidCopyrightPageUrlException());
|
$error->Add(new Exceptions\InvalidArtworkPageUrlException());
|
||||||
}
|
}
|
||||||
|
else{
|
||||||
if($this->CopyrightPageUrl !== null && strlen($this->CopyrightPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
try{
|
||||||
$error->Add(new Exceptions\StringTooLongException('Link to page with copyright details'));
|
$this->ArtworkPageUrl = $this->NormalizePageScanUrl($this->ArtworkPageUrl);
|
||||||
}
|
}
|
||||||
|
catch(Exceptions\InvalidUrlException $ex){
|
||||||
if($this->ArtworkPageUrl !== null && strlen($this->ArtworkPageUrl) > 0 && filter_var($this->ArtworkPageUrl, FILTER_VALIDATE_URL) === false){
|
$error->Add($ex);
|
||||||
$error->Add(new Exceptions\InvalidArtworkPageUrlException());
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if($this->ArtworkPageUrl !== null && strlen($this->ArtworkPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
|
|
||||||
$error->Add(new Exceptions\StringTooLongException('Link to page with artwork'));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$hasMuseumProof = $this->MuseumUrl !== null && $this->MuseumUrl != '';
|
$hasMuseumProof = $this->MuseumUrl !== null && $this->MuseumUrl != '';
|
||||||
|
@ -406,6 +442,95 @@ class Artwork extends PropertiesBase{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private function NormalizePageScanUrl(string $url): string{
|
||||||
|
$outputUrl = $url;
|
||||||
|
|
||||||
|
try{
|
||||||
|
$parsedUrl = parse_url($url);
|
||||||
|
}
|
||||||
|
catch(Exception){
|
||||||
|
throw new InvalidUrlException($url);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!is_array($parsedUrl)){
|
||||||
|
throw new InvalidUrlException($url);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(stripos($parsedUrl['host'], 'hathitrust.org') !== false){
|
||||||
|
// https://babel.hathitrust.org/cgi/pt?id=hvd.32044034383265&seq=13
|
||||||
|
if($parsedUrl['host'] != 'babel.hathitrust.org'){
|
||||||
|
throw new Exceptions\InvalidHathiTrustUrlException();
|
||||||
|
}
|
||||||
|
|
||||||
|
if($parsedUrl['path'] != '/cgi/pt'){
|
||||||
|
throw new Exceptions\InvalidHathiTrustUrlException();
|
||||||
|
}
|
||||||
|
|
||||||
|
parse_str($parsedUrl['query'] ?? '', $vars);
|
||||||
|
|
||||||
|
if(!isset($vars['id']) || !isset($vars['seq']) || is_array($vars['id']) || is_array($vars['seq'])){
|
||||||
|
throw new Exceptions\InvalidHathiTrustUrlException();
|
||||||
|
}
|
||||||
|
|
||||||
|
$outputUrl = 'https://' . $parsedUrl['host'] . $parsedUrl['path'] . '?id=' . $vars['id'] . '&seq=' . $vars['seq'];
|
||||||
|
}
|
||||||
|
|
||||||
|
if(stripos($parsedUrl['host'], 'archive.org') !== false){
|
||||||
|
// https://archive.org/details/royalacademypict1902roya/page/n9/mode/1up
|
||||||
|
|
||||||
|
if($parsedUrl['host'] != 'archive.org'){
|
||||||
|
throw new Exceptions\InvalidInternetArchiveUrlException();
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!preg_match('|^/details/[^/]+?/page/[^/]+/mode/1up$|ius', $parsedUrl['path'])){
|
||||||
|
throw new Exceptions\InvalidInternetArchiveUrlException();
|
||||||
|
}
|
||||||
|
|
||||||
|
$outputUrl = 'https://' . $parsedUrl['host'] . $parsedUrl['path'];
|
||||||
|
}
|
||||||
|
|
||||||
|
if(stripos($parsedUrl['host'], 'google.com') !== false){
|
||||||
|
// Old style: https://books.google.com/books?id=mZpAAAAAYAAJ&pg=PA70-IA2
|
||||||
|
// New style: https://www.google.com/books/edition/_/mZpAAAAAYAAJ?gbpv=1&pg=PA70-IA2
|
||||||
|
|
||||||
|
if($parsedUrl['host'] == 'books.google.com'){
|
||||||
|
// Old style, convert to new style
|
||||||
|
|
||||||
|
if($parsedUrl['path'] != '/books'){
|
||||||
|
throw new Exceptions\InvalidGoogleBooksUrlException();
|
||||||
|
}
|
||||||
|
|
||||||
|
parse_str($parsedUrl['query'] ?? '', $vars);
|
||||||
|
|
||||||
|
if(!isset($vars['id']) || !isset($vars['pg']) || is_array($vars['id']) || is_array($vars['pg'])){
|
||||||
|
throw new Exceptions\InvalidGoogleBooksUrlException();
|
||||||
|
}
|
||||||
|
|
||||||
|
$outputUrl = 'https://www.google.com/books/edition/_/' . $vars['id'] . '?gbpv=1&pg=' . $vars['pg'];
|
||||||
|
}
|
||||||
|
elseif($parsedUrl['host'] == 'www.google.com'){
|
||||||
|
// New style
|
||||||
|
|
||||||
|
if(!preg_match('|^/books/edition/_/[^/]+$|ius', $parsedUrl['path'])){
|
||||||
|
throw new Exceptions\InvalidGoogleBooksUrlException();
|
||||||
|
}
|
||||||
|
|
||||||
|
parse_str($parsedUrl['query'] ?? '', $vars);
|
||||||
|
|
||||||
|
if(!isset($vars['gbpv']) || $vars['gbpv'] !== '1' || !isset($vars['pg']) || is_array($vars['pg'])){
|
||||||
|
throw new Exceptions\InvalidGoogleBooksUrlException();
|
||||||
|
}
|
||||||
|
|
||||||
|
$outputUrl = 'https://' . $parsedUrl['host'] . $parsedUrl['path'] . '?gbpv=' . $vars['gbpv'] . '&pg=' . $vars['pg'];
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
throw new Exceptions\InvalidGoogleBooksUrlException();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $outputUrl;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param array<mixed> $uploadedFile
|
* @param array<mixed> $uploadedFile
|
||||||
* @throws \Exceptions\ValidationException
|
* @throws \Exceptions\ValidationException
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
<?
|
<?
|
||||||
namespace Exceptions;
|
namespace Exceptions;
|
||||||
|
|
||||||
class InvalidArtworkPageUrlException extends AppException{
|
class InvalidArtworkPageUrlException extends InvalidUrlException{
|
||||||
protected $message = 'Invalid link to page with artwork.';
|
protected $message = 'Invalid link to page with artwork.';
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
<?
|
<?
|
||||||
namespace Exceptions;
|
namespace Exceptions;
|
||||||
|
|
||||||
class InvalidCopyrightPageUrlException extends AppException{
|
class InvalidCopyrightPageUrlException extends InvalidUrlException{
|
||||||
protected $message = 'Invalid link to page with copyright details.';
|
protected $message = 'Invalid link to page with copyright details.';
|
||||||
}
|
}
|
||||||
|
|
6
lib/Exceptions/InvalidGoogleBooksUrlException.php
Normal file
6
lib/Exceptions/InvalidGoogleBooksUrlException.php
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
<?
|
||||||
|
namespace Exceptions;
|
||||||
|
|
||||||
|
class InvalidGoogleBooksUrlException extends InvalidUrlException{
|
||||||
|
protected $message = 'Invalid Google Books URL. Google Books URLs begin with “https://www.google.com/books/edition/_/” and must be in single-page view. An example of a valid Google Books URL is “https://www.google.com/books/edition/_/mZpAAAAAYAAJ?gbpv=1&pg=PA70-IA2”.';
|
||||||
|
}
|
6
lib/Exceptions/InvalidHathiTrustUrlException.php
Normal file
6
lib/Exceptions/InvalidHathiTrustUrlException.php
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
<?
|
||||||
|
namespace Exceptions;
|
||||||
|
|
||||||
|
class InvalidHathiTrustUrlException extends InvalidUrlException{
|
||||||
|
protected $message = 'Invalid HathiTrust URL. HathiTrust URLs begin with “https://babel.hathitrust.org/cgi/pt”. An example of a valid HathiTrust URL is “https://babel.hathitrust.org/cgi/pt?id=hvd.32044034383265&seq=13”.';
|
||||||
|
}
|
6
lib/Exceptions/InvalidInternetArchiveUrlException.php
Normal file
6
lib/Exceptions/InvalidInternetArchiveUrlException.php
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
<?
|
||||||
|
namespace Exceptions;
|
||||||
|
|
||||||
|
class InvalidInternetArchiveUrlException extends InvalidUrlException{
|
||||||
|
protected $message = 'Invalid Internet Archive URL. Internet Archive URLs begin with “https://archive.org/details/” and must be in single-page view. An example of a valid Internet Archive URL is “https://archive.org/details/royalacademypict1902roya/page/n9/mode/1up”.';
|
||||||
|
}
|
|
@ -1,6 +1,6 @@
|
||||||
<?
|
<?
|
||||||
namespace Exceptions;
|
namespace Exceptions;
|
||||||
|
|
||||||
class InvalidMuseumUrlException extends AppException{
|
class InvalidMuseumUrlException extends InvalidUrlException{
|
||||||
protected $message = 'Invalid link to an approved museum page.';
|
protected $message = 'Invalid link to an approved museum page.';
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
<?
|
<?
|
||||||
namespace Exceptions;
|
namespace Exceptions;
|
||||||
|
|
||||||
class InvalidPublicationYearPageUrlException extends AppException{
|
class InvalidPublicationYearPageUrlException extends InvalidUrlException{
|
||||||
protected $message = 'Invalid link to page with year of publication.';
|
protected $message = 'Invalid link to page with year of publication.';
|
||||||
}
|
}
|
||||||
|
|
12
lib/Exceptions/InvalidUrlException.php
Normal file
12
lib/Exceptions/InvalidUrlException.php
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
<?
|
||||||
|
namespace Exceptions;
|
||||||
|
|
||||||
|
class InvalidUrlException extends AppException{
|
||||||
|
protected $message = 'Invalid URL.';
|
||||||
|
|
||||||
|
public function __construct(?string $url = null){
|
||||||
|
if($url !== null){
|
||||||
|
parent::__construct('Invalid URL: “' . $url . '”.');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue