Skip to content

Commit

Permalink
Normalize URLs when submitting artwork to database
Browse files Browse the repository at this point in the history
  • Loading branch information
acabal committed Jan 9, 2024
1 parent f9c8730 commit e17a4bc
Show file tree
Hide file tree
Showing 10 changed files with 179 additions and 26 deletions.
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,6 @@ Before submitting design contributions, please discuss them with the Standard Eb
- Write responsive CSS to make artwork list at `/artworks` mobile-friendly.
- Normalize page scan/museum URLs to remove unnecessary query string parameters and hash anchors, resulting in a minimum viable URL. For example, normalizing `https://books.google.com/books?id=k9qgAAAAMAAJ&newbks=1&newbks_redir=0&pg=PA11#v=onepage&q&f=false` to `https://books.google.com/books?id=k9qgAAAAMAAJ&pg=PA11`
## PHP code style
- Indent with tabs.
Expand Down
165 changes: 145 additions & 20 deletions lib/Artwork.php
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
<?

use Exceptions\InvalidUrlException;
use Safe\DateTime;
use function Safe\copy;
use function Safe\date;
use function Safe\exec;
use function Safe\getimagesize;
use function Safe\ini_get;
use function Safe\parse_url;
use function Safe\preg_match;
use function Safe\preg_replace;

/**
Expand Down Expand Up @@ -309,36 +313,68 @@ protected function Validate(array &$uploadedFile = []): void{
}
}

if($this->MuseumUrl !== null && strlen($this->MuseumUrl) > 0 && filter_var($this->MuseumUrl, FILTER_VALIDATE_URL) === false){
$error->Add(new Exceptions\InvalidMuseumUrlException());
}
if($this->MuseumUrl !== null){
if(strlen($this->MuseumUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
$error->Add(new Exceptions\StringTooLongException('Link to an approved museum page'));
}

if($this->MuseumUrl !== null && strlen($this->MuseumUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
$error->Add(new Exceptions\StringTooLongException('Link to an approved museum page'));
if($this->MuseumUrl == '' || filter_var($this->MuseumUrl, FILTER_VALIDATE_URL) === false){
$error->Add(new Exceptions\InvalidMuseumUrlException());
}
}

if($this->PublicationYearPageUrl !== null && strlen($this->PublicationYearPageUrl) > 0 && filter_var($this->PublicationYearPageUrl, FILTER_VALIDATE_URL) === false){
$error->Add(new Exceptions\InvalidPublicationYearPageUrlException());
}
if($this->PublicationYearPageUrl !== null){
if(strlen($this->PublicationYearPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
$error->Add(new Exceptions\StringTooLongException('Link to page with year of publication'));
}

if($this->PublicationYearPageUrl !== null && strlen($this->PublicationYearPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
$error->Add(new Exceptions\StringTooLongException('Link to page with year of publication'));
if($this->PublicationYearPageUrl == '' || filter_var($this->PublicationYearPageUrl, FILTER_VALIDATE_URL) === false){
$error->Add(new Exceptions\InvalidPublicationYearPageUrlException());
}
else{
try{
$this->PublicationYearPageUrl = $this->NormalizePageScanUrl($this->PublicationYearPageUrl);
}
catch(Exceptions\InvalidUrlException $ex){
$error->Add($ex);
}
}
}

if($this->CopyrightPageUrl !== null && strlen($this->CopyrightPageUrl) > 0 && filter_var($this->CopyrightPageUrl, FILTER_VALIDATE_URL) === false){
$error->Add(new Exceptions\InvalidCopyrightPageUrlException());
}
if($this->CopyrightPageUrl !== null){
if(strlen($this->CopyrightPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
$error->Add(new Exceptions\StringTooLongException('Link to page with copyright details'));
}

if($this->CopyrightPageUrl !== null && strlen($this->CopyrightPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
$error->Add(new Exceptions\StringTooLongException('Link to page with copyright details'));
if($this->CopyrightPageUrl == '' || filter_var($this->CopyrightPageUrl, FILTER_VALIDATE_URL) === false){
$error->Add(new Exceptions\InvalidCopyrightPageUrlException());
}
else{
try{
$this->CopyrightPageUrl = $this->NormalizePageScanUrl($this->CopyrightPageUrl);
}
catch(Exceptions\InvalidUrlException $ex){
$error->Add($ex);
}
}
}

if($this->ArtworkPageUrl !== null && strlen($this->ArtworkPageUrl) > 0 && filter_var($this->ArtworkPageUrl, FILTER_VALIDATE_URL) === false){
$error->Add(new Exceptions\InvalidArtworkPageUrlException());
}
if($this->ArtworkPageUrl !== null){
if(strlen($this->ArtworkPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
$error->Add(new Exceptions\StringTooLongException('Link to page with artwork'));
}

if($this->ArtworkPageUrl !== null && strlen($this->ArtworkPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){
$error->Add(new Exceptions\StringTooLongException('Link to page with artwork'));
if($this->ArtworkPageUrl == '' || filter_var($this->ArtworkPageUrl, FILTER_VALIDATE_URL) === false){
$error->Add(new Exceptions\InvalidArtworkPageUrlException());
}
else{
try{
$this->ArtworkPageUrl = $this->NormalizePageScanUrl($this->ArtworkPageUrl);
}
catch(Exceptions\InvalidUrlException $ex){
$error->Add($ex);
}
}
}

$hasMuseumProof = $this->MuseumUrl !== null && $this->MuseumUrl != '';
Expand Down Expand Up @@ -406,6 +442,95 @@ protected function Validate(array &$uploadedFile = []): void{
}
}

private function NormalizePageScanUrl(string $url): string{
$outputUrl = $url;

try{
$parsedUrl = parse_url($url);
}
catch(Exception){
throw new InvalidUrlException($url);
}

if(!is_array($parsedUrl)){
throw new InvalidUrlException($url);
}

if(stripos($parsedUrl['host'], 'hathitrust.org') !== false){
// https://babel.hathitrust.org/cgi/pt?id=hvd.32044034383265&seq=13
if($parsedUrl['host'] != 'babel.hathitrust.org'){
throw new Exceptions\InvalidHathiTrustUrlException();
}

if($parsedUrl['path'] != '/cgi/pt'){
throw new Exceptions\InvalidHathiTrustUrlException();
}

parse_str($parsedUrl['query'] ?? '', $vars);

if(!isset($vars['id']) || !isset($vars['seq']) || is_array($vars['id']) || is_array($vars['seq'])){
throw new Exceptions\InvalidHathiTrustUrlException();
}

$outputUrl = 'https://' . $parsedUrl['host'] . $parsedUrl['path'] . '?id=' . $vars['id'] . '&seq=' . $vars['seq'];
}

if(stripos($parsedUrl['host'], 'archive.org') !== false){
// https://archive.org/details/royalacademypict1902roya/page/n9/mode/1up

if($parsedUrl['host'] != 'archive.org'){
throw new Exceptions\InvalidInternetArchiveUrlException();
}

if(!preg_match('|^/details/[^/]+?/page/[^/]+/mode/1up$|ius', $parsedUrl['path'])){
throw new Exceptions\InvalidInternetArchiveUrlException();
}

$outputUrl = 'https://' . $parsedUrl['host'] . $parsedUrl['path'];
}

if(stripos($parsedUrl['host'], 'google.com') !== false){
// Old style: https://books.google.com/books?id=mZpAAAAAYAAJ&pg=PA70-IA2
// New style: https://www.google.com/books/edition/_/mZpAAAAAYAAJ?gbpv=1&pg=PA70-IA2

if($parsedUrl['host'] == 'books.google.com'){
// Old style, convert to new style

if($parsedUrl['path'] != '/books'){
throw new Exceptions\InvalidGoogleBooksUrlException();
}

parse_str($parsedUrl['query'] ?? '', $vars);

if(!isset($vars['id']) || !isset($vars['pg']) || is_array($vars['id']) || is_array($vars['pg'])){
throw new Exceptions\InvalidGoogleBooksUrlException();
}

$outputUrl = 'https://www.google.com/books/edition/_/' . $vars['id'] . '?gbpv=1&pg=' . $vars['pg'];
}
elseif($parsedUrl['host'] == 'www.google.com'){
// New style

if(!preg_match('|^/books/edition/_/[^/]+$|ius', $parsedUrl['path'])){
throw new Exceptions\InvalidGoogleBooksUrlException();
}

parse_str($parsedUrl['query'] ?? '', $vars);

if(!isset($vars['gbpv']) || $vars['gbpv'] !== '1' || !isset($vars['pg']) || is_array($vars['pg'])){
throw new Exceptions\InvalidGoogleBooksUrlException();
}

$outputUrl = 'https://' . $parsedUrl['host'] . $parsedUrl['path'] . '?gbpv=' . $vars['gbpv'] . '&pg=' . $vars['pg'];
}
else{
throw new Exceptions\InvalidGoogleBooksUrlException();
}
}

return $outputUrl;
}

/**
* @param array<mixed> $uploadedFile
* @throws \Exceptions\ValidationException
Expand Down
2 changes: 1 addition & 1 deletion lib/Exceptions/InvalidArtworkPageUrlException.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?
namespace Exceptions;

class InvalidArtworkPageUrlException extends AppException{
class InvalidArtworkPageUrlException extends InvalidUrlException{
protected $message = 'Invalid link to page with artwork.';
}
2 changes: 1 addition & 1 deletion lib/Exceptions/InvalidCopyrightPageUrlException.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?
namespace Exceptions;

class InvalidCopyrightPageUrlException extends AppException{
class InvalidCopyrightPageUrlException extends InvalidUrlException{
protected $message = 'Invalid link to page with copyright details.';
}
6 changes: 6 additions & 0 deletions lib/Exceptions/InvalidGoogleBooksUrlException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?
namespace Exceptions;

class InvalidGoogleBooksUrlException extends InvalidUrlException{
protected $message = 'Invalid Google Books URL. Google Books URLs begin with “https://www.google.com/books/edition/_/” and must be in single-page view. An example of a valid Google Books URL is “https://www.google.com/books/edition/_/mZpAAAAAYAAJ?gbpv=1&pg=PA70-IA2”.';
}
6 changes: 6 additions & 0 deletions lib/Exceptions/InvalidHathiTrustUrlException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?
namespace Exceptions;

class InvalidHathiTrustUrlException extends InvalidUrlException{
protected $message = 'Invalid HathiTrust URL. HathiTrust URLs begin with “https://babel.hathitrust.org/cgi/pt”. An example of a valid HathiTrust URL is “https://babel.hathitrust.org/cgi/pt?id=hvd.32044034383265&seq=13”.';
}
6 changes: 6 additions & 0 deletions lib/Exceptions/InvalidInternetArchiveUrlException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?
namespace Exceptions;

class InvalidInternetArchiveUrlException extends InvalidUrlException{
protected $message = 'Invalid Internet Archive URL. Internet Archive URLs begin with “https://archive.org/details/” and must be in single-page view. An example of a valid Internet Archive URL is “https://archive.org/details/royalacademypict1902roya/page/n9/mode/1up”.';
}
2 changes: 1 addition & 1 deletion lib/Exceptions/InvalidMuseumUrlException.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?
namespace Exceptions;

class InvalidMuseumUrlException extends AppException{
class InvalidMuseumUrlException extends InvalidUrlException{
protected $message = 'Invalid link to an approved museum page.';
}
2 changes: 1 addition & 1 deletion lib/Exceptions/InvalidPublicationYearPageUrlException.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?
namespace Exceptions;

class InvalidPublicationYearPageUrlException extends AppException{
class InvalidPublicationYearPageUrlException extends InvalidUrlException{
protected $message = 'Invalid link to page with year of publication.';
}
12 changes: 12 additions & 0 deletions lib/Exceptions/InvalidUrlException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<?
namespace Exceptions;

class InvalidUrlException extends AppException{
protected $message = 'Invalid URL.';

public function __construct(?string $url = null){
if($url !== null){
parent::__construct('Invalid URL: “' . $url . '”.');
}
}
}

0 comments on commit e17a4bc

Please sign in to comment.