From e17a4bcc6537fa3f67ce9a1024c10c00da01522b Mon Sep 17 00:00:00 2001 From: Alex Cabal Date: Tue, 9 Jan 2024 13:59:00 -0600 Subject: [PATCH] Normalize URLs when submitting artwork to database --- README.md | 2 - lib/Artwork.php | 165 +++++++++++++++--- .../InvalidArtworkPageUrlException.php | 2 +- .../InvalidCopyrightPageUrlException.php | 2 +- .../InvalidGoogleBooksUrlException.php | 6 + .../InvalidHathiTrustUrlException.php | 6 + .../InvalidInternetArchiveUrlException.php | 6 + lib/Exceptions/InvalidMuseumUrlException.php | 2 +- ...InvalidPublicationYearPageUrlException.php | 2 +- lib/Exceptions/InvalidUrlException.php | 12 ++ 10 files changed, 179 insertions(+), 26 deletions(-) create mode 100644 lib/Exceptions/InvalidGoogleBooksUrlException.php create mode 100644 lib/Exceptions/InvalidHathiTrustUrlException.php create mode 100644 lib/Exceptions/InvalidInternetArchiveUrlException.php create mode 100644 lib/Exceptions/InvalidUrlException.php diff --git a/README.md b/README.md index 1390edab..d8cc6a6e 100644 --- a/README.md +++ b/README.md @@ -140,8 +140,6 @@ Before submitting design contributions, please discuss them with the Standard Eb - Write responsive CSS to make artwork list at `/artworks` mobile-friendly. -- Normalize page scan/museum URLs to remove unnecessary query string parameters and hash anchors, resulting in a minimum viable URL. For example, normalizing `https://books.google.com/books?id=k9qgAAAAMAAJ&newbks=1&newbks_redir=0&pg=PA11#v=onepage&q&f=false` to `https://books.google.com/books?id=k9qgAAAAMAAJ&pg=PA11` - ## PHP code style - Indent with tabs. diff --git a/lib/Artwork.php b/lib/Artwork.php index 30121cbf..271e2425 100644 --- a/lib/Artwork.php +++ b/lib/Artwork.php @@ -1,10 +1,14 @@ MuseumUrl !== null && strlen($this->MuseumUrl) > 0 && filter_var($this->MuseumUrl, FILTER_VALIDATE_URL) === false){ - $error->Add(new Exceptions\InvalidMuseumUrlException()); - } + if($this->MuseumUrl !== null){ + if(strlen($this->MuseumUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){ + $error->Add(new Exceptions\StringTooLongException('Link to an approved museum page')); + } - if($this->MuseumUrl !== null && strlen($this->MuseumUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){ - $error->Add(new Exceptions\StringTooLongException('Link to an approved museum page')); + if($this->MuseumUrl == '' || filter_var($this->MuseumUrl, FILTER_VALIDATE_URL) === false){ + $error->Add(new Exceptions\InvalidMuseumUrlException()); + } } - if($this->PublicationYearPageUrl !== null && strlen($this->PublicationYearPageUrl) > 0 && filter_var($this->PublicationYearPageUrl, FILTER_VALIDATE_URL) === false){ - $error->Add(new Exceptions\InvalidPublicationYearPageUrlException()); - } + if($this->PublicationYearPageUrl !== null){ + if(strlen($this->PublicationYearPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){ + $error->Add(new Exceptions\StringTooLongException('Link to page with year of publication')); + } - if($this->PublicationYearPageUrl !== null && strlen($this->PublicationYearPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){ - $error->Add(new Exceptions\StringTooLongException('Link to page with year of publication')); + if($this->PublicationYearPageUrl == '' || filter_var($this->PublicationYearPageUrl, FILTER_VALIDATE_URL) === false){ + $error->Add(new Exceptions\InvalidPublicationYearPageUrlException()); + } + else{ + try{ + $this->PublicationYearPageUrl = $this->NormalizePageScanUrl($this->PublicationYearPageUrl); + } + catch(Exceptions\InvalidUrlException $ex){ + $error->Add($ex); + } + } } - if($this->CopyrightPageUrl !== null && strlen($this->CopyrightPageUrl) > 0 && filter_var($this->CopyrightPageUrl, FILTER_VALIDATE_URL) === false){ - $error->Add(new Exceptions\InvalidCopyrightPageUrlException()); - } + if($this->CopyrightPageUrl !== null){ + if(strlen($this->CopyrightPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){ + $error->Add(new Exceptions\StringTooLongException('Link to page with copyright details')); + } - if($this->CopyrightPageUrl !== null && strlen($this->CopyrightPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){ - $error->Add(new Exceptions\StringTooLongException('Link to page with copyright details')); + if($this->CopyrightPageUrl == '' || filter_var($this->CopyrightPageUrl, FILTER_VALIDATE_URL) === false){ + $error->Add(new Exceptions\InvalidCopyrightPageUrlException()); + } + else{ + try{ + $this->CopyrightPageUrl = $this->NormalizePageScanUrl($this->CopyrightPageUrl); + } + catch(Exceptions\InvalidUrlException $ex){ + $error->Add($ex); + } + } } - if($this->ArtworkPageUrl !== null && strlen($this->ArtworkPageUrl) > 0 && filter_var($this->ArtworkPageUrl, FILTER_VALIDATE_URL) === false){ - $error->Add(new Exceptions\InvalidArtworkPageUrlException()); - } + if($this->ArtworkPageUrl !== null){ + if(strlen($this->ArtworkPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){ + $error->Add(new Exceptions\StringTooLongException('Link to page with artwork')); + } - if($this->ArtworkPageUrl !== null && strlen($this->ArtworkPageUrl) > COVER_ARTWORK_MAX_STRING_LENGTH){ - $error->Add(new Exceptions\StringTooLongException('Link to page with artwork')); + if($this->ArtworkPageUrl == '' || filter_var($this->ArtworkPageUrl, FILTER_VALIDATE_URL) === false){ + $error->Add(new Exceptions\InvalidArtworkPageUrlException()); + } + else{ + try{ + $this->ArtworkPageUrl = $this->NormalizePageScanUrl($this->ArtworkPageUrl); + } + catch(Exceptions\InvalidUrlException $ex){ + $error->Add($ex); + } + } } $hasMuseumProof = $this->MuseumUrl !== null && $this->MuseumUrl != ''; @@ -406,6 +442,95 @@ protected function Validate(array &$uploadedFile = []): void{ } } + private function NormalizePageScanUrl(string $url): string{ + $outputUrl = $url; + + try{ + $parsedUrl = parse_url($url); + } + catch(Exception){ + throw new InvalidUrlException($url); + } + + if(!is_array($parsedUrl)){ + throw new InvalidUrlException($url); + } + + if(stripos($parsedUrl['host'], 'hathitrust.org') !== false){ + // https://babel.hathitrust.org/cgi/pt?id=hvd.32044034383265&seq=13 + if($parsedUrl['host'] != 'babel.hathitrust.org'){ + throw new Exceptions\InvalidHathiTrustUrlException(); + } + + if($parsedUrl['path'] != '/cgi/pt'){ + throw new Exceptions\InvalidHathiTrustUrlException(); + } + + parse_str($parsedUrl['query'] ?? '', $vars); + + if(!isset($vars['id']) || !isset($vars['seq']) || is_array($vars['id']) || is_array($vars['seq'])){ + throw new Exceptions\InvalidHathiTrustUrlException(); + } + + $outputUrl = 'https://' . $parsedUrl['host'] . $parsedUrl['path'] . '?id=' . $vars['id'] . '&seq=' . $vars['seq']; + } + + if(stripos($parsedUrl['host'], 'archive.org') !== false){ + // https://archive.org/details/royalacademypict1902roya/page/n9/mode/1up + + if($parsedUrl['host'] != 'archive.org'){ + throw new Exceptions\InvalidInternetArchiveUrlException(); + } + + if(!preg_match('|^/details/[^/]+?/page/[^/]+/mode/1up$|ius', $parsedUrl['path'])){ + throw new Exceptions\InvalidInternetArchiveUrlException(); + } + + $outputUrl = 'https://' . $parsedUrl['host'] . $parsedUrl['path']; + } + + if(stripos($parsedUrl['host'], 'google.com') !== false){ + // Old style: https://books.google.com/books?id=mZpAAAAAYAAJ&pg=PA70-IA2 + // New style: https://www.google.com/books/edition/_/mZpAAAAAYAAJ?gbpv=1&pg=PA70-IA2 + + if($parsedUrl['host'] == 'books.google.com'){ + // Old style, convert to new style + + if($parsedUrl['path'] != '/books'){ + throw new Exceptions\InvalidGoogleBooksUrlException(); + } + + parse_str($parsedUrl['query'] ?? '', $vars); + + if(!isset($vars['id']) || !isset($vars['pg']) || is_array($vars['id']) || is_array($vars['pg'])){ + throw new Exceptions\InvalidGoogleBooksUrlException(); + } + + $outputUrl = 'https://www.google.com/books/edition/_/' . $vars['id'] . '?gbpv=1&pg=' . $vars['pg']; + } + elseif($parsedUrl['host'] == 'www.google.com'){ + // New style + + if(!preg_match('|^/books/edition/_/[^/]+$|ius', $parsedUrl['path'])){ + throw new Exceptions\InvalidGoogleBooksUrlException(); + } + + parse_str($parsedUrl['query'] ?? '', $vars); + + if(!isset($vars['gbpv']) || $vars['gbpv'] !== '1' || !isset($vars['pg']) || is_array($vars['pg'])){ + throw new Exceptions\InvalidGoogleBooksUrlException(); + } + + $outputUrl = 'https://' . $parsedUrl['host'] . $parsedUrl['path'] . '?gbpv=' . $vars['gbpv'] . '&pg=' . $vars['pg']; + } + else{ + throw new Exceptions\InvalidGoogleBooksUrlException(); + } + } + + return $outputUrl; + } + /** * @param array $uploadedFile * @throws \Exceptions\ValidationException diff --git a/lib/Exceptions/InvalidArtworkPageUrlException.php b/lib/Exceptions/InvalidArtworkPageUrlException.php index 780ef934..72ffcb43 100644 --- a/lib/Exceptions/InvalidArtworkPageUrlException.php +++ b/lib/Exceptions/InvalidArtworkPageUrlException.php @@ -1,6 +1,6 @@