From 45631f1191205bea4a539ca275882726bc5a9b23 Mon Sep 17 00:00:00 2001 From: John Erik Halse Date: Wed, 16 Dec 2020 20:28:25 +0100 Subject: [PATCH] Switched to java.net.URL for Url parsing --- veidemann-robotsevaluator-service/pom.xml | 14 +++----- .../nna/veidemann/robotsparser/RobotsTxt.java | 6 ++-- .../veidemann/robotsservice/RobotsCache.java | 14 ++++---- .../robotsservice/RobotsService.java | 5 ++- .../robotsparser/RobotsTxtParserTest.java | 11 +++---- .../veidemann/robotsparser/RobotsTxtTest.java | 32 ++++++++++--------- 6 files changed, 37 insertions(+), 45 deletions(-) diff --git a/veidemann-robotsevaluator-service/pom.xml b/veidemann-robotsevaluator-service/pom.xml index 95d2ce2..3d785e6 100644 --- a/veidemann-robotsevaluator-service/pom.xml +++ b/veidemann-robotsevaluator-service/pom.xml @@ -15,14 +15,14 @@ ${env.DOCKER_PASSWORD} ${project.version} - 2.13.0 + 2.13.3 1.7.30 1.2.4.Final 4.4.0 - 0.4.5 - 0.4.1 + 0.5.3 + 0.4.9 @@ -82,7 +82,7 @@ junit junit - 4.13 + 4.13.1 test @@ -98,12 +98,6 @@ ${project.version} - - org.netpreserve.commons - webarchive-commons-uri - 2.0.0-SNAPSHOT - - com.github.nlnwa veidemann-commons diff --git a/veidemann-robotsevaluator-service/src/main/java/no/nb/nna/veidemann/robotsparser/RobotsTxt.java b/veidemann-robotsevaluator-service/src/main/java/no/nb/nna/veidemann/robotsparser/RobotsTxt.java index 7d73bba..2a3d92c 100644 --- a/veidemann-robotsevaluator-service/src/main/java/no/nb/nna/veidemann/robotsparser/RobotsTxt.java +++ b/veidemann-robotsevaluator-service/src/main/java/no/nb/nna/veidemann/robotsparser/RobotsTxt.java @@ -17,8 +17,8 @@ import no.nb.nna.veidemann.api.robotsevaluator.v1.IsAllowedReply; import no.nb.nna.veidemann.api.robotsevaluator.v1.IsAllowedReply.OtherField; -import org.netpreserve.commons.uri.Uri; +import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.Optional; @@ -51,7 +51,7 @@ void addOtherField(String name, String value) { otherFields.add(OtherField.newBuilder().setName(name).setValue(value).build()); } - public IsAllowedReply isAllowed(String userAgent, Uri uri) { + public IsAllowedReply isAllowed(String userAgent, URL uri) { String ua = USER_AGENT_PARSER.parse(userAgent); IsAllowedReply reply = findMatchingDirectives(ua) @@ -145,7 +145,7 @@ MatchedDirectiveGroup compareUA(final String ua, final String parsedUserAgent) { return new MatchedDirectiveGroup(i, this); } - boolean isAllowed(Uri uri) { + boolean isAllowed(URL uri) { final String path = uri.getPath(); Optional match = directives.stream() .map(d -> d.comparePath(path)) diff --git a/veidemann-robotsevaluator-service/src/main/java/no/nb/nna/veidemann/robotsservice/RobotsCache.java b/veidemann-robotsevaluator-service/src/main/java/no/nb/nna/veidemann/robotsservice/RobotsCache.java index e8d7f25..4f97679 100644 --- a/veidemann-robotsevaluator-service/src/main/java/no/nb/nna/veidemann/robotsservice/RobotsCache.java +++ b/veidemann-robotsevaluator-service/src/main/java/no/nb/nna/veidemann/robotsservice/RobotsCache.java @@ -25,7 +25,6 @@ import org.cache2k.Cache2kBuilder; import org.cache2k.expiry.ExpiryTimeValues; import org.cache2k.integration.CacheLoader; -import org.netpreserve.commons.uri.Uri; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,12 +36,11 @@ import javax.net.ssl.X509TrustManager; import java.net.InetSocketAddress; import java.net.Proxy; +import java.net.URL; import java.security.cert.CertificateException; import java.util.Objects; -import static no.nb.nna.veidemann.commons.VeidemannHeaderConstants.COLLECTION_ID; -import static no.nb.nna.veidemann.commons.VeidemannHeaderConstants.EXECUTION_ID; -import static no.nb.nna.veidemann.commons.VeidemannHeaderConstants.JOB_EXECUTION_ID; +import static no.nb.nna.veidemann.commons.VeidemannHeaderConstants.*; /** * @@ -107,7 +105,7 @@ public RobotsTxt load(CacheKey key) throws Exception { .build(); } - public RobotsTxt get(final Uri uri, final int ttlSeconds, final String executionId, final String jobExecutionId, final String collectionId) { + public RobotsTxt get(final URL uri, final int ttlSeconds, final String executionId, final String jobExecutionId, final String collectionId) { return cache.get(new CacheKey(uri, ttlSeconds, executionId, jobExecutionId, collectionId)); } @@ -132,10 +130,10 @@ public static final class CacheKey { private final String collectionId; - public CacheKey(final Uri uri, final int ttlSeconds, final String executionId, final String jobExecutionId, final String collectionId) { - this.protocol = uri.getScheme(); + public CacheKey(final URL uri, final int ttlSeconds, final String executionId, final String jobExecutionId, final String collectionId) { + this.protocol = uri.getProtocol(); this.domain = uri.getHost(); - this.port = uri.getDecodedPort(); + this.port = uri.getPort() == -1 ? uri.getDefaultPort() : uri.getPort(); this.ttlSeconds = ttlSeconds; this.executionId = executionId; this.jobExecutionId = jobExecutionId; diff --git a/veidemann-robotsevaluator-service/src/main/java/no/nb/nna/veidemann/robotsservice/RobotsService.java b/veidemann-robotsevaluator-service/src/main/java/no/nb/nna/veidemann/robotsservice/RobotsService.java index 31c1440..4e44efa 100644 --- a/veidemann-robotsevaluator-service/src/main/java/no/nb/nna/veidemann/robotsservice/RobotsService.java +++ b/veidemann-robotsevaluator-service/src/main/java/no/nb/nna/veidemann/robotsservice/RobotsService.java @@ -22,11 +22,10 @@ import no.nb.nna.veidemann.api.robotsevaluator.v1.RobotsEvaluatorGrpc; import no.nb.nna.veidemann.robotsparser.RobotsTxt; import no.nb.nna.veidemann.robotsparser.RobotsTxtParser; -import org.netpreserve.commons.uri.Uri; -import org.netpreserve.commons.uri.UriConfigs; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.net.URL; import java.util.Objects; import static no.nb.nna.veidemann.robotsparser.RobotsTxt.EMPTY_ALLOWED_REPLY; @@ -56,7 +55,7 @@ public void isAllowed(IsAllowedRequest request, StreamObserver r Objects.requireNonNull(request.getUserAgent()); Objects.requireNonNull(request.getCollectionRef()); try { - Uri uri = UriConfigs.WHATWG.buildUri(request.getUri()); + URL uri = new URL(request.getUri()); int ttlSeconds = request.getPoliteness().getPolitenessConfig().getMinimumRobotsValidityDurationS(); if (ttlSeconds == 0) { ttlSeconds = 300; diff --git a/veidemann-robotsevaluator-service/src/test/java/no/nb/nna/veidemann/robotsparser/RobotsTxtParserTest.java b/veidemann-robotsevaluator-service/src/test/java/no/nb/nna/veidemann/robotsparser/RobotsTxtParserTest.java index bd9bb9a..6b0c086 100644 --- a/veidemann-robotsevaluator-service/src/test/java/no/nb/nna/veidemann/robotsparser/RobotsTxtParserTest.java +++ b/veidemann-robotsevaluator-service/src/test/java/no/nb/nna/veidemann/robotsparser/RobotsTxtParserTest.java @@ -18,10 +18,9 @@ import org.antlr.v4.runtime.CharStreams; import org.junit.Test; -import org.netpreserve.commons.uri.Uri; -import org.netpreserve.commons.uri.UriConfigs; import java.io.IOException; +import java.net.URL; import static org.assertj.core.api.Assertions.assertThat; @@ -37,8 +36,8 @@ public void checkIsAllowed() throws IOException { RobotsTxtParser parser = new RobotsTxtParser(); RobotsTxt robots = parser.parse(CharStreams.fromFileName("src/test/resources/examples/robotstxt/robots1.txt"), "robots1.txt"); - Uri denied = UriConfigs.WHATWG.buildUri("http://example.com/denied"); - Uri allowed = UriConfigs.WHATWG.buildUri("http://example.com/allowed"); + URL denied = new URL("http://example.com/denied"); + URL allowed = new URL("http://example.com/allowed"); assertThat(robots.isAllowed(BOT1, denied).getIsAllowed()).isFalse(); assertThat(robots.isAllowed(BOT1, allowed).getIsAllowed()).isTrue(); @@ -52,8 +51,8 @@ public void checkGrammar() throws IOException { parser.parse(CharStreams.fromFileName("src/test/resources/examples/robotstxt/robots3.txt"), "robots3.txt"); RobotsTxt robots = parser.parse(CharStreams.fromFileName("src/test/resources/examples/robotstxt/robots4.txt"), "robots4.txt"); - Uri denied = UriConfigs.WHATWG.buildUri("http://example.com/test6"); - Uri allowed = UriConfigs.WHATWG.buildUri("http://example.com/test9"); + URL denied = new URL("http://example.com/test6"); + URL allowed = new URL("http://example.com/test9"); assertThat(robots.isAllowed(BOT2, denied).getIsAllowed()).isFalse(); assertThat(robots.isAllowed(BOT2, denied).getCrawlDelay()).isEqualTo(7.0f); diff --git a/veidemann-robotsevaluator-service/src/test/java/no/nb/nna/veidemann/robotsparser/RobotsTxtTest.java b/veidemann-robotsevaluator-service/src/test/java/no/nb/nna/veidemann/robotsparser/RobotsTxtTest.java index fe567cc..7c99df4 100644 --- a/veidemann-robotsevaluator-service/src/test/java/no/nb/nna/veidemann/robotsparser/RobotsTxtTest.java +++ b/veidemann-robotsevaluator-service/src/test/java/no/nb/nna/veidemann/robotsparser/RobotsTxtTest.java @@ -17,7 +17,9 @@ package no.nb.nna.veidemann.robotsparser; import org.junit.Test; -import org.netpreserve.commons.uri.UriConfigs; + +import java.net.MalformedURLException; +import java.net.URL; import static org.assertj.core.api.Assertions.assertThat; @@ -42,33 +44,33 @@ public void testDirectiveGroup_matchUserAgent() { } @Test - public void testDirectiveGroup_isAllowed() { + public void testDirectiveGroup_isAllowed() throws MalformedURLException { RobotsTxt.DirectiveGroup directiveGroup; directiveGroup = new RobotsTxt.DirectiveGroup(); directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/p")); directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/")); - assertThat(directiveGroup.isAllowed(UriConfigs.WHATWG.buildUri("http://example.com/page"))).isTrue(); + assertThat(directiveGroup.isAllowed(new URL("http://example.com/page"))).isTrue(); directiveGroup = new RobotsTxt.DirectiveGroup(); directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/folder/")); directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/folder")); - assertThat(directiveGroup.isAllowed(UriConfigs.WHATWG.buildUri("http://example.com/folder/page"))).isTrue(); + assertThat(directiveGroup.isAllowed(new URL("http://example.com/folder/page"))).isTrue(); directiveGroup = new RobotsTxt.DirectiveGroup(); directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/page")); directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/*.htm")); - assertThat(directiveGroup.isAllowed(UriConfigs.WHATWG.buildUri("http://example.com/page.htm"))).isFalse(); + assertThat(directiveGroup.isAllowed(new URL("http://example.com/page.htm"))).isFalse(); directiveGroup = new RobotsTxt.DirectiveGroup(); directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/$")); directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/")); - assertThat(directiveGroup.isAllowed(UriConfigs.WHATWG.buildUri("http://example.com/"))).isTrue(); + assertThat(directiveGroup.isAllowed(new URL("http://example.com/"))).isTrue(); directiveGroup = new RobotsTxt.DirectiveGroup(); directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/$")); directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/")); - assertThat(directiveGroup.isAllowed(UriConfigs.WHATWG.buildUri("http://example.com/page.htm"))).isFalse(); + assertThat(directiveGroup.isAllowed(new URL("http://example.com/page.htm"))).isFalse(); } @Test @@ -140,11 +142,11 @@ public void testDirective_comparePath() { * Test of isAllowed method, of class RobotsTxt. */ @Test - public void testIsAllowed() { + public void testIsAllowed() throws MalformedURLException { RobotsTxt robots = new RobotsTxt("test"); // Test that empty robots.txt allows all - assertThat(robots.isAllowed("googlebot-news", UriConfigs.WHATWG.buildUri("http://example.com/page")).getIsAllowed()).isTrue(); + assertThat(robots.isAllowed("googlebot-news", new URL("http://example.com/page")).getIsAllowed()).isTrue(); RobotsTxt.DirectiveGroup directiveGroup; directiveGroup = new RobotsTxt.DirectiveGroup(); @@ -152,7 +154,7 @@ public void testIsAllowed() { directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/p")); directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/")); robots.directives.add(directiveGroup); - assertThat(robots.isAllowed("googlebot-news", UriConfigs.WHATWG.buildUri("http://example.com/page")).getIsAllowed()).isTrue(); + assertThat(robots.isAllowed("googlebot-news", new URL("http://example.com/page")).getIsAllowed()).isTrue(); directiveGroup = new RobotsTxt.DirectiveGroup(); directiveGroup.userAgents.add("googlebot"); @@ -160,12 +162,12 @@ public void testIsAllowed() { directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/folder")); robots.directives.add(directiveGroup); assertThat(robots.isAllowed("Googlebot/2.1 (+http://www.google.com/bot.html)", - UriConfigs.WHATWG.buildUri("http://example.com/folder/page")).getIsAllowed()).isTrue(); - assertThat(robots.isAllowed("googlebot-news", UriConfigs.WHATWG.buildUri("http://example.com/folder/page")).getIsAllowed()) + new URL("http://example.com/folder/page")).getIsAllowed()).isTrue(); + assertThat(robots.isAllowed("googlebot-news", new URL("http://example.com/folder/page")).getIsAllowed()) .isFalse(); - assertThat(robots.isAllowed("googlebo", UriConfigs.WHATWG.buildUri("http://example.com/folder/page")).getIsAllowed()) + assertThat(robots.isAllowed("googlebo", new URL("http://example.com/folder/page")).getIsAllowed()) .isTrue(); - assertThat(robots.isAllowed("foo", UriConfigs.WHATWG.buildUri("http://example.com/folder/page")).getIsAllowed()) + assertThat(robots.isAllowed("foo", new URL("http://example.com/folder/page")).getIsAllowed()) .isTrue(); directiveGroup = new RobotsTxt.DirectiveGroup(); @@ -173,7 +175,7 @@ public void testIsAllowed() { directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/page")); directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/*.htm")); robots.directives.add(directiveGroup); - assertThat(robots.isAllowed("foo", UriConfigs.WHATWG.buildUri("http://example.com/folder/page")).getIsAllowed()) + assertThat(robots.isAllowed("foo", new URL("http://example.com/folder/page")).getIsAllowed()) .isTrue(); }