Skip to content

Commit

Permalink
Merge pull request #9 from nlnwa/javaURL
Browse files Browse the repository at this point in the history
Switched to java.net.URL for Url parsing
  • Loading branch information
maeb authored Dec 17, 2020
2 parents cfdb36a + 45631f1 commit 43f2d10
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 45 deletions.
14 changes: 4 additions & 10 deletions veidemann-robotsevaluator-service/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
<docker.password>${env.DOCKER_PASSWORD}</docker.password>
<docker.tag>${project.version}</docker.tag>

<log4j.version>2.13.0</log4j.version>
<log4j.version>2.13.3</log4j.version>
<slf4j.version>1.7.30</slf4j.version>

<cache2k-version>1.2.4.Final</cache2k-version>
<okhttp3.version>4.4.0</okhttp3.version>

<veidemann.rethinkdbadapter.version>0.4.5</veidemann.rethinkdbadapter.version>
<veidemann.commons.version>0.4.1</veidemann.commons.version>
<veidemann.rethinkdbadapter.version>0.5.3</veidemann.rethinkdbadapter.version>
<veidemann.commons.version>0.4.9</veidemann.commons.version>
</properties>

<repositories>
Expand Down Expand Up @@ -82,7 +82,7 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13</version>
<version>4.13.1</version>
<scope>test</scope>
</dependency>
<dependency>
Expand All @@ -98,12 +98,6 @@
<version>${project.version}</version>
</dependency>

<dependency>
<groupId>org.netpreserve.commons</groupId>
<artifactId>webarchive-commons-uri</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>

<dependency>
<groupId>com.github.nlnwa</groupId>
<artifactId>veidemann-commons</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

import no.nb.nna.veidemann.api.robotsevaluator.v1.IsAllowedReply;
import no.nb.nna.veidemann.api.robotsevaluator.v1.IsAllowedReply.OtherField;
import org.netpreserve.commons.uri.Uri;

import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
Expand Down Expand Up @@ -51,7 +51,7 @@ void addOtherField(String name, String value) {
otherFields.add(OtherField.newBuilder().setName(name).setValue(value).build());
}

public IsAllowedReply isAllowed(String userAgent, Uri uri) {
public IsAllowedReply isAllowed(String userAgent, URL uri) {
String ua = USER_AGENT_PARSER.parse(userAgent);

IsAllowedReply reply = findMatchingDirectives(ua)
Expand Down Expand Up @@ -145,7 +145,7 @@ MatchedDirectiveGroup compareUA(final String ua, final String parsedUserAgent) {
return new MatchedDirectiveGroup(i, this);
}

boolean isAllowed(Uri uri) {
boolean isAllowed(URL uri) {
final String path = uri.getPath();
Optional<MatchedDirective> match = directives.stream()
.map(d -> d.comparePath(path))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import org.cache2k.Cache2kBuilder;
import org.cache2k.expiry.ExpiryTimeValues;
import org.cache2k.integration.CacheLoader;
import org.netpreserve.commons.uri.Uri;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -37,12 +36,11 @@
import javax.net.ssl.X509TrustManager;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.security.cert.CertificateException;
import java.util.Objects;

import static no.nb.nna.veidemann.commons.VeidemannHeaderConstants.COLLECTION_ID;
import static no.nb.nna.veidemann.commons.VeidemannHeaderConstants.EXECUTION_ID;
import static no.nb.nna.veidemann.commons.VeidemannHeaderConstants.JOB_EXECUTION_ID;
import static no.nb.nna.veidemann.commons.VeidemannHeaderConstants.*;

/**
*
Expand Down Expand Up @@ -107,7 +105,7 @@ public RobotsTxt load(CacheKey key) throws Exception {
.build();
}

public RobotsTxt get(final Uri uri, final int ttlSeconds, final String executionId, final String jobExecutionId, final String collectionId) {
public RobotsTxt get(final URL uri, final int ttlSeconds, final String executionId, final String jobExecutionId, final String collectionId) {
return cache.get(new CacheKey(uri, ttlSeconds, executionId, jobExecutionId, collectionId));
}

Expand All @@ -132,10 +130,10 @@ public static final class CacheKey {

private final String collectionId;

public CacheKey(final Uri uri, final int ttlSeconds, final String executionId, final String jobExecutionId, final String collectionId) {
this.protocol = uri.getScheme();
public CacheKey(final URL uri, final int ttlSeconds, final String executionId, final String jobExecutionId, final String collectionId) {
this.protocol = uri.getProtocol();
this.domain = uri.getHost();
this.port = uri.getDecodedPort();
this.port = uri.getPort() == -1 ? uri.getDefaultPort() : uri.getPort();
this.ttlSeconds = ttlSeconds;
this.executionId = executionId;
this.jobExecutionId = jobExecutionId;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,10 @@
import no.nb.nna.veidemann.api.robotsevaluator.v1.RobotsEvaluatorGrpc;
import no.nb.nna.veidemann.robotsparser.RobotsTxt;
import no.nb.nna.veidemann.robotsparser.RobotsTxtParser;
import org.netpreserve.commons.uri.Uri;
import org.netpreserve.commons.uri.UriConfigs;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.net.URL;
import java.util.Objects;

import static no.nb.nna.veidemann.robotsparser.RobotsTxt.EMPTY_ALLOWED_REPLY;
Expand Down Expand Up @@ -56,7 +55,7 @@ public void isAllowed(IsAllowedRequest request, StreamObserver<IsAllowedReply> r
Objects.requireNonNull(request.getUserAgent());
Objects.requireNonNull(request.getCollectionRef());
try {
Uri uri = UriConfigs.WHATWG.buildUri(request.getUri());
URL uri = new URL(request.getUri());
int ttlSeconds = request.getPoliteness().getPolitenessConfig().getMinimumRobotsValidityDurationS();
if (ttlSeconds == 0) {
ttlSeconds = 300;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,9 @@

import org.antlr.v4.runtime.CharStreams;
import org.junit.Test;
import org.netpreserve.commons.uri.Uri;
import org.netpreserve.commons.uri.UriConfigs;

import java.io.IOException;
import java.net.URL;

import static org.assertj.core.api.Assertions.assertThat;

Expand All @@ -37,8 +36,8 @@ public void checkIsAllowed() throws IOException {
RobotsTxtParser parser = new RobotsTxtParser();
RobotsTxt robots = parser.parse(CharStreams.fromFileName("src/test/resources/examples/robotstxt/robots1.txt"), "robots1.txt");

Uri denied = UriConfigs.WHATWG.buildUri("http://example.com/denied");
Uri allowed = UriConfigs.WHATWG.buildUri("http://example.com/allowed");
URL denied = new URL("http://example.com/denied");
URL allowed = new URL("http://example.com/allowed");

assertThat(robots.isAllowed(BOT1, denied).getIsAllowed()).isFalse();
assertThat(robots.isAllowed(BOT1, allowed).getIsAllowed()).isTrue();
Expand All @@ -52,8 +51,8 @@ public void checkGrammar() throws IOException {
parser.parse(CharStreams.fromFileName("src/test/resources/examples/robotstxt/robots3.txt"), "robots3.txt");

RobotsTxt robots = parser.parse(CharStreams.fromFileName("src/test/resources/examples/robotstxt/robots4.txt"), "robots4.txt");
Uri denied = UriConfigs.WHATWG.buildUri("http://example.com/test6");
Uri allowed = UriConfigs.WHATWG.buildUri("http://example.com/test9");
URL denied = new URL("http://example.com/test6");
URL allowed = new URL("http://example.com/test9");

assertThat(robots.isAllowed(BOT2, denied).getIsAllowed()).isFalse();
assertThat(robots.isAllowed(BOT2, denied).getCrawlDelay()).isEqualTo(7.0f);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
package no.nb.nna.veidemann.robotsparser;

import org.junit.Test;
import org.netpreserve.commons.uri.UriConfigs;

import java.net.MalformedURLException;
import java.net.URL;

import static org.assertj.core.api.Assertions.assertThat;

Expand All @@ -42,33 +44,33 @@ public void testDirectiveGroup_matchUserAgent() {
}

@Test
public void testDirectiveGroup_isAllowed() {
public void testDirectiveGroup_isAllowed() throws MalformedURLException {
RobotsTxt.DirectiveGroup directiveGroup;

directiveGroup = new RobotsTxt.DirectiveGroup();
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/p"));
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/"));
assertThat(directiveGroup.isAllowed(UriConfigs.WHATWG.buildUri("http://example.com/page"))).isTrue();
assertThat(directiveGroup.isAllowed(new URL("http://example.com/page"))).isTrue();

directiveGroup = new RobotsTxt.DirectiveGroup();
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/folder/"));
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/folder"));
assertThat(directiveGroup.isAllowed(UriConfigs.WHATWG.buildUri("http://example.com/folder/page"))).isTrue();
assertThat(directiveGroup.isAllowed(new URL("http://example.com/folder/page"))).isTrue();

directiveGroup = new RobotsTxt.DirectiveGroup();
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/page"));
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/*.htm"));
assertThat(directiveGroup.isAllowed(UriConfigs.WHATWG.buildUri("http://example.com/page.htm"))).isFalse();
assertThat(directiveGroup.isAllowed(new URL("http://example.com/page.htm"))).isFalse();

directiveGroup = new RobotsTxt.DirectiveGroup();
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/$"));
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/"));
assertThat(directiveGroup.isAllowed(UriConfigs.WHATWG.buildUri("http://example.com/"))).isTrue();
assertThat(directiveGroup.isAllowed(new URL("http://example.com/"))).isTrue();

directiveGroup = new RobotsTxt.DirectiveGroup();
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/$"));
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/"));
assertThat(directiveGroup.isAllowed(UriConfigs.WHATWG.buildUri("http://example.com/page.htm"))).isFalse();
assertThat(directiveGroup.isAllowed(new URL("http://example.com/page.htm"))).isFalse();
}

@Test
Expand Down Expand Up @@ -140,40 +142,40 @@ public void testDirective_comparePath() {
* Test of isAllowed method, of class RobotsTxt.
*/
@Test
public void testIsAllowed() {
public void testIsAllowed() throws MalformedURLException {
RobotsTxt robots = new RobotsTxt("test");

// Test that empty robots.txt allows all
assertThat(robots.isAllowed("googlebot-news", UriConfigs.WHATWG.buildUri("http://example.com/page")).getIsAllowed()).isTrue();
assertThat(robots.isAllowed("googlebot-news", new URL("http://example.com/page")).getIsAllowed()).isTrue();

RobotsTxt.DirectiveGroup directiveGroup;
directiveGroup = new RobotsTxt.DirectiveGroup();
directiveGroup.userAgents.add("googlebot-news");
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/p"));
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/"));
robots.directives.add(directiveGroup);
assertThat(robots.isAllowed("googlebot-news", UriConfigs.WHATWG.buildUri("http://example.com/page")).getIsAllowed()).isTrue();
assertThat(robots.isAllowed("googlebot-news", new URL("http://example.com/page")).getIsAllowed()).isTrue();

directiveGroup = new RobotsTxt.DirectiveGroup();
directiveGroup.userAgents.add("googlebot");
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/folder/"));
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/folder"));
robots.directives.add(directiveGroup);
assertThat(robots.isAllowed("Googlebot/2.1 (+http://www.google.com/bot.html)",
UriConfigs.WHATWG.buildUri("http://example.com/folder/page")).getIsAllowed()).isTrue();
assertThat(robots.isAllowed("googlebot-news", UriConfigs.WHATWG.buildUri("http://example.com/folder/page")).getIsAllowed())
new URL("http://example.com/folder/page")).getIsAllowed()).isTrue();
assertThat(robots.isAllowed("googlebot-news", new URL("http://example.com/folder/page")).getIsAllowed())
.isFalse();
assertThat(robots.isAllowed("googlebo", UriConfigs.WHATWG.buildUri("http://example.com/folder/page")).getIsAllowed())
assertThat(robots.isAllowed("googlebo", new URL("http://example.com/folder/page")).getIsAllowed())
.isTrue();
assertThat(robots.isAllowed("foo", UriConfigs.WHATWG.buildUri("http://example.com/folder/page")).getIsAllowed())
assertThat(robots.isAllowed("foo", new URL("http://example.com/folder/page")).getIsAllowed())
.isTrue();

directiveGroup = new RobotsTxt.DirectiveGroup();
directiveGroup.userAgents.add("*");
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.ALLOW, "/page"));
directiveGroup.addDirective(new RobotsTxt.Directive(RobotsTxt.DirectiveType.DISALLOW, "/*.htm"));
robots.directives.add(directiveGroup);
assertThat(robots.isAllowed("foo", UriConfigs.WHATWG.buildUri("http://example.com/folder/page")).getIsAllowed())
assertThat(robots.isAllowed("foo", new URL("http://example.com/folder/page")).getIsAllowed())
.isTrue();
}

Expand Down

0 comments on commit 43f2d10

Please sign in to comment.