Skip to content

Commit

Permalink
Enhanced documentation, improved sitemap handling, and optimized obje…
Browse files Browse the repository at this point in the history
…ct equality in request and sitemap entities.
  • Loading branch information
marevol committed Feb 16, 2025
1 parent 0089343 commit 3efbb1d
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,20 @@
import org.codelibs.fess.crawler.entity.RequestData.Method;

/**
* Builder class to create a request.
* Builder class for creating RequestData objects using a fluent interface.
* This class provides a simple way to construct RequestData objects with method chaining.
*
* @author shinsuke
* <p>Usage example:</p>
* <pre>
* RequestData request = RequestDataBuilder.newRequestData()
* .method("GET")
* .url("https://example.com")
* .weight(1.0f)
* .build();
* </pre>
*
* <p>The builder is implemented using an inner class {@link RequestDataContext} that handles
* the actual construction of the RequestData object.</p>
*
*/
public final class RequestDataBuilder {
Expand Down Expand Up @@ -70,16 +81,33 @@ public RequestDataContext post() {
return method(Method.POST);
}

/**
* Sets the URL for this request data.
*
* @param url the URL string to be set
* @return the current RequestDataContext instance for method chaining
*/
public RequestDataContext url(final String url) {
data.setUrl(url);
return this;
}

/**
* Sets the weight for the request data.
*
* @param weight the weight to set
* @return the current RequestDataContext instance
*/
public RequestDataContext weight(final float weight) {
data.setWeight(weight);
return this;
}

/**
* Builds and returns the constructed RequestData object.
*
* @return the constructed RequestData object
*/
public RequestData build() {
return data;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,42 @@

import org.codelibs.core.lang.StringUtil;

/**
* Represents a robots.txt file parser and handler.
* This class manages the rules defined in a robots.txt file, including user agent directives,
* allowed/disallowed paths, crawl delays, and sitemap URLs.
*
* <p>The robots.txt protocol is implemented according to the standard specification,
* supporting pattern matching for user agents, path-based access control, and crawl delay settings.</p>
*
* <p>Key features:</p>
* <ul>
* <li>Supports multiple user-agent directives with pattern matching</li>
* <li>Handles Allow and Disallow rules for path-based access control</li>
* <li>Manages crawl delay settings per user agent</li>
* <li>Stores sitemap URLs listed in robots.txt</li>
* </ul>
*
* <p>The class uses case-insensitive pattern matching for user agents and supports
* wildcard characters (*) in user agent strings. When multiple directives match a user agent,
* the most specific (longest) match is used.</p>
*
*/
public class RobotsTxt {
private static final String ALL_BOTS = "*";

protected final Map<Pattern, Directive> directiveMap = new LinkedHashMap<>();

private final List<String> sitemapList = new ArrayList<>();

/**
* Checks if access to a given path is allowed for a specific user agent according to robots.txt rules.
*
* @param path The path to check for access permission
* @param userAgent The user agent string to check against robots.txt directives
* @return true if access is allowed, false if access is disallowed by robots.txt rules.
* Returns true if no matching directive is found for the user agent.
*/
public boolean allows(final String path, final String userAgent) {
final Directive directive = getMatchedDirective(userAgent);
if (directive == null) {
Expand All @@ -38,6 +67,14 @@ public boolean allows(final String path, final String userAgent) {
return directive.allows(path);
}

/**
* Gets the crawl delay value for the specified user agent from robots.txt.
* The crawl delay specifies the time (in seconds) to wait between successive requests.
*
* @param userAgent The user agent string to match against robots.txt directives
* @return The crawl delay value in seconds. Returns 0 if no matching directive is found
* or no crawl delay is specified for the matching directive.
*/
public int getCrawlDelay(final String userAgent) {
final Directive directive = getMatchedDirective(userAgent);
if (directive == null) {
Expand All @@ -46,6 +83,15 @@ public int getCrawlDelay(final String userAgent) {
return directive.getCrawlDelay();
}

/**
* Returns the most specific directive matching the given user agent.
* The method finds the longest matching user agent pattern in the directives,
* excluding the general "*" pattern which matches all bots.
*
* @param userAgent the user agent string to match against directives,
* can be null (treated as empty string)
* @return the most specific matching directive, or null if no directive matches
*/
public Directive getMatchedDirective(final String userAgent) {
final String target;
if (userAgent == null) {
Expand Down Expand Up @@ -74,6 +120,12 @@ public Directive getMatchedDirective(final String userAgent) {
return matchedDirective;
}

/**
* Retrieves the robots.txt directive for the specified user agent.
*
* @param userAgent The user agent string to look up in the directives
* @return The Directive object matching the user agent, or null if no matching directive is found or if userAgent is null
*/
public Directive getDirective(final String userAgent) {
if (userAgent == null) {
return null;
Expand All @@ -86,18 +138,41 @@ public Directive getDirective(final String userAgent) {
return null;
}

/**
* Adds a directive to the robots.txt rules.
* The user-agent pattern in the directive is converted to a regular expression pattern,
* where '*' is replaced with '.*' for pattern matching, and stored case-insensitively.
*
* @param directive The directive to add to the robots.txt rules
*/
public void addDirective(final Directive directive) {
directiveMap.put(Pattern.compile(directive.getUserAgent().replace("*", ".*"), Pattern.CASE_INSENSITIVE), directive);
}

/**
* Adds a sitemap URL to the list of sitemaps.
*
* @param url The URL of the sitemap to be added
*/
public void addSitemap(final String url) {
sitemapList.add(url);
if (!sitemapList.contains(url)) {
sitemapList.add(url);
}
}

/**
* Returns an array of sitemap URLs.
*
* @return an array of sitemap URLs
*/
public String[] getSitemaps() {
return sitemapList.toArray(new String[sitemapList.size()]);
}

/**
* Represents a directive in a robots.txt file.
* A directive consists of a user agent, crawl delay, allowed paths, and disallowed paths.
*/
public static class Directive {
private final String userAgent;

Expand Down Expand Up @@ -138,11 +213,15 @@ public boolean allows(final String path) {
}

public void addAllow(final String path) {
allowedPaths.add(path);
if (!allowedPaths.contains(path)) {
allowedPaths.add(path);
}
}

public void addDisallow(final String path) {
disallowedPaths.add(path);
if (!disallowedPaths.contains(path)) {
disallowedPaths.add(path);
}
}

public String[] getAllows() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,26 @@
import org.codelibs.core.lang.StringUtil;

/**
* @author shinsuke
* Represents a Sitemap file entry, conforming to the Sitemap XML format.
* This class holds information about a single Sitemap, including its location and last modification timestamp.
* It implements the {@link Sitemap} interface.
*
* <p>
* A Sitemap file provides search engines with a list of URLs available for crawling.
* This class encapsulates the essential attributes of a Sitemap entry, allowing for efficient management
* and processing of Sitemap data.
* </p>
*
* <p>
* The {@code loc} attribute specifies the URL of the Sitemap, while the {@code lastmod} attribute
* indicates the last time the Sitemap file was modified. The {@code lastmod} attribute is used by crawlers
* to incrementally fetch sitemaps that have been updated since a certain date.
* </p>
*
* <p>
* This class also provides implementations for {@code equals}, {@code hashCode}, and {@code toString} methods
* to facilitate object comparison and representation.
* </p>
*
*/
public class SitemapFile implements Sitemap {
Expand Down Expand Up @@ -75,6 +94,9 @@ public void setLastmod(final String lastmod) {

@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (!(obj instanceof final SitemapFile sitemapUrl)) {
return false;
}
Expand All @@ -86,7 +108,7 @@ public boolean equals(final Object obj) {

@Override
public int hashCode() {
return loc.hashCode() + lastmod.hashCode();
return java.util.Objects.hash(loc, lastmod);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
import java.util.List;

/**
* @author shinsuke
* Represents a set of Sitemaps, which can be either a UrlSet or an Index.
* This class provides methods to manage a list of Sitemap objects and determine the type of the SitemapSet.
*
*/
public class SitemapSet implements Serializable {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,18 @@
import org.codelibs.core.lang.StringUtil;

/**
* @author shinsuke
* Represents a URL entry within a sitemap.
*
* <p>
* This class encapsulates the properties of a URL as defined in the sitemap XML format,
* including its location, last modification date, change frequency, and priority.
* It implements the {@link Sitemap} interface.
* </p>
*
* <p>
* The {@code SitemapUrl} class provides getter and setter methods for each of these properties,
* as well as implementations for {@code equals()}, {@code hashCode()}, and {@code toString()} methods.
* </p>
*
*/
public class SitemapUrl implements Sitemap {
Expand Down Expand Up @@ -125,6 +136,9 @@ public void setPriority(final String priority) {

@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (!(obj instanceof final SitemapUrl sitemapUrl)) {
return false;
}
Expand All @@ -137,7 +151,7 @@ public boolean equals(final Object obj) {

@Override
public int hashCode() {
return loc.hashCode() + changefreq.hashCode() + lastmod.hashCode() + priority.hashCode();
return java.util.Objects.hash(loc, changefreq, lastmod, priority);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
package org.codelibs.fess.crawler.entity;

/**
* @author shinsuke
* Implementation of the {@link UrlQueue} interface.
* This class represents a URL to be crawled, storing its ID, session ID,
* HTTP method, URL, metadata, encoding, parent URL, depth, last modified time,
* creation time, and weight.
*
*/
public class UrlQueueImpl<IDTYPE> implements UrlQueue<IDTYPE> {
Expand Down

0 comments on commit 3efbb1d

Please sign in to comment.