Enhanced documentation, improved sitemap handling, and optimized obje…

…ct equality in request and sitemap entities.
codelibs · Feb 16, 2025 · 3efbb1d · 3efbb1d
1 parent 0089343
commit 3efbb1d
Show file tree

Hide file tree

Showing 6 changed files with 158 additions and 11 deletions.
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/builder/RequestDataBuilder.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/builder/RequestDataBuilder.java
@@ -20,9 +20,20 @@
 import org.codelibs.fess.crawler.entity.RequestData.Method;
 
 /**
- * Builder class to create a request.
+ * Builder class for creating RequestData objects using a fluent interface.
+ * This class provides a simple way to construct RequestData objects with method chaining.
  *
- * @author shinsuke
+ * <p>Usage example:</p>
+ * <pre>
+ * RequestData request = RequestDataBuilder.newRequestData()
+ *     .method("GET")
+ *     .url("https://example.com")
+ *     .weight(1.0f)
+ *     .build();
+ * </pre>
+ *
+ * <p>The builder is implemented using an inner class {@link RequestDataContext} that handles
+ * the actual construction of the RequestData object.</p>
  *
  */
 public final class RequestDataBuilder {
@@ -70,16 +81,33 @@ public RequestDataContext post() {
             return method(Method.POST);
         }
 
+        /**
+         * Sets the URL for this request data.
+         *
+         * @param url the URL string to be set
+         * @return the current RequestDataContext instance for method chaining
+         */
         public RequestDataContext url(final String url) {
             data.setUrl(url);
             return this;
         }
 
+        /**
+         * Sets the weight for the request data.
+         *
+         * @param weight the weight to set
+         * @return the current RequestDataContext instance
+         */
         public RequestDataContext weight(final float weight) {
             data.setWeight(weight);
             return this;
         }
 
+        /**
+         * Builds and returns the constructed RequestData object.
+         *
+         * @return the constructed RequestData object
+         */
         public RequestData build() {
             return data;
         }

diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/RobotsTxt.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/RobotsTxt.java
@@ -23,13 +23,42 @@
 
 import org.codelibs.core.lang.StringUtil;
 
+/**
+ * Represents a robots.txt file parser and handler.
+ * This class manages the rules defined in a robots.txt file, including user agent directives,
+ * allowed/disallowed paths, crawl delays, and sitemap URLs.
+ *
+ * <p>The robots.txt protocol is implemented according to the standard specification,
+ * supporting pattern matching for user agents, path-based access control, and crawl delay settings.</p>
+ *
+ * <p>Key features:</p>
+ * <ul>
+ *   <li>Supports multiple user-agent directives with pattern matching</li>
+ *   <li>Handles Allow and Disallow rules for path-based access control</li>
+ *   <li>Manages crawl delay settings per user agent</li>
+ *   <li>Stores sitemap URLs listed in robots.txt</li>
+ * </ul>
+ *
+ * <p>The class uses case-insensitive pattern matching for user agents and supports
+ * wildcard characters (*) in user agent strings. When multiple directives match a user agent,
+ * the most specific (longest) match is used.</p>
+ *
+ */
 public class RobotsTxt {
     private static final String ALL_BOTS = "*";
 
     protected final Map<Pattern, Directive> directiveMap = new LinkedHashMap<>();
 
     private final List<String> sitemapList = new ArrayList<>();
 
+    /**
+     * Checks if access to a given path is allowed for a specific user agent according to robots.txt rules.
+     *
+     * @param path The path to check for access permission
+     * @param userAgent The user agent string to check against robots.txt directives
+     * @return true if access is allowed, false if access is disallowed by robots.txt rules.
+     *         Returns true if no matching directive is found for the user agent.
+     */
     public boolean allows(final String path, final String userAgent) {
         final Directive directive = getMatchedDirective(userAgent);
         if (directive == null) {
@@ -38,6 +67,14 @@ public boolean allows(final String path, final String userAgent) {
         return directive.allows(path);
     }
 
+    /**
+     * Gets the crawl delay value for the specified user agent from robots.txt.
+     * The crawl delay specifies the time (in seconds) to wait between successive requests.
+     *
+     * @param userAgent The user agent string to match against robots.txt directives
+     * @return The crawl delay value in seconds. Returns 0 if no matching directive is found
+     *         or no crawl delay is specified for the matching directive.
+     */
     public int getCrawlDelay(final String userAgent) {
         final Directive directive = getMatchedDirective(userAgent);
         if (directive == null) {
@@ -46,6 +83,15 @@ public int getCrawlDelay(final String userAgent) {
         return directive.getCrawlDelay();
     }
 
+    /**
+     * Returns the most specific directive matching the given user agent.
+     * The method finds the longest matching user agent pattern in the directives,
+     * excluding the general "*" pattern which matches all bots.
+     *
+     * @param userAgent the user agent string to match against directives,
+     *                 can be null (treated as empty string)
+     * @return the most specific matching directive, or null if no directive matches
+     */
     public Directive getMatchedDirective(final String userAgent) {
         final String target;
         if (userAgent == null) {
@@ -74,6 +120,12 @@ public Directive getMatchedDirective(final String userAgent) {
         return matchedDirective;
     }
 
+    /**
+     * Retrieves the robots.txt directive for the specified user agent.
+     *
+     * @param userAgent The user agent string to look up in the directives
+     * @return The Directive object matching the user agent, or null if no matching directive is found or if userAgent is null
+     */
     public Directive getDirective(final String userAgent) {
         if (userAgent == null) {
             return null;
@@ -86,18 +138,41 @@ public Directive getDirective(final String userAgent) {
         return null;
     }
 
+    /**
+     * Adds a directive to the robots.txt rules.
+     * The user-agent pattern in the directive is converted to a regular expression pattern,
+     * where '*' is replaced with '.*' for pattern matching, and stored case-insensitively.
+     *
+     * @param directive The directive to add to the robots.txt rules
+     */
     public void addDirective(final Directive directive) {
         directiveMap.put(Pattern.compile(directive.getUserAgent().replace("*", ".*"), Pattern.CASE_INSENSITIVE), directive);
     }
 
+    /**
+     * Adds a sitemap URL to the list of sitemaps.
+     *
+     * @param url The URL of the sitemap to be added
+     */
     public void addSitemap(final String url) {
-        sitemapList.add(url);
+        if (!sitemapList.contains(url)) {
+            sitemapList.add(url);
+        }
     }
 
+    /**
+     * Returns an array of sitemap URLs.
+     *
+     * @return an array of sitemap URLs
+     */
     public String[] getSitemaps() {
         return sitemapList.toArray(new String[sitemapList.size()]);
     }
 
+    /**
+     * Represents a directive in a robots.txt file.
+     * A directive consists of a user agent, crawl delay, allowed paths, and disallowed paths.
+     */
     public static class Directive {
         private final String userAgent;
 
@@ -138,11 +213,15 @@ public boolean allows(final String path) {
         }
 
         public void addAllow(final String path) {
-            allowedPaths.add(path);
+            if (!allowedPaths.contains(path)) {
+                allowedPaths.add(path);
+            }
         }
 
         public void addDisallow(final String path) {
-            disallowedPaths.add(path);
+            if (!disallowedPaths.contains(path)) {
+                disallowedPaths.add(path);
+            }
         }
 
         public String[] getAllows() {

diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/SitemapFile.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/SitemapFile.java
@@ -18,7 +18,26 @@
 import org.codelibs.core.lang.StringUtil;
 
 /**
- * @author shinsuke
+ * Represents a Sitemap file entry, conforming to the Sitemap XML format.
+ * This class holds information about a single Sitemap, including its location and last modification timestamp.
+ * It implements the {@link Sitemap} interface.
+ *
+ * <p>
+ * A Sitemap file provides search engines with a list of URLs available for crawling.
+ * This class encapsulates the essential attributes of a Sitemap entry, allowing for efficient management
+ * and processing of Sitemap data.
+ * </p>
+ *
+ * <p>
+ * The {@code loc} attribute specifies the URL of the Sitemap, while the {@code lastmod} attribute
+ * indicates the last time the Sitemap file was modified.  The {@code lastmod} attribute is used by crawlers
+ * to incrementally fetch sitemaps that have been updated since a certain date.
+ * </p>
+ *
+ * <p>
+ * This class also provides implementations for {@code equals}, {@code hashCode}, and {@code toString} methods
+ * to facilitate object comparison and representation.
+ * </p>
  *
  */
 public class SitemapFile implements Sitemap {
@@ -75,6 +94,9 @@ public void setLastmod(final String lastmod) {
 
     @Override
     public boolean equals(final Object obj) {
+        if (this == obj) {
+            return true;
+        }
         if (!(obj instanceof final SitemapFile sitemapUrl)) {
             return false;
         }
@@ -86,7 +108,7 @@ public boolean equals(final Object obj) {
 
     @Override
     public int hashCode() {
-        return loc.hashCode() + lastmod.hashCode();
+        return java.util.Objects.hash(loc, lastmod);
     }
 
     @Override

diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/SitemapSet.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/SitemapSet.java
@@ -20,7 +20,8 @@
 import java.util.List;
 
 /**
- * @author shinsuke
+ * Represents a set of Sitemaps, which can be either a UrlSet or an Index.
+ * This class provides methods to manage a list of Sitemap objects and determine the type of the SitemapSet.
  *
  */
 public class SitemapSet implements Serializable {

diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/SitemapUrl.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/SitemapUrl.java
@@ -18,7 +18,18 @@
 import org.codelibs.core.lang.StringUtil;
 
 /**
- * @author shinsuke
+ * Represents a URL entry within a sitemap.
+ *
+ * <p>
+ * This class encapsulates the properties of a URL as defined in the sitemap XML format,
+ * including its location, last modification date, change frequency, and priority.
+ * It implements the {@link Sitemap} interface.
+ * </p>
+ *
+ * <p>
+ * The {@code SitemapUrl} class provides getter and setter methods for each of these properties,
+ * as well as implementations for {@code equals()}, {@code hashCode()}, and {@code toString()} methods.
+ * </p>
  *
  */
 public class SitemapUrl implements Sitemap {
@@ -125,6 +136,9 @@ public void setPriority(final String priority) {
 
     @Override
     public boolean equals(final Object obj) {
+        if (this == obj) {
+            return true;
+        }
         if (!(obj instanceof final SitemapUrl sitemapUrl)) {
             return false;
         }
@@ -137,7 +151,7 @@ public boolean equals(final Object obj) {
 
     @Override
     public int hashCode() {
-        return loc.hashCode() + changefreq.hashCode() + lastmod.hashCode() + priority.hashCode();
+        return java.util.Objects.hash(loc, changefreq, lastmod, priority);
     }
 
     @Override

diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/UrlQueueImpl.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/UrlQueueImpl.java
@@ -16,7 +16,10 @@
 package org.codelibs.fess.crawler.entity;
 
 /**
- * @author shinsuke
+ * Implementation of the {@link UrlQueue} interface.
+ * This class represents a URL to be crawled, storing its ID, session ID,
+ * HTTP method, URL, metadata, encoding, parent URL, depth, last modified time,
+ * creation time, and weight.
  *
  */
 public class UrlQueueImpl<IDTYPE> implements UrlQueue<IDTYPE> {