This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 50b1ee63918d264c2064120282185a14d88209f3 Author: Sebastian Nagel <[email protected]> AuthorDate: Fri Dec 12 09:32:14 2025 +0100 NUTCH-3136 Upgrade crawler-commons dependency Robots.txt parser: use URL objects in newly introduced methods to avoid the unnecessary parsing of URLs. --- ivy/ivy.xml | 2 +- .../org/apache/nutch/fetcher/FetcherThread.java | 4 ++-- src/java/org/apache/nutch/protocol/Protocol.java | 21 +++++++++++++++++++++ src/java/org/apache/nutch/util/URLUtil.java | 2 +- .../apache/nutch/protocol/http/api/HttpBase.java | 6 ++++++ .../java/org/apache/nutch/protocol/file/File.java | 10 ++++++++++ .../src/java/org/apache/nutch/protocol/ftp/Ftp.java | 9 +++++++++ 7 files changed, 50 insertions(+), 4 deletions(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index f149ce13d..d2cbfc850 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -83,7 +83,7 @@ <dependency org="com.google.guava" name="guava" rev="33.4.8-jre" /> - <dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.4" /> + <dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.6" /> <dependency org="com.google.code.gson" name="gson" rev="2.13.1"/> <dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0"> diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 02b7cd3e8..2abcfe9f6 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -366,7 +366,7 @@ public class FetcherThread extends Thread { LOG.debug("redirectCount={}", redirectCount); redirecting = false; Protocol protocol = this.protocolFactory.getProtocol(fit.u); - BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, + BaseRobotRules rules = protocol.getRobotRules(fit.u, fit.datum, robotsTxtContent); if (robotsTxtContent != null) { outputRobotsTxt(robotsTxtContent); @@ -389,7 +389,7 @@ public class FetcherThread extends Thread { } continue; } - if (!rules.isAllowed(fit.url.toString())) { + if (!rules.isAllowed(fit.u)) { // unblock fetchQueues.finishFetchItem(fit, true); LOG.info("Denied by robots.txt: {}", fit.url); diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java index ab4162c87..2514eae33 100644 --- a/src/java/org/apache/nutch/protocol/Protocol.java +++ b/src/java/org/apache/nutch/protocol/Protocol.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.protocol; +import java.net.URL; import java.util.List; import org.apache.hadoop.conf.Configurable; @@ -57,4 +58,24 @@ public interface Protocol extends Pluggable, Configurable { BaseRobotRules getRobotRules(Text url, CrawlDatum datum, List<Content> robotsTxtContent); + /** + * Retrieve robot rules applicable for this URL. + * + * @param url + * URL to check + * @param datum + * page datum + * @param robotsTxtContent + * container to store responses when fetching the robots.txt file for + * debugging or archival purposes. Instead of a robots.txt file, it + * may include redirects or an error page (404, etc.). Response + * {@link Content} is appended to the passed list. If null is passed + * nothing is stored. + * @return robot rules (specific for this URL or default), never null + */ + default BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List<Content> robotsTxtContent) { + return getRobotRules(new Text(url.toString()), datum, robotsTxtContent); + } + } diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index 0cfce1c65..afd6f1385 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -103,7 +103,7 @@ public class URLUtil { * <a href= "https://publicsuffix.org/list/public_suffix_list.dat" * >https://publicsuffix.org/list/public_suffix_list.dat</a> and are compared * using <a href= - * "https://crawler-commons.github.io/crawler-commons/1.4/crawlercommons/domains/EffectiveTldFinder.html"> + * "https://crawler-commons.github.io/crawler-commons/1.6/crawlercommons/domains/EffectiveTldFinder.html"> * crawler-commons' EffectiveTldFinder</a>. Only ICANN domain suffixes are * used. Because EffectiveTldFinder loads the public suffix list as file * "effective_tld_names.dat" from the Java classpath, it's possible to use the diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 79b45882e..caa3f861e 100755 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -721,6 +721,12 @@ public abstract class HttpBase implements Protocol { return this.robots.getRobotRulesSet(this, url, robotsTxtContent); } + @Override + public BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List<Content> robotsTxtContent) { + return this.robots.getRobotRulesSet(this, url, robotsTxtContent); + } + /** * Transforming a String[] into a HashMap for faster searching * diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java index e4d201069..877873b64 100644 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java @@ -232,4 +232,14 @@ public class File implements Protocol { return RobotRulesParser.EMPTY_RULES; } + /** + * No robots parsing is done for file protocol. So this returns a set of empty + * rules which will allow every url. + */ + @Override + public BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List<Content> robotsTxtContent) { + return RobotRulesParser.EMPTY_RULES; + } + } diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java index 2a47b63d6..8cf58f75e 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java @@ -304,6 +304,15 @@ public class Ftp implements Protocol { return robots.getRobotRulesSet(this, url, robotsTxtContent); } + /** + * Get the robots rules for a given url + */ + @Override + public BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List<Content> robotsTxtContent) { + return robots.getRobotRulesSet(this, url, robotsTxtContent); + } + public int getBufferSize() { return BUFFER_SIZE; }
