This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new f691bae NUTCH-2573 Suspend crawling if robots.txt fails to fetch with 5xx status (#724) f691bae is described below commit f691baebc3c04c08ea500f4767e2decb88c30c70 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Tue Jan 18 08:22:36 2022 +0100 NUTCH-2573 Suspend crawling if robots.txt fails to fetch with 5xx status (#724) NUTCH-2573 Suspend crawling if robots.txt fails to fetch with 5xx status - add properties http.robots.503.defer.visits : enable/disable the feature (default: enabled) http.robots.503.defer.visits.delay : delay to wait before the next trial to fetch the deferred URL and the corresponding robots.txt (default: wait 5 minutes) http.robots.503.defer.visits.retries : max. number of retries before giving up and dropping all URLs from the given host / queue (default: give up after the 3rd retry, ie. after 4 attempts) - handle HTTP 5xx in robots.txt parser - handle delay, retries and dropping queues in Fetcher - count dropped fetch items in `robots_defer_visits_dropped` --- conf/nutch-default.xml | 26 +++++++++ .../org/apache/nutch/fetcher/FetchItemQueues.java | 64 +++++++++++++++++----- .../org/apache/nutch/fetcher/FetcherThread.java | 39 ++++++++++++- .../apache/nutch/protocol/RobotRulesParser.java | 13 +++++ .../protocol/http/api/HttpRobotRulesParser.java | 15 ++++- 5 files changed, 140 insertions(+), 17 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index c305fa8..29a4716 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -130,6 +130,32 @@ </property> <property> + <name>http.robots.503.defer.visits</name> + <value>true</value> + <description>Temporarily suspend fetching from a host if the + robots.txt response is HTTP 503 or any other 5xx server error. See + also http.robots.503.defer.visits.delay and + http.robots.503.defer.visits.retries</description> +</property> + +<property> + <name>http.robots.503.defer.visits.delay</name> + <value>300000</value> + <description>Time in milliseconds to suspend crawling a host if the + robots.txt response is HTTP 5xx - see + http.robots.503.defer.visits.</description> +</property> + +<property> + <name>http.robots.503.defer.visits.retries</name> + <value>3</value> + <description>Number of retries crawling a host if the robots.txt + response is HTTP 5xx - see http.robots.503.defer.visits. After n + retries the host queue is dropped for this segment/cycle. + </description> +</property> + +<property> <name>http.agent.description</name> <value></value> <description>Further description of our bot- this text is used in diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java index 00a0784..ceb8cab 100644 --- a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java +++ b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java @@ -195,11 +195,19 @@ public class FetchItemQueues { return null; } + /** + * @return true if the fetcher timelimit is defined and has been exceeded + * ({@code fetcher.timelimit.mins} minutes after fetching started) + */ + public boolean timelimitExceeded() { + return timelimit != -1 && System.currentTimeMillis() >= timelimit; + } + // called only once the feeder has stopped public synchronized int checkTimelimit() { int count = 0; - if (System.currentTimeMillis() >= timelimit && timelimit != -1) { + if (timelimitExceeded()) { // emptying the queues count = emptyQueues(); @@ -209,6 +217,7 @@ public class FetchItemQueues { if (totalSize.get() != 0 && queues.size() == 0) totalSize.set(0); } + return count; } @@ -220,11 +229,9 @@ public class FetchItemQueues { FetchItemQueue fiq = queues.get(id); if (fiq.getQueueSize() == 0) continue; - LOG.info("* queue: " + id + " >> dropping! "); + LOG.info("* queue: {} >> dropping!", id); int deleted = fiq.emptyQueue(); - for (int i = 0; i < deleted; i++) { - totalSize.decrementAndGet(); - } + totalSize.addAndGet(-deleted); count += deleted; } @@ -235,26 +242,43 @@ public class FetchItemQueues { * Increment the exception counter of a queue in case of an exception e.g. * timeout; when higher than a given threshold simply empty the queue. * - * @param queueid a queue identifier to locate and check + * The next fetch is delayed if specified by the param {@code delay} or + * configured by the property {@code fetcher.exceptions.per.queue.delay}. + * + * @param queueid + * a queue identifier to locate and check + * @param maxExceptions + * custom-defined number of max. exceptions - if negative the value + * of the property {@code fetcher.max.exceptions.per.queue} is used. + * @param delay + * a custom-defined time span in milliseconds to delay the next fetch + * in addition to the delay defined for the given queue. If a + * negative value is passed the delay is chosen by + * {@code fetcher.exceptions.per.queue.delay} + * * @return number of purged items */ - public synchronized int checkExceptionThreshold(String queueid) { + public synchronized int checkExceptionThreshold(String queueid, + int maxExceptions, long delay) { FetchItemQueue fiq = queues.get(queueid); if (fiq == null) { return 0; } int excCount = fiq.incrementExceptionCounter(); + if (delay > 0) { + fiq.nextFetchTime.getAndAdd(delay); + LOG.info("* queue: {} >> delayed next fetch by {} ms", queueid, delay); + } if (fiq.getQueueSize() == 0) { return 0; } - if (maxExceptionsPerQueue != -1 && excCount >= maxExceptionsPerQueue) { + if (maxExceptions!= -1 && excCount >= maxExceptions) { // too many exceptions for items in this queue - purge it int deleted = fiq.emptyQueue(); - LOG.info("* queue: " + queueid + " >> removed " + deleted - + " URLs from queue because " + excCount + " exceptions occurred"); - for (int i = 0; i < deleted; i++) { - totalSize.decrementAndGet(); - } + LOG.info( + "* queue: {} >> removed {} URLs from queue because {} exceptions occurred", + queueid, deleted, excCount); + totalSize.getAndAdd(-deleted); // keep queue IDs to ensure that these queues aren't created and filled // again, see addFetchItem(FetchItem) queuesMaxExceptions.add(queueid); @@ -264,6 +288,20 @@ public class FetchItemQueues { } /** + * Increment the exception counter of a queue in case of an exception e.g. + * timeout; when higher than a given threshold simply empty the queue. + * + * @see #checkExceptionThreshold(String, int, long) + * + * @param queueid + * queue identifier to locate and check + * @return number of purged items + */ + public int checkExceptionThreshold(String queueid) { + return checkExceptionThreshold(queueid, this.maxExceptionsPerQueue, -1); + } + + /** * @param redirUrl * redirect target * @return true if redirects are deduplicated and redirUrl has been queued diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 40b7201..d5fe343 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -139,6 +139,8 @@ public class FetcherThread extends Thread { private AtomicLong bytes; private List<Content> robotsTxtContent = null; + private long robotsDeferVisitsDelay; + private int robotsDeferVisitsRetries; //Used by the REST service private FetchNode fetchNode; @@ -194,6 +196,14 @@ public class FetcherThread extends Thread { URLNormalizers.SCOPE_OUTLINK); } + // NUTCH-2573 defer visits if robots.txt fails with HTTP 5xx + if (conf.getBoolean("http.robots.503.defer.visits", true)) { + this.robotsDeferVisitsDelay = conf + .getLong("http.robots.503.defer.visits.delay", 5 * 60 * 1000L); + this.robotsDeferVisitsRetries = conf + .getInt("http.robots.503.defer.visits.retries", 3); + } + if((activatePublisher=conf.getBoolean("fetcher.publisher", false))) this.publisher = new FetcherThreadPublisher(conf); @@ -312,6 +322,25 @@ public class FetcherThread extends Thread { outputRobotsTxt(robotsTxtContent); robotsTxtContent.clear(); } + if (rules.isDeferVisits()) { + LOG.info("Defer visits for queue {} : {}", fit.queueID, fit.url); + // retry the fetch item + if (fetchQueues.timelimitExceeded()) { + fetchQueues.finishFetchItem(fit, true); + } else { + fetchQueues.addFetchItem(fit); + } + // but check whether it's time to cancel the queue + int killedURLs = fetchQueues.checkExceptionThreshold( + fit.getQueueID(), this.robotsDeferVisitsRetries + 1, + this.robotsDeferVisitsDelay); + if (killedURLs != 0) { + context + .getCounter("FetcherStatus", "robots_defer_visits_dropped") + .increment(killedURLs); + } + continue; + } if (!rules.isAllowed(fit.url.toString())) { // unblock fetchQueues.finishFetchItem(fit, true); @@ -600,6 +629,12 @@ public class FetcherThread extends Thread { LOG.debug(" - ignoring redirect from {} to {} as duplicate", fit.url, redirUrl); return null; + } else if (fetchQueues.timelimitExceeded()) { + redirecting = false; + context.getCounter("FetcherStatus", "hitByTimeLimit").increment(1); + LOG.debug(" - ignoring redirect from {} to {} - timelimit reached", + fit.url, redirUrl); + return null; } CrawlDatum newDatum = createRedirDatum(redirUrl, fit, CrawlDatum.STATUS_DB_UNFETCHED); fit = FetchItem.create(redirUrl, newDatum, queueMode); @@ -780,8 +815,10 @@ public class FetcherThread extends Thread { reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language")); publisher.publish(reportEvent, conf); } + // Only process depth N outlinks - if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) { + if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth + && !fetchQueues.timelimitExceeded()) { FetchItem ft = FetchItem.create(url, null, queueMode); FetchItemQueue queue = fetchQueues.getFetchItemQueue(ft.queueID); queue.alreadyFetched.add(url.toString().hashCode()); diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java index 97256d2..1493bc2 100644 --- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java +++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java @@ -77,6 +77,19 @@ public abstract class RobotRulesParser implements Tool { public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules( RobotRulesMode.ALLOW_NONE); + /** + * A {@link BaseRobotRules} object appropriate for use when the + * {@code robots.txt} file failed to fetch with a 503 "Internal Server + * Error" (or other 5xx) status code. The crawler should suspend crawling + * for a certain (but not too long) time, see property + * <code>http.robots.503.defer.visits</code>. + */ + public static final BaseRobotRules DEFER_VISIT_RULES = new SimpleRobotRules( + RobotRulesMode.ALLOW_NONE); + static { + DEFER_VISIT_RULES.setDeferVisits(true); + } + private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser(); static { robotParser.setMaxCrawlDelay(Long.MAX_VALUE); diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index 4f3afd3..ad2521b 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -41,7 +41,9 @@ public class HttpRobotRulesParser extends RobotRulesParser { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); + protected boolean allowForbidden = false; + protected boolean deferVisits503 = false; HttpRobotRulesParser() { } @@ -53,6 +55,7 @@ public class HttpRobotRulesParser extends RobotRulesParser { public void setConf(Configuration conf) { super.setConf(conf); allowForbidden = conf.getBoolean("http.robots.403.allow", true); + deferVisits503 = conf.getBoolean("http.robots.503.defer.visits", true); } /** @@ -110,7 +113,7 @@ public class HttpRobotRulesParser extends RobotRulesParser { if (robotRules != null) { return robotRules; // cached rule } else if (LOG.isTraceEnabled()) { - LOG.trace("cache miss " + url); + LOG.trace("cache miss {}", url); } boolean cacheRule = true; @@ -163,9 +166,15 @@ public class HttpRobotRulesParser extends RobotRulesParser { robotRules = FORBID_ALL_RULES; // use forbid all else if (response.getCode() >= 500) { cacheRule = false; // try again later to fetch robots.txt - robotRules = EMPTY_RULES; - } else + if (deferVisits503) { + // signal fetcher to suspend crawling for this host + robotRules = DEFER_VISIT_RULES; + } else { + robotRules = EMPTY_RULES; + } + } else { robotRules = EMPTY_RULES; // use default rules + } } catch (Throwable t) { if (LOG.isInfoEnabled()) { LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());