Author: ab Date: Wed Jul 19 15:07:48 2006 New Revision: 423630 URL: http://svn.apache.org/viewvc?rev=423630&view=rev Log: Add support for Crawl-delay in robots.txt (NUTCH-293).
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=423630&r1=423629&r2=423630&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Jul 19 15:07:48 2006 @@ -200,6 +200,12 @@ 71. NUTCH-320 DmozParser does not output list of urls to stdout but to a log file instead. Original functionality restored. +72. NUTCH-271 - Add ability to limit crawling to the set of initially + injected hosts (db.ignore.external.links) (Philippe Eugene, + Stefan Neufeind via ab) + +73. NUTCH-293 - Support for Crawl-Delay (Stefan Groschupf via ab) + Release 0.7 - 2005-08-17 1. Added support for "type:" in queries. Search results are limited/qualified Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=423630&r1=423629&r2=423630&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Wed Jul 19 15:07:48 2006 @@ -183,12 +183,14 @@ } } - String host = blockAddr(u); + long crawlDelay = robots.getCrawlDelay(this, u); + long delay = crawlDelay > 0 ? crawlDelay : serverDelay; + String host = blockAddr(u, delay); Response response; try { response = getResponse(u, datum, false); // make a request } finally { - unblockAddr(host); + unblockAddr(host, delay); } int code = response.getCode(); @@ -298,7 +300,7 @@ return useHttp11; } - private String blockAddr(URL url) throws ProtocolException { + private String blockAddr(URL url, long crawlDelay) throws ProtocolException { String host; if (byIP) { @@ -346,7 +348,7 @@ long now = System.currentTimeMillis(); long sleep = 0; if (done == 0) { // address is still in use - sleep = serverDelay; // wait at least delay + sleep = crawlDelay; // wait at least delay } else if (now < done) { // address is on hold sleep = done - now; // wait until its free @@ -359,14 +361,14 @@ } } - private void unblockAddr(String host) { + private void unblockAddr(String host, long crawlDelay) { synchronized (BLOCKED_ADDR_TO_TIME) { int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(host)).intValue(); if (addrCount == 1) { THREADS_PER_HOST_COUNT.remove(host); BLOCKED_ADDR_QUEUE.addFirst(host); BLOCKED_ADDR_TO_TIME.put - (host, new Long(System.currentTimeMillis() + serverDelay)); + (host, new Long(System.currentTimeMillis() + crawlDelay)); } else { THREADS_PER_HOST_COUNT.put(host, new Integer(addrCount - 1)); } Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=423630&r1=423629&r2=423630&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Wed Jul 19 15:07:48 2006 @@ -73,6 +73,7 @@ ArrayList tmpEntries = new ArrayList(); RobotsEntry[] entries = null; long expireTime; + long crawlDelay = -1; /** */ @@ -126,6 +127,20 @@ return expireTime; } + /** + * Get Crawl-Delay, in milliseconds. This returns -1 if not set. + */ + public long getCrawlDelay() { + return crawlDelay; + } + + /** + * Set Crawl-Delay, in milliseconds + */ + public void setCrawlDelay(long crawlDelay) { + this.crawlDelay = crawlDelay; + } + /** * Returns <code>false</code> if the <code>robots.txt</code> file * prohibits us from accessing the given <code>path</code>, or @@ -352,6 +367,19 @@ if (addRules) currentRules.addPrefix(path, true); } + } else if ( (line.length() >= 12) + && (line.substring(0, 12).equalsIgnoreCase("Crawl-Delay:"))) { + doneAgents = true; + long crawlDelay = -1; + String delay = line.substring("Crawl-Delay:".length(), line.length()).trim(); + if (delay.length() > 0) { + try { + crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec + } catch (Exception e) { + LOG.info("can not parse Crawl-Delay:" + e.toString()); + } + currentRules.setCrawlDelay(crawlDelay); + } } } @@ -386,10 +414,9 @@ return rules; } - public boolean isAllowed(HttpBase http, URL url) - throws ProtocolException, IOException { + private RobotRuleSet getRobotRulesSet(HttpBase http, URL url) { - String host = url.getHost(); + String host = url.getHost().toLowerCase(); // normalize to lower case RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host); @@ -414,13 +441,22 @@ CACHE.put(host, robotRules); // cache rules for host } + return robotRules; + } + public boolean isAllowed(HttpBase http, URL url) + throws ProtocolException, IOException { String path = url.getPath(); // check rules if ((path == null) || "".equals(path)) { path= "/"; } - return robotRules.isAllowed(path); + return getRobotRulesSet(http, url).isAllowed(path); + } + + public long getCrawlDelay(HttpBase http, URL url) + throws ProtocolException, IOException { + return getRobotRulesSet(http, url).getCrawlDelay(); } private final static int BUFSIZE= 2048;