RobotRulesParser.java

ab Wed, 19 Jul 2006 15:08:11 -0700

Author: ab
Date: Wed Jul 19 15:07:48 2006
New Revision: 423630

URL: http://svn.apache.org/viewvc?rev=423630&view=rev
Log:
Add support for Crawl-delay in robots.txt (NUTCH-293).


Modified:
    lucene/nutch/trunk/CHANGES.txt
    
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=423630&r1=423629&r2=423630&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Jul 19 15:07:48 2006
@@ -200,6 +200,12 @@
 71. NUTCH-320 DmozParser does not output list of urls to stdout
     but to a log file instead. Original functionality restored.
 
+72. NUTCH-271 - Add ability to limit crawling to the set of initially
+    injected hosts (db.ignore.external.links) (Philippe Eugene,
+    Stefan Neufeind via ab)
+
+73. NUTCH-293 - Support for Crawl-Delay (Stefan Groschupf via ab)
+
 Release 0.7 - 2005-08-17
 
  1. Added support for "type:" in queries. Search results are limited/qualified

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=423630&r1=423629&r2=423630&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Wed Jul 19 15:07:48 2006
@@ -183,12 +183,14 @@
         }
       }
       
-      String host = blockAddr(u);
+      long crawlDelay = robots.getCrawlDelay(this, u);
+      long delay = crawlDelay > 0 ? crawlDelay : serverDelay;
+      String host = blockAddr(u, delay);
       Response response;
       try {
         response = getResponse(u, datum, false); // make a request
       } finally {
-        unblockAddr(host);
+        unblockAddr(host, delay);
       }
       
       int code = response.getCode();
@@ -298,7 +300,7 @@
     return useHttp11;
   }
   
-  private String blockAddr(URL url) throws ProtocolException {
+  private String blockAddr(URL url, long crawlDelay) throws ProtocolException {
     
     String host;
     if (byIP) {
@@ -346,7 +348,7 @@
       long now = System.currentTimeMillis();
       long sleep = 0;
       if (done == 0) {                            // address is still in use
-        sleep = serverDelay;                      // wait at least delay
+        sleep = crawlDelay;                      // wait at least delay
         
       } else if (now < done) {                    // address is on hold
         sleep = done - now;                       // wait until its free
@@ -359,14 +361,14 @@
     }
   }
   
-  private void unblockAddr(String host) {
+  private void unblockAddr(String host, long crawlDelay) {
     synchronized (BLOCKED_ADDR_TO_TIME) {
       int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(host)).intValue();
       if (addrCount == 1) {
         THREADS_PER_HOST_COUNT.remove(host);
         BLOCKED_ADDR_QUEUE.addFirst(host);
         BLOCKED_ADDR_TO_TIME.put
-                (host, new Long(System.currentTimeMillis() + serverDelay));
+                (host, new Long(System.currentTimeMillis() + crawlDelay));
       } else {
         THREADS_PER_HOST_COUNT.put(host, new Integer(addrCount - 1));
       }

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=423630&r1=423629&r2=423630&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 Wed Jul 19 15:07:48 2006
@@ -73,6 +73,7 @@
     ArrayList tmpEntries = new ArrayList();
     RobotsEntry[] entries = null;
     long expireTime;
+    long crawlDelay = -1;
 
     /**
      */
@@ -126,6 +127,20 @@
       return expireTime;
     }
 
+    /**
+     * Get Crawl-Delay, in milliseconds. This returns -1 if not set.
+     */
+    public long getCrawlDelay() {
+      return crawlDelay;
+    }
+    
+    /**
+     * Set Crawl-Delay, in milliseconds
+     */
+    public void setCrawlDelay(long crawlDelay) {
+      this.crawlDelay = crawlDelay;
+    }
+    
     /** 
      *  Returns <code>false</code> if the <code>robots.txt</code> file
      *  prohibits us from accessing the given <code>path</code>, or
@@ -352,6 +367,19 @@
           if (addRules)
             currentRules.addPrefix(path, true);
         }
+      } else if ( (line.length() >= 12)
+                  && (line.substring(0, 12).equalsIgnoreCase("Crawl-Delay:"))) 
{
+        doneAgents = true;
+        long crawlDelay = -1;
+        String delay = line.substring("Crawl-Delay:".length(), 
line.length()).trim();
+        if (delay.length() > 0) {
+          try {
+            crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec
+          } catch (Exception e) {
+            LOG.info("can not parse Crawl-Delay:" + e.toString());
+          }
+          currentRules.setCrawlDelay(crawlDelay);
+        }
       }
     }
 
@@ -386,10 +414,9 @@
     return rules;
   }
   
-  public boolean isAllowed(HttpBase http, URL url)
-    throws ProtocolException, IOException {
+  private RobotRuleSet getRobotRulesSet(HttpBase http, URL url) {
 
-    String host = url.getHost();
+    String host = url.getHost().toLowerCase(); // normalize to lower case
 
     RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);
 
@@ -414,13 +441,22 @@
 
       CACHE.put(host, robotRules);                // cache rules for host
     }
+    return robotRules;
+  }
 
+  public boolean isAllowed(HttpBase http, URL url)
+      throws ProtocolException, IOException {
     String path = url.getPath();                  // check rules
     if ((path == null) || "".equals(path)) {
       path= "/";
     }
 
-    return robotRules.isAllowed(path);
+    return getRobotRulesSet(http, url).isAllowed(path);
+  }
+  
+  public long getCrawlDelay(HttpBase http, URL url)
+      throws ProtocolException, IOException {
+    return getRobotRulesSet(http, url).getCrawlDelay();
   }
 
   private final static int BUFSIZE= 2048;

svn commit: r423630 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java

Reply via email to