Author: siren
Date: Thu May 10 09:29:51 2007
New Revision: 536925

URL: http://svn.apache.org/viewvc?view=rev&rev=536925
Log:
NUTCH-446 RobotRulesParser should ignore Crawl-delay values of other bots in 
robots.txt, contributed by Doğacan Güney

Modified:
    lucene/nutch/trunk/CHANGES.txt
    
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
    
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=536925&r1=536924&r2=536925
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu May 10 09:29:51 2007
@@ -11,6 +11,9 @@
     (Eelco Lempsink via ab)
 
  4. NUTCH-456 - Parse msexcel plugin speedup (Heiko Dietze via siren)
+
+ 5. NUTCH-446 - RobotRulesParser should ignore Crawl-delay values of other
+    bots in robots.txt (Dogacan Guney via siren)
  
 
 Release 0.9 - 2007-04-02

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?view=diff&rev=536925&r1=536924&r2=536925
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 Thu May 10 09:29:51 2007
@@ -389,15 +389,17 @@
       } else if ( (line.length() >= 12)
                   && (line.substring(0, 12).equalsIgnoreCase("Crawl-Delay:"))) 
{
         doneAgents = true;
-        long crawlDelay = -1;
-        String delay = line.substring("Crawl-Delay:".length(), 
line.length()).trim();
-        if (delay.length() > 0) {
-          try {
-            crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec
-          } catch (Exception e) {
-            LOG.info("can not parse Crawl-Delay:" + e.toString());
+        if (addRules) {
+          long crawlDelay = -1;
+          String delay = line.substring("Crawl-Delay:".length(), 
line.length()).trim();
+          if (delay.length() > 0) {
+            try {
+              crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec
+            } catch (Exception e) {
+              LOG.info("can not parse Crawl-Delay:" + e.toString());
+            }
+            currentRules.setCrawlDelay(crawlDelay);
           }
-          currentRules.setCrawlDelay(crawlDelay);
         }
       }
     }
@@ -500,7 +502,7 @@
 
   /** command-line main for testing */
   public static void main(String[] argv) {
-    if (argv.length != 3) {
+    if (argv.length < 3) {
       System.out.println("Usage:");
       System.out.println("   java <robots-file> <url-file> <agent-name>+");
       System.out.println("");
@@ -513,7 +515,7 @@
     try { 
       FileInputStream robotsIn= new FileInputStream(argv[0]);
       LineNumberReader testsIn= new LineNumberReader(new FileReader(argv[1]));
-      String[] robotNames= new String[argv.length - 1];
+      String[] robotNames= new String[argv.length - 2];
 
       for (int i= 0; i < argv.length - 2; i++) 
         robotNames[i]= argv[i+2];

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?view=diff&rev=536925&r1=536924&r2=536925
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 Thu May 10 09:29:51 2007
@@ -262,6 +262,26 @@
       }
     }
   }
+  
+  public void testCrawlDelay() {
+    RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" });
+    String delayRule1 = "User-agent: nutchbot" + CR +
+                        "Crawl-delay: 10" + CR +
+                        "User-agent: foobot" + CR +
+                        "Crawl-delay: 20" + CR +
+                        "User-agent: *" + CR + 
+                        "Disallow:/baz" + CR;
+    String delayRule2 = "User-agent: foobot" + CR +
+                        "Crawl-delay: 20" + CR +
+                        "User-agent: *" + CR + 
+                        "Disallow:/baz" + CR;
+    RobotRuleSet rules = p.parseRules(delayRule1.getBytes());
+    long crawlDelay = rules.getCrawlDelay();
+    assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay 
== 10000));
+    rules = p.parseRules(delayRule2.getBytes());
+    crawlDelay = rules.getCrawlDelay();
+    assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay 
== -1));
+  }
 
   // helper
 



-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to