Author: siren Date: Thu May 10 09:29:51 2007 New Revision: 536925 URL: http://svn.apache.org/viewvc?view=rev&rev=536925 Log: NUTCH-446 RobotRulesParser should ignore Crawl-delay values of other bots in robots.txt, contributed by Doğacan Güney
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=536925&r1=536924&r2=536925 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu May 10 09:29:51 2007 @@ -11,6 +11,9 @@ (Eelco Lempsink via ab) 4. NUTCH-456 - Parse msexcel plugin speedup (Heiko Dietze via siren) + + 5. NUTCH-446 - RobotRulesParser should ignore Crawl-delay values of other + bots in robots.txt (Dogacan Guney via siren) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?view=diff&rev=536925&r1=536924&r2=536925 ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Thu May 10 09:29:51 2007 @@ -389,15 +389,17 @@ } else if ( (line.length() >= 12) && (line.substring(0, 12).equalsIgnoreCase("Crawl-Delay:"))) { doneAgents = true; - long crawlDelay = -1; - String delay = line.substring("Crawl-Delay:".length(), line.length()).trim(); - if (delay.length() > 0) { - try { - crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec - } catch (Exception e) { - LOG.info("can not parse Crawl-Delay:" + e.toString()); + if (addRules) { + long crawlDelay = -1; + String delay = line.substring("Crawl-Delay:".length(), line.length()).trim(); + if (delay.length() > 0) { + try { + crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec + } catch (Exception e) { + LOG.info("can not parse Crawl-Delay:" + e.toString()); + } + currentRules.setCrawlDelay(crawlDelay); } - currentRules.setCrawlDelay(crawlDelay); } } } @@ -500,7 +502,7 @@ /** command-line main for testing */ public static void main(String[] argv) { - if (argv.length != 3) { + if (argv.length < 3) { System.out.println("Usage:"); System.out.println(" java <robots-file> <url-file> <agent-name>+"); System.out.println(""); @@ -513,7 +515,7 @@ try { FileInputStream robotsIn= new FileInputStream(argv[0]); LineNumberReader testsIn= new LineNumberReader(new FileReader(argv[1])); - String[] robotNames= new String[argv.length - 1]; + String[] robotNames= new String[argv.length - 2]; for (int i= 0; i < argv.length - 2; i++) robotNames[i]= argv[i+2]; Modified: lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?view=diff&rev=536925&r1=536924&r2=536925 ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Thu May 10 09:29:51 2007 @@ -262,6 +262,26 @@ } } } + + public void testCrawlDelay() { + RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" }); + String delayRule1 = "User-agent: nutchbot" + CR + + "Crawl-delay: 10" + CR + + "User-agent: foobot" + CR + + "Crawl-delay: 20" + CR + + "User-agent: *" + CR + + "Disallow:/baz" + CR; + String delayRule2 = "User-agent: foobot" + CR + + "Crawl-delay: 20" + CR + + "User-agent: *" + CR + + "Disallow:/baz" + CR; + RobotRuleSet rules = p.parseRules(delayRule1.getBytes()); + long crawlDelay = rules.getCrawlDelay(); + assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay == 10000)); + rules = p.parseRules(delayRule2.getBytes()); + crawlDelay = rules.getCrawlDelay(); + assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay == -1)); + } // helper ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs