Author: markus
Date: Mon Jun 25 14:42:05 2012
New Revision: 1353582
URL: http://svn.apache.org/viewvc?rev=1353582&view=rev
Log:
NUTCH-1408 RobotRulesParser main doesn't take URL's
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1353582&r1=1353581&r2=1353582&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jun 25 14:42:05 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1408 RobotRulesParser main doesn't take URL's (markus)
+
* NUTCH-1400 Remove developer -core option for bin/nutch (jnioche)
* NUTCH-1404 Nutch script fails to find job file in deploy mode (sidabatra,
jnioche)
Modified:
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=1353582&r1=1353581&r2=1353582&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
(original)
+++
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Mon Jun 25 14:42:05 2012
@@ -182,6 +182,7 @@ public class RobotRulesParser implements
while (pos < end) {
if (path.startsWith(entries[pos].prefix))
return entries[pos].allowed;
+
pos++;
}
@@ -335,6 +336,12 @@ public class RobotRulesParser implements
doneAgents= true;
String path= line.substring(line.indexOf(":") + 1);
path= path.trim();
+
+ // Skip if no path was specified
+ if (path.length() == 0) {
+ // Go to the next token
+ continue;
+ }
try {
path= URLDecoder.decode(path, CHARACTER_ENCODING);
} catch (Exception e) {
@@ -560,7 +567,7 @@ public class RobotRulesParser implements
String testPath= testsIn.readLine().trim();
while (testPath != null) {
- System.out.println( (rules.isAllowed(testPath) ?
+ System.out.println( (rules.isAllowed(new URL(testPath)) ?
"allowed" : "not allowed")
+ ":\t" + testPath);
testPath= testsIn.readLine();