Author: tejasp
Date: Fri Apr 5 23:50:56 2013
New Revision: 1465159
URL: http://svn.apache.org/r1465159
Log:
NUTCH-1031 Delegate parsing of robots.txt to crawler-commons
Added:
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
Removed:
nutch/trunk/src/java/org/apache/nutch/protocol/EmptyRobotRules.java
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Apr 5 23:50:56 2013
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk): Current Development
+* NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)
+
* NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)
* NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel)
Modified: nutch/trunk/ivy/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Fri Apr 5 23:50:56 2013
@@ -74,6 +74,7 @@
<dependency org="oro" name="oro" rev="2.0.8" />
<dependency org="com.google.guava" name="guava" rev="11.0.2" />
+ <dependency org="com.google.code.crawler-commons"
name="crawler-commons" rev="0.2" />
<!--Configuration: test -->
Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Apr 5
23:50:56 2013
@@ -51,6 +51,7 @@ import org.apache.nutch.scoring.ScoringF
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.*;
+import crawlercommons.robots.BaseRobotRules;
/**
* A queue-based fetcher.
@@ -671,8 +672,8 @@ public class Fetcher extends Configured
}
redirecting = false;
Protocol protocol =
this.protocolFactory.getProtocol(fit.url.toString());
- RobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
- if (!rules.isAllowed(fit.u)) {
+ BaseRobotRules rules = protocol.getRobotRules(fit.url,
fit.datum);
+ if (!rules.isAllowed(fit.u.toString())) {
// unblock
fetchQueues.finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
Modified: nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java Fri Apr 5
23:50:56 2013
@@ -25,6 +25,8 @@ import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.plugin.Pluggable;
+import crawlercommons.robots.BaseRobotRules;
+
/** A retriever of url content. Implemented by protocol extensions. */
public interface Protocol extends Pluggable, Configurable {
@@ -59,5 +61,6 @@ public interface Protocol extends Plugga
* @param datum page datum
* @return robot rules (specific for this url or default), never null
*/
- RobotRules getRobotRules(Text url, CrawlDatum datum);
+ BaseRobotRules getRobotRules(Text url, CrawlDatum datum);
}
+
Added: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1465159&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Fri
Apr 5 23:50:56 2013
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+// JDK imports
+import java.io.File;
+import java.io.FileReader;
+import java.io.LineNumberReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.StringTokenizer;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.io.Text;
+
+import com.google.common.io.Files;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
+import crawlercommons.robots.SimpleRobotRulesParser;
+
+/**
+ * This class uses crawler-commons for handling the parsing of {@code
robots.txt} files.
+ * It emits SimpleRobotRules objects, which describe the download permissions
+ * as described in SimpleRobotRulesParser.
+ */
+public abstract class RobotRulesParser implements Configurable {
+
+ public static final Logger LOG =
LoggerFactory.getLogger(RobotRulesParser.class);
+
+ protected static final Hashtable<String, BaseRobotRules> CACHE = new
Hashtable<String, BaseRobotRules> ();
+
+ /**
+ * A {@link BaseRobotRules} object appropriate for use
+ * when the {@code robots.txt} file is empty or missing;
+ * all requests are allowed.
+ */
+ public static final BaseRobotRules EMPTY_RULES = new
SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
+
+ /**
+ * A {@link BaseRobotRules} object appropriate for use when the
+ * {@code robots.txt} file is not fetched due to a {@code 403/Forbidden}
+ * response; all requests are disallowed.
+ */
+ public static BaseRobotRules FORBID_ALL_RULES = new
SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
+
+ private static SimpleRobotRulesParser robotParser = new
SimpleRobotRulesParser();
+ private Configuration conf;
+ protected String agentNames;
+
+ public RobotRulesParser() { }
+
+ public RobotRulesParser(Configuration conf) {
+ setConf(conf);
+ }
+
+ /**
+ * Set the {@link Configuration} object
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ // Grab the agent names we advertise to robots files.
+ String agentName = conf.get("http.agent.name");
+ if (null == agentName) {
+ throw new RuntimeException("Agent name not configured!");
+ }
+
+ String agentNames = conf.get("http.robots.agents");
+ StringTokenizer tok = new StringTokenizer(agentNames, ",");
+ ArrayList<String> agents = new ArrayList<String>();
+ while (tok.hasMoreTokens()) {
+ agents.add(tok.nextToken().trim());
+ }
+
+ /**
+ * If there are no agents for robots-parsing, use the
+ * default agent-string. If both are present, our agent-string
+ * should be the first one we advertise to robots-parsing.
+ */
+ if (agents.size() == 0) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error("No agents listed in 'http.robots.agents' property!");
+ }
+ } else {
+ StringBuffer combinedAgentsString = new StringBuffer(agentName);
+ int index = 0;
+
+ if ((agents.get(0)).equalsIgnoreCase(agentName))
+ index++;
+ else if (LOG.isErrorEnabled()) {
+ LOG.error("Agent we advertise (" + agentName
+ + ") not listed first in 'http.robots.agents' property!");
+ }
+
+ // append all the agents from the http.robots.agents property
+ for(; index < agents.size(); index++) {
+ combinedAgentsString.append(", " + agents.get(index));
+ }
+
+ // always make sure "*" is included in the end
+ combinedAgentsString.append(", *");
+ this.agentNames = combinedAgentsString.toString();
+ }
+ }
+
+ /**
+ * Get the {@link Configuration} object
+ */
+ public Configuration getConf() {
+ return conf;
+ }
+
+ /**
+ * Parses the robots content using the {@link SimpleRobotRulesParser} from
crawler commons
+ *
+ * @param url A string containing url
+ * @param content Contents of the robots file in a byte array
+ * @param contentType The
+ * @param robotName A string containing value of
+ * @return BaseRobotRules object
+ */
+ public BaseRobotRules parseRules (String url, byte[] content, String
contentType, String robotName) {
+ return robotParser.parseContent(url, content, contentType, robotName);
+ }
+
+ public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url) {
+ URL u = null;
+ try {
+ u = new URL(url.toString());
+ } catch (Exception e) {
+ return EMPTY_RULES;
+ }
+ return getRobotRulesSet(protocol, u);
+ }
+
+ public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);
+
+ /** command-line main for testing */
+ public static void main(String[] argv) {
+
+ if (argv.length < 3) {
+ System.err.println("Usage: RobotRulesParser <robots-file> <url-file>
<agent-names>\n");
+ System.err.println("\tThe <robots-file> will be parsed as a robots.txt
file,");
+ System.err.println("\tusing the given <agent-name> to select rules.
URLs ");
+ System.err.println("\twill be read (one per line) from <url-file>, and
tested");
+ System.err.println("\tagainst the rules. Multiple agent names can be
specified using spaces.");
+ System.exit(-1);
+ }
+
+ try {
+ StringBuilder agentNames = new StringBuilder();
+ for(int counter = 2; counter < argv.length; counter++)
+ agentNames.append(argv[counter]).append(",");
+
+ agentNames.deleteCharAt(agentNames.length()-1);
+
+ byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
+ BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes,
"text/plain", agentNames.toString());
+
+ LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
+ String testPath = testsIn.readLine().trim();
+ while (testPath != null) {
+ System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not
allowed") +
+ ":\t" + testPath);
+ testPath = testsIn.readLine();
+ }
+ testsIn.close();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+}
Modified:
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
(original)
+++
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Fri Apr 5 23:50:56 2013
@@ -32,15 +32,16 @@ import org.apache.nutch.protocol.Protoco
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
import org.apache.nutch.util.GZIPUtils;
import org.apache.nutch.util.DeflateUtils;
-
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
+// crawler-commons imports
+import crawlercommons.robots.BaseRobotRules;
+
/**
* @author Jérôme Charron
*/
@@ -51,7 +52,7 @@ public abstract class HttpBase implement
private static final byte[] EMPTY_CONTENT = new byte[0];
- private RobotRulesParser robots = null;
+ private HttpRobotRulesParser robots = null;
/** The proxy hostname. */
protected String proxyHost = null;
@@ -105,7 +106,7 @@ public abstract class HttpBase implement
if (logger != null) {
this.logger = logger;
}
- robots = new RobotRulesParser();
+ robots = new HttpRobotRulesParser();
}
// Inherited Javadoc
@@ -138,7 +139,6 @@ public abstract class HttpBase implement
String urlString = url.toString();
try {
URL u = new URL(urlString);
- String host = null;
Response response = getResponse(u, datum, false); // make a request
int code = response.getCode();
@@ -381,18 +381,16 @@ public abstract class HttpBase implement
System.out.println("Content:");
String text = new String(content.getContent());
System.out.println(text);
- }
-
+ }
}
-
protected abstract Response getResponse(URL url,
CrawlDatum datum,
boolean followRedirects)
throws ProtocolException, IOException;
- public RobotRules getRobotRules(Text url, CrawlDatum datum) {
+ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
return robots.getRobotRulesSet(this, url);
}
-
}
+
Added:
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1465159&view=auto
==============================================================================
---
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
(added)
+++
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
Fri Apr 5 23:50:56 2013
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.RobotRulesParser;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to HTTP protocol.
+ * It extends the generic {@link RobotRulesParser} class and contains
+ * Http protocol specific implementation for obtaining the robots file.
+ */
+public class HttpRobotRulesParser extends RobotRulesParser {
+
+ public static final Logger LOG =
LoggerFactory.getLogger(HttpRobotRulesParser.class);
+ protected boolean allowForbidden = false;
+
+ HttpRobotRulesParser() { }
+
+ public HttpRobotRulesParser(Configuration conf) {
+ super(conf);
+ allowForbidden = conf.getBoolean("http.robots.403.allow", false);
+ }
+
+ /**
+ * The hosts for which the caching of robots rules is yet to be done,
+ * it sends a Http request to the host corresponding to the {@link URL}
+ * passed, gets robots file, parses the rules and caches the rules object
+ * to avoid re-work in future.
+ *
+ * @param http The {@link Protocol} object
+ * @param url URL
+ *
+ * @return robotRules A {@link BaseRobotRules} object for the rules
+ */
+ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
+
+ String protocol = url.getProtocol().toLowerCase(); // normalize to lower
case
+ String host = url.getHost().toLowerCase(); // normalize to lower
case
+
+ BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" +
host);
+
+ boolean cacheRule = true;
+
+ if (robotRules == null) { // cache miss
+ URL redir = null;
+ if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
+ try {
+ Response response = ((HttpBase)http).getResponse(new URL(url,
"/robots.txt"),
+ new CrawlDatum(), true);
+ // try one level of redirection ?
+ if (response.getCode() == 301 || response.getCode() == 302) {
+ String redirection = response.getHeader("Location");
+ if (redirection == null) {
+ // some versions of MS IIS are known to mangle this header
+ redirection = response.getHeader("location");
+ }
+ if (redirection != null) {
+ if (!redirection.startsWith("http")) {
+ // RFC says it should be absolute, but apparently it isn't
+ redir = new URL(url, redirection);
+ } else {
+ redir = new URL(redirection);
+ }
+
+ response = ((HttpBase)http).getResponse(redir, new CrawlDatum(),
true);
+ }
+ }
+
+ if (response.getCode() == 200) // found rules: parse them
+ robotRules = parseRules(url.toString(), response.getContent(),
+ response.getHeader("Content-Type"),
+ agentNames);
+
+ else if ( (response.getCode() == 403) && (!allowForbidden) )
+ robotRules = FORBID_ALL_RULES; // use forbid all
+ else if (response.getCode() >= 500) {
+ cacheRule = false;
+ robotRules = EMPTY_RULES;
+ }else
+ robotRules = EMPTY_RULES; // use default rules
+ } catch (Throwable t) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+ }
+ cacheRule = false;
+ robotRules = EMPTY_RULES;
+ }
+
+ if (cacheRule) {
+ CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
+ if (redir != null && !redir.getHost().equals(host)) {
+ // cache also for the redirected host
+ CACHE.put(protocol + ":" + redir.getHost(), robotRules);
+ }
+ }
+ }
+ return robotRules;
+ }
+}
Modified:
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
(original)
+++
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
Fri Apr 5 23:50:56 2013
@@ -17,292 +17,100 @@
package org.apache.nutch.protocol.http.api;
-import org.apache.nutch.protocol.http.api.RobotRulesParser.RobotRuleSet;
-
+import crawlercommons.robots.BaseRobotRules;
import junit.framework.TestCase;
+/**
+ * JUnit test case which tests
+ * 1. that robots filtering is performed correctly as per the agent name
+ * 2. that crawl delay is extracted correctly from the robots file
+ *
+ */
public class TestRobotRulesParser extends TestCase {
- private static final String LF= "\n";
- private static final String CR= "\r";
- private static final String CRLF= "\r\n";
+
+ private static final String CONTENT_TYPE = "text/plain";
+ private static final String SINGLE_AGENT = "Agent1";
+ private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
+ private static final String UNKNOWN_AGENT = "AgentABC";
+ private static final String CR = "\r";
- private static final boolean[] ACCEPT_ALL = {
- true, // "/a",
- true, // "/a/",
- true, // "/a/bloh/foo.html"
- true, // "/b",
- true, // "/b/a",
- true, // "/b/a/index.html",
- true, // "/b/b/foo.html",
- true, // "/c",
- true, // "/c/a",
- true, // "/c/a/index.html",
- true, // "/c/b/foo.html",
- true, // "/d",
- true, // "/d/a",
- true, // "/e/a/index.html",
- true, // "/e/d",
- true, // "/e/d/foo.html",
- true, // "/e/doh.html",
- true, // "/f/index.html",
- true, // "/foo/bar.html",
- true, // "/f/",
- };
+ private static final String ROBOTS_STRING =
+ "User-Agent: Agent1 #foo" + CR
+ + "Disallow: /a" + CR
+ + "Disallow: /b/a" + CR
+ + "#Disallow: /c" + CR
+ + "Crawl-delay: 10" + CR // set crawl delay for Agent1 as 10 sec
+ + "" + CR
+ + "" + CR
+ + "User-Agent: Agent2" + CR
+ + "Disallow: /a/bloh" + CR
+ + "Disallow: /c" + CR
+ + "Disallow: /foo" + CR
+ + "Crawl-delay: 20" + CR
+ + "" + CR
+ + "User-Agent: *" + CR
+ + "Disallow: /foo/bar/" + CR; // no crawl delay for other agents
- private static final String[] ROBOTS_STRINGS= new String[] {
- "User-Agent: Agent1 #foo" + CR
- + "Disallow: /a" + CR
- + "Disallow: /b/a" + CR
- + "#Disallow: /c" + CR
- + "" + CR
- + "" + CR
- + "User-Agent: Agent2 Agent3#foo" + CR
- + "User-Agent: Agent4" + CR
- + "Disallow: /d" + CR
- + "Disallow: /e/d/" + CR
- + "" + CR
- + "User-Agent: *" + CR
- + "Disallow: /foo/bar/" + CR,
- null // Used to test EMPTY_RULES
+ private static final String[] TEST_PATHS = new String[] {
+ "http://example.com/a",
+ "http://example.com/a/bloh/foo.html",
+ "http://example.com/b",
+ "http://example.com/c",
+ "http://example.com/b/a/index.html",
+ "http://example.com/foo/bar/baz.html"
+ };
+
+ private static final boolean[] RESULTS = new boolean[] {
+ false, // /a
+ false, // /a/bloh/foo.html
+ true, // /b
+ true, // /c
+ false, // /b/a/index.html
+ true // /foo/bar/baz.html
};
- private static final String[] AGENT_STRINGS= new String[] {
- "Agent1",
- "Agent2",
- "Agent3",
- "Agent4",
- "Agent5",
- };
-
- private static final boolean[][] NOT_IN_ROBOTS_STRING= new boolean[][] {
- {
- false,
- false,
- false,
- false,
- true,
- },
- {
- false,
- false,
- false,
- false,
- true,
- }
- };
+ private HttpRobotRulesParser parser;
+ private BaseRobotRules rules;
- private static final String[] TEST_PATHS= new String[] {
- "/a",
- "/a/",
- "/a/bloh/foo.html",
- "/b",
- "/b/a",
- "/b/a/index.html",
- "/b/b/foo.html",
- "/c",
- "/c/a",
- "/c/a/index.html",
- "/c/b/foo.html",
- "/d",
- "/d/a",
- "/e/a/index.html",
- "/e/d",
- "/e/d/foo.html",
- "/e/doh.html",
- "/f/index.html",
- "/foo/bar/baz.html",
- "/f/",
- };
-
- private static final boolean[][][] ALLOWED= new boolean[][][] {
- { // ROBOTS_STRINGS[0]
- { // Agent1
- false, // "/a",
- false, // "/a/",
- false, // "/a/bloh/foo.html"
- true, // "/b",
- false, // "/b/a",
- false, // "/b/a/index.html",
- true, // "/b/b/foo.html",
- true, // "/c",
- true, // "/c/a",
- true, // "/c/a/index.html",
- true, // "/c/b/foo.html",
- true, // "/d",
- true, // "/d/a",
- true, // "/e/a/index.html",
- true, // "/e/d",
- true, // "/e/d/foo.html",
- true, // "/e/doh.html",
- true, // "/f/index.html",
- true, // "/foo/bar.html",
- true, // "/f/",
- },
- { // Agent2
- true, // "/a",
- true, // "/a/",
- true, // "/a/bloh/foo.html"
- true, // "/b",
- true, // "/b/a",
- true, // "/b/a/index.html",
- true, // "/b/b/foo.html",
- true, // "/c",
- true, // "/c/a",
- true, // "/c/a/index.html",
- true, // "/c/b/foo.html",
- false, // "/d",
- false, // "/d/a",
- true, // "/e/a/index.html",
- true, // "/e/d",
- false, // "/e/d/foo.html",
- true, // "/e/doh.html",
- true, // "/f/index.html",
- true, // "/foo/bar.html",
- true, // "/f/",
- },
- { // Agent3
- true, // "/a",
- true, // "/a/",
- true, // "/a/bloh/foo.html"
- true, // "/b",
- true, // "/b/a",
- true, // "/b/a/index.html",
- true, // "/b/b/foo.html",
- true, // "/c",
- true, // "/c/a",
- true, // "/c/a/index.html",
- true, // "/c/b/foo.html",
- false, // "/d",
- false, // "/d/a",
- true, // "/e/a/index.html",
- true, // "/e/d",
- false, // "/e/d/foo.html",
- true, // "/e/doh.html",
- true, // "/f/index.html",
- true, // "/foo/bar.html",
- true, // "/f/",
- },
- { // Agent4
- true, // "/a",
- true, // "/a/",
- true, // "/a/bloh/foo.html"
- true, // "/b",
- true, // "/b/a",
- true, // "/b/a/index.html",
- true, // "/b/b/foo.html",
- true, // "/c",
- true, // "/c/a",
- true, // "/c/a/index.html",
- true, // "/c/b/foo.html",
- false, // "/d",
- false, // "/d/a",
- true, // "/e/a/index.html",
- true, // "/e/d",
- false, // "/e/d/foo.html",
- true, // "/e/doh.html",
- true, // "/f/index.html",
- true, // "/foo/bar.html",
- true, // "/f/",
- },
- { // Agent5/"*"
- true, // "/a",
- true, // "/a/",
- true, // "/a/bloh/foo.html"
- true, // "/b",
- true, // "/b/a",
- true, // "/b/a/index.html",
- true, // "/b/b/foo.html",
- true, // "/c",
- true, // "/c/a",
- true, // "/c/a/index.html",
- true, // "/c/b/foo.html",
- true, // "/d",
- true, // "/d/a",
- true, // "/e/a/index.html",
- true, // "/e/d",
- true, // "/e/d/foo.html",
- true, // "/e/doh.html",
- true, // "/f/index.html",
- false, // "/foo/bar.html",
- true, // "/f/",
- }
- },
- { // ROBOTS_STRINGS[1]
- ACCEPT_ALL, // Agent 1
- ACCEPT_ALL, // Agent 2
- ACCEPT_ALL, // Agent 3
- ACCEPT_ALL, // Agent 4
- ACCEPT_ALL, // Agent 5
- }
- };
-
public TestRobotRulesParser(String name) {
super(name);
+ parser = new HttpRobotRulesParser();
}
- public void testRobotsOneAgent() {
- for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
- for (int j= 0; j < AGENT_STRINGS.length; j++) {
- testRobots(i, new String[] { AGENT_STRINGS[j] },
- TEST_PATHS, ALLOWED[i][j]);
- }
+ /**
+ * Test that the robots rules are interpreted correctly by the robots rules
parser.
+ */
+ public void testRobotsAgent() {
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, SINGLE_AGENT);
+
+ for(int counter = 0; counter < TEST_PATHS.length; counter++) {
+ assertTrue("testing on agent (" + SINGLE_AGENT + "), and "
+ + "path " + TEST_PATHS[counter]
+ + " got " + rules.isAllowed(TEST_PATHS[counter]),
+ rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
- }
- public void testRobotsTwoAgents() {
- for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
- for (int j= 0; j < AGENT_STRINGS.length; j++) {
- for (int k= 0; k < AGENT_STRINGS.length; k++) {
- int key= j;
- if (NOT_IN_ROBOTS_STRING[i][j])
- key= k;
- testRobots(i, new String[] { AGENT_STRINGS[j], AGENT_STRINGS[k] },
- TEST_PATHS, ALLOWED[i][key]);
- }
- }
- }
- }
-
- public void testCrawlDelay() {
- RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" });
- String delayRule1 = "User-agent: nutchbot" + CR +
- "Crawl-delay: 10" + CR +
- "User-agent: foobot" + CR +
- "Crawl-delay: 20" + CR +
- "User-agent: *" + CR +
- "Disallow:/baz" + CR;
- String delayRule2 = "User-agent: foobot" + CR +
- "Crawl-delay: 20" + CR +
- "User-agent: *" + CR +
- "Disallow:/baz" + CR;
- RobotRuleSet rules = p.parseRules(delayRule1.getBytes());
- long crawlDelay = rules.getCrawlDelay();
- assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay
== 10000));
- rules = p.parseRules(delayRule2.getBytes());
- crawlDelay = rules.getCrawlDelay();
- assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay
== -1));
- }
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, MULTIPLE_AGENTS);
- // helper
-
- public void testRobots(int robotsString, String[] agents, String[] paths,
- boolean[] allowed) {
- String agentsString= agents[0];
- for (int i= 1; i < agents.length; i++)
- agentsString= agentsString + "," + agents[i];
- RobotRulesParser p= new RobotRulesParser(agents);
- RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null
- ? ROBOTS_STRINGS[robotsString].getBytes()
- : null);
- for (int i= 0; i < paths.length; i++) {
- assertTrue("testing robots file "+robotsString+", on agents ("
- + agentsString + "), and path " + TEST_PATHS[i] + "; got "
- + rules.isAllowed(TEST_PATHS[i]) + ", rules are: " + LF
- + rules,
- rules.isAllowed(TEST_PATHS[i]) == allowed[i]);
+ for(int counter = 0; counter < TEST_PATHS.length; counter++) {
+ assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and "
+ + "path " + TEST_PATHS[counter]
+ + " got " + rules.isAllowed(TEST_PATHS[counter]),
+ rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
}
-
-
+ /**
+ * Test that the crawl delay is extracted from the robots file for respective
agent.
+ * If its not specified for a given agent, default value must be returned.
+ */
+ public void testCrawlDelay() {
+ // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
returned by the parser
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, SINGLE_AGENT);
+ assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ",
(rules.getCrawlDelay() == 10000));
+
+ // for UNKNOWN_AGENT, the default crawl delay must be returned.
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, UNKNOWN_AGENT);
+ assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ",
(rules.getCrawlDelay() == Long.MIN_VALUE));
+ }
}
Modified:
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
(original)
+++
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
Fri Apr 5 23:50:56 2013
@@ -17,35 +17,33 @@
package org.apache.nutch.protocol.file;
+import java.net.URL;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.HttpDateFormat;
-import org.apache.nutch.net.protocols.Response;
-
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.EmptyRobotRules;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
+import org.apache.nutch.protocol.RobotRulesParser;
import org.apache.nutch.util.NutchConfiguration;
-import java.net.URL;
+import crawlercommons.robots.BaseRobotRules;
-/************************************
- * File.java deals with file: scheme.
- *
- * Configurable parameters are defined under "FILE properties" section
- * in ./conf/nutch-default.xml or similar.
+/**
+ * This class is a protocol plugin used for file: scheme.
+ * It creates {@link FileResponse} object and gets the content of the url from
it.
+ * Configurable parameters are {@code file.content.limit} and {@code
file.crawl.parent}
+ * in nutch-default.xml defined under "file properties" section.
*
* @author John Xing
- ***********************************/
+ */
public class File implements Protocol {
public static final Logger LOG = LoggerFactory.getLogger(File.class);
@@ -57,13 +55,40 @@ public class File implements Protocol {
private Configuration conf;
- // constructor
- public File() {
- }
+ public File() {}
- /** Set the point at which content is truncated. */
- public void setMaxContentLength(int length) {maxContentLength = length;}
+ /**
+ * Set the {@link Configuration} object
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+ this.crawlParents = conf.getBoolean("file.crawl.parent", true);
+ }
+ /**
+ * Get the {@link Configuration} object
+ */
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /**
+ * Set the length after at which content is truncated.
+ */
+ public void setMaxContentLength(int maxContentLength) {
+ this.maxContentLength = maxContentLength;
+ }
+
+ /**
+ * Creates a {@link FileResponse} object corresponding to the url and
+ * return a {@link ProtocolOutput} object as per the content received
+ *
+ * @param url Text containing the url
+ * @param datum The CrawlDatum object corresponding to the url
+ *
+ * @return {@link ProtocolOutput} object for the content of the file
indicated by url
+ */
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
@@ -99,11 +124,9 @@ public class File implements Protocol {
}
}
-// protected void finalize () {
-// // nothing here
-// }
-
- /** For debugging. */
+ /**
+ * Quick way for running this class. Useful for debugging.
+ */
public static void main(String[] args) throws Exception {
int maxContentLength = Integer.MIN_VALUE;
String logLevel = "info";
@@ -154,17 +177,12 @@ public class File implements Protocol {
file = null;
}
- public void setConf(Configuration conf) {
- this.conf = conf;
- this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
- this.crawlParents = conf.getBoolean("file.crawl.parent", true);
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-
- public RobotRules getRobotRules(Text url, CrawlDatum datum) {
- return EmptyRobotRules.RULES;
+ /**
+ * No robots parsing is done for file protocol.
+ * So this returns a set of empty rules which will allow every url.
+ */
+ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+ return RobotRulesParser.EMPTY_RULES;
}
}
+
Modified:
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
(original)
+++
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
Fri Apr 5 23:50:56 2013
@@ -24,30 +24,33 @@ import org.apache.commons.net.ftp.FTPFil
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.hadoop.io.Text;
-import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.EmptyRobotRules;
+import org.apache.nutch.protocol.RobotRulesParser;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
+
+import crawlercommons.robots.BaseRobotRules;
import java.net.URL;
import java.io.IOException;
-/************************************
- * Ftp.java deals with ftp: scheme.
- *
- * Configurable parameters are defined under "FTP properties" section
- * in ./conf/nutch-default.xml or similar.
+/**
+ * This class is a protocol plugin used for ftp: scheme.
+ * It creates {@link FtpResponse} object and gets the content of the url from
it.
+ * Configurable parameters are {@code ftp.username}, {@code ftp.password},
+ * {@code ftp.content.limit}, {@code ftp.timeout},
+ * {@code ftp.server.timeout}, {@code
ftp.password},
+ * {@code ftp.keep.connection} and {@code
ftp.follow.talk}.
+ * For details see "FTP properties" section in {@code nutch-default.xml}.
*
* @author John Xing
- ***********************************/
+ */
public class Ftp implements Protocol {
public static final Logger LOG = LoggerFactory.getLogger(Ftp.class);
@@ -106,6 +109,15 @@ public class Ftp implements Protocol {
this.keepConnection = keepConnection;
}
+ /**
+ * Creates a {@link FtpResponse} object corresponding to the url and
+ * returns a {@link ProtocolOutput} object as per the content received
+ *
+ * @param url Text containing the ftp url
+ * @param datum The CrawlDatum object corresponding to the url
+ *
+ * @return {@link ProtocolOutput} object for the url
+ */
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
@@ -216,7 +228,9 @@ public class Ftp implements Protocol {
ftp = null;
}
-
+ /**
+ * Set the {@link Configuration} object
+ */
public void setConf(Configuration conf) {
this.conf = conf;
this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
@@ -228,12 +242,20 @@ public class Ftp implements Protocol {
this.followTalk = conf.getBoolean("ftp.follow.talk", false);
}
+ /**
+ * Get the {@link Configuration} object
+ */
public Configuration getConf() {
return this.conf;
}
- public RobotRules getRobotRules(Text url, CrawlDatum datum) {
- return EmptyRobotRules.RULES;
+ /**
+ * Currently, no robots parsing is done for ftp protocol
+ * and this returns a set of empty rules which will allow every url.
+ * There a jira logged for the same NUTCH-1513
+ */
+ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+ return RobotRulesParser.EMPTY_RULES;
}
-
}
+