Author: tejasp
Date: Mon Apr 29 20:26:52 2013
New Revision: 1477319
URL: http://svn.apache.org/r1477319
Log:
NUTCH-1031 Delegate parsing of robots.txt to crawler-commons
Added:
nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
Removed:
nutch/branches/2.x/src/java/org/apache/nutch/protocol/EmptyRobotRules.java
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/ivy/ivy.xml
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Apr 29 20:26:52 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.2 - Current Development
+* NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)
+
* NUTCH-346 Improve readability of logs/hadoop.log (Renaud Richardet via
tejasp)
* NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel +
lewismc)
Modified: nutch/branches/2.x/ivy/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Mon Apr 29 20:26:52 2013
@@ -70,6 +70,7 @@
<dependency org="org.jdom" name="jdom" rev="1.1" conf="*->default" />
<dependency org="com.google.guava" name="guava" rev="11.0.2" />
+ <dependency org="com.google.code.crawler-commons" name="crawler-commons"
rev="0.2" />
<!--Configuration: test -->
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
Mon Apr 29 20:26:52 2013
@@ -61,6 +61,8 @@ import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
+import crawlercommons.robots.BaseRobotRules;
+
public class FetcherReducer
extends GoraReducer<IntWritable, FetchEntry, String, WebPage> {
@@ -152,9 +154,6 @@ extends GoraReducer<IntWritable, FetchEn
return "FetchItem [queueID=" + queueID + ", url=" + url + ", u=" + u
+ ", page=" + page + "]";
}
-
-
-
}
/**
@@ -489,8 +488,8 @@ extends GoraReducer<IntWritable, FetchEn
// fetch the page
final Protocol protocol =
this.protocolFactory.getProtocol(fit.url);
- final RobotRules rules = protocol.getRobotRules(fit.url, fit.page);
- if (!rules.isAllowed(fit.u)) {
+ final BaseRobotRules rules = protocol.getRobotRules(fit.url,
fit.page);
+ if (!rules.isAllowed(fit.u.toString())) {
// unblock
fetchQueues.finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java Mon Apr
29 20:26:52 2013
@@ -23,6 +23,8 @@ import org.apache.hadoop.conf.Configurab
import org.apache.nutch.plugin.FieldPluggable;
import org.apache.nutch.storage.WebPage;
+import crawlercommons.robots.BaseRobotRules;
+
/** A retriever of url content. Implemented by protocol extensions. */
public interface Protocol extends FieldPluggable, Configurable {
/** The name of the extension point. */
@@ -46,7 +48,8 @@ public interface Protocol extends FieldP
*/
public final static String CHECK_ROBOTS = "protocol.plugin.check.robots";
- /** Returns the {@link Content} for a fetchlist entry.
+ /*
+ * Returns the {@link Content} for a fetchlist entry.
*/
ProtocolOutput getProtocolOutput(String url, WebPage page);
@@ -56,5 +59,5 @@ public interface Protocol extends FieldP
* @param page
* @return robot rules (specific for this url or default), never null
*/
- RobotRules getRobotRules(String url, WebPage page);
+ BaseRobotRules getRobotRules(String url, WebPage page);
}
Added:
nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1477319&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
(added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
Mon Apr 29 20:26:52 2013
@@ -0,0 +1,195 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+// JDK imports
+import java.io.File;
+import java.io.FileReader;
+import java.io.LineNumberReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.StringTokenizer;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.io.Text;
+
+import com.google.common.io.Files;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
+import crawlercommons.robots.SimpleRobotRulesParser;
+
+/**
+ * This class uses crawler-commons for handling the parsing of {@code
robots.txt} files.
+ * It emits SimpleRobotRules objects, which describe the download permissions
+ * as described in SimpleRobotRulesParser.
+ */
+public abstract class RobotRulesParser implements Configurable {
+
+ public static final Logger LOG =
LoggerFactory.getLogger(RobotRulesParser.class);
+
+ protected static final Hashtable<String, BaseRobotRules> CACHE = new
Hashtable<String, BaseRobotRules> ();
+
+ /**
+ * A {@link BaseRobotRules} object appropriate for use
+ * when the {@code robots.txt} file is empty or missing;
+ * all requests are allowed.
+ */
+ public static final BaseRobotRules EMPTY_RULES = new
SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
+
+ /**
+ * A {@link BaseRobotRules} object appropriate for use when the
+ * {@code robots.txt} file is not fetched due to a {@code 403/Forbidden}
+ * response; all requests are disallowed.
+ */
+ public static BaseRobotRules FORBID_ALL_RULES = new
SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
+
+ private static SimpleRobotRulesParser robotParser = new
SimpleRobotRulesParser();
+ private Configuration conf;
+ protected String agentNames;
+
+ public RobotRulesParser() { }
+
+ public RobotRulesParser(Configuration conf) {
+ setConf(conf);
+ }
+
+ /**
+ * Set the {@link Configuration} object
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ // Grab the agent names we advertise to robots files.
+ String agentName = conf.get("http.agent.name");
+ if (null == agentName) {
+ throw new RuntimeException("Agent name not configured!");
+ }
+
+ String agentNames = conf.get("http.robots.agents");
+ StringTokenizer tok = new StringTokenizer(agentNames, ",");
+ ArrayList<String> agents = new ArrayList<String>();
+ while (tok.hasMoreTokens()) {
+ agents.add(tok.nextToken().trim());
+ }
+
+ /**
+ * If there are no agents for robots-parsing, use the
+ * default agent-string. If both are present, our agent-string
+ * should be the first one we advertise to robots-parsing.
+ */
+ if (agents.size() == 0) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error("No agents listed in 'http.robots.agents' property!");
+ }
+ } else {
+ StringBuffer combinedAgentsString = new StringBuffer(agentName);
+ int index = 0;
+
+ if ((agents.get(0)).equalsIgnoreCase(agentName))
+ index++;
+ else if (LOG.isErrorEnabled()) {
+ LOG.error("Agent we advertise (" + agentName
+ + ") not listed first in 'http.robots.agents' property!");
+ }
+
+ // append all the agents from the http.robots.agents property
+ for(; index < agents.size(); index++) {
+ combinedAgentsString.append(", " + agents.get(index));
+ }
+
+ // always make sure "*" is included in the end
+ combinedAgentsString.append(", *");
+ this.agentNames = combinedAgentsString.toString();
+ }
+ }
+
+ /**
+ * Get the {@link Configuration} object
+ */
+ public Configuration getConf() {
+ return conf;
+ }
+
+ /**
+ * Parses the robots content using the {@link SimpleRobotRulesParser} from
crawler commons
+ *
+ * @param url A string containing url
+ * @param content Contents of the robots file in a byte array
+ * @param contentType The
+ * @param robotName A string containing value of
+ * @return BaseRobotRules object
+ */
+ public BaseRobotRules parseRules (String url, byte[] content, String
contentType, String robotName) {
+ return robotParser.parseContent(url, content, contentType, robotName);
+ }
+
+ public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) {
+ URL u = null;
+ try {
+ u = new URL(url);
+ } catch (Exception e) {
+ return EMPTY_RULES;
+ }
+ return getRobotRulesSet(protocol, u);
+ }
+
+ public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);
+
+ /** command-line main for testing */
+ public static void main(String[] argv) {
+
+ if (argv.length < 3) {
+ System.err.println("Usage: RobotRulesParser <robots-file> <url-file>
<agent-names>\n");
+ System.err.println(" <robots-file> - Input robots.txt file which will
be parsed.");
+ System.err.println(" <url-file> - Contains input URLs (1 per line)
which are tested against the rules.");
+ System.err.println(" <agent-names> - Input agent name. Multiple agent
names can be specified using spaces.");
+ System.exit(-1);
+ }
+
+ try {
+ StringBuilder agentNames = new StringBuilder();
+ for(int counter = 2; counter < argv.length; counter++)
+ agentNames.append(argv[counter]).append(",");
+
+ agentNames.deleteCharAt(agentNames.length()-1);
+
+ byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
+ BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes,
"text/plain", agentNames.toString());
+
+ LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
+ String testPath = testsIn.readLine().trim();
+ while (testPath != null) {
+ System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not
allowed") +
+ ":\t" + testPath);
+ testPath = testsIn.readLine();
+ }
+ testsIn.close();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+}
Modified:
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
(original)
+++
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Mon Apr 29 20:26:52 2013
@@ -32,23 +32,21 @@ import org.apache.nutch.protocol.Protoco
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatusCodes;
import org.apache.nutch.protocol.ProtocolStatusUtils;
-import org.apache.nutch.protocol.RobotRules;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.GZIPUtils;
import org.apache.nutch.util.DeflateUtils;
import org.apache.nutch.util.MimeUtil;
-/**
- * @author Jérôme Charron
- */
-public abstract class HttpBase implements Protocol {
+// crawler-commons imports
+import crawlercommons.robots.BaseRobotRules;
+public abstract class HttpBase implements Protocol {
public static final int BUFFER_SIZE = 8 * 1024;
private static final byte[] EMPTY_CONTENT = new byte[0];
- private RobotRulesParser robots = null;
+ private HttpRobotRulesParser robots = null;
/** The proxy hostname. */
protected String proxyHost = null;
@@ -102,7 +100,7 @@ public abstract class HttpBase implement
if (logger != null) {
this.logger = logger;
}
- robots = new RobotRulesParser();
+ robots = new HttpRobotRulesParser();
}
// Inherited Javadoc
@@ -128,13 +126,10 @@ public abstract class HttpBase implement
return this.conf;
}
-
-
public ProtocolOutput getProtocolOutput(String url, WebPage page) {
try {
URL u = new URL(url);
- String host = null;
Response response = getResponse(u, page, false); // make a request
int code = response.getCode();
byte[] content = response.getContent();
@@ -145,7 +140,6 @@ public abstract class HttpBase implement
if (code == 200) { // got a good response
return new ProtocolOutput(c); // return it
-
} else if (code == 410) { // page is gone
return new ProtocolOutput(c,
ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, "Http: "
+ code + " url=" + url));
@@ -206,8 +200,6 @@ public abstract class HttpBase implement
/* -------------------------- *
* </implementation:Protocol> *
* -------------------------- */
-
-
public String getProxyHost() {
return proxyHost;
}
@@ -367,10 +359,6 @@ public abstract class HttpBase implement
url = args[i];
}
- // if (verbose) {
- // LOGGER.setLevel(Level.FINE);
- // }
-
ProtocolOutput out = http.getProtocolOutput(url, new WebPage());
Content content = out.getContent();
@@ -383,17 +371,14 @@ public abstract class HttpBase implement
String text = new String(content.getContent());
System.out.println(text);
}
-
}
-
protected abstract Response getResponse(URL url,
WebPage page, boolean followRedirects)
throws ProtocolException, IOException;
@Override
- public RobotRules getRobotRules(String url, WebPage page) {
+ public BaseRobotRules getRobotRules(String url, WebPage page) {
return robots.getRobotRulesSet(this, url);
}
-
}
Added:
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1477319&view=auto
==============================================================================
---
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
(added)
+++
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
Mon Apr 29 20:26:52 2013
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.apache.nutch.storage.WebPage;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to HTTP protocol.
+ * It extends the generic {@link RobotRulesParser} class and contains
+ * Http protocol specific implementation for obtaining the robots file.
+ */
+public class HttpRobotRulesParser extends RobotRulesParser {
+
+ public static final Logger LOG =
LoggerFactory.getLogger(HttpRobotRulesParser.class);
+ protected boolean allowForbidden = false;
+
+ HttpRobotRulesParser() { }
+
+ public HttpRobotRulesParser(Configuration conf) {
+ super(conf);
+ allowForbidden = conf.getBoolean("http.robots.403.allow", false);
+ }
+
+ /**
+ * The hosts for which the caching of robots rules is yet to be done,
+ * it sends a Http request to the host corresponding to the {@link URL}
+ * passed, gets robots file, parses the rules and caches the rules object
+ * to avoid re-work in future.
+ *
+ * @param http The {@link Protocol} object
+ * @param url URL
+ *
+ * @return robotRules A {@link BaseRobotRules} object for the rules
+ */
+ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
+
+ String protocol = url.getProtocol().toLowerCase(); // normalize to lower
case
+ String host = url.getHost().toLowerCase(); // normalize to lower
case
+
+ BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" +
host);
+
+ boolean cacheRule = true;
+
+ if (robotRules == null) { // cache miss
+ URL redir = null;
+ if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
+ try {
+ Response response = ((HttpBase)http).getResponse(new URL(url,
"/robots.txt"),
+ new WebPage(), true);
+ // try one level of redirection ?
+ if (response.getCode() == 301 || response.getCode() == 302) {
+ String redirection = response.getHeader("Location");
+ if (redirection == null) {
+ // some versions of MS IIS are known to mangle this header
+ redirection = response.getHeader("location");
+ }
+ if (redirection != null) {
+ if (!redirection.startsWith("http")) {
+ // RFC says it should be absolute, but apparently it isn't
+ redir = new URL(url, redirection);
+ } else {
+ redir = new URL(redirection);
+ }
+
+ response = ((HttpBase)http).getResponse(redir, new WebPage(),
true);
+ }
+ }
+
+ if (response.getCode() == 200) // found rules: parse them
+ robotRules = parseRules(url.toString(), response.getContent(),
+ response.getHeader("Content-Type"),
+ agentNames);
+
+ else if ( (response.getCode() == 403) && (!allowForbidden) )
+ robotRules = FORBID_ALL_RULES; // use forbid all
+ else if (response.getCode() >= 500) {
+ cacheRule = false;
+ robotRules = EMPTY_RULES;
+ }else
+ robotRules = EMPTY_RULES; // use default rules
+ } catch (Throwable t) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+ }
+ cacheRule = false;
+ robotRules = EMPTY_RULES;
+ }
+
+ if (cacheRule) {
+ CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
+ if (redir != null && !redir.getHost().equals(host)) {
+ // cache also for the redirected host
+ CACHE.put(protocol + ":" + redir.getHost(), robotRules);
+ }
+ }
+ }
+ return robotRules;
+ }
+}
Modified:
nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
(original)
+++
nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
Mon Apr 29 20:26:52 2013
@@ -17,292 +17,100 @@
package org.apache.nutch.protocol.http.api;
-import org.apache.nutch.protocol.http.api.RobotRulesParser.RobotRuleSet;
-
+import crawlercommons.robots.BaseRobotRules;
import junit.framework.TestCase;
+/**
+ * JUnit test case which tests
+ * 1. that robots filtering is performed correctly as per the agent name
+ * 2. that crawl delay is extracted correctly from the robots file
+ *
+ */
public class TestRobotRulesParser extends TestCase {
- private static final String LF= "\n";
- private static final String CR= "\r";
- private static final String CRLF= "\r\n";
+
+ private static final String CONTENT_TYPE = "text/plain";
+ private static final String SINGLE_AGENT = "Agent1";
+ private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
+ private static final String UNKNOWN_AGENT = "AgentABC";
+ private static final String CR = "\r";
- private static final boolean[] ACCEPT_ALL = {
- true, // "/a",
- true, // "/a/",
- true, // "/a/bloh/foo.html"
- true, // "/b",
- true, // "/b/a",
- true, // "/b/a/index.html",
- true, // "/b/b/foo.html",
- true, // "/c",
- true, // "/c/a",
- true, // "/c/a/index.html",
- true, // "/c/b/foo.html",
- true, // "/d",
- true, // "/d/a",
- true, // "/e/a/index.html",
- true, // "/e/d",
- true, // "/e/d/foo.html",
- true, // "/e/doh.html",
- true, // "/f/index.html",
- true, // "/foo/bar.html",
- true, // "/f/",
- };
+ private static final String ROBOTS_STRING =
+ "User-Agent: Agent1 #foo" + CR
+ + "Disallow: /a" + CR
+ + "Disallow: /b/a" + CR
+ + "#Disallow: /c" + CR
+ + "Crawl-delay: 10" + CR // set crawl delay for Agent1 as 10 sec
+ + "" + CR
+ + "" + CR
+ + "User-Agent: Agent2" + CR
+ + "Disallow: /a/bloh" + CR
+ + "Disallow: /c" + CR
+ + "Disallow: /foo" + CR
+ + "Crawl-delay: 20" + CR
+ + "" + CR
+ + "User-Agent: *" + CR
+ + "Disallow: /foo/bar/" + CR; // no crawl delay for other agents
- private static final String[] ROBOTS_STRINGS= new String[] {
- "User-Agent: Agent1 #foo" + CR
- + "Disallow: /a" + CR
- + "Disallow: /b/a" + CR
- + "#Disallow: /c" + CR
- + "" + CR
- + "" + CR
- + "User-Agent: Agent2 Agent3#foo" + CR
- + "User-Agent: Agent4" + CR
- + "Disallow: /d" + CR
- + "Disallow: /e/d/" + CR
- + "" + CR
- + "User-Agent: *" + CR
- + "Disallow: /foo/bar/" + CR,
- null // Used to test EMPTY_RULES
+ private static final String[] TEST_PATHS = new String[] {
+ "http://example.com/a",
+ "http://example.com/a/bloh/foo.html",
+ "http://example.com/b",
+ "http://example.com/c",
+ "http://example.com/b/a/index.html",
+ "http://example.com/foo/bar/baz.html"
+ };
+
+ private static final boolean[] RESULTS = new boolean[] {
+ false, // /a
+ false, // /a/bloh/foo.html
+ true, // /b
+ true, // /c
+ false, // /b/a/index.html
+ true // /foo/bar/baz.html
};
- private static final String[] AGENT_STRINGS= new String[] {
- "Agent1",
- "Agent2",
- "Agent3",
- "Agent4",
- "Agent5",
- };
-
- private static final boolean[][] NOT_IN_ROBOTS_STRING= new boolean[][] {
- {
- false,
- false,
- false,
- false,
- true,
- },
- {
- false,
- false,
- false,
- false,
- true,
- }
- };
+ private HttpRobotRulesParser parser;
+ private BaseRobotRules rules;
- private static final String[] TEST_PATHS= new String[] {
- "/a",
- "/a/",
- "/a/bloh/foo.html",
- "/b",
- "/b/a",
- "/b/a/index.html",
- "/b/b/foo.html",
- "/c",
- "/c/a",
- "/c/a/index.html",
- "/c/b/foo.html",
- "/d",
- "/d/a",
- "/e/a/index.html",
- "/e/d",
- "/e/d/foo.html",
- "/e/doh.html",
- "/f/index.html",
- "/foo/bar/baz.html",
- "/f/",
- };
-
- private static final boolean[][][] ALLOWED= new boolean[][][] {
- { // ROBOTS_STRINGS[0]
- { // Agent1
- false, // "/a",
- false, // "/a/",
- false, // "/a/bloh/foo.html"
- true, // "/b",
- false, // "/b/a",
- false, // "/b/a/index.html",
- true, // "/b/b/foo.html",
- true, // "/c",
- true, // "/c/a",
- true, // "/c/a/index.html",
- true, // "/c/b/foo.html",
- true, // "/d",
- true, // "/d/a",
- true, // "/e/a/index.html",
- true, // "/e/d",
- true, // "/e/d/foo.html",
- true, // "/e/doh.html",
- true, // "/f/index.html",
- true, // "/foo/bar.html",
- true, // "/f/",
- },
- { // Agent2
- true, // "/a",
- true, // "/a/",
- true, // "/a/bloh/foo.html"
- true, // "/b",
- true, // "/b/a",
- true, // "/b/a/index.html",
- true, // "/b/b/foo.html",
- true, // "/c",
- true, // "/c/a",
- true, // "/c/a/index.html",
- true, // "/c/b/foo.html",
- false, // "/d",
- false, // "/d/a",
- true, // "/e/a/index.html",
- true, // "/e/d",
- false, // "/e/d/foo.html",
- true, // "/e/doh.html",
- true, // "/f/index.html",
- true, // "/foo/bar.html",
- true, // "/f/",
- },
- { // Agent3
- true, // "/a",
- true, // "/a/",
- true, // "/a/bloh/foo.html"
- true, // "/b",
- true, // "/b/a",
- true, // "/b/a/index.html",
- true, // "/b/b/foo.html",
- true, // "/c",
- true, // "/c/a",
- true, // "/c/a/index.html",
- true, // "/c/b/foo.html",
- false, // "/d",
- false, // "/d/a",
- true, // "/e/a/index.html",
- true, // "/e/d",
- false, // "/e/d/foo.html",
- true, // "/e/doh.html",
- true, // "/f/index.html",
- true, // "/foo/bar.html",
- true, // "/f/",
- },
- { // Agent4
- true, // "/a",
- true, // "/a/",
- true, // "/a/bloh/foo.html"
- true, // "/b",
- true, // "/b/a",
- true, // "/b/a/index.html",
- true, // "/b/b/foo.html",
- true, // "/c",
- true, // "/c/a",
- true, // "/c/a/index.html",
- true, // "/c/b/foo.html",
- false, // "/d",
- false, // "/d/a",
- true, // "/e/a/index.html",
- true, // "/e/d",
- false, // "/e/d/foo.html",
- true, // "/e/doh.html",
- true, // "/f/index.html",
- true, // "/foo/bar.html",
- true, // "/f/",
- },
- { // Agent5/"*"
- true, // "/a",
- true, // "/a/",
- true, // "/a/bloh/foo.html"
- true, // "/b",
- true, // "/b/a",
- true, // "/b/a/index.html",
- true, // "/b/b/foo.html",
- true, // "/c",
- true, // "/c/a",
- true, // "/c/a/index.html",
- true, // "/c/b/foo.html",
- true, // "/d",
- true, // "/d/a",
- true, // "/e/a/index.html",
- true, // "/e/d",
- true, // "/e/d/foo.html",
- true, // "/e/doh.html",
- true, // "/f/index.html",
- false, // "/foo/bar.html",
- true, // "/f/",
- }
- },
- { // ROBOTS_STRINGS[1]
- ACCEPT_ALL, // Agent 1
- ACCEPT_ALL, // Agent 2
- ACCEPT_ALL, // Agent 3
- ACCEPT_ALL, // Agent 4
- ACCEPT_ALL, // Agent 5
- }
- };
-
public TestRobotRulesParser(String name) {
super(name);
+ parser = new HttpRobotRulesParser();
}
- public void testRobotsOneAgent() {
- for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
- for (int j= 0; j < AGENT_STRINGS.length; j++) {
- testRobots(i, new String[] { AGENT_STRINGS[j] },
- TEST_PATHS, ALLOWED[i][j]);
- }
+ /**
+ * Test that the robots rules are interpreted correctly by the robots rules
parser.
+ */
+ public void testRobotsAgent() {
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, SINGLE_AGENT);
+
+ for(int counter = 0; counter < TEST_PATHS.length; counter++) {
+ assertTrue("testing on agent (" + SINGLE_AGENT + "), and "
+ + "path " + TEST_PATHS[counter]
+ + " got " + rules.isAllowed(TEST_PATHS[counter]),
+ rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
- }
- public void testRobotsTwoAgents() {
- for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
- for (int j= 0; j < AGENT_STRINGS.length; j++) {
- for (int k= 0; k < AGENT_STRINGS.length; k++) {
- int key= j;
- if (NOT_IN_ROBOTS_STRING[i][j])
- key= k;
- testRobots(i, new String[] { AGENT_STRINGS[j], AGENT_STRINGS[k] },
- TEST_PATHS, ALLOWED[i][key]);
- }
- }
- }
- }
-
- public void testCrawlDelay() {
- RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" });
- String delayRule1 = "User-agent: nutchbot" + CR +
- "Crawl-delay: 10" + CR +
- "User-agent: foobot" + CR +
- "Crawl-delay: 20" + CR +
- "User-agent: *" + CR +
- "Disallow:/baz" + CR;
- String delayRule2 = "User-agent: foobot" + CR +
- "Crawl-delay: 20" + CR +
- "User-agent: *" + CR +
- "Disallow:/baz" + CR;
- RobotRuleSet rules = p.parseRules(delayRule1.getBytes());
- long crawlDelay = rules.getCrawlDelay();
- assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay
== 10000));
- rules = p.parseRules(delayRule2.getBytes());
- crawlDelay = rules.getCrawlDelay();
- assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay
== -1));
- }
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, MULTIPLE_AGENTS);
- // helper
-
- public void testRobots(int robotsString, String[] agents, String[] paths,
- boolean[] allowed) {
- String agentsString= agents[0];
- for (int i= 1; i < agents.length; i++)
- agentsString= agentsString + "," + agents[i];
- RobotRulesParser p= new RobotRulesParser(agents);
- RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null
- ? ROBOTS_STRINGS[robotsString].getBytes()
- : null);
- for (int i= 0; i < paths.length; i++) {
- assertTrue("testing robots file "+robotsString+", on agents ("
- + agentsString + "), and path " + TEST_PATHS[i] + "; got "
- + rules.isAllowed(TEST_PATHS[i]) + ", rules are: " + LF
- + rules,
- rules.isAllowed(TEST_PATHS[i]) == allowed[i]);
+ for(int counter = 0; counter < TEST_PATHS.length; counter++) {
+ assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and "
+ + "path " + TEST_PATHS[counter]
+ + " got " + rules.isAllowed(TEST_PATHS[counter]),
+ rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
}
-
-
+ /**
+ * Test that the crawl delay is extracted from the robots file for respective
agent.
+ * If its not specified for a given agent, default value must be returned.
+ */
+ public void testCrawlDelay() {
+ // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
returned by the parser
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, SINGLE_AGENT);
+ assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ",
(rules.getCrawlDelay() == 10000));
+
+ // for UNKNOWN_AGENT, the default crawl delay must be returned.
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, UNKNOWN_AGENT);
+ assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ",
(rules.getCrawlDelay() == Long.MIN_VALUE));
+ }
}
Modified:
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
(original)
+++
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
Mon Apr 29 20:26:52 2013
@@ -14,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.nutch.protocol.file;
import java.net.URL;
@@ -23,28 +22,30 @@ import java.util.HashSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.EmptyRobotRules;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatusCodes;
import org.apache.nutch.protocol.ProtocolStatusUtils;
-import org.apache.nutch.protocol.RobotRules;
+import org.apache.nutch.protocol.RobotRulesParser;
import org.apache.nutch.storage.ProtocolStatus;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
import org.apache.nutch.util.NutchConfiguration;
-/************************************
- * File.java deals with file: scheme.
- *
- * Configurable parameters are defined under "FILE properties" section in
- * ./conf/nutch-default.xml or similar.
- *
- * @author John Xing
- ***********************************/
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * This class is a protocol plugin used for file: scheme.
+ * It creates {@link FileResponse} object and gets the content of the url from
it.
+ * Configurable parameters are {@code file.content.limit} and {@code
file.crawl.parent}
+ * in nutch-default.xml defined under "file properties" section.
+ */
public class File implements Protocol {
public static final Logger LOG = LoggerFactory.getLogger(File.class);
@@ -65,14 +66,40 @@ public class File implements Protocol {
private Configuration conf;
// constructor
- public File() {
- }
+ public File() { }
- /** Set the point at which content is truncated. */
- public void setMaxContentLength(int length) {
- maxContentLength = length;
+ /**
+ * Set the {@link Configuration} object
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+ this.crawlParents = conf.getBoolean("file.crawl.parent", true);
}
-
+
+ /**
+ * Get the {@link Configuration} object
+ */
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /**
+ * Set the point at which content is truncated.
+ */
+ public void setMaxContentLength(int maxContentLength) {
+ this.maxContentLength = maxContentLength;
+ }
+
+ /**
+ * Creates a {@link FileResponse} object corresponding to the url and
+ * return a {@link ProtocolOutput} object as per the content received
+ *
+ * @param url Text containing the url
+ * @param datum The CrawlDatum object corresponding to the url
+ *
+ * @return {@link ProtocolOutput} object for the content of the file
indicated by url
+ */
public ProtocolOutput getProtocolOutput(String url, WebPage page) {
String urlString = url.toString();
try {
@@ -82,16 +109,11 @@ public class File implements Protocol {
while (true) {
FileResponse response;
- response = new FileResponse(u, page, this, getConf()); // make
- // a
- // request
-
+ response = new FileResponse(u, page, this, getConf()); // make a
request
int code = response.getCode();
if (code == 200) { // got a good response
- return new ProtocolOutput(response.toContent()); // return
- // it
-
+ return new ProtocolOutput(response.toContent()); // return it
} else if (code >= 300 && code < 400) { // handle redirect
if (redirects == MAX_REDIRECTS)
throw new FileException("Too many redirects: " + url);
@@ -114,16 +136,13 @@ public class File implements Protocol {
}
@Override
- public RobotRules getRobotRules(String url, WebPage page) {
- return EmptyRobotRules.RULES;
- }
-
- @Override
public Collection<Field> getFields() {
return FIELDS;
}
- /** For debugging. */
+ /**
+ * Quick way for running this class. Useful for debugging.
+ */
public static void main(String[] args) throws Exception {
int maxContentLength = Integer.MIN_VALUE;
boolean dumpContent = false;
@@ -154,9 +173,6 @@ public class File implements Protocol {
if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
file.setMaxContentLength(maxContentLength);
- // set log level
- // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
-
Content content = file.getProtocolOutput(urlString, new WebPage())
.getContent();
@@ -172,13 +188,11 @@ public class File implements Protocol {
file = null;
}
- public void setConf(Configuration conf) {
- this.conf = conf;
- this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
- this.crawlParents = conf.getBoolean("file.crawl.parent", true);
- }
-
- public Configuration getConf() {
- return this.conf;
- }
+ /**
+ * No robots parsing is done for file protocol.
+ * So this returns a set of empty rules which will allow every url.
+ */
+ public BaseRobotRules getRobotRules(String url, WebPage page) {
+ return RobotRulesParser.EMPTY_RULES;
+ }
}
Modified:
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
(original)
+++
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
Mon Apr 29 20:26:52 2013
@@ -28,23 +28,27 @@ import org.apache.commons.net.ftp.FTPFil
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.EmptyRobotRules;
+import org.apache.nutch.protocol.RobotRulesParser;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatusCodes;
import org.apache.nutch.protocol.ProtocolStatusUtils;
-import org.apache.nutch.protocol.RobotRules;
import org.apache.nutch.storage.ProtocolStatus;
import org.apache.nutch.storage.WebPage;
-/************************************
- * Ftp.java deals with ftp: scheme.
- *
- * Configurable parameters are defined under "FTP properties" section in
- * ./conf/nutch-default.xml or similar.
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * This class is a protocol plugin used for ftp: scheme.
+ * It creates {@link FtpResponse} object and gets the content of the url from
it.
+ * Configurable parameters are {@code ftp.username}, {@code ftp.password},
+ * {@code ftp.content.limit}, {@code ftp.timeout},
+ * {@code ftp.server.timeout}, {@code
ftp.password},
+ * {@code ftp.keep.connection} and {@code
ftp.follow.talk}.
+ * For details see "FTP properties" section in {@code nutch-default.xml}.
*
* @author John Xing
- ***********************************/
+ */
public class Ftp implements Protocol {
public static final Logger LOG = LoggerFactory.getLogger(Ftp.class);
@@ -109,6 +113,15 @@ public class Ftp implements Protocol {
this.keepConnection = keepConnection;
}
+ /**
+ * Creates a {@link FtpResponse} object corresponding to the url and
+ * returns a {@link ProtocolOutput} object as per the content received
+ *
+ * @param url Text containing the ftp url
+ * @param datum The CrawlDatum object corresponding to the url
+ *
+ * @return {@link ProtocolOutput} object for the url
+ */
public ProtocolOutput getProtocolOutput(String url, WebPage page) {
try {
URL u = new URL(url);
@@ -154,6 +167,9 @@ public class Ftp implements Protocol {
}
}
+ /**
+ * Set the {@link Configuration} object
+ */
public void setConf(Configuration conf) {
this.conf = conf;
this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
@@ -165,14 +181,13 @@ public class Ftp implements Protocol {
this.followTalk = conf.getBoolean("ftp.follow.talk", false);
}
+ /**
+ * Get the {@link Configuration} object
+ */
public Configuration getConf() {
return this.conf;
}
- public RobotRules getRobotRules(String url, WebPage page) {
- return EmptyRobotRules.RULES;
- }
-
/** For debugging. */
public static void main(String[] args) throws Exception {
int timeout = Integer.MIN_VALUE;
@@ -222,9 +237,6 @@ public class Ftp implements Protocol {
if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
ftp.setMaxContentLength(maxContentLength);
- // set log level
- // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
-
Content content = ftp.getProtocolOutput(urlString, new WebPage())
.getContent();
@@ -244,4 +256,12 @@ public class Ftp implements Protocol {
return FIELDS;
}
+ /**
+ * Currently, no robots parsing is done for ftp protocol
+ * and this returns a set of empty rules which will allow every url.
+ * There a jira logged for the same NUTCH-1513
+ */
+ public BaseRobotRules getRobotRules(String url, WebPage page) {
+ return RobotRulesParser.EMPTY_RULES;
+ }
}
Modified:
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
(original)
+++
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
Mon Apr 29 20:26:52 2013
@@ -38,9 +38,9 @@ import org.apache.nutch.net.protocols.Re
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.RobotRules;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
+import org.apache.nutch.protocol.RobotRulesParser;
//JSCH imports
import com.jcraft.jsch.ChannelSftp;
@@ -50,6 +50,8 @@ import com.jcraft.jsch.Session;
import com.jcraft.jsch.SftpException;
import com.jcraft.jsch.ChannelSftp.LsEntry;
+import crawlercommons.robots.BaseRobotRules;
+
/**
* This class uses the Jsch package to fetch content using the Sftp protocol.
*
@@ -224,10 +226,16 @@ public class Sftp implements Protocol {
}
}
+ /**
+ * Get the {@link Configuration} object
+ */
public Configuration getConf() {
return configuration;
}
+ /**
+ * Set the {@link Configuration} object
+ */
public void setConf(Configuration arg0) {
configuration = arg0;
@@ -288,34 +296,9 @@ public class Sftp implements Protocol {
}
}
- /*
- * (non-Javadoc)
- *
- * @see org.apache.nutch.protocol.Protocol#getRobotRules(java.lang.String,
- * org.apache.nutch.storage.WebPage)
- */
@Override
- public RobotRules getRobotRules(String url, WebPage page) {
- return new RobotRules() {
-
- @Override
- public boolean isAllowed(URL url) {
- // they're all allowed for now.
- return true;
- }
-
- @Override
- public long getExpireTime() {
- // set to 0 for never expire
- return 0;
- }
-
- @Override
- public long getCrawlDelay() {
- // no delay
- return 0;
- }
- };
+ public BaseRobotRules getRobotRules(String url, WebPage page) {
+ return RobotRulesParser.EMPTY_RULES;
}
/*
@@ -327,5 +310,4 @@ public class Sftp implements Protocol {
public Collection<Field> getFields() {
return Collections.emptySet();
}
-
}