Author: ab
Date: Mon May 15 15:18:34 2006
New Revision: 406757
URL: http://svn.apache.org/viewcvs?rev=406757&view=rev
Log:
Fix NUTCH-268. Default settings are still different to avoid DOS-ing
remote DNS servers during fetchlist generation.
Modified:
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=406757&r1=406756&r2=406757&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon May 15 15:18:34 2006
@@ -308,6 +308,20 @@
fetchlist. -1 if unlimited.</description>
</property>
+<property>
+ <name>generate.max.per.host.by.ip</name>
+ <value>false</value>
+ <description>If false, same host names are counted. If true,
+ hosts' IP addresses are resolved and the same IP-s are counted.
+
+ -+-+-+- WARNING !!! -+-+-+-
+ When set to true, Generator will create a lot of DNS lookup
+ requests, rapidly. This may cause a DOS attack on
+ remote DNS servers, not to mention increased external traffic
+ and latency. For these reasons when using this option it is
+ required that a local caching DNS be used.</description>
+</property>
+
<!-- fetcher properties -->
<property>
@@ -330,6 +344,16 @@
<value>1</value>
<description>This number is the maximum number of threads that
should be allowed to access a host at one time.</description>
+</property>
+
+<property>
+ <name>fetcher.threads.per.host.by.ip</name>
+ <value>true</value>
+ <description>If true, then fetcher will count threads by IP address,
+ to which the URL's host name resolves. If false, only host name will be
+ used. NOTE: this should be set to the same value as
+ "generate.max.per.host.by.ip" - default settings are different only for
+ reasons of backward-compatibility.</description>
</property>
<property>
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=406757&r1=406756&r2=406757&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Mon May
15 15:18:34 2006
@@ -26,7 +26,6 @@
import org.apache.hadoop.conf.*;
import org.apache.hadoop.util.LogFormatter;
import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.mapred.lib.*;
import org.apache.hadoop.fs.Path;
import org.apache.nutch.net.URLFilterException;
@@ -71,11 +70,13 @@
private URLFilters filters;
private SelectorEntry entry = new SelectorEntry();
private FloatWritable sortValue = new FloatWritable();
+ private boolean byIP;
public void configure(JobConf job) {
curTime = job.getLong("crawl.gen.curTime", System.currentTimeMillis());
limit = job.getLong("crawl.topN",Long.MAX_VALUE)/job.getNumReduceTasks();
maxPerHost = job.getInt("generate.max.per.host", -1);
+ byIP = job.getBoolean("generate.max.per.host.by.ip", false);
filters = new URLFilters(job);
}
@@ -127,15 +128,32 @@
if (maxPerHost > 0) { // are we counting hosts?
String host = new URL(url.toString()).getHost();
- Integer hostCount = (Integer)hostCounts.get(host);
+ if (host == null) {
+ // unknown host, skip
+ continue;
+ }
+ host = host.toLowerCase();
+ if (byIP) {
+ try {
+ InetAddress ia = InetAddress.getByName(host);
+ host = ia.getHostAddress();
+ } catch (UnknownHostException uhe) {
+ LOG.fine("DNS lookup failed: " + host + ", skipping.");
+ continue;
+ }
+ }
+ IntWritable hostCount = (IntWritable)hostCounts.get(host);
+ if (hostCount == null) {
+ hostCount = new IntWritable();
+ hostCounts.put(host, hostCount);
+ }
// increment hostCount
- hostCount = new Integer(hostCount==null ? 1 :
hostCount.intValue()+1);
- hostCounts.put(host, hostCount);
+ hostCount.set(hostCount.get() + 1);
// skip URL if above the limit per host.
- if (hostCount.intValue() > maxPerHost) {
- if (hostCount.intValue() == maxPerHost + 1) {
+ if (hostCount.get() > maxPerHost) {
+ if (hostCount.get() == maxPerHost + 1) {
LOG.info("Host "+ host +" has more than "+ maxPerHost +" URLs."+
" Skipping additional.");
}
Modified:
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=406757&r1=406756&r2=406757&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
(original)
+++
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Mon May 15 15:18:34 2006
@@ -90,20 +90,20 @@
/**
- * Maps from InetAddress to a Long naming the time it should be unblocked.
- * The Long is zero while the address is in use, then set to now+wait when
- * a request finishes. This way only one thread at a time accesses an
- * address.
+ * Maps from host to a Long naming the time it should be unblocked.
+ * The Long is zero while the host is in use, then set to now+wait when
+ * a request finishes. This way only one thread at a time accesses a
+ * host.
*/
private static HashMap BLOCKED_ADDR_TO_TIME = new HashMap();
/**
- * Maps an address to the number of threads accessing that address.
+ * Maps a host to the number of threads accessing that host.
*/
private static HashMap THREADS_PER_HOST_COUNT = new HashMap();
/**
- * Queue of blocked InetAddress. This contains all of the non-zero entries
+ * Queue of blocked hosts. This contains all of the non-zero entries
* from BLOCKED_ADDR_TO_TIME, ordered by increasing time.
*/
private static LinkedList BLOCKED_ADDR_QUEUE = new LinkedList();
@@ -116,6 +116,9 @@
/** The nutch configuration */
private Configuration conf = null;
+
+ /** Do we block by IP addresses or by hostnames? */
+ private boolean byIP = true;
/** Creates a new instance of HttpBase */
@@ -144,6 +147,8 @@
this.userAgent = getAgentString(conf.get("http.agent.name"),
conf.get("http.agent.version"), conf
.get("http.agent.description"), conf.get("http.agent.url"),
conf.get("http.agent.email"));
this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f)
* 1000);
+ // backward-compatible default setting
+ this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
this.robots.setConf(conf);
logConf();
}
@@ -170,12 +175,12 @@
logger.fine("Exception checking robot rules for " + url + ": " + e);
}
- InetAddress addr = blockAddr(u);
+ String host = blockAddr(u);
Response response;
try {
response = getResponse(u, datum, false); // make a request
} finally {
- unblockAddr(addr);
+ unblockAddr(host);
}
int code = response.getCode();
@@ -282,13 +287,22 @@
}
- private InetAddress blockAddr(URL url) throws ProtocolException {
+ private String blockAddr(URL url) throws ProtocolException {
- InetAddress addr;
- try {
- addr = InetAddress.getByName(url.getHost());
- } catch (UnknownHostException e) {
- throw new HttpException(e);
+ String host;
+ if (byIP) {
+ try {
+ InetAddress addr = InetAddress.getByName(url.getHost());
+ host = addr.getHostAddress();
+ } catch (UnknownHostException e) {
+ // unable to resolve it, so don't fall back to host name
+ throw new HttpException(e);
+ }
+ } else {
+ host = url.getHost();
+ if (host == null)
+ throw new HttpException("Unknown host for url: " + url);
+ host = host.toLowerCase();
}
int delays = 0;
@@ -297,20 +311,20 @@
Long time;
synchronized (BLOCKED_ADDR_TO_TIME) {
- time = (Long) BLOCKED_ADDR_TO_TIME.get(addr);
+ time = (Long) BLOCKED_ADDR_TO_TIME.get(host);
if (time == null) { // address is free
// get # of threads already accessing this addr
- Integer counter = (Integer)THREADS_PER_HOST_COUNT.get(addr);
+ Integer counter = (Integer)THREADS_PER_HOST_COUNT.get(host);
int count = (counter == null) ? 0 : counter.intValue();
count++; // increment & store
- THREADS_PER_HOST_COUNT.put(addr, new Integer(count));
+ THREADS_PER_HOST_COUNT.put(host, new Integer(count));
if (count >= maxThreadsPerHost) {
- BLOCKED_ADDR_TO_TIME.put(addr, new Long(0)); // block it
+ BLOCKED_ADDR_TO_TIME.put(host, new Long(0)); // block it
}
- return addr;
+ return host;
}
}
@@ -334,16 +348,16 @@
}
}
- private void unblockAddr(InetAddress addr) {
+ private void unblockAddr(String host) {
synchronized (BLOCKED_ADDR_TO_TIME) {
- int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(addr)).intValue();
+ int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(host)).intValue();
if (addrCount == 1) {
- THREADS_PER_HOST_COUNT.remove(addr);
- BLOCKED_ADDR_QUEUE.addFirst(addr);
+ THREADS_PER_HOST_COUNT.remove(host);
+ BLOCKED_ADDR_QUEUE.addFirst(host);
BLOCKED_ADDR_TO_TIME.put
- (addr, new Long(System.currentTimeMillis() + serverDelay));
+ (host, new Long(System.currentTimeMillis() + serverDelay));
} else {
- THREADS_PER_HOST_COUNT.put(addr, new Integer(addrCount - 1));
+ THREADS_PER_HOST_COUNT.put(host, new Integer(addrCount - 1));
}
}
}
@@ -351,10 +365,10 @@
private static void cleanExpiredServerBlocks() {
synchronized (BLOCKED_ADDR_TO_TIME) {
while (!BLOCKED_ADDR_QUEUE.isEmpty()) {
- InetAddress addr = (InetAddress) BLOCKED_ADDR_QUEUE.getLast();
- long time = ((Long) BLOCKED_ADDR_TO_TIME.get(addr)).longValue();
+ String host = (String) BLOCKED_ADDR_QUEUE.getLast();
+ long time = ((Long) BLOCKED_ADDR_TO_TIME.get(host)).longValue();
if (time <= System.currentTimeMillis()) {
- BLOCKED_ADDR_TO_TIME.remove(addr);
+ BLOCKED_ADDR_TO_TIME.remove(host);
BLOCKED_ADDR_QUEUE.removeLast();
} else {
break;