Author: ab Date: Mon May 15 15:18:34 2006 New Revision: 406757 URL: http://svn.apache.org/viewcvs?rev=406757&view=rev Log: Fix NUTCH-268. Default settings are still different to avoid DOS-ing remote DNS servers during fetchlist generation.
Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=406757&r1=406756&r2=406757&view=diff ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon May 15 15:18:34 2006 @@ -308,6 +308,20 @@ fetchlist. -1 if unlimited.</description> </property> +<property> + <name>generate.max.per.host.by.ip</name> + <value>false</value> + <description>If false, same host names are counted. If true, + hosts' IP addresses are resolved and the same IP-s are counted. + + -+-+-+- WARNING !!! -+-+-+- + When set to true, Generator will create a lot of DNS lookup + requests, rapidly. This may cause a DOS attack on + remote DNS servers, not to mention increased external traffic + and latency. For these reasons when using this option it is + required that a local caching DNS be used.</description> +</property> + <!-- fetcher properties --> <property> @@ -330,6 +344,16 @@ <value>1</value> <description>This number is the maximum number of threads that should be allowed to access a host at one time.</description> +</property> + +<property> + <name>fetcher.threads.per.host.by.ip</name> + <value>true</value> + <description>If true, then fetcher will count threads by IP address, + to which the URL's host name resolves. If false, only host name will be + used. NOTE: this should be set to the same value as + "generate.max.per.host.by.ip" - default settings are different only for + reasons of backward-compatibility.</description> </property> <property> Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=406757&r1=406756&r2=406757&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Mon May 15 15:18:34 2006 @@ -26,7 +26,6 @@ import org.apache.hadoop.conf.*; import org.apache.hadoop.util.LogFormatter; import org.apache.hadoop.mapred.*; -import org.apache.hadoop.mapred.lib.*; import org.apache.hadoop.fs.Path; import org.apache.nutch.net.URLFilterException; @@ -71,11 +70,13 @@ private URLFilters filters; private SelectorEntry entry = new SelectorEntry(); private FloatWritable sortValue = new FloatWritable(); + private boolean byIP; public void configure(JobConf job) { curTime = job.getLong("crawl.gen.curTime", System.currentTimeMillis()); limit = job.getLong("crawl.topN",Long.MAX_VALUE)/job.getNumReduceTasks(); maxPerHost = job.getInt("generate.max.per.host", -1); + byIP = job.getBoolean("generate.max.per.host.by.ip", false); filters = new URLFilters(job); } @@ -127,15 +128,32 @@ if (maxPerHost > 0) { // are we counting hosts? String host = new URL(url.toString()).getHost(); - Integer hostCount = (Integer)hostCounts.get(host); + if (host == null) { + // unknown host, skip + continue; + } + host = host.toLowerCase(); + if (byIP) { + try { + InetAddress ia = InetAddress.getByName(host); + host = ia.getHostAddress(); + } catch (UnknownHostException uhe) { + LOG.fine("DNS lookup failed: " + host + ", skipping."); + continue; + } + } + IntWritable hostCount = (IntWritable)hostCounts.get(host); + if (hostCount == null) { + hostCount = new IntWritable(); + hostCounts.put(host, hostCount); + } // increment hostCount - hostCount = new Integer(hostCount==null ? 1 : hostCount.intValue()+1); - hostCounts.put(host, hostCount); + hostCount.set(hostCount.get() + 1); // skip URL if above the limit per host. - if (hostCount.intValue() > maxPerHost) { - if (hostCount.intValue() == maxPerHost + 1) { + if (hostCount.get() > maxPerHost) { + if (hostCount.get() == maxPerHost + 1) { LOG.info("Host "+ host +" has more than "+ maxPerHost +" URLs."+ " Skipping additional."); } Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=406757&r1=406756&r2=406757&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon May 15 15:18:34 2006 @@ -90,20 +90,20 @@ /** - * Maps from InetAddress to a Long naming the time it should be unblocked. - * The Long is zero while the address is in use, then set to now+wait when - * a request finishes. This way only one thread at a time accesses an - * address. + * Maps from host to a Long naming the time it should be unblocked. + * The Long is zero while the host is in use, then set to now+wait when + * a request finishes. This way only one thread at a time accesses a + * host. */ private static HashMap BLOCKED_ADDR_TO_TIME = new HashMap(); /** - * Maps an address to the number of threads accessing that address. + * Maps a host to the number of threads accessing that host. */ private static HashMap THREADS_PER_HOST_COUNT = new HashMap(); /** - * Queue of blocked InetAddress. This contains all of the non-zero entries + * Queue of blocked hosts. This contains all of the non-zero entries * from BLOCKED_ADDR_TO_TIME, ordered by increasing time. */ private static LinkedList BLOCKED_ADDR_QUEUE = new LinkedList(); @@ -116,6 +116,9 @@ /** The nutch configuration */ private Configuration conf = null; + + /** Do we block by IP addresses or by hostnames? */ + private boolean byIP = true; /** Creates a new instance of HttpBase */ @@ -144,6 +147,8 @@ this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email")); this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000); + // backward-compatible default setting + this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true); this.robots.setConf(conf); logConf(); } @@ -170,12 +175,12 @@ logger.fine("Exception checking robot rules for " + url + ": " + e); } - InetAddress addr = blockAddr(u); + String host = blockAddr(u); Response response; try { response = getResponse(u, datum, false); // make a request } finally { - unblockAddr(addr); + unblockAddr(host); } int code = response.getCode(); @@ -282,13 +287,22 @@ } - private InetAddress blockAddr(URL url) throws ProtocolException { + private String blockAddr(URL url) throws ProtocolException { - InetAddress addr; - try { - addr = InetAddress.getByName(url.getHost()); - } catch (UnknownHostException e) { - throw new HttpException(e); + String host; + if (byIP) { + try { + InetAddress addr = InetAddress.getByName(url.getHost()); + host = addr.getHostAddress(); + } catch (UnknownHostException e) { + // unable to resolve it, so don't fall back to host name + throw new HttpException(e); + } + } else { + host = url.getHost(); + if (host == null) + throw new HttpException("Unknown host for url: " + url); + host = host.toLowerCase(); } int delays = 0; @@ -297,20 +311,20 @@ Long time; synchronized (BLOCKED_ADDR_TO_TIME) { - time = (Long) BLOCKED_ADDR_TO_TIME.get(addr); + time = (Long) BLOCKED_ADDR_TO_TIME.get(host); if (time == null) { // address is free // get # of threads already accessing this addr - Integer counter = (Integer)THREADS_PER_HOST_COUNT.get(addr); + Integer counter = (Integer)THREADS_PER_HOST_COUNT.get(host); int count = (counter == null) ? 0 : counter.intValue(); count++; // increment & store - THREADS_PER_HOST_COUNT.put(addr, new Integer(count)); + THREADS_PER_HOST_COUNT.put(host, new Integer(count)); if (count >= maxThreadsPerHost) { - BLOCKED_ADDR_TO_TIME.put(addr, new Long(0)); // block it + BLOCKED_ADDR_TO_TIME.put(host, new Long(0)); // block it } - return addr; + return host; } } @@ -334,16 +348,16 @@ } } - private void unblockAddr(InetAddress addr) { + private void unblockAddr(String host) { synchronized (BLOCKED_ADDR_TO_TIME) { - int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(addr)).intValue(); + int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(host)).intValue(); if (addrCount == 1) { - THREADS_PER_HOST_COUNT.remove(addr); - BLOCKED_ADDR_QUEUE.addFirst(addr); + THREADS_PER_HOST_COUNT.remove(host); + BLOCKED_ADDR_QUEUE.addFirst(host); BLOCKED_ADDR_TO_TIME.put - (addr, new Long(System.currentTimeMillis() + serverDelay)); + (host, new Long(System.currentTimeMillis() + serverDelay)); } else { - THREADS_PER_HOST_COUNT.put(addr, new Integer(addrCount - 1)); + THREADS_PER_HOST_COUNT.put(host, new Integer(addrCount - 1)); } } } @@ -351,10 +365,10 @@ private static void cleanExpiredServerBlocks() { synchronized (BLOCKED_ADDR_TO_TIME) { while (!BLOCKED_ADDR_QUEUE.isEmpty()) { - InetAddress addr = (InetAddress) BLOCKED_ADDR_QUEUE.getLast(); - long time = ((Long) BLOCKED_ADDR_TO_TIME.get(addr)).longValue(); + String host = (String) BLOCKED_ADDR_QUEUE.getLast(); + long time = ((Long) BLOCKED_ADDR_TO_TIME.get(host)).longValue(); if (time <= System.currentTimeMillis()) { - BLOCKED_ADDR_TO_TIME.remove(addr); + BLOCKED_ADDR_TO_TIME.remove(host); BLOCKED_ADDR_QUEUE.removeLast(); } else { break; ------------------------------------------------------- Using Tomcat but need to do more? Need to support web services, security? Get stuff done quickly with pre-integrated technology to make your job easier Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs