Author: ab
Date: Mon May 15 15:18:34 2006
New Revision: 406757

URL: http://svn.apache.org/viewcvs?rev=406757&view=rev
Log:
Fix NUTCH-268. Default settings are still different to avoid DOS-ing
remote DNS servers during fetchlist generation.

Modified:
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
    
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=406757&r1=406756&r2=406757&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon May 15 15:18:34 2006
@@ -308,6 +308,20 @@
   fetchlist.  -1 if unlimited.</description>
 </property>
 
+<property>
+  <name>generate.max.per.host.by.ip</name>
+  <value>false</value>
+  <description>If false, same host names are counted. If true,
+  hosts' IP addresses are resolved and the same IP-s are counted.
+  
+  -+-+-+- WARNING !!! -+-+-+-
+  When set to true, Generator will create a lot of DNS lookup
+  requests, rapidly. This may cause a DOS attack on
+  remote DNS servers, not to mention increased external traffic
+  and latency. For these reasons when using this option it is
+  required that a local caching DNS be used.</description>
+</property>
+
 <!-- fetcher properties -->
 
 <property>
@@ -330,6 +344,16 @@
   <value>1</value>
   <description>This number is the maximum number of threads that
     should be allowed to access a host at one time.</description>
+</property>
+
+<property>
+  <name>fetcher.threads.per.host.by.ip</name>
+  <value>true</value>
+  <description>If true, then fetcher will count threads by IP address,
+  to which the URL's host name resolves. If false, only host name will be
+  used. NOTE: this should be set to the same value as
+  "generate.max.per.host.by.ip" - default settings are different only for
+  reasons of backward-compatibility.</description>
 </property>
 
 <property>

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=406757&r1=406756&r2=406757&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Mon May 
15 15:18:34 2006
@@ -26,7 +26,6 @@
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.util.LogFormatter;
 import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.mapred.lib.*;
 import org.apache.hadoop.fs.Path;
 
 import org.apache.nutch.net.URLFilterException;
@@ -71,11 +70,13 @@
     private URLFilters filters;
     private SelectorEntry entry = new SelectorEntry();
     private FloatWritable sortValue = new FloatWritable();
+    private boolean byIP;
 
     public void configure(JobConf job) {
       curTime = job.getLong("crawl.gen.curTime", System.currentTimeMillis());
       limit = job.getLong("crawl.topN",Long.MAX_VALUE)/job.getNumReduceTasks();
       maxPerHost = job.getInt("generate.max.per.host", -1);
+      byIP = job.getBoolean("generate.max.per.host.by.ip", false);
       filters = new URLFilters(job);
     }
 
@@ -127,15 +128,32 @@
 
         if (maxPerHost > 0) {                     // are we counting hosts?
           String host = new URL(url.toString()).getHost();
-          Integer hostCount = (Integer)hostCounts.get(host);
+          if (host == null) {
+            // unknown host, skip
+            continue;
+          }
+          host = host.toLowerCase();
+          if (byIP) {
+            try {
+              InetAddress ia = InetAddress.getByName(host);
+              host = ia.getHostAddress();
+            } catch (UnknownHostException uhe) {
+              LOG.fine("DNS lookup failed: " + host + ", skipping.");
+              continue;
+            }
+          }
+          IntWritable hostCount = (IntWritable)hostCounts.get(host);
+          if (hostCount == null) {
+            hostCount = new IntWritable();
+            hostCounts.put(host, hostCount);
+          }
 
           // increment hostCount
-          hostCount = new Integer(hostCount==null ? 1 : 
hostCount.intValue()+1);
-          hostCounts.put(host, hostCount);
+          hostCount.set(hostCount.get() + 1);
 
           // skip URL if above the limit per host.
-          if (hostCount.intValue() > maxPerHost) {
-            if (hostCount.intValue() == maxPerHost + 1) {
+          if (hostCount.get() > maxPerHost) {
+            if (hostCount.get() == maxPerHost + 1) {
               LOG.info("Host "+ host +" has more than "+ maxPerHost +" URLs."+
                        " Skipping additional.");
             }

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=406757&r1=406756&r2=406757&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Mon May 15 15:18:34 2006
@@ -90,20 +90,20 @@
 
     
   /**
-   * Maps from InetAddress to a Long naming the time it should be unblocked.
-   * The Long is zero while the address is in use, then set to now+wait when
-   * a request finishes.  This way only one thread at a time accesses an
-   * address.
+   * Maps from host to a Long naming the time it should be unblocked.
+   * The Long is zero while the host is in use, then set to now+wait when
+   * a request finishes.  This way only one thread at a time accesses a
+   * host.
    */
   private static HashMap BLOCKED_ADDR_TO_TIME = new HashMap();
   
   /**
-   * Maps an address to the number of threads accessing that address.
+   * Maps a host to the number of threads accessing that host.
    */
   private static HashMap THREADS_PER_HOST_COUNT = new HashMap();
   
   /**
-   * Queue of blocked InetAddress.  This contains all of the non-zero entries
+   * Queue of blocked hosts.  This contains all of the non-zero entries
    * from BLOCKED_ADDR_TO_TIME, ordered by increasing time.
    */
   private static LinkedList BLOCKED_ADDR_QUEUE = new LinkedList();
@@ -116,6 +116,9 @@
  
   /** The nutch configuration */
   private Configuration conf = null;
+  
+  /** Do we block by IP addresses or by hostnames? */
+  private boolean byIP = true;
  
 
   /** Creates a new instance of HttpBase */
@@ -144,6 +147,8 @@
         this.userAgent = getAgentString(conf.get("http.agent.name"), 
conf.get("http.agent.version"), conf
                 .get("http.agent.description"), conf.get("http.agent.url"), 
conf.get("http.agent.email"));
         this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) 
* 1000);
+        // backward-compatible default setting
+        this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
         this.robots.setConf(conf);
         logConf();
     }
@@ -170,12 +175,12 @@
         logger.fine("Exception checking robot rules for " + url + ": " + e);
       }
       
-      InetAddress addr = blockAddr(u);
+      String host = blockAddr(u);
       Response response;
       try {
         response = getResponse(u, datum, false); // make a request
       } finally {
-        unblockAddr(addr);
+        unblockAddr(host);
       }
       
       int code = response.getCode();
@@ -282,13 +287,22 @@
   }
 
 
-  private InetAddress blockAddr(URL url) throws ProtocolException {
+  private String blockAddr(URL url) throws ProtocolException {
     
-    InetAddress addr;
-    try {
-      addr = InetAddress.getByName(url.getHost());
-    } catch (UnknownHostException e) {
-      throw new HttpException(e);
+    String host;
+    if (byIP) {
+      try {
+        InetAddress addr = InetAddress.getByName(url.getHost());
+        host = addr.getHostAddress();
+      } catch (UnknownHostException e) {
+        // unable to resolve it, so don't fall back to host name
+        throw new HttpException(e);
+      }
+    } else {
+      host = url.getHost();
+      if (host == null)
+        throw new HttpException("Unknown host for url: " + url);
+      host = host.toLowerCase();
     }
     
     int delays = 0;
@@ -297,20 +311,20 @@
       
       Long time;
       synchronized (BLOCKED_ADDR_TO_TIME) {
-        time = (Long) BLOCKED_ADDR_TO_TIME.get(addr);
+        time = (Long) BLOCKED_ADDR_TO_TIME.get(host);
         if (time == null) {                       // address is free
           
           // get # of threads already accessing this addr
-          Integer counter = (Integer)THREADS_PER_HOST_COUNT.get(addr);
+          Integer counter = (Integer)THREADS_PER_HOST_COUNT.get(host);
           int count = (counter == null) ? 0 : counter.intValue();
           
           count++;                              // increment & store
-          THREADS_PER_HOST_COUNT.put(addr, new Integer(count));
+          THREADS_PER_HOST_COUNT.put(host, new Integer(count));
           
           if (count >= maxThreadsPerHost) {
-            BLOCKED_ADDR_TO_TIME.put(addr, new Long(0)); // block it
+            BLOCKED_ADDR_TO_TIME.put(host, new Long(0)); // block it
           }
-          return addr;
+          return host;
         }
       }
       
@@ -334,16 +348,16 @@
     }
   }
   
-  private void unblockAddr(InetAddress addr) {
+  private void unblockAddr(String host) {
     synchronized (BLOCKED_ADDR_TO_TIME) {
-      int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(addr)).intValue();
+      int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(host)).intValue();
       if (addrCount == 1) {
-        THREADS_PER_HOST_COUNT.remove(addr);
-        BLOCKED_ADDR_QUEUE.addFirst(addr);
+        THREADS_PER_HOST_COUNT.remove(host);
+        BLOCKED_ADDR_QUEUE.addFirst(host);
         BLOCKED_ADDR_TO_TIME.put
-                (addr, new Long(System.currentTimeMillis() + serverDelay));
+                (host, new Long(System.currentTimeMillis() + serverDelay));
       } else {
-        THREADS_PER_HOST_COUNT.put(addr, new Integer(addrCount - 1));
+        THREADS_PER_HOST_COUNT.put(host, new Integer(addrCount - 1));
       }
     }
   }
@@ -351,10 +365,10 @@
   private static void cleanExpiredServerBlocks() {
     synchronized (BLOCKED_ADDR_TO_TIME) {
       while (!BLOCKED_ADDR_QUEUE.isEmpty()) {
-        InetAddress addr = (InetAddress) BLOCKED_ADDR_QUEUE.getLast();
-        long time = ((Long) BLOCKED_ADDR_TO_TIME.get(addr)).longValue();
+        String host = (String) BLOCKED_ADDR_QUEUE.getLast();
+        long time = ((Long) BLOCKED_ADDR_TO_TIME.get(host)).longValue();
         if (time <= System.currentTimeMillis()) {
-          BLOCKED_ADDR_TO_TIME.remove(addr);
+          BLOCKED_ADDR_TO_TIME.remove(host);
           BLOCKED_ADDR_QUEUE.removeLast();
         } else {
           break;




-------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to