Author: otis
Date: Tue Jan 13 14:15:58 2009
New Revision: 734257

URL: http://svn.apache.org/viewvc?rev=734257&view=rev
Log:
NUTCH-627 - Minimize host address lookup while running generate

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=734257&r1=734256&r2=734257&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Jan 13 14:15:58 2009
@@ -308,6 +308,8 @@
 
 115. NUTCH-652 - AdaptiveFetchSchedule#setFetchSchedule doesn't calculate
                  fetch interval correctly. (dogacan)
+
+116. NUTCH-627 - Minimize host address lookup (Otis Gospodnetic)
      
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=734257&r1=734256&r2=734257&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Tue Jan 
13 14:15:58 2009
@@ -88,6 +88,8 @@
     private HashMap<String, IntWritable> hostCounts =
       new HashMap<String, IntWritable>();
     private int maxPerHost;
+    private HashSet<String> maxedHosts = new HashSet<String>();
+    private HashSet<String> dnsFailureHosts = new HashSet<String>();
     private Partitioner<Text, Writable> hostPartitioner = new 
PartitionUrlByHost();
     private URLFilters filters;
     private URLNormalizers normalizers;
@@ -195,17 +197,28 @@
         
         String host = u.getHost();
         host = host.toLowerCase();
-        
+        String hostname = host;
+
         // partitioning by ip will generate lots of DNS requests here, and 
will 
         // be up to double the overall dns load, do not run this way unless you
         // are running a local caching DNS server or a two layer DNS cache
         if (byIP) {
+          if (maxedHosts.contains(host)) {
+            if (LOG.isDebugEnabled()) { LOG.debug("Host already maxed out: " + 
host); }
+            continue;
+          }
+          if (dnsFailureHosts.contains(host)) {
+            if (LOG.isDebugEnabled()) { LOG.debug("Host name lookup already 
failed: " + host); }
+            continue;
+          }
           try {
             InetAddress ia = InetAddress.getByName(host);
             host = ia.getHostAddress();
             urlString = new URL(u.getProtocol(), host, u.getPort(), 
u.getFile()).toString();
           } 
           catch (UnknownHostException uhe) {
+            // remember hostnames that could not be looked up
+            dnsFailureHosts.add(hostname);
             if (LOG.isDebugEnabled()) {
               LOG.debug("DNS lookup failed: " + host + ", skipping.");
             }
@@ -241,6 +254,8 @@
           // skip URL if above the limit per host.
           if (hostCount.get() > maxPerHost) {
             if (hostCount.get() == maxPerHost + 1) {
+              // remember the raw hostname that is maxed out
+              maxedHosts.add(hostname);
               if (LOG.isInfoEnabled()) {
                 LOG.info("Host " + host + " has more than " + maxPerHost +
                          " URLs." + " Skipping additional.");


Reply via email to