Author: otis Date: Tue Jan 13 14:15:58 2009 New Revision: 734257 URL: http://svn.apache.org/viewvc?rev=734257&view=rev Log: NUTCH-627 - Minimize host address lookup while running generate
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=734257&r1=734256&r2=734257&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Jan 13 14:15:58 2009 @@ -308,6 +308,8 @@ 115. NUTCH-652 - AdaptiveFetchSchedule#setFetchSchedule doesn't calculate fetch interval correctly. (dogacan) + +116. NUTCH-627 - Minimize host address lookup (Otis Gospodnetic) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=734257&r1=734256&r2=734257&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Tue Jan 13 14:15:58 2009 @@ -88,6 +88,8 @@ private HashMap<String, IntWritable> hostCounts = new HashMap<String, IntWritable>(); private int maxPerHost; + private HashSet<String> maxedHosts = new HashSet<String>(); + private HashSet<String> dnsFailureHosts = new HashSet<String>(); private Partitioner<Text, Writable> hostPartitioner = new PartitionUrlByHost(); private URLFilters filters; private URLNormalizers normalizers; @@ -195,17 +197,28 @@ String host = u.getHost(); host = host.toLowerCase(); - + String hostname = host; + // partitioning by ip will generate lots of DNS requests here, and will // be up to double the overall dns load, do not run this way unless you // are running a local caching DNS server or a two layer DNS cache if (byIP) { + if (maxedHosts.contains(host)) { + if (LOG.isDebugEnabled()) { LOG.debug("Host already maxed out: " + host); } + continue; + } + if (dnsFailureHosts.contains(host)) { + if (LOG.isDebugEnabled()) { LOG.debug("Host name lookup already failed: " + host); } + continue; + } try { InetAddress ia = InetAddress.getByName(host); host = ia.getHostAddress(); urlString = new URL(u.getProtocol(), host, u.getPort(), u.getFile()).toString(); } catch (UnknownHostException uhe) { + // remember hostnames that could not be looked up + dnsFailureHosts.add(hostname); if (LOG.isDebugEnabled()) { LOG.debug("DNS lookup failed: " + host + ", skipping."); } @@ -241,6 +254,8 @@ // skip URL if above the limit per host. if (hostCount.get() > maxPerHost) { if (hostCount.get() == maxPerHost + 1) { + // remember the raw hostname that is maxed out + maxedHosts.add(hostname); if (LOG.isInfoEnabled()) { LOG.info("Host " + host + " has more than " + maxPerHost + " URLs." + " Skipping additional.");