Author: cutting
Date: Thu Nov 10 13:03:16 2005
New Revision: 332371

URL: http://svn.apache.org/viewcvs?rev=332371&view=rev
Log:
Fix to not increment count of urls when urls are filtered by
maxPerHost limit.  Patch contributed by Rod Taylor.

Modified:
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=332371&r1=332370&r2=332371&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java 
Thu Nov 10 13:03:16 2005
@@ -76,23 +76,27 @@
                        OutputCollector output, Reporter reporter)
       throws IOException {
 
-      while (values.hasNext() && ++count < limit) {
+      while (values.hasNext() && count < limit) {
 
         UTF8 url = (UTF8)values.next();
 
-        if (maxPerHost > 0) {                       // are we counting hosts?
+        if (maxPerHost > 0) {                     // are we counting hosts?
           String host = new URL(url.toString()).getHost();
-          Integer count = (Integer)hostCounts.get(host);
-          if (count != null) {
-            if (count.intValue() >= maxPerHost)
+          Integer hostCount = (Integer)hostCounts.get(host);
+          if (hostCount != null) {
+            if (hostCount.intValue() >= maxPerHost)
               continue;                           // too many from host
-            hostCounts.put(host, new Integer(count.intValue()+1));
+            hostCounts.put(host, new Integer(hostCount.intValue()+1));
           } else {                                // update host count
             hostCounts.put(host, new Integer(1));
           }
         }
 
         output.collect(key, url);
+
+        // Count is incremented only when we keep the URL
+        // maxPerHost may cause us to skip it.
+        count++;
       }
 
     }


Reply via email to