Repository: nutch
Updated Branches:
  refs/heads/master 6b141fb10 -> ecf2bb011


CrawlDb statistics: avoid overflow in sum of fetch times for large CrawlDb


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/4800ad91
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/4800ad91
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/4800ad91

Branch: refs/heads/master
Commit: 4800ad91ab911aed8b139b39527bc437d82f0de3
Parents: 39f6c71
Author: Sebastian Nagel <[email protected]>
Authored: Thu Jun 23 16:32:48 2016 +0200
Committer: Sebastian Nagel <[email protected]>
Committed: Sat Jul 2 12:06:04 2016 +0200

----------------------------------------------------------------------
 src/java/org/apache/nutch/crawl/CrawlDbReader.java | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/4800ad91/src/java/org/apache/nutch/crawl/CrawlDbReader.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java 
b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 3cf6ff3..5db5f95 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -198,8 +198,12 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
           .collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1);
       output.collect(new Text("sc"), new LongWritable(
           (long) (value.getScore() * 1000.0)));
-      output.collect(new Text("ft"), new LongWritable(value.getFetchTime()));
-      output.collect(new Text("fi"), new 
LongWritable(value.getFetchInterval()));
+      // fetch time (in minutes to prevent from overflows when summing up)
+      output.collect(new Text("ft"),
+          new LongWritable(value.getFetchTime() / (1000 * 60)));
+      // fetch interval (in seconds)
+      output.collect(new Text("fi"),
+          new LongWritable(value.getFetchInterval()));
       if (sort) {
         URL u = new URL(key.toString());
         String host = u.getHost();
@@ -449,12 +453,12 @@ public class CrawlDbReader extends Configured implements 
Closeable, Tool {
           LOG.info("avg score:\t"
               + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
         } else if (k.equals("ftn")) {
-          LOG.info("earliest fetch time:\t" + new Date(val.get()));
+          LOG.info("earliest fetch time:\t" + new Date(1000 * 60 * val.get()));
         } else if (k.equals("ftx")) {
-          LOG.info("latest fetch time:\t" + new Date(val.get()));
+          LOG.info("latest fetch time:\t" + new Date(1000 * 60 * val.get()));
         } else if (k.equals("ftt")) {
           LOG.info("avg of fetch times:\t"
-              + new Date(val.get() / totalCnt.get()));
+              + new Date(1000 * 60 * (val.get() / totalCnt.get())));
         } else if (k.equals("fin")) {
           LOG.info("shortest fetch interval:\t{}",
               TimingUtil.secondsToDaysHMS(val.get()));

Reply via email to