Repository: nutch Updated Branches: refs/heads/master 6b141fb10 -> ecf2bb011
CrawlDb statistics: avoid overflow in sum of fetch times for large CrawlDb Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/4800ad91 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/4800ad91 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/4800ad91 Branch: refs/heads/master Commit: 4800ad91ab911aed8b139b39527bc437d82f0de3 Parents: 39f6c71 Author: Sebastian Nagel <[email protected]> Authored: Thu Jun 23 16:32:48 2016 +0200 Committer: Sebastian Nagel <[email protected]> Committed: Sat Jul 2 12:06:04 2016 +0200 ---------------------------------------------------------------------- src/java/org/apache/nutch/crawl/CrawlDbReader.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/4800ad91/src/java/org/apache/nutch/crawl/CrawlDbReader.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java index 3cf6ff3..5db5f95 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java @@ -198,8 +198,12 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { .collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1); output.collect(new Text("sc"), new LongWritable( (long) (value.getScore() * 1000.0))); - output.collect(new Text("ft"), new LongWritable(value.getFetchTime())); - output.collect(new Text("fi"), new LongWritable(value.getFetchInterval())); + // fetch time (in minutes to prevent from overflows when summing up) + output.collect(new Text("ft"), + new LongWritable(value.getFetchTime() / (1000 * 60))); + // fetch interval (in seconds) + output.collect(new Text("fi"), + new LongWritable(value.getFetchInterval())); if (sort) { URL u = new URL(key.toString()); String host = u.getHost(); @@ -449,12 +453,12 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { LOG.info("avg score:\t" + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0)); } else if (k.equals("ftn")) { - LOG.info("earliest fetch time:\t" + new Date(val.get())); + LOG.info("earliest fetch time:\t" + new Date(1000 * 60 * val.get())); } else if (k.equals("ftx")) { - LOG.info("latest fetch time:\t" + new Date(val.get())); + LOG.info("latest fetch time:\t" + new Date(1000 * 60 * val.get())); } else if (k.equals("ftt")) { LOG.info("avg of fetch times:\t" - + new Date(val.get() / totalCnt.get())); + + new Date(1000 * 60 * (val.get() / totalCnt.get()))); } else if (k.equals("fin")) { LOG.info("shortest fetch interval:\t{}", TimingUtil.secondsToDaysHMS(val.get()));
