CrawlDb statistics: add fetch interval (shortest, longest, average)
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/39f6c713 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/39f6c713 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/39f6c713 Branch: refs/heads/master Commit: 39f6c713974240d19d54a515cd04372878739456 Parents: ea2843b Author: Sebastian Nagel <sna...@apache.org> Authored: Wed Jun 22 16:22:33 2016 +0200 Committer: Sebastian Nagel <sna...@apache.org> Committed: Sat Jul 2 12:06:04 2016 +0200 ---------------------------------------------------------------------- .../org/apache/nutch/crawl/CrawlDbReader.java | 35 ++++++++----- src/java/org/apache/nutch/util/TimingUtil.java | 53 ++++++++++++-------- 2 files changed, 55 insertions(+), 33 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/39f6c713/src/java/org/apache/nutch/crawl/CrawlDbReader.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java index 381cec5..3cf6ff3 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java @@ -69,6 +69,7 @@ import org.apache.nutch.util.JexlUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; +import org.apache.nutch.util.TimingUtil; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; import org.apache.commons.lang.time.DateUtils; @@ -195,9 +196,10 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { output.collect(new Text("status " + value.getStatus()), COUNT_1); output .collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1); - output.collect(new Text("s"), new LongWritable( + output.collect(new Text("sc"), new LongWritable( (long) (value.getScore() * 1000.0))); - output.collect(new Text("f"), new LongWritable(value.getFetchTime())); + output.collect(new Text("ft"), new LongWritable(value.getFetchTime())); + output.collect(new Text("fi"), new LongWritable(value.getFetchInterval())); if (sort) { URL u = new URL(key.toString()); String host = u.getHost(); @@ -244,10 +246,8 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { throws IOException { val.set(0L); String k = key.toString(); - if (k.equals("s")) { - reduceMinMaxTotal("sc", values, output, reporter); - } else if (k.equals("f")) { - reduceMinMaxTotal("ft", values, output, reporter); + if (k.equals("sc") || k.equals("ft") || k.equals("fi")) { + reduceMinMaxTotal(k, values, output, reporter); } else { while (values.hasNext()) { LongWritable cnt = values.next(); @@ -286,7 +286,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { cnt.set(cnt.get() + val.get()); } output.collect(key, cnt); - } else if (k.equals("scx") || k.equals("ftx")) { + } else if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) { LongWritable cnt = new LongWritable(Long.MIN_VALUE); while (values.hasNext()) { LongWritable val = values.next(); @@ -294,7 +294,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { cnt.set(val.get()); } output.collect(key, cnt); - } else if (k.equals("scn") || k.equals("ftn")) { + } else if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) { LongWritable cnt = new LongWritable(Long.MAX_VALUE); while (values.hasNext()) { LongWritable val = values.next(); @@ -302,7 +302,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { cnt.set(val.get()); } output.collect(key, cnt); - } else if (k.equals("sct") || k.equals("ftt")) { + } else if (k.equals("sct") || k.equals("ftt") || k.equals("fit")) { LongWritable cnt = new LongWritable(); while (values.hasNext()) { LongWritable val = values.next(); @@ -402,16 +402,16 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { LongWritable val = stats.get(k); if (val == null) { val = new LongWritable(); - if (k.equals("scx") || k.equals("ftx")) + if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) val.set(Long.MIN_VALUE); - if (k.equals("scn") || k.equals("ftn")) + if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) val.set(Long.MAX_VALUE); stats.put(k, val); } - if (k.equals("scx") || k.equals("ftx")) { + if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) { if (val.get() < value.get()) val.set(value.get()); - } else if (k.equals("scn") || k.equals("ftn")) { + } else if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) { if (val.get() > value.get()) val.set(value.get()); } else { @@ -455,6 +455,15 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { } else if (k.equals("ftt")) { LOG.info("avg of fetch times:\t" + new Date(val.get() / totalCnt.get())); + } else if (k.equals("fin")) { + LOG.info("shortest fetch interval:\t{}", + TimingUtil.secondsToDaysHMS(val.get())); + } else if (k.equals("fix")) { + LOG.info("longest fetch interval:\t{}", + TimingUtil.secondsToDaysHMS(val.get())); + } else if (k.equals("fit")) { + LOG.info("avg fetch interval:\t{}", + TimingUtil.secondsToDaysHMS(val.get() / totalCnt.get())); } else if (k.startsWith("status")) { String[] st = k.split(" "); int code = Integer.parseInt(st[1]); http://git-wip-us.apache.org/repos/asf/nutch/blob/39f6c713/src/java/org/apache/nutch/util/TimingUtil.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/TimingUtil.java b/src/java/org/apache/nutch/util/TimingUtil.java index 8f77969..c4af356 100644 --- a/src/java/org/apache/nutch/util/TimingUtil.java +++ b/src/java/org/apache/nutch/util/TimingUtil.java @@ -17,12 +17,10 @@ package org.apache.nutch.util; -import java.text.NumberFormat; +import java.util.concurrent.TimeUnit; public class TimingUtil { - private static long[] TIME_FACTOR = { 60 * 60 * 1000, 60 * 1000, 1000 }; - /** * Calculate the elapsed time between two times specified in milliseconds. * @@ -37,23 +35,38 @@ public class TimingUtil { if (start > end) { return null; } + return secondsToHMS((end-start)/1000); + } + + /** + * Show time in seconds as hours, minutes and seconds (hh:mm:ss) + * + * @param seconds + * (elapsed) time in seconds + * @return human readable time string "hh:mm:ss" + */ + public static String secondsToHMS(long seconds) { + long hours = TimeUnit.SECONDS.toHours(seconds); + long minutes = TimeUnit.SECONDS.toMinutes(seconds) + % TimeUnit.HOURS.toMinutes(1); + seconds = TimeUnit.SECONDS.toSeconds(seconds) + % TimeUnit.MINUTES.toSeconds(1); + return String.format("%02d:%02d:%02d", hours, minutes, seconds); + } - long[] elapsedTime = new long[TIME_FACTOR.length]; - - for (int i = 0; i < TIME_FACTOR.length; i++) { - elapsedTime[i] = start > end ? -1 : (end - start) / TIME_FACTOR[i]; - start += TIME_FACTOR[i] * elapsedTime[i]; - } - - NumberFormat nf = NumberFormat.getInstance(); - nf.setMinimumIntegerDigits(2); - StringBuffer buf = new StringBuffer(); - for (int i = 0; i < elapsedTime.length; i++) { - if (i > 0) { - buf.append(":"); - } - buf.append(nf.format(elapsedTime[i])); - } - return buf.toString(); + /** + * Show time in seconds as days, hours, minutes and seconds (d days, hh:mm:ss) + * + * @param seconds + * (elapsed) time in seconds + * @return human readable time string "d days, hh:mm:ss" + */ + public static String secondsToDaysHMS(long seconds) { + long days = TimeUnit.SECONDS.toDays(seconds); + if (days == 0) + return secondsToHMS(seconds); + String hhmmss = secondsToHMS(seconds % TimeUnit.DAYS.toSeconds(1)); + return String.format("%d days, %s", days, hhmmss); } + }