CrawlDb statistics: add fetch time (earliest, latest, average)
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/ea2843b9 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/ea2843b9 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/ea2843b9 Branch: refs/heads/master Commit: ea2843b9be6569e17963031d7370f5db42261809 Parents: 6b141fb Author: Sebastian Nagel <[email protected]> Authored: Mon Jun 20 14:42:04 2016 +0200 Committer: Sebastian Nagel <[email protected]> Committed: Sat Jul 2 12:06:04 2016 +0200 ---------------------------------------------------------------------- .../org/apache/nutch/crawl/CrawlDbReader.java | 76 ++++++++++++-------- 1 file changed, 46 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/ea2843b9/src/java/org/apache/nutch/crawl/CrawlDbReader.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java index 8f42ac4..381cec5 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java @@ -197,6 +197,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { .collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1); output.collect(new Text("s"), new LongWritable( (long) (value.getScore() * 1000.0))); + output.collect(new Text("f"), new LongWritable(value.getFetchTime())); if (sort) { URL u = new URL(key.toString()); String host = u.getHost(); @@ -219,32 +220,40 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { public void close() { } + private void reduceMinMaxTotal(String keyPrefix, Iterator<LongWritable> values, + OutputCollector<Text, LongWritable> output, Reporter reporter) + throws IOException { + long total = 0; + long min = Long.MAX_VALUE; + long max = Long.MIN_VALUE; + while (values.hasNext()) { + LongWritable cnt = values.next(); + if (cnt.get() < min) + min = cnt.get(); + if (cnt.get() > max) + max = cnt.get(); + total += cnt.get(); + } + output.collect(new Text(keyPrefix+"n"), new LongWritable(min)); + output.collect(new Text(keyPrefix+"x"), new LongWritable(max)); + output.collect(new Text(keyPrefix+"t"), new LongWritable(total)); + } + public void reduce(Text key, Iterator<LongWritable> values, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException { val.set(0L); String k = key.toString(); - if (!k.equals("s")) { + if (k.equals("s")) { + reduceMinMaxTotal("sc", values, output, reporter); + } else if (k.equals("f")) { + reduceMinMaxTotal("ft", values, output, reporter); + } else { while (values.hasNext()) { LongWritable cnt = values.next(); val.set(val.get() + cnt.get()); } output.collect(key, val); - } else { - long total = 0; - long min = Long.MAX_VALUE; - long max = Long.MIN_VALUE; - while (values.hasNext()) { - LongWritable cnt = values.next(); - if (cnt.get() < min) - min = cnt.get(); - if (cnt.get() > max) - max = cnt.get(); - total += cnt.get(); - } - output.collect(new Text("scn"), new LongWritable(min)); - output.collect(new Text("scx"), new LongWritable(max)); - output.collect(new Text("sct"), new LongWritable(total)); } } } @@ -277,7 +286,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { cnt.set(cnt.get() + val.get()); } output.collect(key, cnt); - } else if (k.equals("scx")) { + } else if (k.equals("scx") || k.equals("ftx")) { LongWritable cnt = new LongWritable(Long.MIN_VALUE); while (values.hasNext()) { LongWritable val = values.next(); @@ -285,7 +294,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { cnt.set(val.get()); } output.collect(key, cnt); - } else if (k.equals("scn")) { + } else if (k.equals("scn") || k.equals("ftn")) { LongWritable cnt = new LongWritable(Long.MAX_VALUE); while (values.hasNext()) { LongWritable val = values.next(); @@ -293,7 +302,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { cnt.set(val.get()); } output.collect(key, cnt); - } else if (k.equals("sct")) { + } else if (k.equals("sct") || k.equals("ftt")) { LongWritable cnt = new LongWritable(); while (values.hasNext()) { LongWritable val = values.next(); @@ -393,16 +402,16 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { LongWritable val = stats.get(k); if (val == null) { val = new LongWritable(); - if (k.equals("scx")) + if (k.equals("scx") || k.equals("ftx")) val.set(Long.MIN_VALUE); - if (k.equals("scn")) + if (k.equals("scn") || k.equals("ftn")) val.set(Long.MAX_VALUE); stats.put(k, val); } - if (k.equals("scx")) { + if (k.equals("scx") || k.equals("ftx")) { if (val.get() < value.get()) val.set(value.get()); - } else if (k.equals("scn")) { + } else if (k.equals("scn") || k.equals("ftn")) { if (val.get() > value.get()) val.set(value.get()); } else { @@ -439,6 +448,13 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { } else if (k.equals("sct")) { LOG.info("avg score:\t" + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0)); + } else if (k.equals("ftn")) { + LOG.info("earliest fetch time:\t" + new Date(val.get())); + } else if (k.equals("ftx")) { + LOG.info("latest fetch time:\t" + new Date(val.get())); + } else if (k.equals("ftt")) { + LOG.info("avg of fetch times:\t" + + new Date(val.get() / totalCnt.get())); } else if (k.startsWith("status")) { String[] st = k.split(" "); int code = Integer.parseInt(st[1]); @@ -732,11 +748,12 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { return 0; } - public static void main(String[] args) throws Exception { - int result = ToolRunner.run(NutchConfiguration.create(), - new CrawlDbReader(), args); - System.exit(result); - } + public static void main(String[] args) throws Exception { + int result = ToolRunner.run(NutchConfiguration.create(), + new CrawlDbReader(), args); + System.exit(result); + } + public Object query(Map<String, String> args, Configuration conf, String type, String crawlId) throws Exception { @@ -759,7 +776,6 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { String k = entry.getKey(); LongWritable val = entry.getValue(); if (k.equals("scn")) { - results.put("minScore", String.valueOf((val.get() / 1000.0f))); } else if (k.equals("scx")) { results.put("maxScore", String.valueOf((val.get() / 1000.0f))); @@ -854,5 +870,5 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { return results; } return results; - } + } } \ No newline at end of file
