Author: mattmann
Date: Fri Oct 30 22:03:27 2015
New Revision: 1711562
URL: http://svn.apache.org/viewvc?rev=1711562&view=rev
Log:
Fix for NUTCH-2150 Add ProtocolStatus Utility contributed by Michael Joyce
<[email protected]> this closes #82.
Added:
nutch/trunk/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/bin/crawl (props changed)
nutch/trunk/src/bin/nutch
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1711562&r1=1711561&r2=1711562&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Oct 30 22:03:27 2015
@@ -3,6 +3,8 @@ Nutch Change Log
Nutch 1.11 Release 25/10/2015 (dd/mm/yyyy)
Release Report: http://s.apache.org/nutch11
+* NUTCH-2150 Add protocolstats utility (Michael Joyce via mattmann)
+
* NUTCH-2146 hashCode on the Outlink class (jorgelbg via mattmann)
* NUTCH-2155 Create a "crawl completeness" utility (Michael Joyce via mattmann)
Propchange: nutch/trunk/src/bin/crawl
------------------------------------------------------------------------------
svn:executable = *
Modified: nutch/trunk/src/bin/nutch
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1711562&r1=1711561&r2=1711562&view=diff
==============================================================================
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Fri Oct 30 22:03:27 2015
@@ -53,7 +53,7 @@ done
# if no args specified, show usage
if [ $# = 0 ]; then
- echo "nutch 1.10-SNAPSHOT"
+ echo "nutch 1.11"
echo "Usage: nutch COMMAND"
echo "where COMMAND is one of:"
echo " readdb read / dump crawl db"
@@ -80,6 +80,7 @@ if [ $# = 0 ]; then
echo " parsechecker check the parser for a given url"
echo " indexchecker check the indexing filters for a given url"
echo " domainstats calculate domain statistics from crawldb"
+ echo " protocolstats calculate protocol status code stats from crawldb"
echo " crawlcomplete calculate crawl completion stats from crawldb"
echo " webgraph generate a web graph from existing segments"
echo " linkrank run a link analysis program on the generated web
graph"
@@ -261,6 +262,8 @@ elif [ "$COMMAND" = "indexchecker" ] ; t
CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
elif [ "$COMMAND" = "domainstats" ] ; then
CLASS=org.apache.nutch.util.domain.DomainStatistics
+elif [ "$COMMAND" = "protocolstats" ] ; then
+ CLASS=org.apache.nutch.util.ProtocolStatusStatistics
elif [ "$COMMAND" = "crawlcomplete" ] ; then
CLASS=org.apache.nutch.util.CrawlCompletionStats
elif [ "$COMMAND" = "webgraph" ] ; then
Added: nutch/trunk/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java?rev=1711562&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
(added)
+++ nutch/trunk/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
Fri Oct 30 22:03:27 2015
@@ -0,0 +1,174 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.IOException;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.metadata.Nutch;
+
+/**
+ * Extracts protocol status code information from the crawl database.
+ *
+ * ProtocolStatusStatistics will give you information on the count
+ * of all status codes encountered on your crawl. This can be useful
+ * for checking a number of things.
+ *
+ * An example output run showing the number of encountered status
+ * codes such as 200, 300, and a count of un-fetched record.
+ *
+ * 38 200
+ * 19 301
+ * 2 302
+ * 665 UNFETCHED
+ *
+ */
+public class ProtocolStatusStatistics extends Configured implements Tool {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(ProtocolStatusStatistics.class);
+
+ private static final Text UNFETCHED_TEXT = new Text("UNFETCHED");
+
+ public static Configuration conf;
+
+ public int run(String[] args) throws Exception {
+ if (args.length < 2) {
+ System.out
+ .println("usage: ProtocolStatistics <crawl db> <output dir>
[numOfReducer]");
+ return 1;
+ }
+ String inputDir = args[0];
+ String outputDir = args[1];
+
+ System.out.println(inputDir);
+ System.out.println(outputDir);
+
+ int numOfReducers = 1;
+
+ if (args.length > 3) {
+ numOfReducers = Integer.parseInt(args[3]);
+ }
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("ProtocolStatistics: starting at " + sdf.format(start));
+
+ String jobName = "ProtocolStatistics";
+
+ conf = getConf();
+ conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+ Job job = Job.getInstance(conf, jobName);
+ job.setJarByClass(ProtocolStatusStatistics.class);
+
+ String[] inputDirsSpecs = inputDir.split(",");
+ for (int i = 0; i < inputDirsSpecs.length; i++) {
+ FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i]));
+ }
+
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ FileOutputFormat.setOutputPath(job, new Path(outputDir));
+ job.setOutputFormatClass(TextOutputFormat.class);
+
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(LongWritable.class);
+
+ job.setMapperClass(ProtocolStatusStatisticsMapper.class);
+ job.setReducerClass(ProtocolStatusStatisticsReducer.class);
+ job.setCombinerClass(ProtocolStatusStatisticsCombiner.class);
+ job.setNumReduceTasks(numOfReducers);
+
+ try {
+ job.waitForCompletion(true);
+ } catch (Exception e) {
+ throw e;
+ }
+
+ long end = System.currentTimeMillis();
+ LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ",
elapsed: "
+ + TimingUtil.elapsedTime(start, end));
+ return 0;
+ }
+
+ static class ProtocolStatusStatisticsMapper extends
+ Mapper<Text, CrawlDatum, Text, LongWritable> {
+
+ public void map(Text urlText, CrawlDatum datum, Context context)
+ throws IOException, InterruptedException {
+ if (datum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
+ context.write((Text)
datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY), new LongWritable(1));
+ } else {
+ context.write(UNFETCHED_TEXT, new LongWritable(1));
+ }
+ }
+ }
+
+ static class ProtocolStatusStatisticsReducer extends
+ Reducer<Text, LongWritable, LongWritable, Text> {
+ public void reduce(Text key, Iterable<LongWritable> values, Context
context)
+ throws IOException, InterruptedException {
+ long total = 0;
+
+ for (LongWritable val : values) {
+ total += val.get();
+ }
+
+ context.write(new LongWritable(total), key);
+ }
+ }
+
+ public static class ProtocolStatusStatisticsCombiner extends
+ Reducer<Text, LongWritable, Text, LongWritable> {
+ public void reduce(Text key, Iterable<LongWritable> values, Context
context)
+ throws IOException, InterruptedException {
+ long total = 0;
+
+ for (LongWritable val : values) {
+ total += val.get();
+ }
+ context.write(key, new LongWritable(total));
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(NutchConfiguration.create(), new
ProtocolStatusStatistics(), args);
+ }
+
+}