Author: mattmann
Date: Fri Oct 30 22:03:27 2015
New Revision: 1711562

URL: http://svn.apache.org/viewvc?rev=1711562&view=rev
Log:
Fix for NUTCH-2150 Add ProtocolStatus Utility contributed by Michael Joyce 
<[email protected]> this closes #82.

Added:
    nutch/trunk/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/bin/crawl   (props changed)
    nutch/trunk/src/bin/nutch

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1711562&r1=1711561&r2=1711562&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Oct 30 22:03:27 2015
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 1.11 Release 25/10/2015 (dd/mm/yyyy)
 Release Report: http://s.apache.org/nutch11
 
+* NUTCH-2150 Add protocolstats utility (Michael Joyce via mattmann)
+
 * NUTCH-2146 hashCode on the Outlink class (jorgelbg via mattmann)
 
 * NUTCH-2155 Create a "crawl completeness" utility (Michael Joyce via mattmann)

Propchange: nutch/trunk/src/bin/crawl
------------------------------------------------------------------------------
    svn:executable = *

Modified: nutch/trunk/src/bin/nutch
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1711562&r1=1711561&r2=1711562&view=diff
==============================================================================
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Fri Oct 30 22:03:27 2015
@@ -53,7 +53,7 @@ done
 
 # if no args specified, show usage
 if [ $# = 0 ]; then
-  echo "nutch 1.10-SNAPSHOT"
+  echo "nutch 1.11"
   echo "Usage: nutch COMMAND"
   echo "where COMMAND is one of:"
   echo "  readdb            read / dump crawl db"
@@ -80,6 +80,7 @@ if [ $# = 0 ]; then
   echo "  parsechecker      check the parser for a given url"
   echo "  indexchecker      check the indexing filters for a given url"
   echo "  domainstats       calculate domain statistics from crawldb"
+  echo "  protocolstats     calculate protocol status code stats from crawldb"
   echo "  crawlcomplete     calculate crawl completion stats from crawldb"
   echo "  webgraph          generate a web graph from existing segments"
   echo "  linkrank          run a link analysis program on the generated web 
graph"
@@ -261,6 +262,8 @@ elif [ "$COMMAND" = "indexchecker" ] ; t
   CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
 elif [ "$COMMAND" = "domainstats" ] ; then 
   CLASS=org.apache.nutch.util.domain.DomainStatistics
+elif [ "$COMMAND" = "protocolstats" ] ; then
+   CLASS=org.apache.nutch.util.ProtocolStatusStatistics
 elif [ "$COMMAND" = "crawlcomplete" ] ; then
   CLASS=org.apache.nutch.util.CrawlCompletionStats
 elif [ "$COMMAND" = "webgraph" ] ; then

Added: nutch/trunk/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java?rev=1711562&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java 
(added)
+++ nutch/trunk/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java 
Fri Oct 30 22:03:27 2015
@@ -0,0 +1,174 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.IOException;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.metadata.Nutch;
+
+/**
+ * Extracts protocol status code information from the crawl database.
+ *
+ * ProtocolStatusStatistics will give you information on the count
+ * of all status codes encountered on your crawl. This can be useful
+ * for checking a number of things.
+ *
+ * An example output run showing the number of encountered status
+ * codes such as 200, 300, and a count of un-fetched record.
+ *
+ * 38  200
+ * 19  301
+ * 2   302
+ * 665 UNFETCHED
+ *
+ */
+public class ProtocolStatusStatistics extends Configured implements Tool {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(ProtocolStatusStatistics.class);
+
+  private static final Text UNFETCHED_TEXT = new Text("UNFETCHED");
+
+  public static Configuration conf;
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.out
+          .println("usage: ProtocolStatistics <crawl db> <output dir> 
[numOfReducer]");
+      return 1;
+    }
+    String inputDir = args[0];
+    String outputDir = args[1];
+
+    System.out.println(inputDir);
+    System.out.println(outputDir);
+
+    int numOfReducers = 1;
+
+    if (args.length > 3) {
+      numOfReducers = Integer.parseInt(args[3]);
+    }
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("ProtocolStatistics: starting at " + sdf.format(start));
+
+    String jobName = "ProtocolStatistics";
+
+    conf = getConf();
+    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+    Job job = Job.getInstance(conf, jobName);
+    job.setJarByClass(ProtocolStatusStatistics.class);
+
+    String[] inputDirsSpecs = inputDir.split(",");
+    for (int i = 0; i < inputDirsSpecs.length; i++) {
+      FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i]));
+    }
+
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    FileOutputFormat.setOutputPath(job, new Path(outputDir));
+    job.setOutputFormatClass(TextOutputFormat.class);
+
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(LongWritable.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(LongWritable.class);
+
+    job.setMapperClass(ProtocolStatusStatisticsMapper.class);
+    job.setReducerClass(ProtocolStatusStatisticsReducer.class);
+    job.setCombinerClass(ProtocolStatusStatisticsCombiner.class);
+    job.setNumReduceTasks(numOfReducers);
+
+    try {
+      job.waitForCompletion(true);
+    } catch (Exception e) {
+      throw e;
+    }
+
+    long end = System.currentTimeMillis();
+    LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ", 
elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+    return 0;
+  }
+
+  static class ProtocolStatusStatisticsMapper extends
+      Mapper<Text, CrawlDatum, Text, LongWritable> {
+
+    public void map(Text urlText, CrawlDatum datum, Context context)
+        throws IOException, InterruptedException {
+      if (datum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
+        context.write((Text) 
datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY), new LongWritable(1));
+      } else {
+        context.write(UNFETCHED_TEXT, new LongWritable(1));
+      }
+    }
+  }
+
+  static class ProtocolStatusStatisticsReducer extends
+      Reducer<Text, LongWritable, LongWritable, Text> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context 
context)
+        throws IOException, InterruptedException {
+      long total = 0;
+
+      for (LongWritable val : values) {
+        total += val.get();
+      }
+
+      context.write(new LongWritable(total), key);
+    }
+  }
+
+  public static class ProtocolStatusStatisticsCombiner extends
+      Reducer<Text, LongWritable, Text, LongWritable> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context 
context)
+        throws IOException, InterruptedException {
+      long total = 0;
+
+      for (LongWritable val : values) {
+        total += val.get();
+      }
+      context.write(key, new LongWritable(total));
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(NutchConfiguration.create(), new 
ProtocolStatusStatistics(), args);
+  }
+
+}


Reply via email to