[
https://issues.apache.org/jira/browse/NUTCH-2155?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14979335#comment-14979335
]
ASF GitHub Bot commented on NUTCH-2155:
---------------------------------------
Github user MJJoyce commented on a diff in the pull request:
https://github.com/apache/nutch/pull/83#discussion_r43325287
--- Diff: src/java/org/apache/nutch/util/CrawlCompletionStats.java ---
@@ -0,0 +1,189 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.IOException;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * Extracts some simple crawl completion stats from the crawldb
+ *
+ * Stats will be sorted by host/domain and will be of the form:
+ * 1 www.spitzer.caltech.edu FETCHED
+ * 50 www.spitzer.caltech.edu UNFETCHED
+ *
+ */
+public class CrawlCompletionStats extends Configured implements Tool {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(CrawlCompletionStats.class);
+
+ private static final int MODE_HOST = 1;
+ private static final int MODE_DOMAIN = 2;
+
+ private int mode = 0;
+
+ public int run(String[] args) throws Exception {
+ if (args.length < 2) {
+ System.out
+ .println("usage: CrawlCompletionStats inputDirs outDir
host|domain [numOfReducer]");
+ return 1;
+ }
+ String inputDir = args[0];
+ String outputDir = args[1];
+ int numOfReducers = 1;
+
+ if (args.length > 3) {
+ numOfReducers = Integer.parseInt(args[3]);
+ }
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("CrawlCompletionStats: starting at " + sdf.format(start));
+
+ int mode = 0;
+ String jobName = "CrawlCompletionStats";
+ if (args[2].equals("host")) {
+ jobName = "Host CrawlCompletionStats";
+ mode = MODE_HOST;
+ } else if (args[2].equals("domain")) {
+ jobName = "Domain CrawlCompletionStats";
+ mode = MODE_DOMAIN;
+ }
+
+ Configuration conf = getConf();
+ conf.setInt("domain.statistics.mode", mode);
+ conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
false);
+
+ Job job = Job.getInstance(conf, jobName);
+ job.setJarByClass(CrawlCompletionStats.class);
+
+ String[] inputDirsSpecs = inputDir.split(",");
+ for (int i = 0; i < inputDirsSpecs.length; i++) {
+ FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i]));
+ }
+
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ FileOutputFormat.setOutputPath(job, new Path(outputDir));
+ job.setOutputFormatClass(TextOutputFormat.class);
+
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(LongWritable.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(LongWritable.class);
+
+ job.setMapperClass(CrawlCompletionStatsMapper.class);
+ job.setReducerClass(CrawlCompletionStatsReducer.class);
+ job.setCombinerClass(CrawlCompletionStatsCombiner.class);
+ job.setNumReduceTasks(numOfReducers);
+
+ try {
+ job.waitForCompletion(true);
+ } catch (Exception e) {
+ throw e;
+ }
+
+ long end = System.currentTimeMillis();
+ LOG.info("CrawlCompletionStats: finished at " + sdf.format(end) + ",
elapsed: "
--- End diff --
Noted and updated
> Create a "crawl completeness" utility
> -------------------------------------
>
> Key: NUTCH-2155
> URL: https://issues.apache.org/jira/browse/NUTCH-2155
> Project: Nutch
> Issue Type: Improvement
> Components: util
> Affects Versions: 1.10
> Reporter: Michael Joyce
> Fix For: 1.12
>
>
> I've found it useful to have a tool for dumping some "completeness"
> information from a crawl similar to how domainstats does but including
> fetched and unfetched counts per domain/host. This is especially nice when
> doing vertical crawls over a few domains or just to see how much of a
> host/domain you've covered with your crawl so far.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)