Author: lewismc
Date: Fri May 8 23:29:45 2015
New Revision: 1678459
URL: http://svn.apache.org/r1678459
Log:
NUTCH-1873 Solr IndexWriter/Job to report number of docs indexed.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1678459&r1=1678458&r2=1678459&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri May 8 23:29:45 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-1873 Solr IndexWriter/Job to report number of docs indexed. (snagel
via lewismc)
+
* NUTCH-1934 Refactor Fetcher in trunk (lewismc)
* NUTCH-2004 ParseChecker does not handle redirects (mjoyce via lewismc)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1678459&r1=1678458&r2=1678459&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri May
8 23:29:45 2015
@@ -23,7 +23,6 @@ import java.util.Iterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
@@ -210,6 +209,7 @@ public class IndexerMapReduce extends Co
NutchIndexAction action = new NutchIndexAction(null,
NutchIndexAction.DELETE);
output.collect(key, action);
+ reporter.incrCounter("IndexerStatus", "deleted (robots=noindex)",
1);
return;
}
}
@@ -224,7 +224,7 @@ public class IndexerMapReduce extends Co
if (delete && fetchDatum != null && dbDatum != null) {
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
|| dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
- reporter.incrCounter("IndexerStatus", "Documents deleted", 1);
+ reporter.incrCounter("IndexerStatus", "deleted (gone)", 1);
NutchIndexAction action = new NutchIndexAction(null,
NutchIndexAction.DELETE);
@@ -236,8 +236,7 @@ public class IndexerMapReduce extends Co
|| fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
|| dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
|| dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
- reporter.incrCounter("IndexerStatus", "Deleted redirects", 1);
- reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);
+ reporter.incrCounter("IndexerStatus", "deleted redirects", 1);
NutchIndexAction action = new NutchIndexAction(null,
NutchIndexAction.DELETE);
@@ -253,7 +252,7 @@ public class IndexerMapReduce extends Co
// Whether to delete pages marked as duplicates
if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
- reporter.incrCounter("IndexerStatus", "Duplicates deleted", 1);
+ reporter.incrCounter("IndexerStatus", "deleted duplicates", 1);
NutchIndexAction action = new NutchIndexAction(null,
NutchIndexAction.DELETE);
output.collect(key, action);
@@ -262,7 +261,7 @@ public class IndexerMapReduce extends Co
// Whether to skip DB_NOTMODIFIED pages
if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
- reporter.incrCounter("IndexerStatus", "Skipped", 1);
+ reporter.incrCounter("IndexerStatus", "skipped (not modified)", 1);
return;
}
@@ -305,13 +304,13 @@ public class IndexerMapReduce extends Co
if (LOG.isWarnEnabled()) {
LOG.warn("Error indexing " + key + ": " + e);
}
- reporter.incrCounter("IndexerStatus", "Errors", 1);
+ reporter.incrCounter("IndexerStatus", "errors (IndexingFilter)", 1);
return;
}
// skip documents discarded by indexing filters
if (doc == null) {
- reporter.incrCounter("IndexerStatus", "Skipped by filters", 1);
+ reporter.incrCounter("IndexerStatus", "skipped by indexing filters", 1);
return;
}
@@ -321,6 +320,7 @@ public class IndexerMapReduce extends Co
boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse,
inlinks, boost);
} catch (final ScoringFilterException e) {
+ reporter.incrCounter("IndexerStatus", "errors (ScoringFilter)", 1);
if (LOG.isWarnEnabled()) {
LOG.warn("Error calculating score " + key + ": " + e);
}
@@ -331,7 +331,7 @@ public class IndexerMapReduce extends Co
// store boost for use by explain and dedup
doc.add("boost", Float.toString(boost));
- reporter.incrCounter("IndexerStatus", "Documents added", 1);
+ reporter.incrCounter("IndexerStatus", "indexed (add/update)", 1);
NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD);
output.collect(key, action);
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1678459&r1=1678458&r2=1678459&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java Fri May 8
23:29:45 2015
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import java.util.Random;
import org.apache.nutch.segment.SegmentChecker;
@@ -31,6 +32,8 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Counters.Counter;
+import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
@@ -110,12 +113,18 @@ public class IndexingJob extends Configu
FileOutputFormat.setOutputPath(job, tmp);
try {
- JobClient.runJob(job);
+ RunningJob indexJob = JobClient.runJob(job);
// do the commits once and for all the reducers in one go
if (!noCommit) {
writers.open(job, "commit");
writers.commit();
}
+ LOG.info("Indexer: number of documents indexed, deleted, or skipped:");
+ for (Counter counter : indexJob.getCounters().getGroup("IndexerStatus"))
{
+ LOG.info("Indexer: {} {}",
+ String.format(Locale.ROOT, "%6d", counter.getValue()),
+ counter.getName());
+ }
long end = System.currentTimeMillis();
LOG.info("Indexer: finished at " + sdf.format(end) + ", elapsed: "
+ TimingUtil.elapsedTime(start, end));
@@ -127,7 +136,7 @@ public class IndexingJob extends Configu
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err
- .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params
k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone]
[-filter] [-normalize]");
+ .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params
k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone]
[-filter] [-normalize]");
IndexWriters writers = new IndexWriters(getConf());
System.err.println(writers.describe());
return -1;