IndexingJob.java

lewismc Fri, 08 May 2015 16:30:07 -0700

Author: lewismc
Date: Fri May  8 23:29:45 2015
New Revision: 1678459

URL: http://svn.apache.org/r1678459
Log:
NUTCH-1873 Solr IndexWriter/Job to report number of docs indexed.


Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1678459&r1=1678458&r2=1678459&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri May  8 23:29:45 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-1873 Solr IndexWriter/Job to report number of docs indexed. (snagel 
via lewismc)
+
 * NUTCH-1934 Refactor Fetcher in trunk (lewismc)
 
 * NUTCH-2004 ParseChecker does not handle redirects (mjoyce via lewismc)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1678459&r1=1678458&r2=1678459&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri May 
 8 23:29:45 2015
@@ -23,7 +23,6 @@ import java.util.Iterator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
@@ -210,6 +209,7 @@ public class IndexerMapReduce extends Co
             NutchIndexAction action = new NutchIndexAction(null,
                 NutchIndexAction.DELETE);
             output.collect(key, action);
+            reporter.incrCounter("IndexerStatus", "deleted (robots=noindex)", 
1);
             return;
           }
         }
@@ -224,7 +224,7 @@ public class IndexerMapReduce extends Co
     if (delete && fetchDatum != null && dbDatum != null) {
       if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
           || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
-        reporter.incrCounter("IndexerStatus", "Documents deleted", 1);
+        reporter.incrCounter("IndexerStatus", "deleted (gone)", 1);
 
         NutchIndexAction action = new NutchIndexAction(null,
             NutchIndexAction.DELETE);
@@ -236,8 +236,7 @@ public class IndexerMapReduce extends Co
           || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
           || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
           || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
-        reporter.incrCounter("IndexerStatus", "Deleted redirects", 1);
-        reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);
+        reporter.incrCounter("IndexerStatus", "deleted redirects", 1);
 
         NutchIndexAction action = new NutchIndexAction(null,
             NutchIndexAction.DELETE);
@@ -253,7 +252,7 @@ public class IndexerMapReduce extends Co
 
     // Whether to delete pages marked as duplicates
     if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
-      reporter.incrCounter("IndexerStatus", "Duplicates deleted", 1);
+      reporter.incrCounter("IndexerStatus", "deleted duplicates", 1);
       NutchIndexAction action = new NutchIndexAction(null,
           NutchIndexAction.DELETE);
       output.collect(key, action);
@@ -262,7 +261,7 @@ public class IndexerMapReduce extends Co
 
     // Whether to skip DB_NOTMODIFIED pages
     if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
-      reporter.incrCounter("IndexerStatus", "Skipped", 1);
+      reporter.incrCounter("IndexerStatus", "skipped (not modified)", 1);
       return;
     }
 
@@ -305,13 +304,13 @@ public class IndexerMapReduce extends Co
       if (LOG.isWarnEnabled()) {
         LOG.warn("Error indexing " + key + ": " + e);
       }
-      reporter.incrCounter("IndexerStatus", "Errors", 1);
+      reporter.incrCounter("IndexerStatus", "errors (IndexingFilter)", 1);
       return;
     }
 
     // skip documents discarded by indexing filters
     if (doc == null) {
-      reporter.incrCounter("IndexerStatus", "Skipped by filters", 1);
+      reporter.incrCounter("IndexerStatus", "skipped by indexing filters", 1);
       return;
     }
 
@@ -321,6 +320,7 @@ public class IndexerMapReduce extends Co
       boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse,
           inlinks, boost);
     } catch (final ScoringFilterException e) {
+      reporter.incrCounter("IndexerStatus", "errors (ScoringFilter)", 1);
       if (LOG.isWarnEnabled()) {
         LOG.warn("Error calculating score " + key + ": " + e);
       }
@@ -331,7 +331,7 @@ public class IndexerMapReduce extends Co
     // store boost for use by explain and dedup
     doc.add("boost", Float.toString(boost));
 
-    reporter.incrCounter("IndexerStatus", "Documents added", 1);
+    reporter.incrCounter("IndexerStatus", "indexed (add/update)", 1);
 
     NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD);
     output.collect(key, action);

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1678459&r1=1678458&r2=1678459&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java Fri May  8 
23:29:45 2015
@@ -20,6 +20,7 @@ import java.io.IOException;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Locale;
 import java.util.Random;
 
 import org.apache.nutch.segment.SegmentChecker;
@@ -31,6 +32,8 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.mapred.FileOutputFormat;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Counters.Counter;
+import org.apache.hadoop.mapred.RunningJob;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
@@ -110,12 +113,18 @@ public class IndexingJob extends Configu
 
     FileOutputFormat.setOutputPath(job, tmp);
     try {
-      JobClient.runJob(job);
+      RunningJob indexJob = JobClient.runJob(job);
       // do the commits once and for all the reducers in one go
       if (!noCommit) {
         writers.open(job, "commit");
         writers.commit();
       }
+      LOG.info("Indexer: number of documents indexed, deleted, or skipped:");
+      for (Counter counter : indexJob.getCounters().getGroup("IndexerStatus")) 
{
+        LOG.info("Indexer: {}  {}",
+            String.format(Locale.ROOT, "%6d", counter.getValue()),
+            counter.getName());
+      }
       long end = System.currentTimeMillis();
       LOG.info("Indexer: finished at " + sdf.format(end) + ", elapsed: "
           + TimingUtil.elapsedTime(start, end));
@@ -127,7 +136,7 @@ public class IndexingJob extends Configu
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
       System.err
-          .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params 
k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] 
[-filter] [-normalize]");
+      .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params 
k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] 
[-filter] [-normalize]");
       IndexWriters writers = new IndexWriters(getConf());
       System.err.println(writers.describe());
       return -1;

svn commit: r1678459 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexerMapReduce.java src/java/org/apache/nutch/indexer/IndexingJob.java

Reply via email to