Author: markus
Date: Tue Oct  1 12:50:06 2013
New Revision: 1528072

URL: http://svn.apache.org/r1528072
Log:
NUTCH-1646 IndexerMapReduce to consider DB status

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1528072&r1=1528071&r2=1528072&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Oct  1 12:50:06 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1646 IndexerMapReduce to consider DB status (markus)
+
 * NUTCH-1636 Indexer to normalize and filter repr URL (Iain Lopata via snagel)
 
 * NUTCH-1637 URLUtil is missing getProtocol (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1528072&r1=1528071&r2=1528072&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Oct 
 1 12:50:06 2013
@@ -189,14 +189,18 @@ implements Mapper<Text, Writable, Text, 
              * Check if we need to delete 404 NOT FOUND and 301 PERMANENT 
REDIRECT.
              */
             if (delete) {
-              if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {
+              if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || 
dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
                 reporter.incrCounter("IndexerStatus", "Documents deleted", 1);
 
                 NutchIndexAction action = new NutchIndexAction(null, 
NutchIndexAction.DELETE);
                 output.collect(key, action);
                 return;
               }
-              if (fetchDatum.getStatus() == 
CrawlDatum.STATUS_FETCH_REDIR_PERM) {
+              if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM 
||
+                  fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP 
||
+                  dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
+                  dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
+                reporter.incrCounter("IndexerStatus", "Deleted redirects", 1);
                 reporter.incrCounter("IndexerStatus", "Perm redirects 
deleted", 1);
 
                 NutchIndexAction action = new NutchIndexAction(null, 
NutchIndexAction.DELETE);


Reply via email to