Author: snagel Date: Thu May 14 10:08:15 2015 New Revision: 1679335 URL: http://svn.apache.org/r1679335 Log: NUTCH-2008 IndexerMapReduce to use single instance of NutchIndexAction for deletions
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1679335&r1=1679334&r2=1679335&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu May 14 10:08:15 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-2008 IndexerMapReduce to use single instance of NutchIndexAction for deletions (snagel) + * NUTCH-1998 Add support for user-defined file extension to CommonCrawlDataDumper (totaro via mattmann) * NUTCH-1873 Solr IndexWriter/Job to report number of docs indexed. (snagel via lewismc) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1679335&r1=1679334&r2=1679335&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Thu May 14 10:08:15 2015 @@ -77,6 +77,10 @@ public class IndexerMapReduce extends Co private URLNormalizers urlNormalizers; private URLFilters urlFilters; + /** Predefined action to delete documents from the index */ + private static final NutchIndexAction DELETE_ACTION = new NutchIndexAction( + null, NutchIndexAction.DELETE); + public void configure(JobConf job) { setConf(job); this.filters = new IndexingFilters(getConf()); @@ -206,9 +210,7 @@ public class IndexerMapReduce extends Co if (robotsMeta != null && robotsMeta.toLowerCase().indexOf("noindex") != -1) { // Delete it! - NutchIndexAction action = new NutchIndexAction(null, - NutchIndexAction.DELETE); - output.collect(key, action); + output.collect(key, DELETE_ACTION); reporter.incrCounter("IndexerStatus", "deleted (robots=noindex)", 1); return; } @@ -225,10 +227,7 @@ public class IndexerMapReduce extends Co if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) { reporter.incrCounter("IndexerStatus", "deleted (gone)", 1); - - NutchIndexAction action = new NutchIndexAction(null, - NutchIndexAction.DELETE); - output.collect(key, action); + output.collect(key, DELETE_ACTION); return; } @@ -237,10 +236,7 @@ public class IndexerMapReduce extends Co || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { reporter.incrCounter("IndexerStatus", "deleted redirects", 1); - - NutchIndexAction action = new NutchIndexAction(null, - NutchIndexAction.DELETE); - output.collect(key, action); + output.collect(key, DELETE_ACTION); return; } } @@ -253,9 +249,7 @@ public class IndexerMapReduce extends Co // Whether to delete pages marked as duplicates if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) { reporter.incrCounter("IndexerStatus", "deleted duplicates", 1); - NutchIndexAction action = new NutchIndexAction(null, - NutchIndexAction.DELETE); - output.collect(key, action); + output.collect(key, DELETE_ACTION); return; }