Author: snagel
Date: Thu May 14 10:08:15 2015
New Revision: 1679335
URL: http://svn.apache.org/r1679335
Log:
NUTCH-2008 IndexerMapReduce to use single instance of NutchIndexAction for
deletions
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1679335&r1=1679334&r2=1679335&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu May 14 10:08:15 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2008 IndexerMapReduce to use single instance of NutchIndexAction for
deletions (snagel)
+
* NUTCH-1998 Add support for user-defined file extension to
CommonCrawlDataDumper (totaro via mattmann)
* NUTCH-1873 Solr IndexWriter/Job to report number of docs indexed. (snagel
via lewismc)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1679335&r1=1679334&r2=1679335&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Thu May
14 10:08:15 2015
@@ -77,6 +77,10 @@ public class IndexerMapReduce extends Co
private URLNormalizers urlNormalizers;
private URLFilters urlFilters;
+ /** Predefined action to delete documents from the index */
+ private static final NutchIndexAction DELETE_ACTION = new NutchIndexAction(
+ null, NutchIndexAction.DELETE);
+
public void configure(JobConf job) {
setConf(job);
this.filters = new IndexingFilters(getConf());
@@ -206,9 +210,7 @@ public class IndexerMapReduce extends Co
if (robotsMeta != null
&& robotsMeta.toLowerCase().indexOf("noindex") != -1) {
// Delete it!
- NutchIndexAction action = new NutchIndexAction(null,
- NutchIndexAction.DELETE);
- output.collect(key, action);
+ output.collect(key, DELETE_ACTION);
reporter.incrCounter("IndexerStatus", "deleted (robots=noindex)",
1);
return;
}
@@ -225,10 +227,7 @@ public class IndexerMapReduce extends Co
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
|| dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
reporter.incrCounter("IndexerStatus", "deleted (gone)", 1);
-
- NutchIndexAction action = new NutchIndexAction(null,
- NutchIndexAction.DELETE);
- output.collect(key, action);
+ output.collect(key, DELETE_ACTION);
return;
}
@@ -237,10 +236,7 @@ public class IndexerMapReduce extends Co
|| dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
|| dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
reporter.incrCounter("IndexerStatus", "deleted redirects", 1);
-
- NutchIndexAction action = new NutchIndexAction(null,
- NutchIndexAction.DELETE);
- output.collect(key, action);
+ output.collect(key, DELETE_ACTION);
return;
}
}
@@ -253,9 +249,7 @@ public class IndexerMapReduce extends Co
// Whether to delete pages marked as duplicates
if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
reporter.incrCounter("IndexerStatus", "deleted duplicates", 1);
- NutchIndexAction action = new NutchIndexAction(null,
- NutchIndexAction.DELETE);
- output.collect(key, action);
+ output.collect(key, DELETE_ACTION);
return;
}