Author: markus
Date: Tue Oct 1 12:50:06 2013
New Revision: 1528072
URL: http://svn.apache.org/r1528072
Log:
NUTCH-1646 IndexerMapReduce to consider DB status
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1528072&r1=1528071&r2=1528072&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Oct 1 12:50:06 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1646 IndexerMapReduce to consider DB status (markus)
+
* NUTCH-1636 Indexer to normalize and filter repr URL (Iain Lopata via snagel)
* NUTCH-1637 URLUtil is missing getProtocol (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1528072&r1=1528071&r2=1528072&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Oct
1 12:50:06 2013
@@ -189,14 +189,18 @@ implements Mapper<Text, Writable, Text,
* Check if we need to delete 404 NOT FOUND and 301 PERMANENT
REDIRECT.
*/
if (delete) {
- if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {
+ if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE ||
dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
reporter.incrCounter("IndexerStatus", "Documents deleted", 1);
NutchIndexAction action = new NutchIndexAction(null,
NutchIndexAction.DELETE);
output.collect(key, action);
return;
}
- if (fetchDatum.getStatus() ==
CrawlDatum.STATUS_FETCH_REDIR_PERM) {
+ if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM
||
+ fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
||
+ dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
+ dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
+ reporter.incrCounter("IndexerStatus", "Deleted redirects", 1);
reporter.incrCounter("IndexerStatus", "Perm redirects
deleted", 1);
NutchIndexAction action = new NutchIndexAction(null,
NutchIndexAction.DELETE);