Author: markus Date: Tue Oct 1 12:50:06 2013 New Revision: 1528072 URL: http://svn.apache.org/r1528072 Log: NUTCH-1646 IndexerMapReduce to consider DB status
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1528072&r1=1528071&r2=1528072&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Oct 1 12:50:06 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1646 IndexerMapReduce to consider DB status (markus) + * NUTCH-1636 Indexer to normalize and filter repr URL (Iain Lopata via snagel) * NUTCH-1637 URLUtil is missing getProtocol (markus) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1528072&r1=1528071&r2=1528072&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Oct 1 12:50:06 2013 @@ -189,14 +189,18 @@ implements Mapper<Text, Writable, Text, * Check if we need to delete 404 NOT FOUND and 301 PERMANENT REDIRECT. */ if (delete) { - if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) { + if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) { reporter.incrCounter("IndexerStatus", "Documents deleted", 1); NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); output.collect(key, action); return; } - if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM) { + if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM || + fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP || + dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || + dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { + reporter.incrCounter("IndexerStatus", "Deleted redirects", 1); reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1); NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);