Author: snagel Date: Fri Mar 7 18:15:50 2014 New Revision: 1575351 URL: http://svn.apache.org/r1575351 Log: NUTCH-1706 IndexerMapReduce does not remove db_redir_temp
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1575351&r1=1575350&r2=1575351&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Mar 7 18:15:50 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1706 IndexerMapReduce does not remove db_redir_temp (markus, snagel) + * NUTCH-1113 SegmentMerger can now be safely used to merge segments (Edward Drapkin, markus, snagel) * NUTCH-1729 Upgrade to Tika 1.5 (jnioche) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1575351&r1=1575350&r2=1575351&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Mar 7 18:15:50 2014 @@ -180,36 +180,10 @@ implements Mapper<Text, Writable, Text, dbDatum = datum; } else if (CrawlDatum.hasFetchStatus(datum)) { - // don't index unmodified (empty) pages if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) { fetchDatum = datum; - - /** - * Check if we need to delete 404 NOT FOUND and 301 PERMANENT REDIRECT. - */ - if (delete) { - if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) { - reporter.incrCounter("IndexerStatus", "Documents deleted", 1); - - NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); - output.collect(key, action); - return; - } - if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM || - fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP || - dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || - dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { - reporter.incrCounter("IndexerStatus", "Deleted redirects", 1); - reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1); - - NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); - output.collect(key, action); - return; - } - } } - } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() || CrawlDatum.STATUS_SIGNATURE == datum.getStatus() || CrawlDatum.STATUS_PARSE_META == datum.getStatus()) { @@ -239,6 +213,29 @@ implements Mapper<Text, Writable, Text, LOG.warn("Unrecognized type: "+value.getClass()); } } + + // Whether to delete GONE or REDIRECTS + if (delete && fetchDatum != null && dbDatum != null) { + if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) { + reporter.incrCounter("IndexerStatus", "Documents deleted", 1); + + NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); + output.collect(key, action); + return; + } + + if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM || + fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP || + dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || + dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { + reporter.incrCounter("IndexerStatus", "Deleted redirects", 1); + reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1); + + NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); + output.collect(key, action); + return; + } + } if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) {