Author: markus Date: Fri Jan 8 11:10:38 2016 New Revision: 1723688 URL: http://svn.apache.org/viewvc?rev=1723688&view=rev Log: NUTCH-1449 Optionally delete documents skipped by IndexingFilters
Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723688&r1=1723687&r2=1723688&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jan 8 11:10:38 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus) + * NUTCH-2189 Domain filter must deactivate if no rules are present (markus) * NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency (joyce) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1723688&r1=1723687&r2=1723688&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Fri Jan 8 11:10:38 2016 @@ -1043,6 +1043,20 @@ </description> </property> +<property> + <name>indexer.delete.robots.noindex</name> + <value>false</value> + <description>Whether the indexer will delete documents marked by robots=noindex + </description> +</property> + +<property> + <name>indexer.delete.skipped.by.indexingfilter</name> + <value>false</value> + <description>Whether the indexer will delete documents that were skipped by indexing filters + </description> +</property> + <!-- URL normalizer properties --> <property> Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1723688&r1=1723687&r2=1723688&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Jan 8 11:10:38 2016 @@ -63,6 +63,7 @@ public class IndexerMapReduce extends Co public static final String INDEXER_PARAMS = "indexer.additional.params"; public static final String INDEXER_DELETE = "indexer.delete"; public static final String INDEXER_DELETE_ROBOTS_NOINDEX = "indexer.delete.robots.noindex"; + public static final String INDEXER_DELETE_SKIPPED = "indexer.delete.skipped.by.indexingfilter"; public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified"; public static final String URL_FILTERING = "indexer.url.filters"; public static final String URL_NORMALIZING = "indexer.url.normalizers"; @@ -71,6 +72,7 @@ public class IndexerMapReduce extends Co private boolean skip = false; private boolean delete = false; private boolean deleteRobotsNoIndex = false; + private boolean deleteSkippedByIndexingFilter = false; private boolean base64 = false; private IndexingFilters filters; private ScoringFilters scfilters; @@ -94,6 +96,8 @@ public class IndexerMapReduce extends Co this.delete = job.getBoolean(INDEXER_DELETE, false); this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX, false); + this.deleteSkippedByIndexingFilter = job.getBoolean(INDEXER_DELETE_SKIPPED, + false); this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false); this.base64 = job.getBoolean(INDEXER_BINARY_AS_BASE64, false); @@ -245,7 +249,7 @@ public class IndexerMapReduce extends Co || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { - reporter.incrCounter("IndexerStatus", "deleted redirects", 1); + reporter.incrCounter("IndexerStatus", "deleted (redirects)", 1); output.collect(key, DELETE_ACTION); return; } @@ -258,7 +262,7 @@ public class IndexerMapReduce extends Co // Whether to delete pages marked as duplicates if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) { - reporter.incrCounter("IndexerStatus", "deleted duplicates", 1); + reporter.incrCounter("IndexerStatus", "deleted (duplicates)", 1); output.collect(key, DELETE_ACTION); return; } @@ -284,8 +288,25 @@ public class IndexerMapReduce extends Co // add digest, used by dedup doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY)); - + final Parse parse = new ParseImpl(parseText, parseData); + float boost = 1.0f; + // run scoring filters + try { + boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse, + inlinks, boost); + } catch (final ScoringFilterException e) { + reporter.incrCounter("IndexerStatus", "errors (ScoringFilter)", 1); + if (LOG.isWarnEnabled()) { + LOG.warn("Error calculating score {}: {}", key, e); + } + return; + } + // apply boost to all indexed fields. + doc.setWeight(boost); + // store boost for use by explain and dedup + doc.add("boost", Float.toString(boost)); + try { // Indexing filters may also be interested in the signature fetchDatum.setSignature(dbDatum.getSignature()); @@ -317,26 +338,16 @@ public class IndexerMapReduce extends Co // skip documents discarded by indexing filters if (doc == null) { - reporter.incrCounter("IndexerStatus", "skipped by indexing filters", 1); - return; - } - - float boost = 1.0f; - // run scoring filters - try { - boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse, - inlinks, boost); - } catch (final ScoringFilterException e) { - reporter.incrCounter("IndexerStatus", "errors (ScoringFilter)", 1); - if (LOG.isWarnEnabled()) { - LOG.warn("Error calculating score {}: {}", key, e); + // https://issues.apache.org/jira/browse/NUTCH-1449 + if (deleteSkippedByIndexingFilter) { + NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); + output.collect(key, action); + reporter.incrCounter("IndexerStatus", "deleted (IndexingFilter)", 1); + } else { + reporter.incrCounter("IndexerStatus", "skipped (IndexingFilter)", 1); } return; } - // apply boost to all indexed fields. - doc.setWeight(boost); - // store boost for use by explain and dedup - doc.add("boost", Float.toString(boost)); if (content != null) { // Get the original unencoded content