Author: markus Date: Fri Jun 8 07:37:42 2012 New Revision: 1347909 URL: http://svn.apache.org/viewvc?rev=1347909&view=rev Log: NUTCH-1336 Optionally not index db_notmodified pages
Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1347909&r1=1347908&r2=1347909&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jun 8 07:37:42 2012 @@ -2,6 +2,8 @@ Nutch Change Log (trunk) Current Development: +* NUTCH-1336 Optionally not index db_notmodified pages (markus) + * NUTCH-1346 Follow outlinks to ignore external (markus) * NUTCH-1320 IndexChecker and ParseChecker choke on IDN's (markus) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1347909&r1=1347908&r2=1347909&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Fri Jun 8 07:37:42 2012 @@ -875,6 +875,13 @@ </description> </property> +<property> + <name>indexer.skip.notmodified</name> + <value>false</value> + <description>Whether the indexer will skip records with a db_notmodified status. + </description> +</property> + <!-- URL normalizer properties --> <property> Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1347909&r1=1347908&r2=1347909&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Jun 8 07:37:42 2012 @@ -55,7 +55,9 @@ implements Mapper<Text, Writable, Text, public static final Logger LOG = LoggerFactory.getLogger(IndexerMapReduce.class); public static final String INDEXER_DELETE = "indexer.delete"; + public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified"; + private boolean skip = false; private boolean delete = false; private IndexingFilters filters; private ScoringFilters scfilters; @@ -65,6 +67,7 @@ implements Mapper<Text, Writable, Text, this.filters = new IndexingFilters(getConf()); this.scfilters = new ScoringFilters(getConf()); this.delete = job.getBoolean(INDEXER_DELETE, false); + this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false); } public void map(Text key, Writable value, @@ -87,8 +90,15 @@ implements Mapper<Text, Writable, Text, inlinks = (Inlinks)value; } else if (value instanceof CrawlDatum) { final CrawlDatum datum = (CrawlDatum)value; - if (CrawlDatum.hasDbStatus(datum)) + if (CrawlDatum.hasDbStatus(datum)) { dbDatum = datum; + + // Whether to skip DB_NOTMODIFIED pages + if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { + reporter.incrCounter("IndexerStatus", "Skipped", 1); + return; + } + } else if (CrawlDatum.hasFetchStatus(datum)) { // don't index unmodified (empty) pages @@ -104,14 +114,14 @@ implements Mapper<Text, Writable, Text, NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); output.collect(key, action); - continue; + return; } if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM) { reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1); NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); output.collect(key, action); - continue; + return; } } }