Author: markus Date: Wed Jul 1 06:56:32 2015 New Revision: 1688561 URL: http://svn.apache.org/r1688561 Log: NUTCH-1684 ParseMeta to be added before fetch schedulers are run
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1688561&r1=1688560&r2=1688561&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jul 1 06:56:32 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-1684 ParseMeta to be added before fetch schedulers are run (markus) + * NUTCH-2038 fix for NUTCH-2038: Naive Bayes classifier based html Parse filter (for filtering outlinks) (Asitang Mishra, snagel via mattmann) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=1688561&r1=1688560&r2=1688561&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Wed Jul 1 06:56:32 2015 @@ -209,6 +209,13 @@ public class CrawlDbReducer implements case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected case CrawlDatum.STATUS_FETCH_REDIR_PERM: case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified + // https://issues.apache.org/jira/browse/NUTCH-1656 + if (metaFromParse != null) { + for (Entry<Writable, Writable> e : metaFromParse.entrySet()) { + result.getMetaData().put(e.getKey(), e.getValue()); + } + } + // determine the modification status int modified = FetchSchedule.STATUS_UNKNOWN; if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) { @@ -260,13 +267,6 @@ public class CrawlDbReducer implements result.setSignature(signature); } - // https://issues.apache.org/jira/browse/NUTCH-1656 - if (metaFromParse != null) { - for (Entry<Writable, Writable> e : metaFromParse.entrySet()) { - result.getMetaData().put(e.getKey(), e.getValue()); - } - } - // if fetchInterval is larger than the system-wide maximum, trigger // an unconditional recrawl. This prevents the page to be stuck at // NOTMODIFIED state, when the old fetched copy was already removed with