Author: dogacan Date: Wed Feb 11 09:12:15 2009 New Revision: 743277 URL: http://svn.apache.org/viewvc?rev=743277&view=rev Log: NUTCH-683 - NUTCH-676 broke CrawlDbMerger
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=743277&r1=743276&r2=743277&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Feb 11 09:12:15 2009 @@ -338,6 +338,7 @@ 126. NUTCH-636 - Httpclient plugin https doesn't work on IBM JRE (Curtis d'Entremont, ab) +127. NUTCH-683 - NUTCH-676 broke CrawlDbMerger. (dogacan) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=743277&r1=743276&r2=743277&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Wed Feb 11 09:12:15 2009 @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.*; +import java.util.Map.Entry; // Commons Logging imports import org.apache.commons.logging.Log; @@ -28,6 +29,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.*; import org.apache.hadoop.conf.*; @@ -53,7 +55,7 @@ private static final Log LOG = LogFactory.getLog(CrawlDbMerger.class); public static class Merger extends MapReduceBase implements Reducer<Text, CrawlDatum, Text, CrawlDatum> { - private org.apache.hadoop.io.MapWritable meta = new org.apache.hadoop.io.MapWritable(); + private org.apache.hadoop.io.MapWritable meta; private CrawlDatum res = new CrawlDatum(); private FetchSchedule schedule; @@ -67,26 +69,32 @@ throws IOException { long resTime = 0L; boolean resSet = false; - meta.clear(); + meta = new org.apache.hadoop.io.MapWritable(); while (values.hasNext()) { CrawlDatum val = values.next(); if (!resSet) { res.set(val); resSet = true; resTime = schedule.calculateLastFetchTime(res); - meta.putAll(res.getMetaData()); + for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) { + meta.put(e.getKey(), e.getValue()); + } continue; } // compute last fetch time, and pick the latest long valTime = schedule.calculateLastFetchTime(val); if (valTime > resTime) { // collect all metadata, newer values override older values - meta.putAll(val.getMetaData()); + for (Entry<Writable, Writable> e : val.getMetaData().entrySet()) { + meta.put(e.getKey(), e.getValue()); + } res.set(val); resTime = valTime ; } else { // insert older metadata before newer - val.getMetaData().putAll(meta); + for (Entry<Writable, Writable> e : meta.entrySet()) { + val.getMetaData().put(e.getKey(), e.getValue()); + } meta = val.getMetaData(); } }