Author: siren Date: Tue Feb 24 10:09:36 2009 New Revision: 747324 URL: http://svn.apache.org/viewvc?rev=747324&view=rev Log: NUTCH-698 - CrawlDb is corrupted after a few crawl cycles, contributed by dogacan
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=747324&r1=747323&r2=747324&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Feb 24 10:09:36 2009 @@ -359,6 +359,9 @@ 134. NUTCH-247 - Robot parser to restrict (kubes, siren) +135. NUTCH-698 - CrawlDb is corrupted after a few crawl cycles (dogacan + via siren) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=747324&r1=747323&r2=747324&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Tue Feb 24 10:09:36 2009 @@ -204,7 +204,17 @@ } public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) { - this.metaData = mapWritable; + this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable); + } + + /** Add all metadata from other CrawlDatum to this CrawlDatum. + * + * @param other CrawlDatum + */ + public void putAllMetaData(CrawlDatum other) { + for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) { + metaData.put(e.getKey(), e.getValue()); + } } /** Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=747324&r1=747323&r2=747324&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Tue Feb 24 10:09:36 2009 @@ -131,10 +131,10 @@ if (oldSet) { // copy metadata from old, if exists if (old.getMetaData().size() > 0) { - result.getMetaData().putAll(old.getMetaData()); + result.putAllMetaData(old); // overlay with new, if any if (fetch.getMetaData().size() > 0) - result.getMetaData().putAll(fetch.getMetaData()); + result.putAllMetaData(fetch); } // set the most recent valid value of modifiedTime if (old.getModifiedTime() > 0 && fetch.getModifiedTime() == 0) {