Author: dogacan Date: Tue Sep 8 13:15:03 2009 New Revision: 812497 URL: http://svn.apache.org/viewvc?rev=812497&view=rev Log: NUTCH-702 - Lazy Instanciation of Metadata in CrawlDatum. Contributed by Julien Nioche.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=812497&r1=812496&r2=812497&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Sep 8 13:15:03 2009 @@ -7,6 +7,8 @@ 2. NUTCH-721 - Fetcher2 Slow (Julien Nioche via dogacan) + 3. NUTCH-702 - Lazy Instanciation of Metadata in CrawlDatum (Julien Nioche via dogacan) + Release 1.0 - 2009-03-23 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=812497&r1=812496&r2=812497&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Tue Sep 8 13:15:03 2009 @@ -131,9 +131,7 @@ return false; } - public CrawlDatum() { - metaData = new org.apache.hadoop.io.MapWritable(); - } + public CrawlDatum() { } public CrawlDatum(int status, int fetchInterval) { this(); @@ -213,7 +211,7 @@ */ public void putAllMetaData(CrawlDatum other) { for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) { - metaData.put(e.getKey(), e.getValue()); + getMetaData().put(e.getKey(), e.getValue()); } } @@ -257,11 +255,14 @@ in.readFully(signature); } else signature = null; } - metaData = new org.apache.hadoop.io.MapWritable(); + if (version > 3) { + boolean hasMetadata = false; if (version < 7) { MapWritable oldMetaData = new MapWritable(); if (in.readBoolean()) { + hasMetadata = true; + metaData = new org.apache.hadoop.io.MapWritable(); oldMetaData.readFields(in); } for (Writable key : oldMetaData.keySet()) { @@ -269,9 +270,12 @@ } } else { if (in.readBoolean()) { + hasMetadata = true; + metaData = new org.apache.hadoop.io.MapWritable(); metaData.readFields(in); } } + if (hasMetadata==false) metaData = null; } // translate status codes if (version < 5) { @@ -301,7 +305,7 @@ out.writeByte(signature.length); out.write(signature); } - if (metaData.size() > 0) { + if (metaData != null && metaData.size() > 0) { out.writeBoolean(true); metaData.write(out); } else { @@ -318,7 +322,9 @@ this.score = that.score; this.modifiedTime = that.modifiedTime; this.signature = that.signature; - this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make a deep copy + if (that.metaData != null) { + this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make a deep copy + } } @@ -400,16 +406,25 @@ buf.append("Score: " + getScore() + "\n"); buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n"); buf.append("Metadata: "); - for (Entry<Writable, Writable> e : metaData.entrySet()) { - buf.append(e.getKey()); - buf.append(": "); - buf.append(e.getValue()); + if (metaData != null) { + for (Entry<Writable, Writable> e : metaData.entrySet()) { + buf.append(e.getKey()); + buf.append(": "); + buf.append(e.getValue()); + } } buf.append('\n'); return buf.toString(); } private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) { + if (metaData==null || metaData.size() ==0) { + return otherMetaData == null || otherMetaData.size() == 0; + } + if (otherMetaData == null) { + // we already know that the current object is not null or empty + return false; + } HashSet<Entry<Writable, Writable>> set1 = new HashSet<Entry<Writable,Writable>>(metaData.entrySet()); HashSet<Entry<Writable, Writable>> set2 = @@ -441,7 +456,9 @@ signature[i+2] << 8 + signature[i+3]); } } - res ^= metaData.entrySet().hashCode(); + if (metaData != null) { + res ^= metaData.entrySet().hashCode(); + } return res ^ status ^ ((int)fetchTime) ^