Author: dogacan Date: Wed Jan 21 11:26:27 2009 New Revision: 736385 URL: http://svn.apache.org/viewvc?rev=736385&view=rev Log: NUTCH-676 - MapWritable is written inefficiently and confusingly.
Removed: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=736385&r1=736384&r2=736385&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Jan 21 11:26:27 2009 @@ -316,6 +316,9 @@ 118. NUTCH-681 - parse-mp3 compilation problem. (Wildan Maulana via dogacan) + +119. NUTCH-676 - MapWritable is written inefficiently and confusingly. + (dogacan) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=736385&r1=736384&r2=736385&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Jan 21 11:26:27 2009 @@ -19,17 +19,18 @@ import java.io.*; import java.util.*; +import java.util.Map.Entry; import org.apache.hadoop.io.*; import org.apache.nutch.util.*; /* The crawl state of a url. */ -public class CrawlDatum implements WritableComparable, Cloneable { +public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable { public static final String GENERATE_DIR_NAME = "crawl_generate"; public static final String FETCH_DIR_NAME = "crawl_fetch"; public static final String PARSE_DIR_NAME = "crawl_parse"; - private final static byte CUR_VERSION = 6; + private final static byte CUR_VERSION = 7; /** Compatibility values for on-the-fly conversion from versions < 5. */ private static final byte OLD_STATUS_SIGNATURE = 0; @@ -118,7 +119,7 @@ private float score = 1.0f; private byte[] signature = null; private long modifiedTime; - private MapWritable metaData; + private org.apache.hadoop.io.MapWritable metaData; public static boolean hasDbStatus(CrawlDatum datum) { if (datum.status <= STATUS_DB_MAX) return true; @@ -131,10 +132,11 @@ } public CrawlDatum() { - metaData = new MapWritable(); + metaData = new org.apache.hadoop.io.MapWritable(); } public CrawlDatum(int status, int fetchInterval) { + this(); this.status = (byte)status; this.fetchInterval = fetchInterval; } @@ -201,14 +203,16 @@ this.signature = signature; } - public void setMetaData(MapWritable mapWritable) {this.metaData = mapWritable; } + public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) { + this.metaData = mapWritable; + } /** * returns a MapWritable if it was set or read in @see readFields(DataInput), * returns empty map in case CrawlDatum was freshly created (lazily instantiated). */ - public MapWritable getMetaData() { - if (this.metaData == null) this.metaData = new MapWritable(); + public org.apache.hadoop.io.MapWritable getMetaData() { + if (this.metaData == null) this.metaData = new org.apache.hadoop.io.MapWritable(); return this.metaData; } @@ -223,7 +227,6 @@ return result; } - public void readFields(DataInput in) throws IOException { byte version = in.readByte(); // read version if (version > CUR_VERSION) // check version @@ -244,10 +247,20 @@ in.readFully(signature); } else signature = null; } + metaData = new org.apache.hadoop.io.MapWritable(); if (version > 3) { - metaData.clear(); - if (in.readBoolean()) { - metaData.readFields(in); + if (version < 7) { + MapWritable oldMetaData = new MapWritable(); + if (in.readBoolean()) { + oldMetaData.readFields(in); + } + for (Writable key : oldMetaData.keySet()) { + metaData.put(key, oldMetaData.get(key)); + } + } else { + if (in.readBoolean()) { + metaData.readFields(in); + } } } // translate status codes @@ -278,7 +291,7 @@ out.writeByte(signature.length); out.write(signature); } - if (metaData != null && metaData.size() > 0) { + if (metaData.size() > 0) { out.writeBoolean(true); metaData.write(out); } else { @@ -295,7 +308,7 @@ this.score = that.score; this.modifiedTime = that.modifiedTime; this.signature = that.signature; - this.metaData = new MapWritable(that.metaData); // make a deep copy + this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make a deep copy } @@ -304,8 +317,7 @@ // /** Sort by decreasing score. */ - public int compareTo(Object o) { - CrawlDatum that = (CrawlDatum)o; + public int compareTo(CrawlDatum that) { if (that.score != this.score) return (that.score - this.score) > 0 ? 1 : -1; if (that.status != this.status) @@ -367,7 +379,7 @@ // public String toString() { - StringBuffer buf = new StringBuffer(); + StringBuilder buf = new StringBuilder(); buf.append("Version: " + CUR_VERSION + "\n"); buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) + ")\n"); buf.append("Fetch time: " + new Date(getFetchTime()) + "\n"); @@ -377,9 +389,23 @@ (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n"); buf.append("Score: " + getScore() + "\n"); buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n"); - buf.append("Metadata: " + (metaData != null ? metaData.toString() : "null") + "\n"); + buf.append("Metadata: "); + for (Entry<Writable, Writable> e : metaData.entrySet()) { + buf.append(e.getKey()); + buf.append(": "); + buf.append(e.getValue()); + } + buf.append('\n'); return buf.toString(); } + + private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) { + HashSet<Entry<Writable, Writable>> set1 = + new HashSet<Entry<Writable,Writable>>(metaData.entrySet()); + HashSet<Entry<Writable, Writable>> set2 = + new HashSet<Entry<Writable,Writable>>(otherMetaData.entrySet()); + return set1.equals(set2); + } public boolean equals(Object o) { if (!(o instanceof CrawlDatum)) @@ -394,18 +420,7 @@ (SignatureComparator._compare(this.signature, other.signature) == 0) && (this.score == other.score); if (!res) return res; - // allow zero-sized metadata to be equal to null metadata - if (this.metaData == null) { - if (other.metaData != null && other.metaData.size() > 0) return false; - else return true; - } else { - if (other.metaData == null) { - if (this.metaData.size() == 0) return true; - else return false; - } else { - return this.metaData.equals(other.metaData); - } - } + return metadataEquals(other.metaData); } public int hashCode() { @@ -416,7 +431,7 @@ signature[i+2] << 8 + signature[i+3]); } } - if (metaData != null) res ^= metaData.hashCode(); + res ^= metaData.entrySet().hashCode(); return res ^ status ^ ((int)fetchTime) ^ Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=736385&r1=736384&r2=736385&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Wed Jan 21 11:26:27 2009 @@ -53,7 +53,7 @@ private static final Log LOG = LogFactory.getLog(CrawlDbMerger.class); public static class Merger extends MapReduceBase implements Reducer<Text, CrawlDatum, Text, CrawlDatum> { - private MapWritable meta = new MapWritable(); + private org.apache.hadoop.io.MapWritable meta = new org.apache.hadoop.io.MapWritable(); private CrawlDatum res = new CrawlDatum(); private FetchSchedule schedule; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java?rev=736385&r1=736384&r2=736385&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java Wed Jan 21 11:26:27 2009 @@ -57,6 +57,7 @@ * into the header of each MapWritable that uses these types. * * @author Stefan Groschupf + * @deprecated Use org.apache.hadoop.io.MapWritable instead. */ public class MapWritable implements Writable { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java?rev=736385&r1=736384&r2=736385&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java Wed Jan 21 11:26:27 2009 @@ -26,6 +26,7 @@ import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.UTF8; import org.apache.hadoop.io.Writable; @@ -44,7 +45,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.CrawlDb; -import org.apache.nutch.crawl.MapWritable; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java?rev=736385&r1=736384&r2=736385&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java Wed Jan 21 11:26:27 2009 @@ -21,6 +21,7 @@ import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapred.FileInputFormat; @@ -37,7 +38,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.CrawlDb; -import org.apache.nutch.crawl.MapWritable; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.scoring.webgraph.Node; import org.apache.nutch.util.FSUtils;