Author: dogacan
Date: Wed Jan 21 11:26:27 2009
New Revision: 736385

URL: http://svn.apache.org/viewvc?rev=736385&view=rev
Log:
NUTCH-676 - MapWritable is written inefficiently and confusingly.

Removed:
    lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
    
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
    lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=736385&r1=736384&r2=736385&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Jan 21 11:26:27 2009
@@ -316,6 +316,9 @@
 
 118. NUTCH-681 - parse-mp3 compilation problem. 
                  (Wildan Maulana via dogacan)
+
+119. NUTCH-676 - MapWritable is written inefficiently and confusingly.
+                 (dogacan)
      
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=736385&r1=736384&r2=736385&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Jan 
21 11:26:27 2009
@@ -19,17 +19,18 @@
 
 import java.io.*;
 import java.util.*;
+import java.util.Map.Entry;
 
 import org.apache.hadoop.io.*;
 import org.apache.nutch.util.*;
 
 /* The crawl state of a url. */
-public class CrawlDatum implements WritableComparable, Cloneable {
+public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
   public static final String GENERATE_DIR_NAME = "crawl_generate";
   public static final String FETCH_DIR_NAME = "crawl_fetch";
   public static final String PARSE_DIR_NAME = "crawl_parse";
 
-  private final static byte CUR_VERSION = 6;
+  private final static byte CUR_VERSION = 7;
 
   /** Compatibility values for on-the-fly conversion from versions < 5. */
   private static final byte OLD_STATUS_SIGNATURE = 0;
@@ -118,7 +119,7 @@
   private float score = 1.0f;
   private byte[] signature = null;
   private long modifiedTime;
-  private MapWritable metaData;
+  private org.apache.hadoop.io.MapWritable metaData;
   
   public static boolean hasDbStatus(CrawlDatum datum) {
     if (datum.status <= STATUS_DB_MAX) return true;
@@ -131,10 +132,11 @@
   }
 
   public CrawlDatum() {
-    metaData = new MapWritable();
+    metaData = new org.apache.hadoop.io.MapWritable();
   }
 
   public CrawlDatum(int status, int fetchInterval) {
+    this();
     this.status = (byte)status;
     this.fetchInterval = fetchInterval;
   }
@@ -201,14 +203,16 @@
     this.signature = signature;
   }
   
-   public void setMetaData(MapWritable mapWritable) {this.metaData = 
mapWritable; }
+   public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
+     this.metaData = mapWritable;
+   }
 
   /**
    * returns a MapWritable if it was set or read in @see 
readFields(DataInput), 
    * returns empty map in case CrawlDatum was freshly created (lazily 
instantiated).
    */
-  public MapWritable getMetaData() {
-    if (this.metaData == null) this.metaData = new MapWritable();
+  public org.apache.hadoop.io.MapWritable getMetaData() {
+    if (this.metaData == null) this.metaData = new 
org.apache.hadoop.io.MapWritable();
     return this.metaData;
   }
   
@@ -223,7 +227,6 @@
     return result;
   }
 
-
   public void readFields(DataInput in) throws IOException {
     byte version = in.readByte();                 // read version
     if (version > CUR_VERSION)                   // check version
@@ -244,10 +247,20 @@
         in.readFully(signature);
       } else signature = null;
     }
+    metaData = new org.apache.hadoop.io.MapWritable();
     if (version > 3) {
-      metaData.clear();
-      if (in.readBoolean()) {
-        metaData.readFields(in);
+      if (version < 7) {
+        MapWritable oldMetaData = new MapWritable();
+        if (in.readBoolean()) {
+          oldMetaData.readFields(in);
+        }
+        for (Writable key : oldMetaData.keySet()) {
+          metaData.put(key, oldMetaData.get(key));
+        }
+      } else {
+        if (in.readBoolean()) {
+          metaData.readFields(in);
+        }
       }
     }
     // translate status codes
@@ -278,7 +291,7 @@
       out.writeByte(signature.length);
       out.write(signature);
     }
-    if (metaData != null && metaData.size() > 0) {
+    if (metaData.size() > 0) {
       out.writeBoolean(true);
       metaData.write(out);
     } else {
@@ -295,7 +308,7 @@
     this.score = that.score;
     this.modifiedTime = that.modifiedTime;
     this.signature = that.signature;
-    this.metaData = new MapWritable(that.metaData); // make a deep copy
+    this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // 
make a deep copy
   }
 
 
@@ -304,8 +317,7 @@
   //
   
   /** Sort by decreasing score. */
-  public int compareTo(Object o) {
-    CrawlDatum that = (CrawlDatum)o; 
+  public int compareTo(CrawlDatum that) {
     if (that.score != this.score)
       return (that.score - this.score) > 0 ? 1 : -1;
     if (that.status != this.status)
@@ -367,7 +379,7 @@
   //
 
   public String toString() {
-    StringBuffer buf = new StringBuffer();
+    StringBuilder buf = new StringBuilder();
     buf.append("Version: " + CUR_VERSION + "\n");
     buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) + 
")\n");
     buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
@@ -377,9 +389,23 @@
         (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n");
     buf.append("Score: " + getScore() + "\n");
     buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
-    buf.append("Metadata: " + (metaData != null ? metaData.toString() : 
"null") + "\n");
+    buf.append("Metadata: ");
+    for (Entry<Writable, Writable> e : metaData.entrySet()) {
+      buf.append(e.getKey());
+      buf.append(": ");
+      buf.append(e.getValue());
+    }
+    buf.append('\n');
     return buf.toString();
   }
+  
+  private boolean metadataEquals(org.apache.hadoop.io.MapWritable 
otherMetaData) {
+    HashSet<Entry<Writable, Writable>> set1 =
+      new HashSet<Entry<Writable,Writable>>(metaData.entrySet());
+    HashSet<Entry<Writable, Writable>> set2 =
+      new HashSet<Entry<Writable,Writable>>(otherMetaData.entrySet());
+    return set1.equals(set2);
+  }
 
   public boolean equals(Object o) {
     if (!(o instanceof CrawlDatum))
@@ -394,18 +420,7 @@
       (SignatureComparator._compare(this.signature, other.signature) == 0) &&
       (this.score == other.score);
     if (!res) return res;
-    // allow zero-sized metadata to be equal to null metadata
-    if (this.metaData == null) {
-      if (other.metaData != null && other.metaData.size() > 0) return false;
-      else return true;
-    } else {
-      if (other.metaData == null) {
-        if (this.metaData.size() == 0) return true;
-        else return false;
-      } else {
-        return this.metaData.equals(other.metaData);
-      }
-    }
+    return metadataEquals(other.metaData);
   }
 
   public int hashCode() {
@@ -416,7 +431,7 @@
                 signature[i+2] << 8 + signature[i+3]);
       }
     }
-    if (metaData != null) res ^= metaData.hashCode();
+    res ^= metaData.entrySet().hashCode();
     return
       res ^ status ^
       ((int)fetchTime) ^

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=736385&r1=736384&r2=736385&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Wed 
Jan 21 11:26:27 2009
@@ -53,7 +53,7 @@
   private static final Log LOG = LogFactory.getLog(CrawlDbMerger.class);
 
   public static class Merger extends MapReduceBase implements Reducer<Text, 
CrawlDatum, Text, CrawlDatum> {
-    private MapWritable meta = new MapWritable();
+    private org.apache.hadoop.io.MapWritable meta = new 
org.apache.hadoop.io.MapWritable();
     private CrawlDatum res = new CrawlDatum();
     private FetchSchedule schedule;
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java?rev=736385&r1=736384&r2=736385&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java Wed Jan 
21 11:26:27 2009
@@ -57,6 +57,7 @@
  * into the header of each MapWritable that uses these types.
  *
  * @author Stefan Groschupf
+ * @deprecated Use org.apache.hadoop.io.MapWritable instead.
  */
 public class MapWritable implements Writable {
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java?rev=736385&r1=736384&r2=736385&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java 
Wed Jan 21 11:26:27 2009
@@ -26,6 +26,7 @@
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.UTF8;
 import org.apache.hadoop.io.Writable;
@@ -44,7 +45,6 @@
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.CrawlDb;
-import org.apache.nutch.crawl.MapWritable;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java?rev=736385&r1=736384&r2=736385&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java 
Wed Jan 21 11:26:27 2009
@@ -21,6 +21,7 @@
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.WritableUtils;
 import org.apache.hadoop.mapred.FileInputFormat;
@@ -37,7 +38,6 @@
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.CrawlDb;
-import org.apache.nutch.crawl.MapWritable;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.scoring.webgraph.Node;
 import org.apache.nutch.util.FSUtils;


Reply via email to