Author: dogacan
Date: Wed Jan 21 11:26:27 2009
New Revision: 736385
URL: http://svn.apache.org/viewvc?rev=736385&view=rev
Log:
NUTCH-676 - MapWritable is written inefficiently and confusingly.
Removed:
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=736385&r1=736384&r2=736385&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Jan 21 11:26:27 2009
@@ -316,6 +316,9 @@
118. NUTCH-681 - parse-mp3 compilation problem.
(Wildan Maulana via dogacan)
+
+119. NUTCH-676 - MapWritable is written inefficiently and confusingly.
+ (dogacan)
Release 0.9 - 2007-04-02
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=736385&r1=736384&r2=736385&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Jan
21 11:26:27 2009
@@ -19,17 +19,18 @@
import java.io.*;
import java.util.*;
+import java.util.Map.Entry;
import org.apache.hadoop.io.*;
import org.apache.nutch.util.*;
/* The crawl state of a url. */
-public class CrawlDatum implements WritableComparable, Cloneable {
+public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
public static final String GENERATE_DIR_NAME = "crawl_generate";
public static final String FETCH_DIR_NAME = "crawl_fetch";
public static final String PARSE_DIR_NAME = "crawl_parse";
- private final static byte CUR_VERSION = 6;
+ private final static byte CUR_VERSION = 7;
/** Compatibility values for on-the-fly conversion from versions < 5. */
private static final byte OLD_STATUS_SIGNATURE = 0;
@@ -118,7 +119,7 @@
private float score = 1.0f;
private byte[] signature = null;
private long modifiedTime;
- private MapWritable metaData;
+ private org.apache.hadoop.io.MapWritable metaData;
public static boolean hasDbStatus(CrawlDatum datum) {
if (datum.status <= STATUS_DB_MAX) return true;
@@ -131,10 +132,11 @@
}
public CrawlDatum() {
- metaData = new MapWritable();
+ metaData = new org.apache.hadoop.io.MapWritable();
}
public CrawlDatum(int status, int fetchInterval) {
+ this();
this.status = (byte)status;
this.fetchInterval = fetchInterval;
}
@@ -201,14 +203,16 @@
this.signature = signature;
}
- public void setMetaData(MapWritable mapWritable) {this.metaData =
mapWritable; }
+ public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
+ this.metaData = mapWritable;
+ }
/**
* returns a MapWritable if it was set or read in @see
readFields(DataInput),
* returns empty map in case CrawlDatum was freshly created (lazily
instantiated).
*/
- public MapWritable getMetaData() {
- if (this.metaData == null) this.metaData = new MapWritable();
+ public org.apache.hadoop.io.MapWritable getMetaData() {
+ if (this.metaData == null) this.metaData = new
org.apache.hadoop.io.MapWritable();
return this.metaData;
}
@@ -223,7 +227,6 @@
return result;
}
-
public void readFields(DataInput in) throws IOException {
byte version = in.readByte(); // read version
if (version > CUR_VERSION) // check version
@@ -244,10 +247,20 @@
in.readFully(signature);
} else signature = null;
}
+ metaData = new org.apache.hadoop.io.MapWritable();
if (version > 3) {
- metaData.clear();
- if (in.readBoolean()) {
- metaData.readFields(in);
+ if (version < 7) {
+ MapWritable oldMetaData = new MapWritable();
+ if (in.readBoolean()) {
+ oldMetaData.readFields(in);
+ }
+ for (Writable key : oldMetaData.keySet()) {
+ metaData.put(key, oldMetaData.get(key));
+ }
+ } else {
+ if (in.readBoolean()) {
+ metaData.readFields(in);
+ }
}
}
// translate status codes
@@ -278,7 +291,7 @@
out.writeByte(signature.length);
out.write(signature);
}
- if (metaData != null && metaData.size() > 0) {
+ if (metaData.size() > 0) {
out.writeBoolean(true);
metaData.write(out);
} else {
@@ -295,7 +308,7 @@
this.score = that.score;
this.modifiedTime = that.modifiedTime;
this.signature = that.signature;
- this.metaData = new MapWritable(that.metaData); // make a deep copy
+ this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); //
make a deep copy
}
@@ -304,8 +317,7 @@
//
/** Sort by decreasing score. */
- public int compareTo(Object o) {
- CrawlDatum that = (CrawlDatum)o;
+ public int compareTo(CrawlDatum that) {
if (that.score != this.score)
return (that.score - this.score) > 0 ? 1 : -1;
if (that.status != this.status)
@@ -367,7 +379,7 @@
//
public String toString() {
- StringBuffer buf = new StringBuffer();
+ StringBuilder buf = new StringBuilder();
buf.append("Version: " + CUR_VERSION + "\n");
buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) +
")\n");
buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
@@ -377,9 +389,23 @@
(getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n");
buf.append("Score: " + getScore() + "\n");
buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
- buf.append("Metadata: " + (metaData != null ? metaData.toString() :
"null") + "\n");
+ buf.append("Metadata: ");
+ for (Entry<Writable, Writable> e : metaData.entrySet()) {
+ buf.append(e.getKey());
+ buf.append(": ");
+ buf.append(e.getValue());
+ }
+ buf.append('\n');
return buf.toString();
}
+
+ private boolean metadataEquals(org.apache.hadoop.io.MapWritable
otherMetaData) {
+ HashSet<Entry<Writable, Writable>> set1 =
+ new HashSet<Entry<Writable,Writable>>(metaData.entrySet());
+ HashSet<Entry<Writable, Writable>> set2 =
+ new HashSet<Entry<Writable,Writable>>(otherMetaData.entrySet());
+ return set1.equals(set2);
+ }
public boolean equals(Object o) {
if (!(o instanceof CrawlDatum))
@@ -394,18 +420,7 @@
(SignatureComparator._compare(this.signature, other.signature) == 0) &&
(this.score == other.score);
if (!res) return res;
- // allow zero-sized metadata to be equal to null metadata
- if (this.metaData == null) {
- if (other.metaData != null && other.metaData.size() > 0) return false;
- else return true;
- } else {
- if (other.metaData == null) {
- if (this.metaData.size() == 0) return true;
- else return false;
- } else {
- return this.metaData.equals(other.metaData);
- }
- }
+ return metadataEquals(other.metaData);
}
public int hashCode() {
@@ -416,7 +431,7 @@
signature[i+2] << 8 + signature[i+3]);
}
}
- if (metaData != null) res ^= metaData.hashCode();
+ res ^= metaData.entrySet().hashCode();
return
res ^ status ^
((int)fetchTime) ^
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=736385&r1=736384&r2=736385&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Wed
Jan 21 11:26:27 2009
@@ -53,7 +53,7 @@
private static final Log LOG = LogFactory.getLog(CrawlDbMerger.class);
public static class Merger extends MapReduceBase implements Reducer<Text,
CrawlDatum, Text, CrawlDatum> {
- private MapWritable meta = new MapWritable();
+ private org.apache.hadoop.io.MapWritable meta = new
org.apache.hadoop.io.MapWritable();
private CrawlDatum res = new CrawlDatum();
private FetchSchedule schedule;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java?rev=736385&r1=736384&r2=736385&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java Wed Jan
21 11:26:27 2009
@@ -57,6 +57,7 @@
* into the header of each MapWritable that uses these types.
*
* @author Stefan Groschupf
+ * @deprecated Use org.apache.hadoop.io.MapWritable instead.
*/
public class MapWritable implements Writable {
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java?rev=736385&r1=736384&r2=736385&view=diff
==============================================================================
---
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
(original)
+++
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
Wed Jan 21 11:26:27 2009
@@ -26,6 +26,7 @@
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.UTF8;
import org.apache.hadoop.io.Writable;
@@ -44,7 +45,6 @@
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
-import org.apache.nutch.crawl.MapWritable;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java?rev=736385&r1=736384&r2=736385&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
Wed Jan 21 11:26:27 2009
@@ -21,6 +21,7 @@
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
@@ -37,7 +38,6 @@
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
-import org.apache.nutch.crawl.MapWritable;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.scoring.webgraph.Node;
import org.apache.nutch.util.FSUtils;