Author: dogacan
Date: Wed Jun 27 00:05:52 2007
New Revision: 551081

URL: http://svn.apache.org/viewvc?view=rev&rev=551081
Log:
NUTCH-474 - Replace usage of ObjectWritable with something based on 
GenericWritable.

Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
    
lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
    
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java
    lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
    lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Jun 27 00:05:52 2007
@@ -67,6 +67,9 @@
 21. NUTCH-497 -  Extreme Nested Tags causes StackOverflowException in 
        DomContentUtils...Spider Trap. (kubes)
 
+22. NUTCH-434 - Replace usage of ObjectWritable with something based on 
+    GenericWritable. (dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Added: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?view=auto&rev=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Wed 
Jun 27 00:05:52 2007
@@ -0,0 +1,49 @@
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.nutch.util.GenericWritableConfigurable;
+
+public class NutchWritable extends GenericWritableConfigurable {
+  
+  private static Class<? extends Writable>[] CLASSES = null;
+  
+  static {
+    CLASSES = (Class<? extends Writable>[]) new Class[] {
+      org.apache.hadoop.io.NullWritable.class, 
+      org.apache.hadoop.io.LongWritable.class,
+      org.apache.hadoop.io.BytesWritable.class,
+      org.apache.hadoop.io.FloatWritable.class,
+      org.apache.hadoop.io.IntWritable.class,
+      org.apache.hadoop.io.Text.class,
+      org.apache.hadoop.io.MD5Hash.class,
+      org.apache.nutch.crawl.CrawlDatum.class,
+      org.apache.nutch.crawl.Inlink.class,
+      org.apache.nutch.crawl.Inlinks.class,
+      org.apache.nutch.crawl.MapWritable.class,
+      org.apache.nutch.fetcher.FetcherOutput.class,
+      org.apache.nutch.metadata.Metadata.class,
+      org.apache.nutch.parse.Outlink.class,
+      org.apache.nutch.parse.ParseText.class,
+      org.apache.nutch.parse.ParseData.class,
+      org.apache.nutch.parse.ParseImpl.class,
+      org.apache.nutch.parse.ParseStatus.class,
+      org.apache.nutch.protocol.Content.class,
+      org.apache.nutch.protocol.ProtocolStatus.class,
+      org.apache.nutch.searcher.Hit.class,
+      org.apache.nutch.searcher.HitDetails.class,
+      org.apache.nutch.searcher.Hits.class
+    };
+  }
+
+  public NutchWritable() { }
+  
+  public NutchWritable(Writable instance) {
+    set(instance);
+  }
+
+  @Override
+  protected Class<? extends Writable>[] getTypes() {
+    return CLASSES;
+  }
+
+}

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Jun 
27 00:05:52 2007
@@ -32,6 +32,7 @@
 import org.apache.hadoop.util.ToolBase;
 
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
@@ -321,9 +322,9 @@
       }
 
       try {
-        output.collect(key, new ObjectWritable(datum));
+        output.collect(key, new NutchWritable(datum));
         if (content != null && storingContent)
-          output.collect(key, new ObjectWritable(content));
+          output.collect(key, new NutchWritable(content));
         if (parseResult != null) {
           for (Entry<Text, Parse> entry : parseResult) {
             Text url = entry.getKey();
@@ -357,7 +358,7 @@
                 LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
               }
             }
-            output.collect(url, new ObjectWritable(
+            output.collect(url, new NutchWritable(
                     new ParseImpl(new ParseText(parse.getText()), 
                                   parse.getData(), parse.isCanonical())));
           }
@@ -493,7 +494,7 @@
     job.setOutputPath(segment);
     job.setOutputFormat(FetcherOutputFormat.class);
     job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(ObjectWritable.class);
+    job.setOutputValueClass(NutchWritable.class);
 
     JobClient.runJob(job);
     if (LOG.isInfoEnabled()) { LOG.info("Fetcher: done"); }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Wed Jun 
27 00:05:52 2007
@@ -36,6 +36,7 @@
 import org.apache.hadoop.util.StringUtils;
 
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
@@ -695,9 +696,9 @@
       }
 
       try {
-        output.collect(key, new ObjectWritable(datum));
+        output.collect(key, new NutchWritable(datum));
         if (content != null && storingContent)
-          output.collect(key, new ObjectWritable(content));
+          output.collect(key, new NutchWritable(content));
         if (parseResult != null) {
           for (Entry<Text, Parse> entry : parseResult) {
             Text url = entry.getKey();
@@ -731,7 +732,7 @@
                 LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
               }
             }
-            output.collect(url, new ObjectWritable(
+            output.collect(url, new NutchWritable(
                     new ParseImpl(new ParseText(parse.getText()), 
                                   parse.getData(), parse.isCanonical())));
           }
@@ -873,7 +874,7 @@
     job.setOutputPath(segment);
     job.setOutputFormat(FetcherOutputFormat.class);
     job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(ObjectWritable.class);
+    job.setOutputValueClass(NutchWritable.class);
 
     JobClient.runJob(job);
     if (LOG.isInfoEnabled()) { LOG.info("Fetcher: done"); }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java 
Wed Jun 27 00:05:52 2007
@@ -20,11 +20,11 @@
 import java.io.IOException;
 
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 
 import org.apache.hadoop.io.MapFile;
-import org.apache.hadoop.io.ObjectWritable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.Text;
@@ -81,7 +81,7 @@
         public void write(WritableComparable key, Writable value)
           throws IOException {
 
-          Writable w = (Writable)((ObjectWritable)value).get();
+          Writable w = ((NutchWritable)value).get();
           
           if (w instanceof CrawlDatum)
             fetchOut.append(key, w);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Wed Jun 
27 00:05:52 2007
@@ -43,6 +43,7 @@
 import org.apache.nutch.crawl.CrawlDb;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.crawl.LinkDb;
+import org.apache.nutch.crawl.NutchWritable;
 
 import org.apache.lucene.index.*;
 import org.apache.lucene.document.*;
@@ -55,6 +56,32 @@
   public static final String DONE_NAME = "index.done";
 
   public static final Log LOG = LogFactory.getLog(Indexer.class);
+  
+  /** A utility class used to pass a lucene document from Indexer.reduce 
+   * to Indexer.OutputFormat.
+   * Note: Despite its name, it can't properly wrap a lucene document - it
+   * doesn't know how to serialize/deserialize a lucene document.
+   */
+  private static class LuceneDocumentWrapper implements Writable {
+    private Document doc;
+         
+    public LuceneDocumentWrapper(Document doc) {
+      this.doc = doc;
+    }
+    
+    public Document get() {
+      return doc;
+    }
+
+    public void readFields(DataInput in) throws IOException { 
+      // intentionally left blank
+    }
+               
+    public void write(DataOutput out) throws IOException {
+      // intentionally left blank
+    }
+         
+  }
 
   /** Unwrap Lucene Documents created by reduce and add them to an index. */
   public static class OutputFormat
@@ -87,7 +114,7 @@
 
           public void write(WritableComparable key, Writable value)
             throws IOException {                  // unwrap & index doc
-            Document doc = (Document)((ObjectWritable)value).get();
+            Document doc = ((LuceneDocumentWrapper) value).get();
             NutchAnalyzer analyzer = factory.get(doc.get("lang"));
             if (LOG.isInfoEnabled()) {
               LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" 
+
@@ -156,7 +183,7 @@
     ParseData parseData = null;
     ParseText parseText = null;
     while (values.hasNext()) {
-      Object value = ((ObjectWritable)values.next()).get(); // unwrap
+      Writable value = ((NutchWritable)values.next()).get(); // unwrap
       if (value instanceof Inlinks) {
         inlinks = (Inlinks)value;
       } else if (value instanceof CrawlDatum) {
@@ -240,7 +267,7 @@
     doc.add(new Field("boost", Float.toString(boost),
             Field.Store.YES, Field.Index.NO));
 
-    output.collect(key, new ObjectWritable(doc));
+    output.collect(key, new LuceneDocumentWrapper(doc));
   }
 
   public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments)
@@ -274,7 +301,7 @@
     job.setOutputPath(indexDir);
     job.setOutputFormat(OutputFormat.class);
     job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(ObjectWritable.class);
+    job.setOutputValueClass(NutchWritable.class);
 
     JobClient.runJob(job);
     if (LOG.isInfoEnabled()) { LOG.info("Indexer: done"); }
@@ -309,7 +336,7 @@
 
   public void map(WritableComparable key, Writable value,
       OutputCollector output, Reporter reporter) throws IOException {
-    output.collect(key, new ObjectWritable(value));
+    output.collect(key, new NutchWritable(value));
   }
 
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java Wed 
Jun 27 00:05:52 2007
@@ -22,17 +22,18 @@
 import java.io.IOException;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.ObjectWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.nutch.crawl.NutchWritable;
 
 /**
- * This is a simple decorator that adds metadata to any Object-s that can be
- * serialized by <tt>ObjectWritable</tt>. This is useful when data needs to be
+ * This is a simple decorator that adds metadata to any Writable-s that can be
+ * serialized by <tt>NutchWritable</tt>. This is useful when data needs to be
  * temporarily enriched during processing, but this
  * temporary metadata doesn't need to be permanently stored after the job is 
done.
  * 
  * @author Andrzej Bialecki
  */
-public class MetaWrapper extends ObjectWritable {
+public class MetaWrapper extends NutchWritable {
   private Metadata metadata;
   
   public MetaWrapper() {
@@ -40,14 +41,14 @@
     metadata = new Metadata();
   }
   
-  public MetaWrapper(Object object, Configuration conf) {
-    super(object);
+  public MetaWrapper(Writable instance, Configuration conf) {
+    super(instance);
     metadata = new Metadata();
     setConf(conf);
   }
   
-  public MetaWrapper(Metadata metadata, Object object, Configuration conf) {
-    super(object);
+  public MetaWrapper(Metadata metadata, Writable instance, Configuration conf) 
{
+    super(instance);
     if (metadata == null) metadata = new Metadata();
     this.metadata = metadata;
     setConf(conf);

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Wed 
Jun 27 00:05:52 2007
@@ -153,7 +153,7 @@
             
             MetaWrapper wrapper = (MetaWrapper) value;
             try {
-              wrapper.set(getValueClass().newInstance());
+              wrapper.set((Writable)getValueClass().newInstance());
             } catch (Exception e) {
               throw new IOException(e.toString());
             }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Wed 
Jun 27 00:05:52 2007
@@ -42,7 +42,6 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.io.MapFile;
-import org.apache.hadoop.io.ObjectWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.UTF8;
@@ -61,6 +60,7 @@
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.util.Progressable;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseText;
 import org.apache.nutch.protocol.Content;
@@ -87,7 +87,7 @@
         newKey.set(key.toString());
         key = newKey;
       }
-      collector.collect(key, new ObjectWritable(value));
+      collector.collect(key, new NutchWritable(value));
     }
     
   }
@@ -104,8 +104,7 @@
       final PrintStream printStream = new 
PrintStream(fs.create(segmentDumpFile));
       return new RecordWriter() {
         public synchronized void write(WritableComparable key, Writable value) 
throws IOException {
-          ObjectWritable writable = (ObjectWritable) value;
-          printStream.println((String) writable.get());
+          printStream.println(value);
         }
 
         public synchronized void close(Reporter reporter) throws IOException {
@@ -170,7 +169,7 @@
     dump.append("\nRecno:: ").append(recNo++).append("\n");
     dump.append("URL:: " + key.toString() + "\n");
     while (values.hasNext()) {
-      Object value = ((ObjectWritable) values.next()).get(); // unwrap
+      Writable value = ((NutchWritable) values.next()).get(); // unwrap
       if (value instanceof CrawlDatum) {
         dump.append("\nCrawlDatum::\n").append(((CrawlDatum) 
value).toString());
       } else if (value instanceof Content) {
@@ -183,7 +182,7 @@
         LOG.warn("Unrecognized type: " + value.getClass());
       }
     }
-    output.collect(key, new ObjectWritable(dump.toString()));
+    output.collect(key, new Text(dump.toString()));
   }
 
   public void dump(Path segment, Path output) throws IOException {
@@ -212,7 +211,7 @@
     job.setOutputPath(tempDir);
     job.setOutputFormat(TextOutputFormat.class);
     job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(ObjectWritable.class);
+    job.setOutputValueClass(NutchWritable.class);
 
     JobClient.runJob(job);
 

Added: 
lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java?view=auto&rev=551081
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
 (added)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
 Wed Jun 27 00:05:52 2007
@@ -0,0 +1,41 @@
+package org.apache.nutch.util;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.GenericWritable;
+import org.apache.hadoop.io.Writable;
+
+/** A generic Writable wrapper that can inject Configuration to [EMAIL 
PROTECTED] Configurable}s */ 
+public abstract class GenericWritableConfigurable extends GenericWritable 
+                                                  implements Configurable {
+
+  private Configuration conf;
+  
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    byte type = in.readByte();
+    Class clazz = getTypes()[type];
+    try {
+      set((Writable) clazz.newInstance());
+    } catch (Exception e) {
+      e.printStackTrace();
+      throw new IOException("Cannot initialize the class: " + clazz);
+    }
+    Writable w = get();
+    if (w instanceof Configurable)
+      ((Configurable)w).setConf(conf);
+    w.readFields(in);
+  }
+  
+}


Reply via email to