Author: dogacan Date: Wed Jun 27 00:05:52 2007 New Revision: 551081 URL: http://svn.apache.org/viewvc?view=rev&rev=551081 Log: NUTCH-474 - Replace usage of ObjectWritable with something based on GenericWritable.
Added: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=551081&r1=551080&r2=551081 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Jun 27 00:05:52 2007 @@ -67,6 +67,9 @@ 21. NUTCH-497 - Extreme Nested Tags causes StackOverflowException in DomContentUtils...Spider Trap. (kubes) +22. NUTCH-434 - Replace usage of ObjectWritable with something based on + GenericWritable. (dogacan) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Added: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?view=auto&rev=551081 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Wed Jun 27 00:05:52 2007 @@ -0,0 +1,49 @@ +package org.apache.nutch.crawl; + +import org.apache.hadoop.io.Writable; +import org.apache.nutch.util.GenericWritableConfigurable; + +public class NutchWritable extends GenericWritableConfigurable { + + private static Class<? extends Writable>[] CLASSES = null; + + static { + CLASSES = (Class<? extends Writable>[]) new Class[] { + org.apache.hadoop.io.NullWritable.class, + org.apache.hadoop.io.LongWritable.class, + org.apache.hadoop.io.BytesWritable.class, + org.apache.hadoop.io.FloatWritable.class, + org.apache.hadoop.io.IntWritable.class, + org.apache.hadoop.io.Text.class, + org.apache.hadoop.io.MD5Hash.class, + org.apache.nutch.crawl.CrawlDatum.class, + org.apache.nutch.crawl.Inlink.class, + org.apache.nutch.crawl.Inlinks.class, + org.apache.nutch.crawl.MapWritable.class, + org.apache.nutch.fetcher.FetcherOutput.class, + org.apache.nutch.metadata.Metadata.class, + org.apache.nutch.parse.Outlink.class, + org.apache.nutch.parse.ParseText.class, + org.apache.nutch.parse.ParseData.class, + org.apache.nutch.parse.ParseImpl.class, + org.apache.nutch.parse.ParseStatus.class, + org.apache.nutch.protocol.Content.class, + org.apache.nutch.protocol.ProtocolStatus.class, + org.apache.nutch.searcher.Hit.class, + org.apache.nutch.searcher.HitDetails.class, + org.apache.nutch.searcher.Hits.class + }; + } + + public NutchWritable() { } + + public NutchWritable(Writable instance) { + set(instance); + } + + @Override + protected Class<? extends Writable>[] getTypes() { + return CLASSES; + } + +} Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=551081&r1=551080&r2=551081 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Jun 27 00:05:52 2007 @@ -32,6 +32,7 @@ import org.apache.hadoop.util.ToolBase; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; @@ -321,9 +322,9 @@ } try { - output.collect(key, new ObjectWritable(datum)); + output.collect(key, new NutchWritable(datum)); if (content != null && storingContent) - output.collect(key, new ObjectWritable(content)); + output.collect(key, new NutchWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); @@ -357,7 +358,7 @@ LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } - output.collect(url, new ObjectWritable( + output.collect(url, new NutchWritable( new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse.isCanonical()))); } @@ -493,7 +494,7 @@ job.setOutputPath(segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); - job.setOutputValueClass(ObjectWritable.class); + job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: done"); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diff&rev=551081&r1=551080&r2=551081 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Wed Jun 27 00:05:52 2007 @@ -36,6 +36,7 @@ import org.apache.hadoop.util.StringUtils; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; @@ -695,9 +696,9 @@ } try { - output.collect(key, new ObjectWritable(datum)); + output.collect(key, new NutchWritable(datum)); if (content != null && storingContent) - output.collect(key, new ObjectWritable(content)); + output.collect(key, new NutchWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); @@ -731,7 +732,7 @@ LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } - output.collect(url, new ObjectWritable( + output.collect(url, new NutchWritable( new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse.isCanonical()))); } @@ -873,7 +874,7 @@ job.setOutputPath(segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); - job.setOutputValueClass(ObjectWritable.class); + job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: done"); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?view=diff&rev=551081&r1=551080&r2=551081 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java Wed Jun 27 00:05:52 2007 @@ -20,11 +20,11 @@ import java.io.IOException; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.NutchWritable; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.MapFile; -import org.apache.hadoop.io.ObjectWritable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.Text; @@ -81,7 +81,7 @@ public void write(WritableComparable key, Writable value) throws IOException { - Writable w = (Writable)((ObjectWritable)value).get(); + Writable w = ((NutchWritable)value).get(); if (w instanceof CrawlDatum) fetchOut.append(key, w); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=551081&r1=551080&r2=551081 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Wed Jun 27 00:05:52 2007 @@ -43,6 +43,7 @@ import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.crawl.LinkDb; +import org.apache.nutch.crawl.NutchWritable; import org.apache.lucene.index.*; import org.apache.lucene.document.*; @@ -55,6 +56,32 @@ public static final String DONE_NAME = "index.done"; public static final Log LOG = LogFactory.getLog(Indexer.class); + + /** A utility class used to pass a lucene document from Indexer.reduce + * to Indexer.OutputFormat. + * Note: Despite its name, it can't properly wrap a lucene document - it + * doesn't know how to serialize/deserialize a lucene document. + */ + private static class LuceneDocumentWrapper implements Writable { + private Document doc; + + public LuceneDocumentWrapper(Document doc) { + this.doc = doc; + } + + public Document get() { + return doc; + } + + public void readFields(DataInput in) throws IOException { + // intentionally left blank + } + + public void write(DataOutput out) throws IOException { + // intentionally left blank + } + + } /** Unwrap Lucene Documents created by reduce and add them to an index. */ public static class OutputFormat @@ -87,7 +114,7 @@ public void write(WritableComparable key, Writable value) throws IOException { // unwrap & index doc - Document doc = (Document)((ObjectWritable)value).get(); + Document doc = ((LuceneDocumentWrapper) value).get(); NutchAnalyzer analyzer = factory.get(doc.get("lang")); if (LOG.isInfoEnabled()) { LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" + @@ -156,7 +183,7 @@ ParseData parseData = null; ParseText parseText = null; while (values.hasNext()) { - Object value = ((ObjectWritable)values.next()).get(); // unwrap + Writable value = ((NutchWritable)values.next()).get(); // unwrap if (value instanceof Inlinks) { inlinks = (Inlinks)value; } else if (value instanceof CrawlDatum) { @@ -240,7 +267,7 @@ doc.add(new Field("boost", Float.toString(boost), Field.Store.YES, Field.Index.NO)); - output.collect(key, new ObjectWritable(doc)); + output.collect(key, new LuceneDocumentWrapper(doc)); } public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments) @@ -274,7 +301,7 @@ job.setOutputPath(indexDir); job.setOutputFormat(OutputFormat.class); job.setOutputKeyClass(Text.class); - job.setOutputValueClass(ObjectWritable.class); + job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("Indexer: done"); } @@ -309,7 +336,7 @@ public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { - output.collect(key, new ObjectWritable(value)); + output.collect(key, new NutchWritable(value)); } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java?view=diff&rev=551081&r1=551080&r2=551081 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java Wed Jun 27 00:05:52 2007 @@ -22,17 +22,18 @@ import java.io.IOException; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.ObjectWritable; +import org.apache.hadoop.io.Writable; +import org.apache.nutch.crawl.NutchWritable; /** - * This is a simple decorator that adds metadata to any Object-s that can be - * serialized by <tt>ObjectWritable</tt>. This is useful when data needs to be + * This is a simple decorator that adds metadata to any Writable-s that can be + * serialized by <tt>NutchWritable</tt>. This is useful when data needs to be * temporarily enriched during processing, but this * temporary metadata doesn't need to be permanently stored after the job is done. * * @author Andrzej Bialecki */ -public class MetaWrapper extends ObjectWritable { +public class MetaWrapper extends NutchWritable { private Metadata metadata; public MetaWrapper() { @@ -40,14 +41,14 @@ metadata = new Metadata(); } - public MetaWrapper(Object object, Configuration conf) { - super(object); + public MetaWrapper(Writable instance, Configuration conf) { + super(instance); metadata = new Metadata(); setConf(conf); } - public MetaWrapper(Metadata metadata, Object object, Configuration conf) { - super(object); + public MetaWrapper(Metadata metadata, Writable instance, Configuration conf) { + super(instance); if (metadata == null) metadata = new Metadata(); this.metadata = metadata; setConf(conf); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?view=diff&rev=551081&r1=551080&r2=551081 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Wed Jun 27 00:05:52 2007 @@ -153,7 +153,7 @@ MetaWrapper wrapper = (MetaWrapper) value; try { - wrapper.set(getValueClass().newInstance()); + wrapper.set((Writable)getValueClass().newInstance()); } catch (Exception e) { throw new IOException(e.toString()); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?view=diff&rev=551081&r1=551080&r2=551081 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Wed Jun 27 00:05:52 2007 @@ -42,7 +42,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.MapFile; -import org.apache.hadoop.io.ObjectWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.UTF8; @@ -61,6 +60,7 @@ import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.util.Progressable; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseText; import org.apache.nutch.protocol.Content; @@ -87,7 +87,7 @@ newKey.set(key.toString()); key = newKey; } - collector.collect(key, new ObjectWritable(value)); + collector.collect(key, new NutchWritable(value)); } } @@ -104,8 +104,7 @@ final PrintStream printStream = new PrintStream(fs.create(segmentDumpFile)); return new RecordWriter() { public synchronized void write(WritableComparable key, Writable value) throws IOException { - ObjectWritable writable = (ObjectWritable) value; - printStream.println((String) writable.get()); + printStream.println(value); } public synchronized void close(Reporter reporter) throws IOException { @@ -170,7 +169,7 @@ dump.append("\nRecno:: ").append(recNo++).append("\n"); dump.append("URL:: " + key.toString() + "\n"); while (values.hasNext()) { - Object value = ((ObjectWritable) values.next()).get(); // unwrap + Writable value = ((NutchWritable) values.next()).get(); // unwrap if (value instanceof CrawlDatum) { dump.append("\nCrawlDatum::\n").append(((CrawlDatum) value).toString()); } else if (value instanceof Content) { @@ -183,7 +182,7 @@ LOG.warn("Unrecognized type: " + value.getClass()); } } - output.collect(key, new ObjectWritable(dump.toString())); + output.collect(key, new Text(dump.toString())); } public void dump(Path segment, Path output) throws IOException { @@ -212,7 +211,7 @@ job.setOutputPath(tempDir); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); - job.setOutputValueClass(ObjectWritable.class); + job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java?view=auto&rev=551081 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java Wed Jun 27 00:05:52 2007 @@ -0,0 +1,41 @@ +package org.apache.nutch.util; + +import java.io.DataInput; +import java.io.IOException; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.GenericWritable; +import org.apache.hadoop.io.Writable; + +/** A generic Writable wrapper that can inject Configuration to [EMAIL PROTECTED] Configurable}s */ +public abstract class GenericWritableConfigurable extends GenericWritable + implements Configurable { + + private Configuration conf; + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + @Override + public void readFields(DataInput in) throws IOException { + byte type = in.readByte(); + Class clazz = getTypes()[type]; + try { + set((Writable) clazz.newInstance()); + } catch (Exception e) { + e.printStackTrace(); + throw new IOException("Cannot initialize the class: " + clazz); + } + Writable w = get(); + if (w instanceof Configurable) + ((Configurable)w).setConf(conf); + w.readFields(in); + } + +} ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs