Author: cutting Date: Mon Aug 15 11:10:23 2005 New Revision: 232841 URL: http://svn.apache.org/viewcvs?rev=232841&view=rev Log: Lazily decompress content.
Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java?rev=232841&view=auto ============================================================================== --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java (added) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java Mon Aug 15 11:10:23 2005 @@ -0,0 +1,81 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.io; + +import java.io.IOException; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.DataOutputStream; +import java.io.DataInputStream; +import java.io.ByteArrayOutputStream; +import java.io.ByteArrayInputStream; +import java.util.zip.DeflaterOutputStream; +import java.util.zip.InflaterInputStream; + +/** A base-class for Writables which store themselves compressed and lazily + * inflate on field access. This is useful for large objects whose fields are + * not be altered during a map or reduce operation: leaving the field data + * compressed makes copying the instance from one file to another much + * faster. */ +public abstract class CompressedWritable implements Writable { + // if non-null, the compressed field data of this instance. + private byte[] compressed; + + public CompressedWritable() {} + + public final void readFields(DataInput in) throws IOException { + compressed = new byte[in.readInt()]; + in.readFully(compressed, 0, compressed.length); + } + + /** Must be called by all methods which access fields to ensure that the data + * has been uncompressed. */ + protected void ensureInflated() { + if (compressed != null) { + try { + ByteArrayInputStream deflated = new ByteArrayInputStream(compressed); + DataInput inflater = + new DataInputStream(new InflaterInputStream(deflated)); + readFieldsCompressed(inflater); + compressed = null; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + /** Subclasses implement this instead of [EMAIL PROTECTED] #readFields(DataInput)}. */ + protected abstract void readFieldsCompressed(DataInput in) + throws IOException; + + public final void write(DataOutput out) throws IOException { + if (compressed == null) { + ByteArrayOutputStream deflated = new ByteArrayOutputStream(); + DataOutputStream deflater = + new DataOutputStream(new DeflaterOutputStream(deflated)); + writeCompressed(deflater); + deflater.close(); + compressed = deflated.toByteArray(); + } + out.writeInt(compressed.length); + out.write(compressed); + } + + /** Subclasses implement this instead of [EMAIL PROTECTED] #write(DataOutput)}. */ + protected abstract void writeCompressed(DataOutput out) throws IOException; + +} Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java?rev=232841&r1=232840&r2=232841&view=diff ============================================================================== --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java Mon Aug 15 11:10:23 2005 @@ -23,12 +23,13 @@ import org.apache.nutch.fs.*; import org.apache.nutch.util.*; -public final class Content extends VersionedWritable { +public final class Content extends CompressedWritable { public static final String DIR_NAME = "content"; private final static byte VERSION = 1; + private byte version; private String url; private String base; private byte[] content; @@ -53,15 +54,16 @@ this.metadata = metadata; } - public byte getVersion() { return VERSION; } - - public final void readFields(DataInput in) throws IOException { - super.readFields(in); // check version + protected final void readFieldsCompressed(DataInput in) throws IOException { + version = in.readByte(); + if (version > VERSION) + throw new VersionMismatchException(VERSION, version); url = UTF8.readString(in); // read url base = UTF8.readString(in); // read base - content = WritableUtils.readCompressedByteArray(in); + content = new byte[in.readInt()]; // read content + in.readFully(content); contentType = UTF8.readString(in); // read contentType @@ -72,13 +74,14 @@ } } - public final void write(DataOutput out) throws IOException { - super.write(out); // write version + protected final void writeCompressed(DataOutput out) throws IOException { + out.writeByte(version); UTF8.writeString(out, url); // write url UTF8.writeString(out, base); // write base - WritableUtils.writeCompressedByteArray(out, content); // write content + out.writeInt(content.length); // write content + out.write(content); UTF8.writeString(out, contentType); // write contentType @@ -102,32 +105,55 @@ // /** The url fetched. */ - public String getUrl() { return url; } + public String getUrl() { + ensureInflated(); + return url; + } /** The base url for relative links contained in the content. * Maybe be different from url if the request redirected. */ - public String getBaseUrl() { return base; } + public String getBaseUrl() { + ensureInflated(); + return base; + } /** The binary content retrieved. */ - public byte[] getContent() { return content; } - public void setContent(byte[] content) { this.content = content; } + public byte[] getContent() { + ensureInflated(); + return content; + } + public void setContent(byte[] content) { + ensureInflated(); + this.content = content; + } /** The media type of the retrieved content. * @see http://www.iana.org/assignments/media-types/ */ - public String getContentType() { return contentType; } + public String getContentType() { + ensureInflated(); + return contentType; + } public void setContentType(String contentType) { + ensureInflated(); this.contentType = contentType; } /** Other protocol-specific data. */ - public Properties getMetadata() { return metadata; } + public Properties getMetadata() { + ensureInflated(); + return metadata; + } /** Return the value of a metadata property. */ - public String get(String name) { return getMetadata().getProperty(name); } + public String get(String name) { + ensureInflated(); + return getMetadata().getProperty(name); + } public boolean equals(Object o) { + ensureInflated(); if (!(o instanceof Content)){ return false; } @@ -141,6 +167,7 @@ } public String toString() { + ensureInflated(); StringBuffer buffer = new StringBuffer(); buffer.append("url: " + url + "\n" );