Author: srowen
Date: Tue May 11 19:22:23 2010
New Revision: 943236
URL: http://svn.apache.org/viewvc?rev=943236&view=rev
Log:
MAHOUT-391
Added:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/Varint.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/VarintTest.java
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/VectorWritable.java
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/Varint.java
URL:
http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/Varint.java?rev=943236&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/Varint.java
(added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/Varint.java
Tue May 11 19:22:23 2010
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * <p>Encodes signed and unsigned values using a common variable-length
+ * scheme, found for example in
+ * <a href="http://code.google.com/apis/protocolbuffers/docs/encoding.html">
+ * Google's Protocol Buffers</a>. It uses fewer bytes to encode smaller values,
+ * but will use slightly more bytes to encode large values.</p>
+ *
+ * <p>Signed values are further encoded using so-called zig-zag encoding
+ * in order to make them "compatible" with variable-length encoding.</p>
+ */
+public final class Varint {
+
+ public static final long MIN_SIGNED_VAR_LONG = -(1L << 62);
+ public static final long MAX_SIGNED_VAR_LONG = (1L << 62) - 1;
+ public static final int MIN_SIGNED_VAR_INT = -(1 << 30);
+ public static final int MAX_SIGNED_VAR_INT = (1 << 30) - 1;
+
+ private Varint() {
+ }
+
+ /**
+ * Encodes a value using the variable-length encoding from
+ * <a href="http://code.google.com/apis/protocolbuffers/docs/encoding.html">
+ * Google Protocol Buffers</a>. It uses zig-zag encoding to efficiently
+ * encode signed values. If values are known to be nonnegative,
+ * {...@link #writeUnsignedVarLong(long, DataOutput)} should be used.
+ *
+ * @param value value to encode
+ * @param out to write bytes to
+ * @throws IOException if {...@link DataOutput} throws {...@link IOException}
+ * @throws IllegalArgumentException if value is not between {...@link
#MIN_SIGNED_VAR_LONG}
+ * and {...@link #MAX_SIGNED_VAR_LONG} inclusive
+ */
+ public static void writeSignedVarLong(long value, DataOutput out) throws
IOException {
+ if (value < MIN_SIGNED_VAR_LONG || value > MAX_SIGNED_VAR_LONG) {
+ throw new IllegalArgumentException("Can't encode value as signed: " +
value);
+ }
+ // Great trick from
http://code.google.com/apis/protocolbuffers/docs/encoding.html#types
+ writeUnsignedVarLong((value << 1) ^ (value >> 63), out);
+ }
+
+ /**
+ * Encodes a value using the variable-length encoding from
+ * <a href="http://code.google.com/apis/protocolbuffers/docs/encoding.html">
+ * Google Protocol Buffers</a>. Zig-zag is not used, so input must not be
negative.
+ * If values can be negative, use {...@link #writeSignedVarLong(long,
DataOutput)}
+ * instead.
+ *
+ * @param value value to encode
+ * @param out to write bytes to
+ * @throws IOException if {...@link DataOutput} throws {...@link IOException}
+ * @throws IllegalArgumentException if value is negative
+ */
+ public static void writeUnsignedVarLong(long value, DataOutput out) throws
IOException {
+ if (value < 0L) {
+ throw new IllegalArgumentException("Can't encode negative value: " +
value);
+ }
+ while ((value & 0xFFFFFFFFFFFFFF80L) != 0L) {
+ out.writeByte(((int) value & 0x7F) | 0x80);
+ value >>>= 7;
+ }
+ out.writeByte((int) value & 0x7F);
+ }
+
+ /**
+ * See {...@link #writeSignedVarLong(long, DataOutput)}
+ */
+ public static void writeSignedVarInt(int value, DataOutput out) throws
IOException {
+ if (value < MIN_SIGNED_VAR_INT || value > MAX_SIGNED_VAR_INT) {
+ throw new IllegalArgumentException("Can't encode value as signed: " +
value);
+ }
+ // Great trick from
http://code.google.com/apis/protocolbuffers/docs/encoding.html#types
+ writeUnsignedVarInt((value << 1) ^ (value >> 31), out);
+ }
+
+ /**
+ * See {...@link #writeUnsignedVarLong(long, DataOutput)}
+ */
+ public static void writeUnsignedVarInt(int value, DataOutput out) throws
IOException {
+ if (value < 0) {
+ throw new IllegalArgumentException("Can't encode negative value: " +
value);
+ }
+ while ((value & 0xFFFFFF80) != 0L) {
+ out.writeByte((value & 0x7F) | 0x80);
+ value >>>= 7;
+ }
+ out.writeByte(value & 0x7F);
+ }
+
+ /**
+ * See {...@link #writeSignedVarLong(long, DataOutput)}.
+ *
+ * @param in to read bytes from
+ * @return decode value
+ * @throws IOException if {...@link DataInput} throws {...@link IOException}
+ * @throws IllegalArgumentException if variable-length value does not
terminate
+ * after 8 bytes have been read
+ */
+ public static long readSignedVarLong(DataInput in) throws IOException {
+ long raw = readUnsignedVarLong(in);
+ // This undoes the trick in writeSignedVarLong()
+ return (((raw << 63) >> 63) ^ raw) >> 1;
+ }
+
+ /**
+ * See {...@link #writeUnsignedVarLong(long, DataOutput)}.
+ *
+ * @param in to read bytes from
+ * @return decode value
+ * @throws IOException if {...@link DataInput} throws {...@link IOException}
+ * @throws IllegalArgumentException if variable-length value does not
terminate
+ * after 8 bytes have been read
+ */
+ public static long readUnsignedVarLong(DataInput in) throws IOException {
+ long value = 0L;
+ int i = 0;
+ long b;
+ while (((b = in.readByte()) & 0x80L) != 0) {
+ value |= (b & 0x7F) << i;
+ i += 7;
+ if (i > 56) {
+ throw new IllegalArgumentException("Variable length quantity is too
long");
+ }
+ }
+ return value | (b << i);
+ }
+
+ /**
+ * See {...@link #readSignedVarLong(DataInput)}
+ */
+ public static int readSignedVarInt(DataInput in) throws IOException {
+ int raw = readUnsignedVarInt(in);
+ // This undoes the trick in writeSignedVarInt()
+ return (((raw << 31) >> 31) ^ raw) >> 1;
+ }
+
+ /**
+ * See {...@link #readUnsignedVarLong(DataInput)}
+ */
+ public static int readUnsignedVarInt(DataInput in) throws IOException {
+ int value = 0;
+ int i = 0;
+ int b;
+ while (((b = in.readByte()) & 0x80) != 0) {
+ value |= (b & 0x7F) << i;
+ i += 7;
+ if (i > 28) {
+ throw new IllegalArgumentException("Variable length quantity is too
long");
+ }
+ }
+ return value | (b << i);
+ }
+
+}
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/VectorWritable.java
URL:
http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/VectorWritable.java?rev=943236&r1=943235&r2=943236&view=diff
==============================================================================
---
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/VectorWritable.java
(original)
+++
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/VectorWritable.java
Tue May 11 19:22:23 2010
@@ -81,8 +81,8 @@ public class VectorWritable extends Conf
(named ? FLAG_NAMED : 0) |
(writesLaxPrecision ? FLAG_LAX_PRECISION : 0));
+ Varint.writeUnsignedVarInt(vector.size(), out);
if (dense) {
- out.writeInt(vector.size());
for (Vector.Element element : vector) {
if (writesLaxPrecision) {
out.writeFloat((float) element.get());
@@ -91,16 +91,31 @@ public class VectorWritable extends Conf
}
}
} else {
- out.writeInt(vector.size());
- out.writeInt(vector.getNumNondefaultElements());
+ Varint.writeUnsignedVarInt(vector.getNumNondefaultElements(), out);
Iterator<Vector.Element> iter = vector.iterateNonZero();
- while (iter.hasNext()) {
- Vector.Element element = iter.next();
- out.writeInt(element.index());
- if (writesLaxPrecision) {
- out.writeFloat((float) element.get());
- } else {
- out.writeDouble(element.get());
+ if (sequential) {
+ int lastIndex = 0;
+ while (iter.hasNext()) {
+ Vector.Element element = iter.next();
+ int thisIndex = element.index();
+ // Delta-code indices:
+ Varint.writeUnsignedVarInt(thisIndex - lastIndex, out);
+ lastIndex = thisIndex;
+ if (writesLaxPrecision) {
+ out.writeFloat((float) element.get());
+ } else {
+ out.writeDouble(element.get());
+ }
+ }
+ } else {
+ while (iter.hasNext()) {
+ Vector.Element element = iter.next();
+ Varint.writeUnsignedVarInt(element.index(), out);
+ if (writesLaxPrecision) {
+ out.writeFloat((float) element.get());
+ } else {
+ out.writeDouble(element.get());
+ }
}
}
}
@@ -120,26 +135,34 @@ public class VectorWritable extends Conf
boolean named = (flags & FLAG_NAMED) != 0;
boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0;
+ int size = Varint.readUnsignedVarInt(in);
Vector v;
if (dense) {
- int size = in.readInt();
double[] values = new double[size];
for (int i = 0; i < size; i++) {
values[i] = laxPrecision ? in.readFloat() : in.readDouble();
}
v = new DenseVector(values);
} else {
- int size = in.readInt();
- int numNonDefaultElements = in.readInt();
+ int numNonDefaultElements = Varint.readUnsignedVarInt(in);
+ v = sequential ?
+ new SequentialAccessSparseVector(size, numNonDefaultElements) :
+ new RandomAccessSparseVector(size, numNonDefaultElements);
if (sequential) {
- v = new SequentialAccessSparseVector(size, numNonDefaultElements);
+ int lastIndex = 0;
+ for (int i = 0; i < numNonDefaultElements; i++) {
+ int delta = Varint.readUnsignedVarInt(in);
+ int index = lastIndex + delta;
+ lastIndex = index;
+ double value = laxPrecision ? in.readFloat() : in.readDouble();
+ v.setQuick(index, value);
+ }
} else {
- v = new RandomAccessSparseVector(size, numNonDefaultElements);
- }
- for (int i = 0; i < numNonDefaultElements; i++) {
- int index = in.readInt();
- double value = laxPrecision ? in.readFloat() : in.readDouble();
- v.setQuick(index, value);
+ for (int i = 0; i < numNonDefaultElements; i++) {
+ int index = Varint.readUnsignedVarInt(in);
+ double value = laxPrecision ? in.readFloat() : in.readDouble();
+ v.setQuick(index, value);
+ }
}
}
if (named) {
Added:
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/VarintTest.java
URL:
http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/VarintTest.java?rev=943236&view=auto
==============================================================================
---
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/VarintTest.java
(added)
+++
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/VarintTest.java
Tue May 11 19:22:23 2010
@@ -0,0 +1,203 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+/**
+ * Tests {...@link Varint}.
+ */
+public final class VarintTest extends MahoutTestCase {
+
+ public void testUnsignedLong() throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput out = new DataOutputStream(baos);
+ Varint.writeUnsignedVarLong(0L, out);
+ for (long i = 1L; i > 0L && i <= (1L << 62); i <<= 1) {
+ Varint.writeUnsignedVarLong(i-1, out);
+ Varint.writeUnsignedVarLong(i, out);
+ }
+ Varint.writeUnsignedVarLong(Long.MAX_VALUE, out);
+
+ DataInput in = new DataInputStream(new
ByteArrayInputStream(baos.toByteArray()));
+ assertEquals(0L, Varint.readUnsignedVarLong(in));
+ for (long i = 1L; i > 0L && i <= (1L << 62); i <<= 1) {
+ assertEquals(i-1, Varint.readUnsignedVarLong(in));
+ assertEquals(i, Varint.readUnsignedVarLong(in));
+ }
+ assertEquals(Long.MAX_VALUE, Varint.readUnsignedVarLong(in));
+ }
+
+ public void testSignedPositiveLong() throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput out = new DataOutputStream(baos);
+ Varint.writeSignedVarLong(0L, out);
+ for (long i = 1L; i <= (1L << 61); i <<= 1) {
+ Varint.writeSignedVarLong(i-1, out);
+ Varint.writeSignedVarLong(i, out);
+ }
+ Varint.writeSignedVarLong((1L << 62) - 1, out);
+
+ DataInput in = new DataInputStream(new
ByteArrayInputStream(baos.toByteArray()));
+ assertEquals(0L, Varint.readSignedVarLong(in));
+ for (long i = 1L; i <= (1L << 61); i <<= 1) {
+ assertEquals(i-1, Varint.readSignedVarLong(in));
+ assertEquals(i, Varint.readSignedVarLong(in));
+ }
+ assertEquals((1L << 62) - 1, Varint.readSignedVarLong(in));
+ }
+
+ public void testSignedNegativeLong() throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput out = new DataOutputStream(baos);
+ for (long i = -1L; i >= -(1L << 62); i <<= 1) {
+ Varint.writeSignedVarLong(i, out);
+ Varint.writeSignedVarLong(i+1, out);
+ }
+ DataInput in = new DataInputStream(new
ByteArrayInputStream(baos.toByteArray()));
+ for (long i = -1L; i >= -(1L << 62); i <<= 1) {
+ assertEquals(i, Varint.readSignedVarLong(in));
+ assertEquals(i+1, Varint.readSignedVarLong(in));
+ }
+ }
+
+ public void testUnsignedInt() throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput out = new DataOutputStream(baos);
+ Varint.writeUnsignedVarInt(0, out);
+ for (int i = 1; i > 0 && i <= (1 << 30); i <<= 1) {
+ Varint.writeUnsignedVarLong(i-1, out);
+ Varint.writeUnsignedVarLong(i, out);
+ }
+ Varint.writeUnsignedVarLong(Integer.MAX_VALUE, out);
+
+ DataInput in = new DataInputStream(new
ByteArrayInputStream(baos.toByteArray()));
+ assertEquals(0, Varint.readUnsignedVarInt(in));
+ for (int i = 1; i > 0 && i <= (1 << 30); i <<= 1) {
+ assertEquals(i-1, Varint.readUnsignedVarInt(in));
+ assertEquals(i, Varint.readUnsignedVarInt(in));
+ }
+ assertEquals(Integer.MAX_VALUE, Varint.readUnsignedVarInt(in));
+ }
+
+ public void testSignedPositiveInt() throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput out = new DataOutputStream(baos);
+ Varint.writeSignedVarInt(0, out);
+ for (int i = 1; i <= (1 << 29); i <<= 1) {
+ Varint.writeSignedVarLong(i-1, out);
+ Varint.writeSignedVarLong(i, out);
+ }
+ Varint.writeSignedVarInt((1 << 30) - 1, out);
+
+ DataInput in = new DataInputStream(new
ByteArrayInputStream(baos.toByteArray()));
+ assertEquals(0, Varint.readSignedVarInt(in));
+ for (int i = 1; i <= (1 << 29); i <<= 1) {
+ assertEquals(i-1, Varint.readSignedVarInt(in));
+ assertEquals(i, Varint.readSignedVarInt(in));
+ }
+ assertEquals((1 << 30) - 1, Varint.readSignedVarInt(in));
+ }
+
+ public void testSignedNegativeInt() throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput out = new DataOutputStream(baos);
+ for (int i = -1; i >= -(1 << 30); i <<= 1) {
+ Varint.writeSignedVarInt(i, out);
+ Varint.writeSignedVarInt(i+1, out);
+ }
+ DataInput in = new DataInputStream(new
ByteArrayInputStream(baos.toByteArray()));
+ for (int i = -1; i >= -(1 << 30); i <<= 1) {
+ assertEquals(i, Varint.readSignedVarInt(in));
+ assertEquals(i+1, Varint.readSignedVarInt(in));
+ }
+ }
+
+ public void testUnsignedSize() throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput out = new DataOutputStream(baos);
+ int expectedSize = 0;
+ for (int exponent = 0; exponent <= 62; exponent++) {
+ Varint.writeUnsignedVarLong(1L << exponent, out);
+ expectedSize += 1 + exponent / 7;
+ assertEquals(expectedSize, baos.size());
+ }
+ }
+
+ public void testSignedSize() throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput out = new DataOutputStream(baos);
+ int expectedSize = 0;
+ for (int exponent = 0; exponent <= 61; exponent++) {
+ Varint.writeSignedVarLong(1L << exponent, out);
+ expectedSize += 1 + ((exponent + 1) / 7);
+ assertEquals(expectedSize, baos.size());
+ }
+ for (int exponent = 0; exponent <= 61; exponent++) {
+ Varint.writeSignedVarLong(-(1L << exponent)-1, out);
+ expectedSize += 1 + ((exponent + 1) / 7);
+ assertEquals(expectedSize, baos.size());
+ }
+ }
+
+ public void testExceptions() throws IOException {
+ try {
+ Varint.writeUnsignedVarLong(-1L, null);
+ fail();
+ } catch (IllegalArgumentException iae) {
+ // good
+ }
+ try {
+ Varint.writeSignedVarLong(Varint.MIN_SIGNED_VAR_LONG - 1, null);
+ fail();
+ } catch (IllegalArgumentException iae) {
+ // good
+ }
+ try {
+ Varint.writeSignedVarLong(Varint.MAX_SIGNED_VAR_LONG + 1, null);
+ fail();
+ } catch (IllegalArgumentException iae) {
+ // good
+ }
+ try {
+ Varint.writeUnsignedVarInt(-1, null);
+ fail();
+ } catch (IllegalArgumentException iae) {
+ // good
+ }
+ try {
+ Varint.writeSignedVarInt(Varint.MIN_SIGNED_VAR_INT - 1, null);
+ fail();
+ } catch (IllegalArgumentException iae) {
+ // good
+ }
+ try {
+ Varint.writeSignedVarInt(Varint.MAX_SIGNED_VAR_INT + 1, null);
+ fail();
+ } catch (IllegalArgumentException iae) {
+ // good
+ }
+ }
+
+}