Repository: mahout Updated Branches: refs/heads/master b044a0779 -> a8e09cd3a
MAHOUT-1578 Optimizations in matrix serialization (ssc) closes apache/mahout#16 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/a8e09cd3 Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/a8e09cd3 Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/a8e09cd3 Branch: refs/heads/master Commit: a8e09cd3aa1c9d6fdda2eaf84f86a06a33963658 Parents: b044a07 Author: ssc <[email protected]> Authored: Thu Jun 12 19:03:19 2014 +0200 Committer: ssc <[email protected]> Committed: Thu Jun 12 19:03:19 2014 +0200 ---------------------------------------------------------------------- CHANGELOG | 2 + .../org/apache/mahout/math/MatrixWritable.java | 30 +++++++------ .../org/apache/mahout/math/VectorWritable.java | 47 +++++++++++++++++--- 3 files changed, 59 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/a8e09cd3/CHANGELOG ---------------------------------------------------------------------- diff --git a/CHANGELOG b/CHANGELOG index 2f604e1..7111122 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,8 @@ Mahout Change Log Release 1.0 - unreleased + MAHOUT-1578: Optimizations in matrix serialization (ssc) + MAHOUT-1572: blockify() to detect (naively) the data sparsity in the loaded data (dlyubimov) MAHOUT-1571: Functional Views are not serialized as dense/sparse correctly (dlyubimov) http://git-wip-us.apache.org/repos/asf/mahout/blob/a8e09cd3/mrlegacy/src/main/java/org/apache/mahout/math/MatrixWritable.java ---------------------------------------------------------------------- diff --git a/mrlegacy/src/main/java/org/apache/mahout/math/MatrixWritable.java b/mrlegacy/src/main/java/org/apache/mahout/math/MatrixWritable.java index 7f0c1bc..c521f3e 100644 --- a/mrlegacy/src/main/java/org/apache/mahout/math/MatrixWritable.java +++ b/mrlegacy/src/main/java/org/apache/mahout/math/MatrixWritable.java @@ -113,27 +113,28 @@ public class MatrixWritable implements Writable { int rows = in.readInt(); int columns = in.readInt(); + byte vectorFlags = in.readByte(); + Matrix matrix; + if (dense) { matrix = new DenseMatrix(rows, columns); - } else { - if (isSparseRowMatrix) { - matrix = new SparseRowMatrix(rows, columns, sequential); - } else { - matrix = new SparseMatrix(rows, columns); + for (int row = 0; row < rows; row++) { + matrix.assignRow(row, VectorWritable.readVector(in, vectorFlags, columns)); } - } - - if (dense || isSparseRowMatrix) { + } else if (isSparseRowMatrix) { + Vector[] rowVectors = new Vector[rows]; for (int row = 0; row < rows; row++) { - matrix.assignRow(row, VectorWritable.readVector(in)); + rowVectors[row] = VectorWritable.readVector(in, vectorFlags, columns); } + matrix = new SparseRowMatrix(rows, columns, rowVectors, true, !sequential); } else { + matrix = new SparseMatrix(rows, columns); int numNonZeroRows = in.readInt(); int rowsRead = 0; while (rowsRead++ < numNonZeroRows) { int rowIndex = in.readInt(); - matrix.assignRow(rowIndex, VectorWritable.readVector(in)); + matrix.assignRow(rowIndex, VectorWritable.readVector(in, vectorFlags, columns)); } } @@ -172,13 +173,16 @@ public class MatrixWritable implements Writable { } out.writeInt(flags); - out.writeInt(matrix.rowSize()); out.writeInt(matrix.columnSize()); + // We only use vectors of the same type, so we write out the type information only once! + byte vectorFlags = VectorWritable.flags(matrix.viewRow(0), false); + out.writeByte(vectorFlags); + if (isDense || isSparseRowMatrix) { for (int i = 0; i < matrix.rowSize(); i++) { - VectorWritable.writeVector(out, matrix.viewRow(i), false); + VectorWritable.writeVectorContents(out, matrix.viewRow(i), vectorFlags); } } else { IntArrayList rowIndices = ((SparseMatrix) matrix).nonZeroRowIndices(); @@ -187,7 +191,7 @@ public class MatrixWritable implements Writable { for (int i = 0; i < numNonZeroRows; i++) { int rowIndex = rowIndices.getQuick(i); out.writeInt(rowIndex); - VectorWritable.writeVector(out, matrix.viewRow(rowIndex), false); + VectorWritable.writeVectorContents(out, matrix.viewRow(rowIndex), vectorFlags); } } http://git-wip-us.apache.org/repos/asf/mahout/blob/a8e09cd3/mrlegacy/src/main/java/org/apache/mahout/math/VectorWritable.java ---------------------------------------------------------------------- diff --git a/mrlegacy/src/main/java/org/apache/mahout/math/VectorWritable.java b/mrlegacy/src/main/java/org/apache/mahout/math/VectorWritable.java index d1ffc73..c06cff6 100644 --- a/mrlegacy/src/main/java/org/apache/mahout/math/VectorWritable.java +++ b/mrlegacy/src/main/java/org/apache/mahout/math/VectorWritable.java @@ -85,13 +85,18 @@ public final class VectorWritable extends Configured implements Writable { @Override public void readFields(DataInput in) throws IOException { int flags = in.readByte(); + int size = Varint.readUnsignedVarInt(in); + readFields(in, (byte) flags, size); + } + + private void readFields(DataInput in, byte flags, int size) throws IOException { + Preconditions.checkArgument(flags >> NUM_FLAGS == 0, "Unknown flags set: %d", Integer.toString(flags, 2)); boolean dense = (flags & FLAG_DENSE) != 0; boolean sequential = (flags & FLAG_SEQUENTIAL) != 0; boolean named = (flags & FLAG_NAMED) != 0; boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0; - int size = Varint.readUnsignedVarInt(in); Vector v; if (dense) { double[] values = new double[size]; @@ -133,17 +138,39 @@ public final class VectorWritable extends Configured implements Writable { writeVector(out, vector, false); } - public static void writeVector(DataOutput out, Vector vector, boolean laxPrecision) throws IOException { + public static byte flags(Vector vector, boolean laxPrecision) { boolean dense = vector.isDense(); boolean sequential = vector.isSequentialAccess(); boolean named = vector instanceof NamedVector; - out.writeByte((dense ? FLAG_DENSE : 0) - | (sequential ? FLAG_SEQUENTIAL : 0) - | (named ? FLAG_NAMED : 0) - | (laxPrecision ? FLAG_LAX_PRECISION : 0)); + return (byte) ((dense ? FLAG_DENSE : 0) + | (sequential ? FLAG_SEQUENTIAL : 0) + | (named ? FLAG_NAMED : 0) + | (laxPrecision ? FLAG_LAX_PRECISION : 0)); + } + + /** Write out type information and size of the vector */ + public static void writeVectorFlagsAndSize(DataOutput out, byte flags, int size) throws IOException { + out.writeByte(flags); + Varint.writeUnsignedVarInt(size, out); + } + + public static void writeVector(DataOutput out, Vector vector, boolean laxPrecision) throws IOException { + + byte flags = flags(vector, laxPrecision); + + writeVectorFlagsAndSize(out, flags, vector.size()); + writeVectorContents(out, vector, flags); + } + + /** Write out contents of the vector */ + public static void writeVectorContents(DataOutput out, Vector vector, byte flags) throws IOException { + + boolean dense = (flags & FLAG_DENSE) != 0; + boolean sequential = (flags & FLAG_SEQUENTIAL) != 0; + boolean named = (flags & FLAG_NAMED) != 0; + boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0; - Varint.writeUnsignedVarInt(vector.size(), out); if (dense) { for (Vector.Element element : vector.all()) { if (laxPrecision) { @@ -200,6 +227,12 @@ public final class VectorWritable extends Configured implements Writable { return v.get(); } + public static Vector readVector(DataInput in, byte vectorFlags, int size) throws IOException { + VectorWritable v = new VectorWritable(); + v.readFields(in, vectorFlags, size); + return v.get(); + } + public static VectorWritable merge(Iterator<VectorWritable> vectors) { return new VectorWritable(mergeToVector(vectors)); }
