http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/hdfs/src/main/java/org/apache/mahout/math/MatrixWritable.java ---------------------------------------------------------------------- diff --git a/hdfs/src/main/java/org/apache/mahout/math/MatrixWritable.java b/hdfs/src/main/java/org/apache/mahout/math/MatrixWritable.java deleted file mode 100644 index b8fc461..0000000 --- a/hdfs/src/main/java/org/apache/mahout/math/MatrixWritable.java +++ /dev/null @@ -1,202 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math; - -import com.google.common.base.Preconditions; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.math.list.IntArrayList; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -public class MatrixWritable implements Writable { - - private static final int FLAG_DENSE = 0x01; - private static final int FLAG_SEQUENTIAL = 0x02; - private static final int FLAG_LABELS = 0x04; - private static final int FLAG_SPARSE_ROW = 0x08; - private static final int NUM_FLAGS = 4; - - private Matrix matrix; - - public MatrixWritable() {} - - public MatrixWritable(Matrix m) { - this.matrix = m; - } - - public Matrix get() { - return matrix; - } - - public void set(Matrix matrix) { - this.matrix = matrix; - } - - @Override - public void write(DataOutput out) throws IOException { - writeMatrix(out, matrix); - } - - @Override - public void readFields(DataInput in) throws IOException { - matrix = readMatrix(in); - } - - public static void readLabels(DataInput in, - Map<String, Integer> columnLabelBindings, - Map<String, Integer> rowLabelBindings) throws IOException { - int colSize = in.readInt(); - if (colSize > 0) { - for (int i = 0; i < colSize; i++) { - columnLabelBindings.put(in.readUTF(), in.readInt()); - } - } - int rowSize = in.readInt(); - if (rowSize > 0) { - for (int i = 0; i < rowSize; i++) { - rowLabelBindings.put(in.readUTF(), in.readInt()); - } - } - } - - public static void writeLabelBindings(DataOutput out, - Map<String, Integer> columnLabelBindings, - Map<String, Integer> rowLabelBindings) throws IOException { - if (columnLabelBindings == null) { - out.writeInt(0); - } else { - out.writeInt(columnLabelBindings.size()); - for (Map.Entry<String, Integer> stringIntegerEntry : columnLabelBindings.entrySet()) { - out.writeUTF(stringIntegerEntry.getKey()); - out.writeInt(stringIntegerEntry.getValue()); - } - } - if (rowLabelBindings == null) { - out.writeInt(0); - } else { - out.writeInt(rowLabelBindings.size()); - for (Map.Entry<String, Integer> stringIntegerEntry : rowLabelBindings.entrySet()) { - out.writeUTF(stringIntegerEntry.getKey()); - out.writeInt(stringIntegerEntry.getValue()); - } - } - } - - /** Reads a typed Matrix instance from the input stream */ - public static Matrix readMatrix(DataInput in) throws IOException { - int flags = in.readInt(); - Preconditions.checkArgument(flags >> NUM_FLAGS == 0, "Unknown flags set: %d", Integer.toString(flags, 2)); - boolean dense = (flags & FLAG_DENSE) != 0; - boolean sequential = (flags & FLAG_SEQUENTIAL) != 0; - boolean hasLabels = (flags & FLAG_LABELS) != 0; - boolean isSparseRowMatrix = (flags & FLAG_SPARSE_ROW) != 0; - - int rows = in.readInt(); - int columns = in.readInt(); - - byte vectorFlags = in.readByte(); - - Matrix matrix; - - if (dense) { - matrix = new DenseMatrix(rows, columns); - for (int row = 0; row < rows; row++) { - matrix.assignRow(row, VectorWritable.readVector(in, vectorFlags, columns)); - } - } else if (isSparseRowMatrix) { - Vector[] rowVectors = new Vector[rows]; - for (int row = 0; row < rows; row++) { - rowVectors[row] = VectorWritable.readVector(in, vectorFlags, columns); - } - matrix = new SparseRowMatrix(rows, columns, rowVectors, true, !sequential); - } else { - matrix = new SparseMatrix(rows, columns); - int numNonZeroRows = in.readInt(); - int rowsRead = 0; - while (rowsRead++ < numNonZeroRows) { - int rowIndex = in.readInt(); - matrix.assignRow(rowIndex, VectorWritable.readVector(in, vectorFlags, columns)); - } - } - - if (hasLabels) { - Map<String,Integer> columnLabelBindings = new HashMap<>(); - Map<String,Integer> rowLabelBindings = new HashMap<>(); - readLabels(in, columnLabelBindings, rowLabelBindings); - if (!columnLabelBindings.isEmpty()) { - matrix.setColumnLabelBindings(columnLabelBindings); - } - if (!rowLabelBindings.isEmpty()) { - matrix.setRowLabelBindings(rowLabelBindings); - } - } - - return matrix; - } - - /** Writes a typed Matrix instance to the output stream */ - public static void writeMatrix(final DataOutput out, Matrix matrix) throws IOException { - int flags = 0; - Vector row = matrix.viewRow(0); - boolean isDense = row.isDense(); - if (isDense) { - flags |= FLAG_DENSE; - } - if (row.isSequentialAccess()) { - flags |= FLAG_SEQUENTIAL; - } - if (matrix.getRowLabelBindings() != null || matrix.getColumnLabelBindings() != null) { - flags |= FLAG_LABELS; - } - boolean isSparseRowMatrix = matrix instanceof SparseRowMatrix; - if (isSparseRowMatrix) { - flags |= FLAG_SPARSE_ROW; - } - - out.writeInt(flags); - out.writeInt(matrix.rowSize()); - out.writeInt(matrix.columnSize()); - - // We only use vectors of the same type, so we write out the type information only once! - byte vectorFlags = VectorWritable.flags(matrix.viewRow(0), false); - out.writeByte(vectorFlags); - - if (isDense || isSparseRowMatrix) { - for (int i = 0; i < matrix.rowSize(); i++) { - VectorWritable.writeVectorContents(out, matrix.viewRow(i), vectorFlags); - } - } else { - IntArrayList rowIndices = ((SparseMatrix) matrix).nonZeroRowIndices(); - int numNonZeroRows = rowIndices.size(); - out.writeInt(numNonZeroRows); - for (int i = 0; i < numNonZeroRows; i++) { - int rowIndex = rowIndices.getQuick(i); - out.writeInt(rowIndex); - VectorWritable.writeVectorContents(out, matrix.viewRow(rowIndex), vectorFlags); - } - } - - if ((flags & FLAG_LABELS) != 0) { - writeLabelBindings(out, matrix.getColumnLabelBindings(), matrix.getRowLabelBindings()); - } - } -}
http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/hdfs/src/main/java/org/apache/mahout/math/VarIntWritable.java ---------------------------------------------------------------------- diff --git a/hdfs/src/main/java/org/apache/mahout/math/VarIntWritable.java b/hdfs/src/main/java/org/apache/mahout/math/VarIntWritable.java deleted file mode 100644 index e5cb173..0000000 --- a/hdfs/src/main/java/org/apache/mahout/math/VarIntWritable.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.WritableComparable; - -public class VarIntWritable implements WritableComparable<VarIntWritable>, Cloneable { - - private int value; - - public VarIntWritable() { - } - - public VarIntWritable(int value) { - this.value = value; - } - - public int get() { - return value; - } - - public void set(int value) { - this.value = value; - } - - @Override - public boolean equals(Object other) { - return other instanceof VarIntWritable && ((VarIntWritable) other).value == value; - } - - @Override - public int hashCode() { - return value; - } - - @Override - public String toString() { - return String.valueOf(value); - } - - @Override - public VarIntWritable clone() { - return new VarIntWritable(value); - } - - @Override - public int compareTo(VarIntWritable other) { - if (value < other.value) { - return -1; - } - if (value > other.value) { - return 1; - } - return 0; - } - - @Override - public void write(DataOutput out) throws IOException { - Varint.writeSignedVarInt(value, out); - } - - @Override - public void readFields(DataInput in) throws IOException { - value = Varint.readSignedVarInt(in); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/hdfs/src/main/java/org/apache/mahout/math/VarLongWritable.java ---------------------------------------------------------------------- diff --git a/hdfs/src/main/java/org/apache/mahout/math/VarLongWritable.java b/hdfs/src/main/java/org/apache/mahout/math/VarLongWritable.java deleted file mode 100644 index 7b0d9c4..0000000 --- a/hdfs/src/main/java/org/apache/mahout/math/VarLongWritable.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import com.google.common.primitives.Longs; -import org.apache.hadoop.io.WritableComparable; - -public class VarLongWritable implements WritableComparable<VarLongWritable> { - - private long value; - - public VarLongWritable() { - } - - public VarLongWritable(long value) { - this.value = value; - } - - public long get() { - return value; - } - - public void set(long value) { - this.value = value; - } - - @Override - public boolean equals(Object other) { - return other != null && getClass().equals(other.getClass()) && ((VarLongWritable) other).value == value; - } - - @Override - public int hashCode() { - return Longs.hashCode(value); - } - - @Override - public String toString() { - return String.valueOf(value); - } - - @Override - public int compareTo(VarLongWritable other) { - if (value >= other.value) { - if (value > other.value) { - return 1; - } - } else { - return -1; - } - return 0; - } - - @Override - public void write(DataOutput out) throws IOException { - Varint.writeSignedVarLong(value, out); - } - - @Override - public void readFields(DataInput in) throws IOException { - value = Varint.readSignedVarLong(in); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/hdfs/src/main/java/org/apache/mahout/math/Varint.java ---------------------------------------------------------------------- diff --git a/hdfs/src/main/java/org/apache/mahout/math/Varint.java b/hdfs/src/main/java/org/apache/mahout/math/Varint.java deleted file mode 100644 index f380c6c..0000000 --- a/hdfs/src/main/java/org/apache/mahout/math/Varint.java +++ /dev/null @@ -1,167 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import com.google.common.base.Preconditions; - -/** - * <p>Encodes signed and unsigned values using a common variable-length - * scheme, found for example in - * <a href="http://code.google.com/apis/protocolbuffers/docs/encoding.html"> - * Google's Protocol Buffers</a>. It uses fewer bytes to encode smaller values, - * but will use slightly more bytes to encode large values.</p> - * - * <p>Signed values are further encoded using so-called zig-zag encoding - * in order to make them "compatible" with variable-length encoding.</p> - */ -public final class Varint { - - private Varint() { - } - - /** - * Encodes a value using the variable-length encoding from - * <a href="http://code.google.com/apis/protocolbuffers/docs/encoding.html"> - * Google Protocol Buffers</a>. It uses zig-zag encoding to efficiently - * encode signed values. If values are known to be nonnegative, - * {@link #writeUnsignedVarLong(long, java.io.DataOutput)} should be used. - * - * @param value value to encode - * @param out to write bytes to - * @throws java.io.IOException if {@link java.io.DataOutput} throws {@link java.io.IOException} - */ - public static void writeSignedVarLong(long value, DataOutput out) throws IOException { - // Great trick from http://code.google.com/apis/protocolbuffers/docs/encoding.html#types - writeUnsignedVarLong((value << 1) ^ (value >> 63), out); - } - - /** - * Encodes a value using the variable-length encoding from - * <a href="http://code.google.com/apis/protocolbuffers/docs/encoding.html"> - * Google Protocol Buffers</a>. Zig-zag is not used, so input must not be negative. - * If values can be negative, use {@link #writeSignedVarLong(long, java.io.DataOutput)} - * instead. This method treats negative input as like a large unsigned value. - * - * @param value value to encode - * @param out to write bytes to - * @throws java.io.IOException if {@link java.io.DataOutput} throws {@link java.io.IOException} - */ - public static void writeUnsignedVarLong(long value, DataOutput out) throws IOException { - while ((value & 0xFFFFFFFFFFFFFF80L) != 0L) { - out.writeByte(((int) value & 0x7F) | 0x80); - value >>>= 7; - } - out.writeByte((int) value & 0x7F); - } - - /** - * @see #writeSignedVarLong(long, java.io.DataOutput) - */ - public static void writeSignedVarInt(int value, DataOutput out) throws IOException { - // Great trick from http://code.google.com/apis/protocolbuffers/docs/encoding.html#types - writeUnsignedVarInt((value << 1) ^ (value >> 31), out); - } - - /** - * @see #writeUnsignedVarLong(long, java.io.DataOutput) - */ - public static void writeUnsignedVarInt(int value, DataOutput out) throws IOException { - while ((value & 0xFFFFFF80) != 0L) { - out.writeByte((value & 0x7F) | 0x80); - value >>>= 7; - } - out.writeByte(value & 0x7F); - } - - /** - * @param in to read bytes from - * @return decode value - * @throws java.io.IOException if {@link java.io.DataInput} throws {@link java.io.IOException} - * @throws IllegalArgumentException if variable-length value does not terminate - * after 9 bytes have been read - * @see #writeSignedVarLong(long, java.io.DataOutput) - */ - public static long readSignedVarLong(DataInput in) throws IOException { - long raw = readUnsignedVarLong(in); - // This undoes the trick in writeSignedVarLong() - long temp = (((raw << 63) >> 63) ^ raw) >> 1; - // This extra step lets us deal with the largest signed values by treating - // negative results from read unsigned methods as like unsigned values - // Must re-flip the top bit if the original read value had it set. - return temp ^ (raw & (1L << 63)); - } - - /** - * @param in to read bytes from - * @return decode value - * @throws java.io.IOException if {@link java.io.DataInput} throws {@link java.io.IOException} - * @throws IllegalArgumentException if variable-length value does not terminate - * after 9 bytes have been read - * @see #writeUnsignedVarLong(long, java.io.DataOutput) - */ - public static long readUnsignedVarLong(DataInput in) throws IOException { - long value = 0L; - int i = 0; - long b; - while (((b = in.readByte()) & 0x80L) != 0) { - value |= (b & 0x7F) << i; - i += 7; - Preconditions.checkArgument(i <= 63, "Variable length quantity is too long (must be <= 63)"); - } - return value | (b << i); - } - - /** - * @throws IllegalArgumentException if variable-length value does not terminate - * after 5 bytes have been read - * @throws java.io.IOException if {@link java.io.DataInput} throws {@link java.io.IOException} - * @see #readSignedVarLong(java.io.DataInput) - */ - public static int readSignedVarInt(DataInput in) throws IOException { - int raw = readUnsignedVarInt(in); - // This undoes the trick in writeSignedVarInt() - int temp = (((raw << 31) >> 31) ^ raw) >> 1; - // This extra step lets us deal with the largest signed values by treating - // negative results from read unsigned methods as like unsigned values. - // Must re-flip the top bit if the original read value had it set. - return temp ^ (raw & (1 << 31)); - } - - /** - * @throws IllegalArgumentException if variable-length value does not terminate - * after 5 bytes have been read - * @throws java.io.IOException if {@link java.io.DataInput} throws {@link java.io.IOException} - * @see #readUnsignedVarLong(java.io.DataInput) - */ - public static int readUnsignedVarInt(DataInput in) throws IOException { - int value = 0; - int i = 0; - int b; - while (((b = in.readByte()) & 0x80) != 0) { - value |= (b & 0x7F) << i; - i += 7; - Preconditions.checkArgument(i <= 35, "Variable length quantity is too long (must be <= 35)"); - } - return value | (b << i); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/hdfs/src/main/java/org/apache/mahout/math/VectorWritable.java ---------------------------------------------------------------------- diff --git a/hdfs/src/main/java/org/apache/mahout/math/VectorWritable.java b/hdfs/src/main/java/org/apache/mahout/math/VectorWritable.java deleted file mode 100644 index 491ae3b..0000000 --- a/hdfs/src/main/java/org/apache/mahout/math/VectorWritable.java +++ /dev/null @@ -1,267 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more contributor license - * agreements. See the NOTICE file distributed with this work for additional information regarding - * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License - * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express - * or implied. See the License for the specific language governing permissions and limitations under - * the License. - */ - -package org.apache.mahout.math; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.Iterator; - -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.math.Vector.Element; - -import com.google.common.base.Preconditions; - -public final class VectorWritable extends Configured implements Writable { - - public static final int FLAG_DENSE = 0x01; - public static final int FLAG_SEQUENTIAL = 0x02; - public static final int FLAG_NAMED = 0x04; - public static final int FLAG_LAX_PRECISION = 0x08; - public static final int NUM_FLAGS = 4; - - private Vector vector; - private boolean writesLaxPrecision; - - public VectorWritable() {} - - public VectorWritable(boolean writesLaxPrecision) { - setWritesLaxPrecision(writesLaxPrecision); - } - - public VectorWritable(Vector vector) { - this.vector = vector; - } - - public VectorWritable(Vector vector, boolean writesLaxPrecision) { - this(vector); - setWritesLaxPrecision(writesLaxPrecision); - } - - /** - * @return {@link org.apache.mahout.math.Vector} that this is to write, or has - * just read - */ - public Vector get() { - return vector; - } - - public void set(Vector vector) { - this.vector = vector; - } - - /** - * @return true if this is allowed to encode {@link org.apache.mahout.math.Vector} - * values using fewer bytes, possibly losing precision. In particular this means - * that floating point values will be encoded as floats, not doubles. - */ - public boolean isWritesLaxPrecision() { - return writesLaxPrecision; - } - - public void setWritesLaxPrecision(boolean writesLaxPrecision) { - this.writesLaxPrecision = writesLaxPrecision; - } - - @Override - public void write(DataOutput out) throws IOException { - writeVector(out, this.vector, this.writesLaxPrecision); - } - - @Override - public void readFields(DataInput in) throws IOException { - int flags = in.readByte(); - int size = Varint.readUnsignedVarInt(in); - readFields(in, (byte) flags, size); - } - - private void readFields(DataInput in, byte flags, int size) throws IOException { - - Preconditions.checkArgument(flags >> NUM_FLAGS == 0, "Unknown flags set: %d", Integer.toString(flags, 2)); - boolean dense = (flags & FLAG_DENSE) != 0; - boolean sequential = (flags & FLAG_SEQUENTIAL) != 0; - boolean named = (flags & FLAG_NAMED) != 0; - boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0; - - Vector v; - if (dense) { - double[] values = new double[size]; - for (int i = 0; i < size; i++) { - values[i] = laxPrecision ? in.readFloat() : in.readDouble(); - } - v = new DenseVector(values); - } else { - int numNonDefaultElements = Varint.readUnsignedVarInt(in); - v = sequential - ? new SequentialAccessSparseVector(size, numNonDefaultElements) - : new RandomAccessSparseVector(size, numNonDefaultElements); - if (sequential) { - int lastIndex = 0; - for (int i = 0; i < numNonDefaultElements; i++) { - int delta = Varint.readUnsignedVarInt(in); - int index = lastIndex + delta; - lastIndex = index; - double value = laxPrecision ? in.readFloat() : in.readDouble(); - v.setQuick(index, value); - } - } else { - for (int i = 0; i < numNonDefaultElements; i++) { - int index = Varint.readUnsignedVarInt(in); - double value = laxPrecision ? in.readFloat() : in.readDouble(); - v.setQuick(index, value); - } - } - } - if (named) { - String name = in.readUTF(); - v = new NamedVector(v, name); - } - vector = v; - } - - /** Write the vector to the output */ - public static void writeVector(DataOutput out, Vector vector) throws IOException { - writeVector(out, vector, false); - } - - public static byte flags(Vector vector, boolean laxPrecision) { - boolean dense = vector.isDense(); - boolean sequential = vector.isSequentialAccess(); - boolean named = vector instanceof NamedVector; - - return (byte) ((dense ? FLAG_DENSE : 0) - | (sequential ? FLAG_SEQUENTIAL : 0) - | (named ? FLAG_NAMED : 0) - | (laxPrecision ? FLAG_LAX_PRECISION : 0)); - } - - /** Write out type information and size of the vector */ - public static void writeVectorFlagsAndSize(DataOutput out, byte flags, int size) throws IOException { - out.writeByte(flags); - Varint.writeUnsignedVarInt(size, out); - } - - public static void writeVector(DataOutput out, Vector vector, boolean laxPrecision) throws IOException { - - byte flags = flags(vector, laxPrecision); - - writeVectorFlagsAndSize(out, flags, vector.size()); - writeVectorContents(out, vector, flags); - } - - /** Write out contents of the vector */ - public static void writeVectorContents(DataOutput out, Vector vector, byte flags) throws IOException { - - boolean dense = (flags & FLAG_DENSE) != 0; - boolean sequential = (flags & FLAG_SEQUENTIAL) != 0; - boolean named = (flags & FLAG_NAMED) != 0; - boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0; - - if (dense) { - for (Element element : vector.all()) { - if (laxPrecision) { - out.writeFloat((float) element.get()); - } else { - out.writeDouble(element.get()); - } - } - } else { - Varint.writeUnsignedVarInt(vector.getNumNonZeroElements(), out); - Iterator<Element> iter = vector.nonZeroes().iterator(); - if (sequential) { - int lastIndex = 0; - while (iter.hasNext()) { - Element element = iter.next(); - if (element.get() == 0) { - continue; - } - int thisIndex = element.index(); - // Delta-code indices: - Varint.writeUnsignedVarInt(thisIndex - lastIndex, out); - lastIndex = thisIndex; - if (laxPrecision) { - out.writeFloat((float) element.get()); - } else { - out.writeDouble(element.get()); - } - } - } else { - while (iter.hasNext()) { - Element element = iter.next(); - if (element.get() == 0) { - // TODO(robinanil): Fix the damn iterator for the zero element. - continue; - } - Varint.writeUnsignedVarInt(element.index(), out); - if (laxPrecision) { - out.writeFloat((float) element.get()); - } else { - out.writeDouble(element.get()); - } - } - } - } - if (named) { - String name = ((NamedVector) vector).getName(); - out.writeUTF(name == null ? "" : name); - } - } - - public static Vector readVector(DataInput in) throws IOException { - VectorWritable v = new VectorWritable(); - v.readFields(in); - return v.get(); - } - - public static Vector readVector(DataInput in, byte vectorFlags, int size) throws IOException { - VectorWritable v = new VectorWritable(); - v.readFields(in, vectorFlags, size); - return v.get(); - } - - public static VectorWritable merge(Iterator<VectorWritable> vectors) { - return new VectorWritable(mergeToVector(vectors)); - } - - public static Vector mergeToVector(Iterator<VectorWritable> vectors) { - Vector accumulator = vectors.next().get(); - while (vectors.hasNext()) { - VectorWritable v = vectors.next(); - if (v != null) { - for (Element nonZeroElement : v.get().nonZeroes()) { - accumulator.setQuick(nonZeroElement.index(), nonZeroElement.get()); - } - } - } - return accumulator; - } - - @Override - public boolean equals(Object o) { - return o instanceof VectorWritable && vector.equals(((VectorWritable) o).get()); - } - - @Override - public int hashCode() { - return vector.hashCode(); - } - - @Override - public String toString() { - return vector.toString(); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/hdfs/src/test/java/org/apache/mahout/math/MatrixWritableTest.java ---------------------------------------------------------------------- diff --git a/hdfs/src/test/java/org/apache/mahout/math/MatrixWritableTest.java b/hdfs/src/test/java/org/apache/mahout/math/MatrixWritableTest.java deleted file mode 100644 index 31e6947..0000000 --- a/hdfs/src/test/java/org/apache/mahout/math/MatrixWritableTest.java +++ /dev/null @@ -1,141 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -import org.apache.hadoop.io.Writable; -import org.junit.Test; - -public final class MatrixWritableTest extends MahoutTestCase { - - @Test - public void testSparseMatrixWritable() throws Exception { - Matrix m = new SparseMatrix(5, 5); - m.set(1, 2, 3.0); - m.set(3, 4, 5.0); - Map<String, Integer> bindings = new HashMap<>(); - bindings.put("A", 0); - bindings.put("B", 1); - bindings.put("C", 2); - bindings.put("D", 3); - bindings.put("default", 4); - m.setRowLabelBindings(bindings); - m.setColumnLabelBindings(bindings); - doTestMatrixWritableEquals(m); - } - - @Test - public void testSparseRowMatrixWritable() throws Exception { - Matrix m = new SparseRowMatrix(5, 5); - m.set(1, 2, 3.0); - m.set(3, 4, 5.0); - Map<String, Integer> bindings = new HashMap<>(); - bindings.put("A", 0); - bindings.put("B", 1); - bindings.put("C", 2); - bindings.put("D", 3); - bindings.put("default", 4); - m.setRowLabelBindings(bindings); - m.setColumnLabelBindings(bindings); - doTestMatrixWritableEquals(m); - } - - @Test - public void testDenseMatrixWritable() throws Exception { - Matrix m = new DenseMatrix(5,5); - m.set(1, 2, 3.0); - m.set(3, 4, 5.0); - Map<String, Integer> bindings = new HashMap<>(); - bindings.put("A", 0); - bindings.put("B", 1); - bindings.put("C", 2); - bindings.put("D", 3); - bindings.put("default", 4); - m.setRowLabelBindings(bindings); - m.setColumnLabelBindings(bindings); - doTestMatrixWritableEquals(m); - } - - private static void doTestMatrixWritableEquals(Matrix m) throws IOException { - Writable matrixWritable = new MatrixWritable(m); - MatrixWritable matrixWritable2 = new MatrixWritable(); - writeAndRead(matrixWritable, matrixWritable2); - Matrix m2 = matrixWritable2.get(); - compareMatrices(m, m2); - doCheckBindings(m2.getRowLabelBindings()); - doCheckBindings(m2.getColumnLabelBindings()); - } - - private static void compareMatrices(Matrix m, Matrix m2) { - assertEquals(m.numRows(), m2.numRows()); - assertEquals(m.numCols(), m2.numCols()); - for (int r = 0; r < m.numRows(); r++) { - for (int c = 0; c < m.numCols(); c++) { - assertEquals(m.get(r, c), m2.get(r, c), EPSILON); - } - } - Map<String,Integer> bindings = m.getRowLabelBindings(); - Map<String, Integer> bindings2 = m2.getRowLabelBindings(); - assertEquals(bindings == null, bindings2 == null); - if (bindings != null) { - assertEquals(bindings.size(), m.numRows()); - assertEquals(bindings.size(), bindings2.size()); - for (Map.Entry<String,Integer> entry : bindings.entrySet()) { - assertEquals(entry.getValue(), bindings2.get(entry.getKey())); - } - } - bindings = m.getColumnLabelBindings(); - bindings2 = m2.getColumnLabelBindings(); - assertEquals(bindings == null, bindings2 == null); - if (bindings != null) { - assertEquals(bindings.size(), bindings2.size()); - for (Map.Entry<String,Integer> entry : bindings.entrySet()) { - assertEquals(entry.getValue(), bindings2.get(entry.getKey())); - } - } - } - - private static void doCheckBindings(Map<String,Integer> labels) { - assertTrue("Missing label", labels.keySet().contains("A")); - assertTrue("Missing label", labels.keySet().contains("B")); - assertTrue("Missing label", labels.keySet().contains("C")); - assertTrue("Missing label", labels.keySet().contains("D")); - assertTrue("Missing label", labels.keySet().contains("default")); - } - - private static void writeAndRead(Writable toWrite, Writable toRead) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try (DataOutputStream dos = new DataOutputStream(baos)){ - toWrite.write(dos); - } - - ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); - try (DataInputStream dis = new DataInputStream(bais)) { - toRead.readFields(dis); - } - } - - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/hdfs/src/test/java/org/apache/mahout/math/VarintTest.java ---------------------------------------------------------------------- diff --git a/hdfs/src/test/java/org/apache/mahout/math/VarintTest.java b/hdfs/src/test/java/org/apache/mahout/math/VarintTest.java deleted file mode 100644 index 0b1a664..0000000 --- a/hdfs/src/test/java/org/apache/mahout/math/VarintTest.java +++ /dev/null @@ -1,189 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math; - -import org.junit.Test; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInput; -import java.io.DataInputStream; -import java.io.DataOutput; -import java.io.DataOutputStream; - -/** - * Tests {@link Varint}. - */ -public final class VarintTest extends MahoutTestCase { - - @Test - public void testUnsignedLong() throws Exception { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutput out = new DataOutputStream(baos); - Varint.writeUnsignedVarLong(0L, out); - for (long i = 1L; i > 0L && i <= (1L << 62); i <<= 1) { - Varint.writeUnsignedVarLong(i-1, out); - Varint.writeUnsignedVarLong(i, out); - } - Varint.writeUnsignedVarLong(Long.MAX_VALUE, out); - - DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray())); - assertEquals(0L, Varint.readUnsignedVarLong(in)); - for (long i = 1L; i > 0L && i <= (1L << 62); i <<= 1) { - assertEquals(i-1, Varint.readUnsignedVarLong(in)); - assertEquals(i, Varint.readUnsignedVarLong(in)); - } - assertEquals(Long.MAX_VALUE, Varint.readUnsignedVarLong(in)); - } - - @Test - public void testSignedPositiveLong() throws Exception { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutput out = new DataOutputStream(baos); - Varint.writeSignedVarLong(0L, out); - for (long i = 1L; i <= (1L << 61); i <<= 1) { - Varint.writeSignedVarLong(i-1, out); - Varint.writeSignedVarLong(i, out); - } - Varint.writeSignedVarLong((1L << 62) - 1, out); - Varint.writeSignedVarLong((1L << 62), out); - Varint.writeSignedVarLong(Long.MAX_VALUE, out); - - DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray())); - assertEquals(0L, Varint.readSignedVarLong(in)); - for (long i = 1L; i <= (1L << 61); i <<= 1) { - assertEquals(i-1, Varint.readSignedVarLong(in)); - assertEquals(i, Varint.readSignedVarLong(in)); - } - assertEquals((1L << 62) - 1, Varint.readSignedVarLong(in)); - assertEquals((1L << 62), Varint.readSignedVarLong(in)); - assertEquals(Long.MAX_VALUE, Varint.readSignedVarLong(in)); - } - - @Test - public void testSignedNegativeLong() throws Exception { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutput out = new DataOutputStream(baos); - for (long i = -1L; i >= -(1L << 62); i <<= 1) { - Varint.writeSignedVarLong(i, out); - Varint.writeSignedVarLong(i+1, out); - } - Varint.writeSignedVarLong(Long.MIN_VALUE, out); - Varint.writeSignedVarLong(Long.MIN_VALUE+1, out); - DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray())); - for (long i = -1L; i >= -(1L << 62); i <<= 1) { - assertEquals(i, Varint.readSignedVarLong(in)); - assertEquals(i+1, Varint.readSignedVarLong(in)); - } - assertEquals(Long.MIN_VALUE, Varint.readSignedVarLong(in)); - assertEquals(Long.MIN_VALUE+1, Varint.readSignedVarLong(in)); - } - - @Test - public void testUnsignedInt() throws Exception { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutput out = new DataOutputStream(baos); - Varint.writeUnsignedVarInt(0, out); - for (int i = 1; i > 0 && i <= (1 << 30); i <<= 1) { - Varint.writeUnsignedVarLong(i-1, out); - Varint.writeUnsignedVarLong(i, out); - } - Varint.writeUnsignedVarLong(Integer.MAX_VALUE, out); - - DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray())); - assertEquals(0, Varint.readUnsignedVarInt(in)); - for (int i = 1; i > 0 && i <= (1 << 30); i <<= 1) { - assertEquals(i-1, Varint.readUnsignedVarInt(in)); - assertEquals(i, Varint.readUnsignedVarInt(in)); - } - assertEquals(Integer.MAX_VALUE, Varint.readUnsignedVarInt(in)); - } - - @Test - public void testSignedPositiveInt() throws Exception { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutput out = new DataOutputStream(baos); - Varint.writeSignedVarInt(0, out); - for (int i = 1; i <= (1 << 29); i <<= 1) { - Varint.writeSignedVarLong(i-1, out); - Varint.writeSignedVarLong(i, out); - } - Varint.writeSignedVarInt((1 << 30) - 1, out); - Varint.writeSignedVarInt((1 << 30), out); - Varint.writeSignedVarInt(Integer.MAX_VALUE, out); - - DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray())); - assertEquals(0, Varint.readSignedVarInt(in)); - for (int i = 1; i <= (1 << 29); i <<= 1) { - assertEquals(i-1, Varint.readSignedVarInt(in)); - assertEquals(i, Varint.readSignedVarInt(in)); - } - assertEquals((1L << 30) - 1, Varint.readSignedVarInt(in)); - assertEquals((1L << 30), Varint.readSignedVarInt(in)); - assertEquals(Integer.MAX_VALUE, Varint.readSignedVarInt(in)); - } - - @Test - public void testSignedNegativeInt() throws Exception { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutput out = new DataOutputStream(baos); - for (int i = -1; i >= -(1 << 30); i <<= 1) { - Varint.writeSignedVarInt(i, out); - Varint.writeSignedVarInt(i+1, out); - } - Varint.writeSignedVarInt(Integer.MIN_VALUE, out); - Varint.writeSignedVarInt(Integer.MIN_VALUE+1, out); - DataInput in = new DataInputStream(new ByteArrayInputStream(baos.toByteArray())); - for (int i = -1; i >= -(1 << 30); i <<= 1) { - assertEquals(i, Varint.readSignedVarInt(in)); - assertEquals(i+1, Varint.readSignedVarInt(in)); - } - assertEquals(Integer.MIN_VALUE, Varint.readSignedVarInt(in)); - assertEquals(Integer.MIN_VALUE+1, Varint.readSignedVarInt(in)); - } - - @Test - public void testUnsignedSize() throws Exception { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutput out = new DataOutputStream(baos); - int expectedSize = 0; - for (int exponent = 0; exponent <= 62; exponent++) { - Varint.writeUnsignedVarLong(1L << exponent, out); - expectedSize += 1 + exponent / 7; - assertEquals(expectedSize, baos.size()); - } - } - - @Test - public void testSignedSize() throws Exception { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutput out = new DataOutputStream(baos); - int expectedSize = 0; - for (int exponent = 0; exponent <= 61; exponent++) { - Varint.writeSignedVarLong(1L << exponent, out); - expectedSize += 1 + ((exponent + 1) / 7); - assertEquals(expectedSize, baos.size()); - } - for (int exponent = 0; exponent <= 61; exponent++) { - Varint.writeSignedVarLong(-(1L << exponent)-1, out); - expectedSize += 1 + ((exponent + 1) / 7); - assertEquals(expectedSize, baos.size()); - } - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/hdfs/src/test/java/org/apache/mahout/math/VectorWritableTest.java ---------------------------------------------------------------------- diff --git a/hdfs/src/test/java/org/apache/mahout/math/VectorWritableTest.java b/hdfs/src/test/java/org/apache/mahout/math/VectorWritableTest.java deleted file mode 100644 index 991be6e..0000000 --- a/hdfs/src/test/java/org/apache/mahout/math/VectorWritableTest.java +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more contributor license - * agreements. See the NOTICE file distributed with this work for additional information regarding - * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed under the License - * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express - * or implied. See the License for the specific language governing permissions and limitations under - * the License. - */ - -package org.apache.mahout.math; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; - -import org.apache.hadoop.io.Writable; -import org.apache.mahout.math.Vector.Element; -import org.junit.Test; - -import com.carrotsearch.randomizedtesting.RandomizedTest; -import com.carrotsearch.randomizedtesting.annotations.Repeat; - -public final class VectorWritableTest extends RandomizedTest { - private static final int MAX_VECTOR_SIZE = 100; - - public void createRandom(Vector v) { - int size = randomInt(v.size() - 1); - for (int i = 0; i < size; ++i) { - v.set(randomInt(v.size() - 1), randomDouble()); - } - - int zeros = Math.max(2, size / 4); - for (Element e : v.nonZeroes()) { - if (e.index() % zeros == 0) { - e.set(0.0); - } - } - } - - @Test - @Repeat(iterations = 20) - public void testViewSequentialAccessSparseVectorWritable() throws Exception { - Vector v = new SequentialAccessSparseVector(MAX_VECTOR_SIZE); - createRandom(v); - Vector view = new VectorView(v, 0, v.size()); - doTestVectorWritableEquals(view); - } - - @Test - @Repeat(iterations = 20) - public void testSequentialAccessSparseVectorWritable() throws Exception { - Vector v = new SequentialAccessSparseVector(MAX_VECTOR_SIZE); - createRandom(v); - doTestVectorWritableEquals(v); - } - - @Test - @Repeat(iterations = 20) - public void testRandomAccessSparseVectorWritable() throws Exception { - Vector v = new RandomAccessSparseVector(MAX_VECTOR_SIZE); - createRandom(v); - doTestVectorWritableEquals(v); - } - - @Test - @Repeat(iterations = 20) - public void testDenseVectorWritable() throws Exception { - Vector v = new DenseVector(MAX_VECTOR_SIZE); - createRandom(v); - doTestVectorWritableEquals(v); - } - - @Test - @Repeat(iterations = 20) - public void testNamedVectorWritable() throws Exception { - Vector v = new DenseVector(MAX_VECTOR_SIZE); - v = new NamedVector(v, "Victor"); - createRandom(v); - doTestVectorWritableEquals(v); - } - - private static void doTestVectorWritableEquals(Vector v) throws IOException { - Writable vectorWritable = new VectorWritable(v); - VectorWritable vectorWritable2 = new VectorWritable(); - writeAndRead(vectorWritable, vectorWritable2); - Vector v2 = vectorWritable2.get(); - if (v instanceof NamedVector) { - assertTrue(v2 instanceof NamedVector); - NamedVector nv = (NamedVector) v; - NamedVector nv2 = (NamedVector) v2; - assertEquals(nv.getName(), nv2.getName()); - assertEquals("Victor", nv.getName()); - } - assertEquals(v, v2); - } - - private static void writeAndRead(Writable toWrite, Writable toRead) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try (DataOutputStream dos = new DataOutputStream(baos)){ - toWrite.write(dos); - } - - ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); - try (DataInputStream dis = new DataInputStream(bais)) { - toRead.readFields(dis); - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/bin/prep_asf_mail_archives.sh ---------------------------------------------------------------------- diff --git a/integration/bin/prep_asf_mail_archives.sh b/integration/bin/prep_asf_mail_archives.sh deleted file mode 100755 index 77f5d13..0000000 --- a/integration/bin/prep_asf_mail_archives.sh +++ /dev/null @@ -1,106 +0,0 @@ -#!/bin/bash -# -# Performs the setup procedures for clustering the ASF mail archives -# described in Taming Text. -# -# Required Command-line Parameters: -# -# $1 - Path to this script's working directory, you will need about -# 22GB of free space to run this script. -# -# $2 - Path to where the ASF Public Archive data is, untarred. -# If you are running Hadoop and the files are in HDFS, then -# this will need to be an HDFS path. Default is $1/input -# $3 - Path to where this script saves the SequenceFile output. -# If you are running Hadoop and you want the sequence files -# saved to your HDFS then you need to set this value to an -# HDFS path and make sure you set HADOOP_HOME so Mahout can -# find Hadoop. Default is $1/sequence-files -# -# -# Required Environment Variables: -# -# MAHOUT_HOME -# Root directory of your Mahout distribution -# -# HADOOP_HOME -# Only needed if you want to send output to HDFS -# -# Example: -# ./prep_asf_mail_archives.sh /mnt/asf-mail-archives /mnt/asf-archives/asf-mail-archives-7-18-2011 /mnt/asf-mail-archives/output -# -# This will download the TAR files from S3, extract them, and then -# run the Mahout org.apache.mahout.text.SequenceFilesFromMailArchives job -# to create Hadoop SequenceFiles in /mnt/asf-mail-archives/output -# -#/** -# * Licensed to the Apache Software Foundation (ASF) under one or more -# * contributor license agreements. See the NOTICE file distributed with -# * this work for additional information regarding copyright ownership. -# * The ASF licenses this file to You under the Apache License, Version 2.0 -# * (the "License"); you may not use this file except in compliance with -# * the License. You may obtain a copy of the License at -# * -# * http://www.apache.org/licenses/LICENSE-2.0 -# * -# * Unless required by applicable law or agreed to in writing, software -# * distributed under the License is distributed on an "AS IS" BASIS, -# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# * See the License for the specific language governing permissions and -# * limitations under the License. -# */ - -if [ "$MAHOUT_HOME" = "" ]; then - echo "Error: MAHOUT_HOME is not set." - exit 1 -fi - -if [ "$1" = "" ]; then - echo "Error: Please pass the path to your prep directory, such as /mnt/asf-mail-archives.\n\n\tUsage: $0 workingDir inputPath outputPath\n" - exit 1 -fi - -# Location where this script saves files -PREP_DIR=$1 - -if [ "$2" != "" ]; then - SEQFILE_INPUT_DIR=$2 -else - SEQFILE_INPUT_DIR=$PREP_DIR/input -fi - - -# Change this to an HDFS path if you are running Hadoop -if [ "$3" != "" ]; then - SEQFILE_OUTPUT_DIR=$3 -else - SEQFILE_OUTPUT_DIR=$PREP_DIR/sequence-files -fi - -# If output sent to HDFS, clear MAHOUT_LOCAL and make sure HADOOP_HOME is set -if [[ "$SEQFILE_OUTPUT_DIR" = hdfs://* ]]; then - export MAHOUT_LOCAL= - if [ "$HADOOP_HOME" = "" ]; then - echo "Error: HADOOP_HOME must be set if you want to send output to HDFS." - exit 1 - fi -else - export MAHOUT_LOCAL=$PREP_DIR -fi - -echo "Running $0 with: - PREP_DIR = $PREP_DIR - SEQFILE_INPUT_DIR = $SEQFILE_INPUT_DIR - SEQFILE_OUTPUT_DIR = $SEQFILE_OUTPUT_DIR - MAHOUT_LOCAL = $MAHOUT_LOCAL - HADOOP_HOME = $HADOOP_HOME" - -# Run Mahout in Local mode! Remove this if you want the -# sequence files stored in your HDFS - - -# convert the extracted gz files into Hadoop SequenceFiles -echo "Converting extracted directories to SequenceFiles ..." -$MAHOUT_HOME/bin/mahout org.apache.mahout.text.SequenceFilesFromMailArchives \ ---input $SEQFILE_INPUT_DIR --output $SEQFILE_OUTPUT_DIR --subject --body \ --c UTF-8 -chunk 1024 -prefix asf_archives http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/pom.xml ---------------------------------------------------------------------- diff --git a/integration/pom.xml b/integration/pom.xml deleted file mode 100644 index 5a873a6..0000000 --- a/integration/pom.xml +++ /dev/null @@ -1,198 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> - - <modelVersion>4.0.0</modelVersion> - - <parent> - <groupId>org.apache.mahout</groupId> - <artifactId>mahout</artifactId> - <version>0.13.1-SNAPSHOT</version> - <relativePath>../pom.xml</relativePath> - </parent> - - <artifactId>mahout-integration</artifactId> - <name>Mahout Integration</name> - <description>Optional components of Mahout which generally support interaction with third party systems, - formats, APIs, etc.</description> - - <packaging>jar</packaging> - - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-remote-resources-plugin</artifactId> - <configuration> - <appendedResourcesDirectory>../src/main/appended-resources</appendedResourcesDirectory> - <resourceBundles> - <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle> - </resourceBundles> - <supplementalModels> - <supplementalModel>supplemental-models.xml</supplementalModel> - </supplementalModels> - </configuration> - </plugin> - - <plugin> - <artifactId>maven-javadoc-plugin</artifactId> - </plugin> - - <plugin> - <artifactId>maven-source-plugin</artifactId> - </plugin> - - </plugins> - - </build> - - <dependencies> - - <!-- own modules --> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>mahout-hdfs</artifactId> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>mahout-mr</artifactId> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>mahout-hdfs</artifactId> - <type>test-jar</type> - <scope>test</scope> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>mahout-mr</artifactId> - <type>test-jar</type> - <scope>test</scope> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>mahout-math</artifactId> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>mahout-math</artifactId> - <type>test-jar</type> - <scope>test</scope> - </dependency> - - <!-- 3rd party --> - - <dependency> - <groupId>commons-dbcp</groupId> - <artifactId>commons-dbcp</artifactId> - <optional>true</optional> - </dependency> - - <dependency> - <groupId>commons-pool</groupId> - <artifactId>commons-pool</artifactId> - <optional>true</optional> - </dependency> - - <dependency> - <groupId>commons-io</groupId> - <artifactId>commons-io</artifactId> - </dependency> - - <dependency> - <groupId>com.google.guava</groupId> - <artifactId>guava</artifactId> - </dependency> - - <dependency> - <groupId>org.apache.solr</groupId> - <artifactId>solr-commons-csv</artifactId> - <version>3.5.0</version> - </dependency> - - <dependency> - <groupId>org.apache.lucene</groupId> - <artifactId>lucene-benchmark</artifactId> - <optional>true</optional> - </dependency> - <dependency> - <groupId>org.apache.lucene</groupId> - <artifactId>lucene-analyzers-common</artifactId> - <optional>true</optional> - </dependency> - - <dependency> - <groupId>org.mongodb</groupId> - <artifactId>mongo-java-driver</artifactId> - <version>2.11.2</version> - <optional>true</optional> - </dependency> - - <dependency> - <groupId>org.mongodb</groupId> - <artifactId>bson</artifactId> - <version>2.11.2</version> - <optional>true</optional> - </dependency> - - <dependency> - <groupId>org.apache.hbase</groupId> - <artifactId>hbase-client</artifactId> - </dependency> - - <dependency> - <groupId>org.hectorclient</groupId> - <artifactId>hector-core</artifactId> - <version>1.1-4</version> - <optional>true</optional> - </dependency> - - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-api</artifactId> - </dependency> - - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-jcl</artifactId> - <scope>test</scope> - </dependency> - - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <scope>test</scope> - </dependency> - - <dependency> - <groupId>com.carrotsearch.randomizedtesting</groupId> - <artifactId>randomizedtesting-runner</artifactId> - <scope>test</scope> - </dependency> - - <dependency> - <groupId>org.easymock</groupId> - <artifactId>easymock</artifactId> - <scope>test</scope> - </dependency> - - </dependencies> - -</project> http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java b/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java deleted file mode 100644 index 549cf2c..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import java.util.Random; -import java.util.concurrent.TimeUnit; - -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.TimingStatistics; -import org.apache.mahout.math.Vector; - -import com.google.common.base.Function; - -public final class BenchmarkRunner { - private static final int BUCKET_SIZE = 10000; - private static final Random R = RandomUtils.getRandom(); - private final long maxTimeUsec; - private final long leadTimeUsec; - - public BenchmarkRunner(long leadTimeMs, long maxTimeMs) { - maxTimeUsec = TimeUnit.MILLISECONDS.toNanos(maxTimeMs); - leadTimeUsec = TimeUnit.MILLISECONDS.toNanos(leadTimeMs); - } - - public abstract static class BenchmarkFn implements Function<Integer, Boolean> { - protected int randIndex() { - return BenchmarkRunner.randIndex(); - } - - protected boolean randBool() { - return BenchmarkRunner.randBool(); - } - - /** - * Adds a random data dependency so that JVM does not remove dead code. - */ - protected boolean depends(Vector v) { - return randIndex() < v.getNumNondefaultElements(); - } - } - - public abstract static class BenchmarkFnD implements Function<Integer, Double> { - protected int randIndex() { - return BenchmarkRunner.randIndex(); - } - - protected boolean randBool() { - return BenchmarkRunner.randBool(); - } - - /** - * Adds a random data dependency so that JVM does not remove dead code. - */ - protected boolean depends(Vector v) { - return randIndex() < v.getNumNondefaultElements(); - } - } - - private static int randIndex() { - return R.nextInt(BUCKET_SIZE); - } - - private static boolean randBool() { - return R.nextBoolean(); - } - - public TimingStatistics benchmark(BenchmarkFn function) { - TimingStatistics stats = new TimingStatistics(); - boolean result = false; - while (true) { - int i = R.nextInt(BUCKET_SIZE); - TimingStatistics.Call call = stats.newCall(leadTimeUsec); - result = result ^ function.apply(i); - if (call.end(maxTimeUsec)) { - break; - } - } - return stats; - } - - public TimingStatistics benchmarkD(BenchmarkFnD function) { - TimingStatistics stats = new TimingStatistics(); - double result = 0; - while (true) { - int i = R.nextInt(BUCKET_SIZE); - TimingStatistics.Call call = stats.newCall(leadTimeUsec); - result += function.apply(i); - if (call.end(maxTimeUsec)) { - break; - } - } - // print result to prevent hotspot from eliminating deadcode - System.err.println("Result = " + result); - return stats; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java deleted file mode 100644 index 5e6ab4d..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; - -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; - -public class CloneBenchmark { - public static final String CLONE = "Clone"; - private final VectorBenchmarks mark; - - public CloneBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark() { - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - mark.vectors[0][mark.vIndex(i)] = mark.vectors[0][mark.vIndex(i)].clone(); - - return depends(mark.vectors[0][mark.vIndex(i)]); - } - }), CLONE, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - mark.vectors[1][mark.vIndex(i)] = mark.vectors[1][mark.vIndex(i)].clone(); - - return depends(mark.vectors[1][mark.vIndex(i)]); - } - }), CLONE, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - mark.vectors[2][mark.vIndex(i)] = mark.vectors[2][mark.vIndex(i)].clone(); - - return depends(mark.vectors[2][mark.vIndex(i)]); - } - }), CLONE, SEQ_SPARSE_VECTOR); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java deleted file mode 100644 index b1c2ded..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import java.io.IOException; -import java.util.Random; - -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.TimingStatistics; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.SparseMatrix; -import org.apache.mahout.math.Vector; - -public class ClosestCentroidBenchmark { - private final VectorBenchmarks mark; - - public ClosestCentroidBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark(DistanceMeasure measure) throws IOException { - SparseMatrix clusterDistances = new SparseMatrix(mark.numClusters, mark.numClusters); - for (int i = 0; i < mark.numClusters; i++) { - for (int j = 0; j < mark.numClusters; j++) { - double distance = Double.POSITIVE_INFINITY; - if (i != j) { - distance = measure.distance(mark.clusters[i], mark.clusters[j]); - } - clusterDistances.setQuick(i, j, distance); - } - } - - long distanceCalculations = 0; - TimingStatistics stats = new TimingStatistics(); - for (int l = 0; l < mark.loop; l++) { - TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); - for (int i = 0; i < mark.numVectors; i++) { - Vector vector = mark.vectors[1][mark.vIndex(i)]; - double minDistance = Double.MAX_VALUE; - for (int k = 0; k < mark.numClusters; k++) { - double distance = measure.distance(vector, mark.clusters[k]); - distanceCalculations++; - if (distance < minDistance) { - minDistance = distance; - } - } - } - if (call.end(mark.maxTimeUsec)) { - break; - } - } - mark.printStats(stats, measure.getClass().getName(), "Closest C w/o Elkan's trick", "distanceCalculations = " - + distanceCalculations); - - distanceCalculations = 0; - stats = new TimingStatistics(); - Random rand = RandomUtils.getRandom(); - for (int l = 0; l < mark.loop; l++) { - TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); - for (int i = 0; i < mark.numVectors; i++) { - Vector vector = mark.vectors[1][mark.vIndex(i)]; - int closestCentroid = rand.nextInt(mark.numClusters); - double dist = measure.distance(vector, mark.clusters[closestCentroid]); - distanceCalculations++; - for (int k = 0; k < mark.numClusters; k++) { - if (closestCentroid != k) { - double centroidDist = clusterDistances.getQuick(k, closestCentroid); - if (centroidDist < 2 * dist) { - dist = measure.distance(vector, mark.clusters[k]); - closestCentroid = k; - distanceCalculations++; - } - } - } - } - if (call.end(mark.maxTimeUsec)) { - break; - } - } - mark.printStats(stats, measure.getClass().getName(), "Closest C w/ Elkan's trick", "distanceCalculations = " - + distanceCalculations); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java deleted file mode 100644 index 25d0ad7..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; - -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFnD; -import org.apache.mahout.common.distance.DistanceMeasure; - -public class DistanceBenchmark { - private final VectorBenchmarks mark; - - public DistanceBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark(final DistanceMeasure measure) { - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), SEQ_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), DENSE_FN_RAND); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), DENSE_FN_SEQ); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), RAND_FN_DENSE); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), RAND_FN_SEQ); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), SEQ_FN_DENSE); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), SEQ_FN_RAND); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java deleted file mode 100644 index fc7f911..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; - -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFnD; - -public class DotBenchmark { - private static final String DOT_PRODUCT = "DotProduct"; - private static final String NORM1 = "Norm1"; - private static final String NORM2 = "Norm2"; - private static final String LOG_NORMALIZE = "LogNormalize"; - private final VectorBenchmarks mark; - - public DotBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark() { - benchmarkDot(); - benchmarkNorm1(); - benchmarkNorm2(); - benchmarkLogNormalize(); - } - - private void benchmarkLogNormalize() { - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - return depends(mark.vectors[0][mark.vIndex(i)].logNormalize()); - } - }), LOG_NORMALIZE, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - return depends(mark.vectors[1][mark.vIndex(i)].logNormalize()); - } - }), LOG_NORMALIZE, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - return depends(mark.vectors[2][mark.vIndex(i)].logNormalize()); - } - }), LOG_NORMALIZE, SEQ_SPARSE_VECTOR); - } - - private void benchmarkNorm1() { - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[0][mark.vIndex(i)].norm(1); - } - }), NORM1, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[1][mark.vIndex(i)].norm(1); - } - }), NORM1, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[2][mark.vIndex(i)].norm(1); - } - }), NORM1, SEQ_SPARSE_VECTOR); - } - - private void benchmarkNorm2() { - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[0][mark.vIndex(i)].norm(2); - } - }), NORM2, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[1][mark.vIndex(i)].norm(2); - } - }), NORM2, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[2][mark.vIndex(i)].norm(2); - } - }), NORM2, SEQ_SPARSE_VECTOR); - } - - private void benchmarkDot() { - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, SEQ_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, DENSE_FN_RAND); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, DENSE_FN_SEQ); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, RAND_FN_DENSE); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, RAND_FN_SEQ); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, SEQ_FN_DENSE); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, SEQ_FN_RAND); - } - - public static void main(String[] args) { - VectorBenchmarks mark = new VectorBenchmarks(1000000, 100, 1000, 10, 1); - mark.createData(); - new DotBenchmark(mark).benchmarkNorm2(); - System.out.println(mark); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/99a5358f/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java deleted file mode 100644 index 82fb693..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; - -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; -import org.apache.mahout.math.Vector; - -public class MinusBenchmark { - - private static final String MINUS = "Minus"; - private final VectorBenchmarks mark; - - public MinusBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark() { - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, SEQ_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, DENSE_FN_RAND); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, DENSE_FN_SEQ); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, RAND_FN_DENSE); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, RAND_FN_SEQ); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, SEQ_FN_DENSE); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, SEQ_FN_RAND); - } -}
