http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/ssvd.props ---------------------------------------------------------------------- diff --git a/community/mahout-mr/conf/ssvd.props b/community/mahout-mr/conf/ssvd.props new file mode 100644 index 0000000..26a52c7 --- /dev/null +++ b/community/mahout-mr/conf/ssvd.props @@ -0,0 +1,14 @@ +#i|input = +#o|output = +#k|rank = +#t|tempDir = +#p|oversampling = +#r|blockHeight = +#s|minSplitSize = +#U|computeU = +#uhs|uHalfSigma = +#V|computeV = +#vhs|vHalfSigma = +#t|reduceTasks = +#w|wide = +#q|powerIter =
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/svd.props ---------------------------------------------------------------------- diff --git a/community/mahout-mr/conf/svd.props b/community/mahout-mr/conf/svd.props new file mode 100644 index 0000000..8c9a467 --- /dev/null +++ b/community/mahout-mr/conf/svd.props @@ -0,0 +1,6 @@ +#i|input = +#o|output = +#nr|numRows = +#nc|numCols = +#r|rank = +#t|tempDir = \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/trainlogistic.props ---------------------------------------------------------------------- diff --git a/community/mahout-mr/conf/trainlogistic.props b/community/mahout-mr/conf/trainlogistic.props new file mode 100644 index 0000000..f474942 --- /dev/null +++ b/community/mahout-mr/conf/trainlogistic.props @@ -0,0 +1,2 @@ +#lambda|lambda = +#passes|passes = http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/transpose.props ---------------------------------------------------------------------- diff --git a/community/mahout-mr/conf/transpose.props b/community/mahout-mr/conf/transpose.props new file mode 100644 index 0000000..025f945 --- /dev/null +++ b/community/mahout-mr/conf/transpose.props @@ -0,0 +1,2 @@ +#i|input = +#o|output = http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/conf/vectordump.props ---------------------------------------------------------------------- diff --git a/community/mahout-mr/conf/vectordump.props b/community/mahout-mr/conf/vectordump.props new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/community/mahout-mr/conf/vectordump.props @@ -0,0 +1 @@ + http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/bin/prep_asf_mail_archives.sh ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/bin/prep_asf_mail_archives.sh b/community/mahout-mr/integration/bin/prep_asf_mail_archives.sh new file mode 100755 index 0000000..77f5d13 --- /dev/null +++ b/community/mahout-mr/integration/bin/prep_asf_mail_archives.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# +# Performs the setup procedures for clustering the ASF mail archives +# described in Taming Text. +# +# Required Command-line Parameters: +# +# $1 - Path to this script's working directory, you will need about +# 22GB of free space to run this script. +# +# $2 - Path to where the ASF Public Archive data is, untarred. +# If you are running Hadoop and the files are in HDFS, then +# this will need to be an HDFS path. Default is $1/input +# $3 - Path to where this script saves the SequenceFile output. +# If you are running Hadoop and you want the sequence files +# saved to your HDFS then you need to set this value to an +# HDFS path and make sure you set HADOOP_HOME so Mahout can +# find Hadoop. Default is $1/sequence-files +# +# +# Required Environment Variables: +# +# MAHOUT_HOME +# Root directory of your Mahout distribution +# +# HADOOP_HOME +# Only needed if you want to send output to HDFS +# +# Example: +# ./prep_asf_mail_archives.sh /mnt/asf-mail-archives /mnt/asf-archives/asf-mail-archives-7-18-2011 /mnt/asf-mail-archives/output +# +# This will download the TAR files from S3, extract them, and then +# run the Mahout org.apache.mahout.text.SequenceFilesFromMailArchives job +# to create Hadoop SequenceFiles in /mnt/asf-mail-archives/output +# +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ + +if [ "$MAHOUT_HOME" = "" ]; then + echo "Error: MAHOUT_HOME is not set." + exit 1 +fi + +if [ "$1" = "" ]; then + echo "Error: Please pass the path to your prep directory, such as /mnt/asf-mail-archives.\n\n\tUsage: $0 workingDir inputPath outputPath\n" + exit 1 +fi + +# Location where this script saves files +PREP_DIR=$1 + +if [ "$2" != "" ]; then + SEQFILE_INPUT_DIR=$2 +else + SEQFILE_INPUT_DIR=$PREP_DIR/input +fi + + +# Change this to an HDFS path if you are running Hadoop +if [ "$3" != "" ]; then + SEQFILE_OUTPUT_DIR=$3 +else + SEQFILE_OUTPUT_DIR=$PREP_DIR/sequence-files +fi + +# If output sent to HDFS, clear MAHOUT_LOCAL and make sure HADOOP_HOME is set +if [[ "$SEQFILE_OUTPUT_DIR" = hdfs://* ]]; then + export MAHOUT_LOCAL= + if [ "$HADOOP_HOME" = "" ]; then + echo "Error: HADOOP_HOME must be set if you want to send output to HDFS." + exit 1 + fi +else + export MAHOUT_LOCAL=$PREP_DIR +fi + +echo "Running $0 with: + PREP_DIR = $PREP_DIR + SEQFILE_INPUT_DIR = $SEQFILE_INPUT_DIR + SEQFILE_OUTPUT_DIR = $SEQFILE_OUTPUT_DIR + MAHOUT_LOCAL = $MAHOUT_LOCAL + HADOOP_HOME = $HADOOP_HOME" + +# Run Mahout in Local mode! Remove this if you want the +# sequence files stored in your HDFS + + +# convert the extracted gz files into Hadoop SequenceFiles +echo "Converting extracted directories to SequenceFiles ..." +$MAHOUT_HOME/bin/mahout org.apache.mahout.text.SequenceFilesFromMailArchives \ +--input $SEQFILE_INPUT_DIR --output $SEQFILE_OUTPUT_DIR --subject --body \ +-c UTF-8 -chunk 1024 -prefix asf_archives http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/pom.xml ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/pom.xml b/community/mahout-mr/integration/pom.xml new file mode 100644 index 0000000..cb0c19a --- /dev/null +++ b/community/mahout-mr/integration/pom.xml @@ -0,0 +1,198 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.mahout</groupId> + <artifactId>mahout</artifactId> + <version>0.13.1-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + + <artifactId>mahout-integration</artifactId> + <name>Mahout Integration</name> + <description>Optional components of Mahout which generally support interaction with third party systems, + formats, APIs, etc.</description> + + <packaging>jar</packaging> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-remote-resources-plugin</artifactId> + <configuration> + <appendedResourcesDirectory>../community/mahout-mr/src/appended-resources</appendedResourcesDirectory> + <resourceBundles> + <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle> + </resourceBundles> + <supplementalModels> + <supplementalModel>supplemental-models.xml</supplementalModel> + </supplementalModels> + </configuration> + </plugin> + + <plugin> + <artifactId>maven-javadoc-plugin</artifactId> + </plugin> + + <plugin> + <artifactId>maven-source-plugin</artifactId> + </plugin> + + </plugins> + + </build> + + <dependencies> + + <!-- own modules --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-hdfs</artifactId> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-mr</artifactId> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-hdfs</artifactId> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-mr</artifactId> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-math</artifactId> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-math</artifactId> + <type>test-jar</type> + <scope>test</scope> + </dependency> + + <!-- 3rd party --> + + <dependency> + <groupId>commons-dbcp</groupId> + <artifactId>commons-dbcp</artifactId> + <optional>true</optional> + </dependency> + + <dependency> + <groupId>commons-pool</groupId> + <artifactId>commons-pool</artifactId> + <optional>true</optional> + </dependency> + + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + </dependency> + + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + </dependency> + + <dependency> + <groupId>org.apache.solr</groupId> + <artifactId>solr-commons-csv</artifactId> + <version>3.5.0</version> + </dependency> + + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-benchmark</artifactId> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + <optional>true</optional> + </dependency> + + <dependency> + <groupId>org.mongodb</groupId> + <artifactId>mongo-java-driver</artifactId> + <version>2.11.2</version> + <optional>true</optional> + </dependency> + + <dependency> + <groupId>org.mongodb</groupId> + <artifactId>bson</artifactId> + <version>2.11.2</version> + <optional>true</optional> + </dependency> + + <dependency> + <groupId>org.apache.hbase</groupId> + <artifactId>hbase-client</artifactId> + </dependency> + + <dependency> + <groupId>org.hectorclient</groupId> + <artifactId>hector-core</artifactId> + <version>1.1-4</version> + <optional>true</optional> + </dependency> + + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + </dependency> + + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-jcl</artifactId> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>com.carrotsearch.randomizedtesting</groupId> + <artifactId>randomizedtesting-runner</artifactId> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.easymock</groupId> + <artifactId>easymock</artifactId> + <scope>test</scope> + </dependency> + + </dependencies> + +</project> http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java new file mode 100644 index 0000000..549cf2c --- /dev/null +++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.benchmark; + +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import org.apache.mahout.common.RandomUtils; +import org.apache.mahout.common.TimingStatistics; +import org.apache.mahout.math.Vector; + +import com.google.common.base.Function; + +public final class BenchmarkRunner { + private static final int BUCKET_SIZE = 10000; + private static final Random R = RandomUtils.getRandom(); + private final long maxTimeUsec; + private final long leadTimeUsec; + + public BenchmarkRunner(long leadTimeMs, long maxTimeMs) { + maxTimeUsec = TimeUnit.MILLISECONDS.toNanos(maxTimeMs); + leadTimeUsec = TimeUnit.MILLISECONDS.toNanos(leadTimeMs); + } + + public abstract static class BenchmarkFn implements Function<Integer, Boolean> { + protected int randIndex() { + return BenchmarkRunner.randIndex(); + } + + protected boolean randBool() { + return BenchmarkRunner.randBool(); + } + + /** + * Adds a random data dependency so that JVM does not remove dead code. + */ + protected boolean depends(Vector v) { + return randIndex() < v.getNumNondefaultElements(); + } + } + + public abstract static class BenchmarkFnD implements Function<Integer, Double> { + protected int randIndex() { + return BenchmarkRunner.randIndex(); + } + + protected boolean randBool() { + return BenchmarkRunner.randBool(); + } + + /** + * Adds a random data dependency so that JVM does not remove dead code. + */ + protected boolean depends(Vector v) { + return randIndex() < v.getNumNondefaultElements(); + } + } + + private static int randIndex() { + return R.nextInt(BUCKET_SIZE); + } + + private static boolean randBool() { + return R.nextBoolean(); + } + + public TimingStatistics benchmark(BenchmarkFn function) { + TimingStatistics stats = new TimingStatistics(); + boolean result = false; + while (true) { + int i = R.nextInt(BUCKET_SIZE); + TimingStatistics.Call call = stats.newCall(leadTimeUsec); + result = result ^ function.apply(i); + if (call.end(maxTimeUsec)) { + break; + } + } + return stats; + } + + public TimingStatistics benchmarkD(BenchmarkFnD function) { + TimingStatistics stats = new TimingStatistics(); + double result = 0; + while (true) { + int i = R.nextInt(BUCKET_SIZE); + TimingStatistics.Call call = stats.newCall(leadTimeUsec); + result += function.apply(i); + if (call.end(maxTimeUsec)) { + break; + } + } + // print result to prevent hotspot from eliminating deadcode + System.err.println("Result = " + result); + return stats; + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java new file mode 100644 index 0000000..5e6ab4d --- /dev/null +++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.benchmark; + +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; + +import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; + +public class CloneBenchmark { + public static final String CLONE = "Clone"; + private final VectorBenchmarks mark; + + public CloneBenchmark(VectorBenchmarks mark) { + this.mark = mark; + } + + public void benchmark() { + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + mark.vectors[0][mark.vIndex(i)] = mark.vectors[0][mark.vIndex(i)].clone(); + + return depends(mark.vectors[0][mark.vIndex(i)]); + } + }), CLONE, DENSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + mark.vectors[1][mark.vIndex(i)] = mark.vectors[1][mark.vIndex(i)].clone(); + + return depends(mark.vectors[1][mark.vIndex(i)]); + } + }), CLONE, RAND_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + mark.vectors[2][mark.vIndex(i)] = mark.vectors[2][mark.vIndex(i)].clone(); + + return depends(mark.vectors[2][mark.vIndex(i)]); + } + }), CLONE, SEQ_SPARSE_VECTOR); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java new file mode 100644 index 0000000..b1c2ded --- /dev/null +++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.benchmark; + +import java.io.IOException; +import java.util.Random; + +import org.apache.mahout.common.RandomUtils; +import org.apache.mahout.common.TimingStatistics; +import org.apache.mahout.common.distance.DistanceMeasure; +import org.apache.mahout.math.SparseMatrix; +import org.apache.mahout.math.Vector; + +public class ClosestCentroidBenchmark { + private final VectorBenchmarks mark; + + public ClosestCentroidBenchmark(VectorBenchmarks mark) { + this.mark = mark; + } + + public void benchmark(DistanceMeasure measure) throws IOException { + SparseMatrix clusterDistances = new SparseMatrix(mark.numClusters, mark.numClusters); + for (int i = 0; i < mark.numClusters; i++) { + for (int j = 0; j < mark.numClusters; j++) { + double distance = Double.POSITIVE_INFINITY; + if (i != j) { + distance = measure.distance(mark.clusters[i], mark.clusters[j]); + } + clusterDistances.setQuick(i, j, distance); + } + } + + long distanceCalculations = 0; + TimingStatistics stats = new TimingStatistics(); + for (int l = 0; l < mark.loop; l++) { + TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); + for (int i = 0; i < mark.numVectors; i++) { + Vector vector = mark.vectors[1][mark.vIndex(i)]; + double minDistance = Double.MAX_VALUE; + for (int k = 0; k < mark.numClusters; k++) { + double distance = measure.distance(vector, mark.clusters[k]); + distanceCalculations++; + if (distance < minDistance) { + minDistance = distance; + } + } + } + if (call.end(mark.maxTimeUsec)) { + break; + } + } + mark.printStats(stats, measure.getClass().getName(), "Closest C w/o Elkan's trick", "distanceCalculations = " + + distanceCalculations); + + distanceCalculations = 0; + stats = new TimingStatistics(); + Random rand = RandomUtils.getRandom(); + for (int l = 0; l < mark.loop; l++) { + TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); + for (int i = 0; i < mark.numVectors; i++) { + Vector vector = mark.vectors[1][mark.vIndex(i)]; + int closestCentroid = rand.nextInt(mark.numClusters); + double dist = measure.distance(vector, mark.clusters[closestCentroid]); + distanceCalculations++; + for (int k = 0; k < mark.numClusters; k++) { + if (closestCentroid != k) { + double centroidDist = clusterDistances.getQuick(k, closestCentroid); + if (centroidDist < 2 * dist) { + dist = measure.distance(vector, mark.clusters[k]); + closestCentroid = k; + distanceCalculations++; + } + } + } + } + if (call.end(mark.maxTimeUsec)) { + break; + } + } + mark.printStats(stats, measure.getClass().getName(), "Closest C w/ Elkan's trick", "distanceCalculations = " + + distanceCalculations); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java new file mode 100644 index 0000000..25d0ad7 --- /dev/null +++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.benchmark; + +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; + +import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFnD; +import org.apache.mahout.common.distance.DistanceMeasure; + +public class DistanceBenchmark { + private final VectorBenchmarks mark; + + public DistanceBenchmark(VectorBenchmarks mark) { + this.mark = mark; + } + + public void benchmark(final DistanceMeasure measure) { + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]); + } + }), measure.getClass().getName(), DENSE_VECTOR); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]); + } + }), measure.getClass().getName(), RAND_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]); + } + }), measure.getClass().getName(), SEQ_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]); + } + }), measure.getClass().getName(), DENSE_FN_RAND); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]); + } + }), measure.getClass().getName(), DENSE_FN_SEQ); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]); + } + }), measure.getClass().getName(), RAND_FN_DENSE); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]); + } + }), measure.getClass().getName(), RAND_FN_SEQ); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]); + } + }), measure.getClass().getName(), SEQ_FN_DENSE); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]); + } + }), measure.getClass().getName(), SEQ_FN_RAND); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java new file mode 100644 index 0000000..fc7f911 --- /dev/null +++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.benchmark; + +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; + +import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; +import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFnD; + +public class DotBenchmark { + private static final String DOT_PRODUCT = "DotProduct"; + private static final String NORM1 = "Norm1"; + private static final String NORM2 = "Norm2"; + private static final String LOG_NORMALIZE = "LogNormalize"; + private final VectorBenchmarks mark; + + public DotBenchmark(VectorBenchmarks mark) { + this.mark = mark; + } + + public void benchmark() { + benchmarkDot(); + benchmarkNorm1(); + benchmarkNorm2(); + benchmarkLogNormalize(); + } + + private void benchmarkLogNormalize() { + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + return depends(mark.vectors[0][mark.vIndex(i)].logNormalize()); + } + }), LOG_NORMALIZE, DENSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + return depends(mark.vectors[1][mark.vIndex(i)].logNormalize()); + } + }), LOG_NORMALIZE, RAND_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + return depends(mark.vectors[2][mark.vIndex(i)].logNormalize()); + } + }), LOG_NORMALIZE, SEQ_SPARSE_VECTOR); + } + + private void benchmarkNorm1() { + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[0][mark.vIndex(i)].norm(1); + } + }), NORM1, DENSE_VECTOR); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[1][mark.vIndex(i)].norm(1); + } + }), NORM1, RAND_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[2][mark.vIndex(i)].norm(1); + } + }), NORM1, SEQ_SPARSE_VECTOR); + } + + private void benchmarkNorm2() { + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[0][mark.vIndex(i)].norm(2); + } + }), NORM2, DENSE_VECTOR); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[1][mark.vIndex(i)].norm(2); + } + }), NORM2, RAND_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[2][mark.vIndex(i)].norm(2); + } + }), NORM2, SEQ_SPARSE_VECTOR); + } + + private void benchmarkDot() { + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]); + } + }), DOT_PRODUCT, DENSE_VECTOR); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]); + } + }), DOT_PRODUCT, RAND_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]); + } + }), DOT_PRODUCT, SEQ_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]); + } + }), DOT_PRODUCT, DENSE_FN_RAND); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]); + } + }), DOT_PRODUCT, DENSE_FN_SEQ); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]); + } + }), DOT_PRODUCT, RAND_FN_DENSE); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]); + } + }), DOT_PRODUCT, RAND_FN_SEQ); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]); + } + }), DOT_PRODUCT, SEQ_FN_DENSE); + + mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { + @Override + public Double apply(Integer i) { + return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]); + } + }), DOT_PRODUCT, SEQ_FN_RAND); + } + + public static void main(String[] args) { + VectorBenchmarks mark = new VectorBenchmarks(1000000, 100, 1000, 10, 1); + mark.createData(); + new DotBenchmark(mark).benchmarkNorm2(); + System.out.println(mark); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java new file mode 100644 index 0000000..82fb693 --- /dev/null +++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.benchmark; + +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; + +import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; +import org.apache.mahout.math.Vector; + +public class MinusBenchmark { + + private static final String MINUS = "Minus"; + private final VectorBenchmarks mark; + + public MinusBenchmark(VectorBenchmarks mark) { + this.mark = mark; + } + + public void benchmark() { + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]); + return depends(v); + } + }), MINUS, DENSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]); + return depends(v); + } + }), MINUS, RAND_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]); + return depends(v); + } + }), MINUS, SEQ_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]); + return depends(v); + } + }), MINUS, DENSE_FN_RAND); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]); + return depends(v); + } + }), MINUS, DENSE_FN_SEQ); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]); + return depends(v); + } + }), MINUS, RAND_FN_DENSE); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]); + return depends(v); + } + }), MINUS, RAND_FN_SEQ); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]); + return depends(v); + } + }), MINUS, SEQ_FN_DENSE); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]); + return depends(v); + } + }), MINUS, SEQ_FN_RAND); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java new file mode 100644 index 0000000..bd76e94 --- /dev/null +++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.benchmark; + +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; + +import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; +import org.apache.mahout.math.Vector; + +public class PlusBenchmark { + + private static final String PLUS = "Plus"; + private final VectorBenchmarks mark; + + public PlusBenchmark(VectorBenchmarks mark) { + this.mark = mark; + } + + public void benchmark() { + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]); + return depends(v); + } + }), PLUS, DENSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]); + return depends(v); + } + }), PLUS, RAND_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]); + return depends(v); + } + }), PLUS, SEQ_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]); + return depends(v); + } + }), PLUS, DENSE_FN_RAND); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]); + return depends(v); + } + }), PLUS, DENSE_FN_SEQ); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]); + return depends(v); + } + }), PLUS, RAND_FN_DENSE); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]); + return depends(v); + } + }), PLUS, RAND_FN_SEQ); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]); + return depends(v); + } + }), PLUS, SEQ_FN_DENSE); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]); + return depends(v); + } + }), PLUS, SEQ_FN_RAND); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java new file mode 100644 index 0000000..cd403c2 --- /dev/null +++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.benchmark; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Writable; +import org.apache.mahout.common.TimingStatistics; +import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator; +import org.apache.mahout.math.VectorWritable; + +import java.io.IOException; + +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; + +public class SerializationBenchmark { + public static final String SERIALIZE = "Serialize"; + public static final String DESERIALIZE = "Deserialize"; + private final VectorBenchmarks mark; + + public SerializationBenchmark(VectorBenchmarks mark) { + this.mark = mark; + } + + public void benchmark() throws IOException { + serializeBenchmark(); + deserializeBenchmark(); + } + + public void serializeBenchmark() throws IOException { + Configuration conf = new Configuration(); + FileSystem fs = FileSystem.get(conf); + + Writable one = new IntWritable(0); + VectorWritable vec = new VectorWritable(); + TimingStatistics stats = new TimingStatistics(); + + try (SequenceFile.Writer writer = + new SequenceFile.Writer(fs, conf, new Path("/tmp/dense-vector"), + IntWritable.class, VectorWritable.class)){ + for (int i = 0; i < mark.loop; i++) { + TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); + vec.set(mark.vectors[0][mark.vIndex(i)]); + writer.append(one, vec); + if (call.end(mark.maxTimeUsec)) { + break; + } + } + } + mark.printStats(stats, SERIALIZE, DENSE_VECTOR); + + stats = new TimingStatistics(); + try (SequenceFile.Writer writer = + new SequenceFile.Writer(fs, conf, + new Path("/tmp/randsparse-vector"), IntWritable.class, VectorWritable.class)){ + for (int i = 0; i < mark.loop; i++) { + TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); + vec.set(mark.vectors[1][mark.vIndex(i)]); + writer.append(one, vec); + if (call.end(mark.maxTimeUsec)) { + break; + } + } + } + mark.printStats(stats, SERIALIZE, RAND_SPARSE_VECTOR); + + stats = new TimingStatistics(); + try (SequenceFile.Writer writer = + new SequenceFile.Writer(fs, conf, + new Path("/tmp/seqsparse-vector"), IntWritable.class, VectorWritable.class)) { + for (int i = 0; i < mark.loop; i++) { + TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); + vec.set(mark.vectors[2][mark.vIndex(i)]); + writer.append(one, vec); + if (call.end(mark.maxTimeUsec)) { + break; + } + } + } + mark.printStats(stats, SERIALIZE, SEQ_SPARSE_VECTOR); + + } + + public void deserializeBenchmark() throws IOException { + doDeserializeBenchmark(DENSE_VECTOR, "/tmp/dense-vector"); + doDeserializeBenchmark(RAND_SPARSE_VECTOR, "/tmp/randsparse-vector"); + doDeserializeBenchmark(SEQ_SPARSE_VECTOR, "/tmp/seqsparse-vector"); + } + + private void doDeserializeBenchmark(String name, String pathString) throws IOException { + TimingStatistics stats = new TimingStatistics(); + TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); + SequenceFileValueIterator<Writable> iterator = new SequenceFileValueIterator<>(new Path(pathString), true, + new Configuration()); + while (iterator.hasNext()) { + iterator.next(); + call.end(); + call = stats.newCall(mark.leadTimeUsec); + } + iterator.close(); + mark.printStats(stats, DESERIALIZE, name); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java new file mode 100644 index 0000000..bf81228 --- /dev/null +++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.benchmark; + +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; +import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; +import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; +import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; + +import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; +import org.apache.mahout.math.Vector; + +public class TimesBenchmark { + + private static final String TIMES = "Times"; + private final VectorBenchmarks mark; + + public TimesBenchmark(VectorBenchmarks mark) { + this.mark = mark; + } + + public void benchmark() { + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]); + return depends(v); + } + }), TIMES, DENSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]); + return depends(v); + } + }), TIMES, RAND_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]); + return depends(v); + } + }), TIMES, SEQ_SPARSE_VECTOR); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]); + return depends(v); + } + }), TIMES, DENSE_FN_RAND); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]); + return depends(v); + } + }), TIMES, DENSE_FN_SEQ); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]); + return depends(v); + } + }), TIMES, RAND_FN_DENSE); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]); + return depends(v); + } + }), TIMES, RAND_FN_SEQ); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]); + return depends(v); + } + }), TIMES, SEQ_FN_DENSE); + + mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]); + return depends(v); + } + }), TIMES, SEQ_FN_RAND); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java new file mode 100644 index 0000000..a076322 --- /dev/null +++ b/community/mahout-mr/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java @@ -0,0 +1,497 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.benchmark; + +import org.apache.commons.cli2.CommandLine; +import org.apache.commons.cli2.Group; +import org.apache.commons.cli2.Option; +import org.apache.commons.cli2.OptionException; +import org.apache.commons.cli2.builder.ArgumentBuilder; +import org.apache.commons.cli2.builder.DefaultOptionBuilder; +import org.apache.commons.cli2.builder.GroupBuilder; +import org.apache.commons.cli2.commandline.Parser; +import org.apache.commons.lang3.StringUtils; +import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; +import org.apache.mahout.common.CommandLineUtil; +import org.apache.mahout.common.RandomUtils; +import org.apache.mahout.common.TimingStatistics; +import org.apache.mahout.common.commandline.DefaultOptionCreator; +import org.apache.mahout.common.distance.ChebyshevDistanceMeasure; +import org.apache.mahout.common.distance.CosineDistanceMeasure; +import org.apache.mahout.common.distance.EuclideanDistanceMeasure; +import org.apache.mahout.common.distance.ManhattanDistanceMeasure; +import org.apache.mahout.common.distance.MinkowskiDistanceMeasure; +import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure; +import org.apache.mahout.common.distance.TanimotoDistanceMeasure; +import org.apache.mahout.math.DenseVector; +import org.apache.mahout.math.RandomAccessSparseVector; +import org.apache.mahout.math.SequentialAccessSparseVector; +import org.apache.mahout.math.Vector; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; + +public class VectorBenchmarks { + private static final int MAX_TIME_MS = 5000; + private static final int LEAD_TIME_MS = 15000; + public static final String CLUSTERS = "Clusters"; + public static final String CREATE_INCREMENTALLY = "Create (incrementally)"; + public static final String CREATE_COPY = "Create (copy)"; + + public static final String DENSE_FN_SEQ = "Dense.fn(Seq)"; + public static final String RAND_FN_DENSE = "Rand.fn(Dense)"; + public static final String SEQ_FN_RAND = "Seq.fn(Rand)"; + public static final String RAND_FN_SEQ = "Rand.fn(Seq)"; + public static final String SEQ_FN_DENSE = "Seq.fn(Dense)"; + public static final String DENSE_FN_RAND = "Dense.fn(Rand)"; + public static final String SEQ_SPARSE_VECTOR = "SeqSparseVector"; + public static final String RAND_SPARSE_VECTOR = "RandSparseVector"; + public static final String DENSE_VECTOR = "DenseVector"; + + private static final Logger log = LoggerFactory.getLogger(VectorBenchmarks.class); + private static final Pattern TAB_NEWLINE_PATTERN = Pattern.compile("[\n\t]"); + private static final String[] EMPTY = new String[0]; + private static final DecimalFormat DF = new DecimalFormat("#.##"); + + /* package private */ + final Vector[][] vectors; + final Vector[] clusters; + final int cardinality; + final int numNonZeros; + final int numVectors; + final int numClusters; + final int loop = Integer.MAX_VALUE; + final int opsPerUnit; + final long maxTimeUsec; + final long leadTimeUsec; + + private final List<Vector> randomVectors = new ArrayList<>(); + private final List<int[]> randomVectorIndices = new ArrayList<>(); + private final List<double[]> randomVectorValues = new ArrayList<>(); + private final Map<String, Integer> implType = new HashMap<>(); + private final Map<String, List<String[]>> statsMap = new HashMap<>(); + private final BenchmarkRunner runner; + private final Random r = RandomUtils.getRandom(); + + public VectorBenchmarks(int cardinality, int numNonZeros, int numVectors, int numClusters, + int opsPerUnit) { + runner = new BenchmarkRunner(LEAD_TIME_MS, MAX_TIME_MS); + maxTimeUsec = TimeUnit.MILLISECONDS.toNanos(MAX_TIME_MS); + leadTimeUsec = TimeUnit.MILLISECONDS.toNanos(LEAD_TIME_MS); + + this.cardinality = cardinality; + this.numNonZeros = numNonZeros; + this.numVectors = numVectors; + this.numClusters = numClusters; + this.opsPerUnit = opsPerUnit; + + setUpVectors(cardinality, numNonZeros, numVectors); + + vectors = new Vector[3][numVectors]; + clusters = new Vector[numClusters]; + } + + private void setUpVectors(int cardinality, int numNonZeros, int numVectors) { + for (int i = 0; i < numVectors; i++) { + Vector v = new SequentialAccessSparseVector(cardinality, numNonZeros); // sparsity! + BitSet featureSpace = new BitSet(cardinality); + int[] indexes = new int[numNonZeros]; + double[] values = new double[numNonZeros]; + int j = 0; + while (j < numNonZeros) { + double value = r.nextGaussian(); + int index = r.nextInt(cardinality); + if (!featureSpace.get(index) && value != 0) { + featureSpace.set(index); + indexes[j] = index; + values[j++] = value; + v.set(index, value); + } + } + randomVectorIndices.add(indexes); + randomVectorValues.add(values); + randomVectors.add(v); + } + } + + void printStats(TimingStatistics stats, String benchmarkName, String implName, String content) { + printStats(stats, benchmarkName, implName, content, 1); + } + + void printStats(TimingStatistics stats, String benchmarkName, String implName) { + printStats(stats, benchmarkName, implName, "", 1); + } + + private void printStats(TimingStatistics stats, String benchmarkName, String implName, + String content, int multiplier) { + float speed = multiplier * stats.getNCalls() * (numNonZeros * 1000.0f * 12 / stats.getSumTime()); + float opsPerSec = stats.getNCalls() * 1000000000.0f / stats.getSumTime(); + log.info("{} {} \n{} {} \nOps = {} Units/sec\nIOps = {} MBytes/sec", benchmarkName, + implName, content, stats.toString(), DF.format(opsPerSec), DF.format(speed)); + + if (!implType.containsKey(implName)) { + implType.put(implName, implType.size()); + } + int implId = implType.get(implName); + if (!statsMap.containsKey(benchmarkName)) { + statsMap.put(benchmarkName, new ArrayList<String[]>()); + } + List<String[]> implStats = statsMap.get(benchmarkName); + while (implStats.size() < implId + 1) { + implStats.add(EMPTY); + } + implStats.set( + implId, + TAB_NEWLINE_PATTERN.split(stats + "\tSpeed = " + DF.format(opsPerSec) + " /sec\tRate = " + + DF.format(speed) + " MB/s")); + } + + public void createData() { + for (int i = 0; i < Math.max(numVectors, numClusters); ++i) { + vectors[0][vIndex(i)] = new DenseVector(randomVectors.get(vIndex(i))); + vectors[1][vIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i))); + vectors[2][vIndex(i)] = new SequentialAccessSparseVector(randomVectors.get(vIndex(i))); + if (numClusters > 0) { + clusters[cIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i))); + } + } + } + + public void createBenchmark() { + printStats(runner.benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + vectors[0][vIndex(i)] = new DenseVector(randomVectors.get(vIndex(i))); + return depends(vectors[0][vIndex(i)]); + } + }), CREATE_COPY, DENSE_VECTOR); + + printStats(runner.benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + vectors[1][vIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i))); + return depends(vectors[1][vIndex(i)]); + } + }), CREATE_COPY, RAND_SPARSE_VECTOR); + + printStats(runner.benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + vectors[2][vIndex(i)] = new SequentialAccessSparseVector(randomVectors.get(vIndex(i))); + return depends(vectors[2][vIndex(i)]); + } + }), CREATE_COPY, SEQ_SPARSE_VECTOR); + + if (numClusters > 0) { + printStats(runner.benchmark(new BenchmarkFn() { + @Override + public Boolean apply(Integer i) { + clusters[cIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i))); + return depends(clusters[cIndex(i)]); + } + }), CREATE_COPY, CLUSTERS); + } + } + + private boolean buildVectorIncrementally(TimingStatistics stats, int randomIndex, Vector v, boolean useSetQuick) { + int[] indexes = randomVectorIndices.get(randomIndex); + double[] values = randomVectorValues.get(randomIndex); + List<Integer> randomOrder = new ArrayList<>(); + for (int i = 0; i < indexes.length; i++) { + randomOrder.add(i); + } + Collections.shuffle(randomOrder); + int[] permutation = new int[randomOrder.size()]; + for (int i = 0; i < randomOrder.size(); i++) { + permutation[i] = randomOrder.get(i); + } + + TimingStatistics.Call call = stats.newCall(leadTimeUsec); + if (useSetQuick) { + for (int i : permutation) { + v.setQuick(indexes[i], values[i]); + } + } else { + for (int i : permutation) { + v.set(indexes[i], values[i]); + } + } + return call.end(maxTimeUsec); + } + + public void incrementalCreateBenchmark() { + TimingStatistics stats = new TimingStatistics(); + for (int i = 0; i < loop; i++) { + vectors[0][vIndex(i)] = new DenseVector(cardinality); + if (buildVectorIncrementally(stats, vIndex(i), vectors[0][vIndex(i)], false)) { + break; + } + } + printStats(stats, CREATE_INCREMENTALLY, DENSE_VECTOR); + + stats = new TimingStatistics(); + for (int i = 0; i < loop; i++) { + vectors[1][vIndex(i)] = new RandomAccessSparseVector(cardinality); + if (buildVectorIncrementally(stats, vIndex(i), vectors[1][vIndex(i)], false)) { + break; + } + } + printStats(stats, CREATE_INCREMENTALLY, RAND_SPARSE_VECTOR); + + stats = new TimingStatistics(); + for (int i = 0; i < loop; i++) { + vectors[2][vIndex(i)] = new SequentialAccessSparseVector(cardinality); + if (buildVectorIncrementally(stats, vIndex(i), vectors[2][vIndex(i)], false)) { + break; + } + } + printStats(stats, CREATE_INCREMENTALLY, SEQ_SPARSE_VECTOR); + + if (numClusters > 0) { + stats = new TimingStatistics(); + for (int i = 0; i < loop; i++) { + clusters[cIndex(i)] = new RandomAccessSparseVector(cardinality); + if (buildVectorIncrementally(stats, vIndex(i), clusters[cIndex(i)], false)) { + break; + } + } + printStats(stats, CREATE_INCREMENTALLY, CLUSTERS); + } + } + + public int vIndex(int i) { + return i % numVectors; + } + + public int cIndex(int i) { + return i % numClusters; + } + + public static void main(String[] args) throws IOException { + DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); + ArgumentBuilder abuilder = new ArgumentBuilder(); + GroupBuilder gbuilder = new GroupBuilder(); + + Option vectorSizeOpt = obuilder + .withLongName("vectorSize") + .withRequired(false) + .withArgument(abuilder.withName("vs").withDefault(1000000).create()) + .withDescription("Cardinality of the vector. Default: 1000000").withShortName("vs").create(); + Option numNonZeroOpt = obuilder + .withLongName("numNonZero") + .withRequired(false) + .withArgument(abuilder.withName("nz").withDefault(1000).create()) + .withDescription("Size of the vector. Default: 1000").withShortName("nz").create(); + Option numVectorsOpt = obuilder + .withLongName("numVectors") + .withRequired(false) + .withArgument(abuilder.withName("nv").withDefault(25).create()) + .withDescription("Number of Vectors to create. Default: 25").withShortName("nv").create(); + Option numClustersOpt = obuilder + .withLongName("numClusters") + .withRequired(false) + .withArgument(abuilder.withName("nc").withDefault(0).create()) + .withDescription("Number of clusters to create. Set to non zero to run cluster benchmark. Default: 0") + .withShortName("nc").create(); + Option numOpsOpt = obuilder + .withLongName("numOps") + .withRequired(false) + .withArgument(abuilder.withName("numOps").withDefault(10).create()) + .withDescription( + "Number of operations to do per timer. " + + "E.g In distance measure, the distance is calculated numOps times" + + " and the total time is measured. Default: 10").withShortName("no").create(); + + Option helpOpt = DefaultOptionCreator.helpOption(); + + Group group = gbuilder.withName("Options").withOption(vectorSizeOpt).withOption(numNonZeroOpt) + .withOption(numVectorsOpt).withOption(numOpsOpt).withOption(numClustersOpt).withOption(helpOpt).create(); + + try { + Parser parser = new Parser(); + parser.setGroup(group); + CommandLine cmdLine = parser.parse(args); + + if (cmdLine.hasOption(helpOpt)) { + CommandLineUtil.printHelpWithGenericOptions(group); + return; + } + + int cardinality = 1000000; + if (cmdLine.hasOption(vectorSizeOpt)) { + cardinality = Integer.parseInt((String) cmdLine.getValue(vectorSizeOpt)); + + } + + int numClusters = 0; + if (cmdLine.hasOption(numClustersOpt)) { + numClusters = Integer.parseInt((String) cmdLine.getValue(numClustersOpt)); + } + + int numNonZero = 1000; + if (cmdLine.hasOption(numNonZeroOpt)) { + numNonZero = Integer.parseInt((String) cmdLine.getValue(numNonZeroOpt)); + } + + int numVectors = 25; + if (cmdLine.hasOption(numVectorsOpt)) { + numVectors = Integer.parseInt((String) cmdLine.getValue(numVectorsOpt)); + + } + + int numOps = 10; + if (cmdLine.hasOption(numOpsOpt)) { + numOps = Integer.parseInt((String) cmdLine.getValue(numOpsOpt)); + + } + VectorBenchmarks mark = new VectorBenchmarks(cardinality, numNonZero, numVectors, numClusters, numOps); + runBenchmark(mark); + + // log.info("\n{}", mark); + log.info("\n{}", mark.asCsvString()); + } catch (OptionException e) { + CommandLineUtil.printHelp(group); + } + } + + private static void runBenchmark(VectorBenchmarks mark) throws IOException { + // Required to set up data. + mark.createData(); + + mark.createBenchmark(); + if (mark.cardinality < 200000) { + // Too slow. + mark.incrementalCreateBenchmark(); + } + + new CloneBenchmark(mark).benchmark(); + new DotBenchmark(mark).benchmark(); + new PlusBenchmark(mark).benchmark(); + new MinusBenchmark(mark).benchmark(); + new TimesBenchmark(mark).benchmark(); + new SerializationBenchmark(mark).benchmark(); + + DistanceBenchmark distanceBenchmark = new DistanceBenchmark(mark); + distanceBenchmark.benchmark(new CosineDistanceMeasure()); + distanceBenchmark.benchmark(new SquaredEuclideanDistanceMeasure()); + distanceBenchmark.benchmark(new EuclideanDistanceMeasure()); + distanceBenchmark.benchmark(new ManhattanDistanceMeasure()); + distanceBenchmark.benchmark(new TanimotoDistanceMeasure()); + distanceBenchmark.benchmark(new ChebyshevDistanceMeasure()); + distanceBenchmark.benchmark(new MinkowskiDistanceMeasure()); + + if (mark.numClusters > 0) { + ClosestCentroidBenchmark centroidBenchmark = new ClosestCentroidBenchmark(mark); + centroidBenchmark.benchmark(new CosineDistanceMeasure()); + centroidBenchmark.benchmark(new SquaredEuclideanDistanceMeasure()); + centroidBenchmark.benchmark(new EuclideanDistanceMeasure()); + centroidBenchmark.benchmark(new ManhattanDistanceMeasure()); + centroidBenchmark.benchmark(new TanimotoDistanceMeasure()); + centroidBenchmark.benchmark(new ChebyshevDistanceMeasure()); + centroidBenchmark.benchmark(new MinkowskiDistanceMeasure()); + } + } + + private String asCsvString() { + List<String> keys = new ArrayList<>(statsMap.keySet()); + Collections.sort(keys); + Map<Integer,String> implMap = new HashMap<>(); + for (Entry<String,Integer> e : implType.entrySet()) { + implMap.put(e.getValue(), e.getKey()); + } + + StringBuilder sb = new StringBuilder(1000); + for (String benchmarkName : keys) { + int i = 0; + for (String[] stats : statsMap.get(benchmarkName)) { + if (stats.length < 8) { + continue; + } + sb.append(benchmarkName).append(','); + sb.append(implMap.get(i++)).append(','); + sb.append(stats[7].trim().split("=|/")[1].trim()); + sb.append('\n'); + } + } + sb.append('\n'); + return sb.toString(); + } + + @Override + public String toString() { + int pad = 24; + StringBuilder sb = new StringBuilder(1000); + sb.append(StringUtils.rightPad("BenchMarks", pad)); + for (int i = 0; i < implType.size(); i++) { + for (Entry<String,Integer> e : implType.entrySet()) { + if (e.getValue() == i) { + sb.append(StringUtils.rightPad(e.getKey(), pad).substring(0, pad)); + break; + } + } + } + sb.append('\n'); + List<String> keys = new ArrayList<>(statsMap.keySet()); + Collections.sort(keys); + for (String benchmarkName : keys) { + List<String[]> implTokenizedStats = statsMap.get(benchmarkName); + int maxStats = 0; + for (String[] stat : implTokenizedStats) { + maxStats = Math.max(maxStats, stat.length); + } + + for (int i = 0; i < maxStats; i++) { + boolean printedName = false; + for (String[] stats : implTokenizedStats) { + if (i == 0 && !printedName) { + sb.append(StringUtils.rightPad(benchmarkName, pad)); + printedName = true; + } else if (!printedName) { + printedName = true; + sb.append(StringUtils.rightPad("", pad)); + } + if (stats.length > i) { + sb.append(StringUtils.rightPad(stats[i], pad)); + } else { + sb.append(StringUtils.rightPad("", pad)); + } + + } + sb.append('\n'); + } + sb.append('\n'); + } + return sb.toString(); + } + + public BenchmarkRunner getRunner() { + return runner; + } +}
