http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/pom.xml ---------------------------------------------------------------------- diff --git a/integration/pom.xml b/integration/pom.xml deleted file mode 100644 index 5a873a6..0000000 --- a/integration/pom.xml +++ /dev/null @@ -1,198 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> - - <modelVersion>4.0.0</modelVersion> - - <parent> - <groupId>org.apache.mahout</groupId> - <artifactId>mahout</artifactId> - <version>0.13.1-SNAPSHOT</version> - <relativePath>../pom.xml</relativePath> - </parent> - - <artifactId>mahout-integration</artifactId> - <name>Mahout Integration</name> - <description>Optional components of Mahout which generally support interaction with third party systems, - formats, APIs, etc.</description> - - <packaging>jar</packaging> - - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-remote-resources-plugin</artifactId> - <configuration> - <appendedResourcesDirectory>../src/main/appended-resources</appendedResourcesDirectory> - <resourceBundles> - <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle> - </resourceBundles> - <supplementalModels> - <supplementalModel>supplemental-models.xml</supplementalModel> - </supplementalModels> - </configuration> - </plugin> - - <plugin> - <artifactId>maven-javadoc-plugin</artifactId> - </plugin> - - <plugin> - <artifactId>maven-source-plugin</artifactId> - </plugin> - - </plugins> - - </build> - - <dependencies> - - <!-- own modules --> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>mahout-hdfs</artifactId> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>mahout-mr</artifactId> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>mahout-hdfs</artifactId> - <type>test-jar</type> - <scope>test</scope> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>mahout-mr</artifactId> - <type>test-jar</type> - <scope>test</scope> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>mahout-math</artifactId> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>mahout-math</artifactId> - <type>test-jar</type> - <scope>test</scope> - </dependency> - - <!-- 3rd party --> - - <dependency> - <groupId>commons-dbcp</groupId> - <artifactId>commons-dbcp</artifactId> - <optional>true</optional> - </dependency> - - <dependency> - <groupId>commons-pool</groupId> - <artifactId>commons-pool</artifactId> - <optional>true</optional> - </dependency> - - <dependency> - <groupId>commons-io</groupId> - <artifactId>commons-io</artifactId> - </dependency> - - <dependency> - <groupId>com.google.guava</groupId> - <artifactId>guava</artifactId> - </dependency> - - <dependency> - <groupId>org.apache.solr</groupId> - <artifactId>solr-commons-csv</artifactId> - <version>3.5.0</version> - </dependency> - - <dependency> - <groupId>org.apache.lucene</groupId> - <artifactId>lucene-benchmark</artifactId> - <optional>true</optional> - </dependency> - <dependency> - <groupId>org.apache.lucene</groupId> - <artifactId>lucene-analyzers-common</artifactId> - <optional>true</optional> - </dependency> - - <dependency> - <groupId>org.mongodb</groupId> - <artifactId>mongo-java-driver</artifactId> - <version>2.11.2</version> - <optional>true</optional> - </dependency> - - <dependency> - <groupId>org.mongodb</groupId> - <artifactId>bson</artifactId> - <version>2.11.2</version> - <optional>true</optional> - </dependency> - - <dependency> - <groupId>org.apache.hbase</groupId> - <artifactId>hbase-client</artifactId> - </dependency> - - <dependency> - <groupId>org.hectorclient</groupId> - <artifactId>hector-core</artifactId> - <version>1.1-4</version> - <optional>true</optional> - </dependency> - - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-api</artifactId> - </dependency> - - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-jcl</artifactId> - <scope>test</scope> - </dependency> - - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <scope>test</scope> - </dependency> - - <dependency> - <groupId>com.carrotsearch.randomizedtesting</groupId> - <artifactId>randomizedtesting-runner</artifactId> - <scope>test</scope> - </dependency> - - <dependency> - <groupId>org.easymock</groupId> - <artifactId>easymock</artifactId> - <scope>test</scope> - </dependency> - - </dependencies> - -</project>
http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java b/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java deleted file mode 100644 index 549cf2c..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/BenchmarkRunner.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import java.util.Random; -import java.util.concurrent.TimeUnit; - -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.TimingStatistics; -import org.apache.mahout.math.Vector; - -import com.google.common.base.Function; - -public final class BenchmarkRunner { - private static final int BUCKET_SIZE = 10000; - private static final Random R = RandomUtils.getRandom(); - private final long maxTimeUsec; - private final long leadTimeUsec; - - public BenchmarkRunner(long leadTimeMs, long maxTimeMs) { - maxTimeUsec = TimeUnit.MILLISECONDS.toNanos(maxTimeMs); - leadTimeUsec = TimeUnit.MILLISECONDS.toNanos(leadTimeMs); - } - - public abstract static class BenchmarkFn implements Function<Integer, Boolean> { - protected int randIndex() { - return BenchmarkRunner.randIndex(); - } - - protected boolean randBool() { - return BenchmarkRunner.randBool(); - } - - /** - * Adds a random data dependency so that JVM does not remove dead code. - */ - protected boolean depends(Vector v) { - return randIndex() < v.getNumNondefaultElements(); - } - } - - public abstract static class BenchmarkFnD implements Function<Integer, Double> { - protected int randIndex() { - return BenchmarkRunner.randIndex(); - } - - protected boolean randBool() { - return BenchmarkRunner.randBool(); - } - - /** - * Adds a random data dependency so that JVM does not remove dead code. - */ - protected boolean depends(Vector v) { - return randIndex() < v.getNumNondefaultElements(); - } - } - - private static int randIndex() { - return R.nextInt(BUCKET_SIZE); - } - - private static boolean randBool() { - return R.nextBoolean(); - } - - public TimingStatistics benchmark(BenchmarkFn function) { - TimingStatistics stats = new TimingStatistics(); - boolean result = false; - while (true) { - int i = R.nextInt(BUCKET_SIZE); - TimingStatistics.Call call = stats.newCall(leadTimeUsec); - result = result ^ function.apply(i); - if (call.end(maxTimeUsec)) { - break; - } - } - return stats; - } - - public TimingStatistics benchmarkD(BenchmarkFnD function) { - TimingStatistics stats = new TimingStatistics(); - double result = 0; - while (true) { - int i = R.nextInt(BUCKET_SIZE); - TimingStatistics.Call call = stats.newCall(leadTimeUsec); - result += function.apply(i); - if (call.end(maxTimeUsec)) { - break; - } - } - // print result to prevent hotspot from eliminating deadcode - System.err.println("Result = " + result); - return stats; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java deleted file mode 100644 index 5e6ab4d..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/CloneBenchmark.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; - -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; - -public class CloneBenchmark { - public static final String CLONE = "Clone"; - private final VectorBenchmarks mark; - - public CloneBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark() { - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - mark.vectors[0][mark.vIndex(i)] = mark.vectors[0][mark.vIndex(i)].clone(); - - return depends(mark.vectors[0][mark.vIndex(i)]); - } - }), CLONE, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - mark.vectors[1][mark.vIndex(i)] = mark.vectors[1][mark.vIndex(i)].clone(); - - return depends(mark.vectors[1][mark.vIndex(i)]); - } - }), CLONE, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - mark.vectors[2][mark.vIndex(i)] = mark.vectors[2][mark.vIndex(i)].clone(); - - return depends(mark.vectors[2][mark.vIndex(i)]); - } - }), CLONE, SEQ_SPARSE_VECTOR); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java deleted file mode 100644 index b1c2ded..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/ClosestCentroidBenchmark.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import java.io.IOException; -import java.util.Random; - -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.TimingStatistics; -import org.apache.mahout.common.distance.DistanceMeasure; -import org.apache.mahout.math.SparseMatrix; -import org.apache.mahout.math.Vector; - -public class ClosestCentroidBenchmark { - private final VectorBenchmarks mark; - - public ClosestCentroidBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark(DistanceMeasure measure) throws IOException { - SparseMatrix clusterDistances = new SparseMatrix(mark.numClusters, mark.numClusters); - for (int i = 0; i < mark.numClusters; i++) { - for (int j = 0; j < mark.numClusters; j++) { - double distance = Double.POSITIVE_INFINITY; - if (i != j) { - distance = measure.distance(mark.clusters[i], mark.clusters[j]); - } - clusterDistances.setQuick(i, j, distance); - } - } - - long distanceCalculations = 0; - TimingStatistics stats = new TimingStatistics(); - for (int l = 0; l < mark.loop; l++) { - TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); - for (int i = 0; i < mark.numVectors; i++) { - Vector vector = mark.vectors[1][mark.vIndex(i)]; - double minDistance = Double.MAX_VALUE; - for (int k = 0; k < mark.numClusters; k++) { - double distance = measure.distance(vector, mark.clusters[k]); - distanceCalculations++; - if (distance < minDistance) { - minDistance = distance; - } - } - } - if (call.end(mark.maxTimeUsec)) { - break; - } - } - mark.printStats(stats, measure.getClass().getName(), "Closest C w/o Elkan's trick", "distanceCalculations = " - + distanceCalculations); - - distanceCalculations = 0; - stats = new TimingStatistics(); - Random rand = RandomUtils.getRandom(); - for (int l = 0; l < mark.loop; l++) { - TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); - for (int i = 0; i < mark.numVectors; i++) { - Vector vector = mark.vectors[1][mark.vIndex(i)]; - int closestCentroid = rand.nextInt(mark.numClusters); - double dist = measure.distance(vector, mark.clusters[closestCentroid]); - distanceCalculations++; - for (int k = 0; k < mark.numClusters; k++) { - if (closestCentroid != k) { - double centroidDist = clusterDistances.getQuick(k, closestCentroid); - if (centroidDist < 2 * dist) { - dist = measure.distance(vector, mark.clusters[k]); - closestCentroid = k; - distanceCalculations++; - } - } - } - } - if (call.end(mark.maxTimeUsec)) { - break; - } - } - mark.printStats(stats, measure.getClass().getName(), "Closest C w/ Elkan's trick", "distanceCalculations = " - + distanceCalculations); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java deleted file mode 100644 index 25d0ad7..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/DistanceBenchmark.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; - -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFnD; -import org.apache.mahout.common.distance.DistanceMeasure; - -public class DistanceBenchmark { - private final VectorBenchmarks mark; - - public DistanceBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark(final DistanceMeasure measure) { - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), SEQ_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), DENSE_FN_RAND); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[0][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), DENSE_FN_SEQ); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), RAND_FN_DENSE); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[1][mark.vIndex(i)], mark.vectors[2][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), RAND_FN_SEQ); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[0][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), SEQ_FN_DENSE); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return measure.distance(mark.vectors[2][mark.vIndex(i)], mark.vectors[1][mark.vIndex(randIndex())]); - } - }), measure.getClass().getName(), SEQ_FN_RAND); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java deleted file mode 100644 index fc7f911..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/DotBenchmark.java +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; - -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFnD; - -public class DotBenchmark { - private static final String DOT_PRODUCT = "DotProduct"; - private static final String NORM1 = "Norm1"; - private static final String NORM2 = "Norm2"; - private static final String LOG_NORMALIZE = "LogNormalize"; - private final VectorBenchmarks mark; - - public DotBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark() { - benchmarkDot(); - benchmarkNorm1(); - benchmarkNorm2(); - benchmarkLogNormalize(); - } - - private void benchmarkLogNormalize() { - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - return depends(mark.vectors[0][mark.vIndex(i)].logNormalize()); - } - }), LOG_NORMALIZE, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - return depends(mark.vectors[1][mark.vIndex(i)].logNormalize()); - } - }), LOG_NORMALIZE, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - return depends(mark.vectors[2][mark.vIndex(i)].logNormalize()); - } - }), LOG_NORMALIZE, SEQ_SPARSE_VECTOR); - } - - private void benchmarkNorm1() { - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[0][mark.vIndex(i)].norm(1); - } - }), NORM1, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[1][mark.vIndex(i)].norm(1); - } - }), NORM1, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[2][mark.vIndex(i)].norm(1); - } - }), NORM1, SEQ_SPARSE_VECTOR); - } - - private void benchmarkNorm2() { - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[0][mark.vIndex(i)].norm(2); - } - }), NORM2, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[1][mark.vIndex(i)].norm(2); - } - }), NORM2, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[2][mark.vIndex(i)].norm(2); - } - }), NORM2, SEQ_SPARSE_VECTOR); - } - - private void benchmarkDot() { - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, SEQ_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, DENSE_FN_RAND); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[0][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, DENSE_FN_SEQ); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, RAND_FN_DENSE); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[1][mark.vIndex(i)].dot(mark.vectors[2][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, RAND_FN_SEQ); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[0][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, SEQ_FN_DENSE); - - mark.printStats(mark.getRunner().benchmarkD(new BenchmarkFnD() { - @Override - public Double apply(Integer i) { - return mark.vectors[2][mark.vIndex(i)].dot(mark.vectors[1][mark.vIndex(randIndex())]); - } - }), DOT_PRODUCT, SEQ_FN_RAND); - } - - public static void main(String[] args) { - VectorBenchmarks mark = new VectorBenchmarks(1000000, 100, 1000, 10, 1); - mark.createData(); - new DotBenchmark(mark).benchmarkNorm2(); - System.out.println(mark); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java deleted file mode 100644 index 82fb693..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/MinusBenchmark.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; - -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; -import org.apache.mahout.math.Vector; - -public class MinusBenchmark { - - private static final String MINUS = "Minus"; - private final VectorBenchmarks mark; - - public MinusBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark() { - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, SEQ_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, DENSE_FN_RAND); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[0][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, DENSE_FN_SEQ); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, RAND_FN_DENSE); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[1][mark.vIndex(i)].minus(mark.vectors[2][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, RAND_FN_SEQ); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[0][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, SEQ_FN_DENSE); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[2][mark.vIndex(i)].minus(mark.vectors[1][mark.vIndex(randIndex())]); - return depends(v); - } - }), MINUS, SEQ_FN_RAND); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java deleted file mode 100644 index bd76e94..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/PlusBenchmark.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; - -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; -import org.apache.mahout.math.Vector; - -public class PlusBenchmark { - - private static final String PLUS = "Plus"; - private final VectorBenchmarks mark; - - public PlusBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark() { - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]); - return depends(v); - } - }), PLUS, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]); - return depends(v); - } - }), PLUS, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]); - return depends(v); - } - }), PLUS, SEQ_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]); - return depends(v); - } - }), PLUS, DENSE_FN_RAND); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[0][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]); - return depends(v); - } - }), PLUS, DENSE_FN_SEQ); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]); - return depends(v); - } - }), PLUS, RAND_FN_DENSE); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[1][mark.vIndex(i)].plus(mark.vectors[2][mark.vIndex(randIndex())]); - return depends(v); - } - }), PLUS, RAND_FN_SEQ); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[0][mark.vIndex(randIndex())]); - return depends(v); - } - }), PLUS, SEQ_FN_DENSE); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[2][mark.vIndex(i)].plus(mark.vectors[1][mark.vIndex(randIndex())]); - return depends(v); - } - }), PLUS, SEQ_FN_RAND); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java deleted file mode 100644 index cd403c2..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/SerializationBenchmark.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Writable; -import org.apache.mahout.common.TimingStatistics; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator; -import org.apache.mahout.math.VectorWritable; - -import java.io.IOException; - -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; - -public class SerializationBenchmark { - public static final String SERIALIZE = "Serialize"; - public static final String DESERIALIZE = "Deserialize"; - private final VectorBenchmarks mark; - - public SerializationBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark() throws IOException { - serializeBenchmark(); - deserializeBenchmark(); - } - - public void serializeBenchmark() throws IOException { - Configuration conf = new Configuration(); - FileSystem fs = FileSystem.get(conf); - - Writable one = new IntWritable(0); - VectorWritable vec = new VectorWritable(); - TimingStatistics stats = new TimingStatistics(); - - try (SequenceFile.Writer writer = - new SequenceFile.Writer(fs, conf, new Path("/tmp/dense-vector"), - IntWritable.class, VectorWritable.class)){ - for (int i = 0; i < mark.loop; i++) { - TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); - vec.set(mark.vectors[0][mark.vIndex(i)]); - writer.append(one, vec); - if (call.end(mark.maxTimeUsec)) { - break; - } - } - } - mark.printStats(stats, SERIALIZE, DENSE_VECTOR); - - stats = new TimingStatistics(); - try (SequenceFile.Writer writer = - new SequenceFile.Writer(fs, conf, - new Path("/tmp/randsparse-vector"), IntWritable.class, VectorWritable.class)){ - for (int i = 0; i < mark.loop; i++) { - TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); - vec.set(mark.vectors[1][mark.vIndex(i)]); - writer.append(one, vec); - if (call.end(mark.maxTimeUsec)) { - break; - } - } - } - mark.printStats(stats, SERIALIZE, RAND_SPARSE_VECTOR); - - stats = new TimingStatistics(); - try (SequenceFile.Writer writer = - new SequenceFile.Writer(fs, conf, - new Path("/tmp/seqsparse-vector"), IntWritable.class, VectorWritable.class)) { - for (int i = 0; i < mark.loop; i++) { - TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); - vec.set(mark.vectors[2][mark.vIndex(i)]); - writer.append(one, vec); - if (call.end(mark.maxTimeUsec)) { - break; - } - } - } - mark.printStats(stats, SERIALIZE, SEQ_SPARSE_VECTOR); - - } - - public void deserializeBenchmark() throws IOException { - doDeserializeBenchmark(DENSE_VECTOR, "/tmp/dense-vector"); - doDeserializeBenchmark(RAND_SPARSE_VECTOR, "/tmp/randsparse-vector"); - doDeserializeBenchmark(SEQ_SPARSE_VECTOR, "/tmp/seqsparse-vector"); - } - - private void doDeserializeBenchmark(String name, String pathString) throws IOException { - TimingStatistics stats = new TimingStatistics(); - TimingStatistics.Call call = stats.newCall(mark.leadTimeUsec); - SequenceFileValueIterator<Writable> iterator = new SequenceFileValueIterator<>(new Path(pathString), true, - new Configuration()); - while (iterator.hasNext()) { - iterator.next(); - call.end(); - call = stats.newCall(mark.leadTimeUsec); - } - iterator.close(); - mark.printStats(stats, DESERIALIZE, name); - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java b/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java deleted file mode 100644 index bf81228..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/TimesBenchmark.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.DENSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_FN_SEQ; -import static org.apache.mahout.benchmark.VectorBenchmarks.RAND_SPARSE_VECTOR; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_DENSE; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_FN_RAND; -import static org.apache.mahout.benchmark.VectorBenchmarks.SEQ_SPARSE_VECTOR; - -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; -import org.apache.mahout.math.Vector; - -public class TimesBenchmark { - - private static final String TIMES = "Times"; - private final VectorBenchmarks mark; - - public TimesBenchmark(VectorBenchmarks mark) { - this.mark = mark; - } - - public void benchmark() { - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]); - return depends(v); - } - }), TIMES, DENSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]); - return depends(v); - } - }), TIMES, RAND_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]); - return depends(v); - } - }), TIMES, SEQ_SPARSE_VECTOR); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]); - return depends(v); - } - }), TIMES, DENSE_FN_RAND); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[0][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]); - return depends(v); - } - }), TIMES, DENSE_FN_SEQ); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]); - return depends(v); - } - }), TIMES, RAND_FN_DENSE); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[1][mark.vIndex(i)].times(mark.vectors[2][mark.vIndex(randIndex())]); - return depends(v); - } - }), TIMES, RAND_FN_SEQ); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[0][mark.vIndex(randIndex())]); - return depends(v); - } - }), TIMES, SEQ_FN_DENSE); - - mark.printStats(mark.getRunner().benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - Vector v = mark.vectors[2][mark.vIndex(i)].times(mark.vectors[1][mark.vIndex(randIndex())]); - return depends(v); - } - }), TIMES, SEQ_FN_RAND); - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java b/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java deleted file mode 100644 index a076322..0000000 --- a/integration/src/main/java/org/apache/mahout/benchmark/VectorBenchmarks.java +++ /dev/null @@ -1,497 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.benchmark; - -import org.apache.commons.cli2.CommandLine; -import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; -import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; -import org.apache.commons.lang3.StringUtils; -import org.apache.mahout.benchmark.BenchmarkRunner.BenchmarkFn; -import org.apache.mahout.common.CommandLineUtil; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.common.TimingStatistics; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.common.distance.ChebyshevDistanceMeasure; -import org.apache.mahout.common.distance.CosineDistanceMeasure; -import org.apache.mahout.common.distance.EuclideanDistanceMeasure; -import org.apache.mahout.common.distance.ManhattanDistanceMeasure; -import org.apache.mahout.common.distance.MinkowskiDistanceMeasure; -import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure; -import org.apache.mahout.common.distance.TanimotoDistanceMeasure; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.SequentialAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.text.DecimalFormat; -import java.util.ArrayList; -import java.util.BitSet; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Random; -import java.util.concurrent.TimeUnit; -import java.util.regex.Pattern; - -public class VectorBenchmarks { - private static final int MAX_TIME_MS = 5000; - private static final int LEAD_TIME_MS = 15000; - public static final String CLUSTERS = "Clusters"; - public static final String CREATE_INCREMENTALLY = "Create (incrementally)"; - public static final String CREATE_COPY = "Create (copy)"; - - public static final String DENSE_FN_SEQ = "Dense.fn(Seq)"; - public static final String RAND_FN_DENSE = "Rand.fn(Dense)"; - public static final String SEQ_FN_RAND = "Seq.fn(Rand)"; - public static final String RAND_FN_SEQ = "Rand.fn(Seq)"; - public static final String SEQ_FN_DENSE = "Seq.fn(Dense)"; - public static final String DENSE_FN_RAND = "Dense.fn(Rand)"; - public static final String SEQ_SPARSE_VECTOR = "SeqSparseVector"; - public static final String RAND_SPARSE_VECTOR = "RandSparseVector"; - public static final String DENSE_VECTOR = "DenseVector"; - - private static final Logger log = LoggerFactory.getLogger(VectorBenchmarks.class); - private static final Pattern TAB_NEWLINE_PATTERN = Pattern.compile("[\n\t]"); - private static final String[] EMPTY = new String[0]; - private static final DecimalFormat DF = new DecimalFormat("#.##"); - - /* package private */ - final Vector[][] vectors; - final Vector[] clusters; - final int cardinality; - final int numNonZeros; - final int numVectors; - final int numClusters; - final int loop = Integer.MAX_VALUE; - final int opsPerUnit; - final long maxTimeUsec; - final long leadTimeUsec; - - private final List<Vector> randomVectors = new ArrayList<>(); - private final List<int[]> randomVectorIndices = new ArrayList<>(); - private final List<double[]> randomVectorValues = new ArrayList<>(); - private final Map<String, Integer> implType = new HashMap<>(); - private final Map<String, List<String[]>> statsMap = new HashMap<>(); - private final BenchmarkRunner runner; - private final Random r = RandomUtils.getRandom(); - - public VectorBenchmarks(int cardinality, int numNonZeros, int numVectors, int numClusters, - int opsPerUnit) { - runner = new BenchmarkRunner(LEAD_TIME_MS, MAX_TIME_MS); - maxTimeUsec = TimeUnit.MILLISECONDS.toNanos(MAX_TIME_MS); - leadTimeUsec = TimeUnit.MILLISECONDS.toNanos(LEAD_TIME_MS); - - this.cardinality = cardinality; - this.numNonZeros = numNonZeros; - this.numVectors = numVectors; - this.numClusters = numClusters; - this.opsPerUnit = opsPerUnit; - - setUpVectors(cardinality, numNonZeros, numVectors); - - vectors = new Vector[3][numVectors]; - clusters = new Vector[numClusters]; - } - - private void setUpVectors(int cardinality, int numNonZeros, int numVectors) { - for (int i = 0; i < numVectors; i++) { - Vector v = new SequentialAccessSparseVector(cardinality, numNonZeros); // sparsity! - BitSet featureSpace = new BitSet(cardinality); - int[] indexes = new int[numNonZeros]; - double[] values = new double[numNonZeros]; - int j = 0; - while (j < numNonZeros) { - double value = r.nextGaussian(); - int index = r.nextInt(cardinality); - if (!featureSpace.get(index) && value != 0) { - featureSpace.set(index); - indexes[j] = index; - values[j++] = value; - v.set(index, value); - } - } - randomVectorIndices.add(indexes); - randomVectorValues.add(values); - randomVectors.add(v); - } - } - - void printStats(TimingStatistics stats, String benchmarkName, String implName, String content) { - printStats(stats, benchmarkName, implName, content, 1); - } - - void printStats(TimingStatistics stats, String benchmarkName, String implName) { - printStats(stats, benchmarkName, implName, "", 1); - } - - private void printStats(TimingStatistics stats, String benchmarkName, String implName, - String content, int multiplier) { - float speed = multiplier * stats.getNCalls() * (numNonZeros * 1000.0f * 12 / stats.getSumTime()); - float opsPerSec = stats.getNCalls() * 1000000000.0f / stats.getSumTime(); - log.info("{} {} \n{} {} \nOps = {} Units/sec\nIOps = {} MBytes/sec", benchmarkName, - implName, content, stats.toString(), DF.format(opsPerSec), DF.format(speed)); - - if (!implType.containsKey(implName)) { - implType.put(implName, implType.size()); - } - int implId = implType.get(implName); - if (!statsMap.containsKey(benchmarkName)) { - statsMap.put(benchmarkName, new ArrayList<String[]>()); - } - List<String[]> implStats = statsMap.get(benchmarkName); - while (implStats.size() < implId + 1) { - implStats.add(EMPTY); - } - implStats.set( - implId, - TAB_NEWLINE_PATTERN.split(stats + "\tSpeed = " + DF.format(opsPerSec) + " /sec\tRate = " - + DF.format(speed) + " MB/s")); - } - - public void createData() { - for (int i = 0; i < Math.max(numVectors, numClusters); ++i) { - vectors[0][vIndex(i)] = new DenseVector(randomVectors.get(vIndex(i))); - vectors[1][vIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i))); - vectors[2][vIndex(i)] = new SequentialAccessSparseVector(randomVectors.get(vIndex(i))); - if (numClusters > 0) { - clusters[cIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i))); - } - } - } - - public void createBenchmark() { - printStats(runner.benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - vectors[0][vIndex(i)] = new DenseVector(randomVectors.get(vIndex(i))); - return depends(vectors[0][vIndex(i)]); - } - }), CREATE_COPY, DENSE_VECTOR); - - printStats(runner.benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - vectors[1][vIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i))); - return depends(vectors[1][vIndex(i)]); - } - }), CREATE_COPY, RAND_SPARSE_VECTOR); - - printStats(runner.benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - vectors[2][vIndex(i)] = new SequentialAccessSparseVector(randomVectors.get(vIndex(i))); - return depends(vectors[2][vIndex(i)]); - } - }), CREATE_COPY, SEQ_SPARSE_VECTOR); - - if (numClusters > 0) { - printStats(runner.benchmark(new BenchmarkFn() { - @Override - public Boolean apply(Integer i) { - clusters[cIndex(i)] = new RandomAccessSparseVector(randomVectors.get(vIndex(i))); - return depends(clusters[cIndex(i)]); - } - }), CREATE_COPY, CLUSTERS); - } - } - - private boolean buildVectorIncrementally(TimingStatistics stats, int randomIndex, Vector v, boolean useSetQuick) { - int[] indexes = randomVectorIndices.get(randomIndex); - double[] values = randomVectorValues.get(randomIndex); - List<Integer> randomOrder = new ArrayList<>(); - for (int i = 0; i < indexes.length; i++) { - randomOrder.add(i); - } - Collections.shuffle(randomOrder); - int[] permutation = new int[randomOrder.size()]; - for (int i = 0; i < randomOrder.size(); i++) { - permutation[i] = randomOrder.get(i); - } - - TimingStatistics.Call call = stats.newCall(leadTimeUsec); - if (useSetQuick) { - for (int i : permutation) { - v.setQuick(indexes[i], values[i]); - } - } else { - for (int i : permutation) { - v.set(indexes[i], values[i]); - } - } - return call.end(maxTimeUsec); - } - - public void incrementalCreateBenchmark() { - TimingStatistics stats = new TimingStatistics(); - for (int i = 0; i < loop; i++) { - vectors[0][vIndex(i)] = new DenseVector(cardinality); - if (buildVectorIncrementally(stats, vIndex(i), vectors[0][vIndex(i)], false)) { - break; - } - } - printStats(stats, CREATE_INCREMENTALLY, DENSE_VECTOR); - - stats = new TimingStatistics(); - for (int i = 0; i < loop; i++) { - vectors[1][vIndex(i)] = new RandomAccessSparseVector(cardinality); - if (buildVectorIncrementally(stats, vIndex(i), vectors[1][vIndex(i)], false)) { - break; - } - } - printStats(stats, CREATE_INCREMENTALLY, RAND_SPARSE_VECTOR); - - stats = new TimingStatistics(); - for (int i = 0; i < loop; i++) { - vectors[2][vIndex(i)] = new SequentialAccessSparseVector(cardinality); - if (buildVectorIncrementally(stats, vIndex(i), vectors[2][vIndex(i)], false)) { - break; - } - } - printStats(stats, CREATE_INCREMENTALLY, SEQ_SPARSE_VECTOR); - - if (numClusters > 0) { - stats = new TimingStatistics(); - for (int i = 0; i < loop; i++) { - clusters[cIndex(i)] = new RandomAccessSparseVector(cardinality); - if (buildVectorIncrementally(stats, vIndex(i), clusters[cIndex(i)], false)) { - break; - } - } - printStats(stats, CREATE_INCREMENTALLY, CLUSTERS); - } - } - - public int vIndex(int i) { - return i % numVectors; - } - - public int cIndex(int i) { - return i % numClusters; - } - - public static void main(String[] args) throws IOException { - DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); - ArgumentBuilder abuilder = new ArgumentBuilder(); - GroupBuilder gbuilder = new GroupBuilder(); - - Option vectorSizeOpt = obuilder - .withLongName("vectorSize") - .withRequired(false) - .withArgument(abuilder.withName("vs").withDefault(1000000).create()) - .withDescription("Cardinality of the vector. Default: 1000000").withShortName("vs").create(); - Option numNonZeroOpt = obuilder - .withLongName("numNonZero") - .withRequired(false) - .withArgument(abuilder.withName("nz").withDefault(1000).create()) - .withDescription("Size of the vector. Default: 1000").withShortName("nz").create(); - Option numVectorsOpt = obuilder - .withLongName("numVectors") - .withRequired(false) - .withArgument(abuilder.withName("nv").withDefault(25).create()) - .withDescription("Number of Vectors to create. Default: 25").withShortName("nv").create(); - Option numClustersOpt = obuilder - .withLongName("numClusters") - .withRequired(false) - .withArgument(abuilder.withName("nc").withDefault(0).create()) - .withDescription("Number of clusters to create. Set to non zero to run cluster benchmark. Default: 0") - .withShortName("nc").create(); - Option numOpsOpt = obuilder - .withLongName("numOps") - .withRequired(false) - .withArgument(abuilder.withName("numOps").withDefault(10).create()) - .withDescription( - "Number of operations to do per timer. " - + "E.g In distance measure, the distance is calculated numOps times" - + " and the total time is measured. Default: 10").withShortName("no").create(); - - Option helpOpt = DefaultOptionCreator.helpOption(); - - Group group = gbuilder.withName("Options").withOption(vectorSizeOpt).withOption(numNonZeroOpt) - .withOption(numVectorsOpt).withOption(numOpsOpt).withOption(numClustersOpt).withOption(helpOpt).create(); - - try { - Parser parser = new Parser(); - parser.setGroup(group); - CommandLine cmdLine = parser.parse(args); - - if (cmdLine.hasOption(helpOpt)) { - CommandLineUtil.printHelpWithGenericOptions(group); - return; - } - - int cardinality = 1000000; - if (cmdLine.hasOption(vectorSizeOpt)) { - cardinality = Integer.parseInt((String) cmdLine.getValue(vectorSizeOpt)); - - } - - int numClusters = 0; - if (cmdLine.hasOption(numClustersOpt)) { - numClusters = Integer.parseInt((String) cmdLine.getValue(numClustersOpt)); - } - - int numNonZero = 1000; - if (cmdLine.hasOption(numNonZeroOpt)) { - numNonZero = Integer.parseInt((String) cmdLine.getValue(numNonZeroOpt)); - } - - int numVectors = 25; - if (cmdLine.hasOption(numVectorsOpt)) { - numVectors = Integer.parseInt((String) cmdLine.getValue(numVectorsOpt)); - - } - - int numOps = 10; - if (cmdLine.hasOption(numOpsOpt)) { - numOps = Integer.parseInt((String) cmdLine.getValue(numOpsOpt)); - - } - VectorBenchmarks mark = new VectorBenchmarks(cardinality, numNonZero, numVectors, numClusters, numOps); - runBenchmark(mark); - - // log.info("\n{}", mark); - log.info("\n{}", mark.asCsvString()); - } catch (OptionException e) { - CommandLineUtil.printHelp(group); - } - } - - private static void runBenchmark(VectorBenchmarks mark) throws IOException { - // Required to set up data. - mark.createData(); - - mark.createBenchmark(); - if (mark.cardinality < 200000) { - // Too slow. - mark.incrementalCreateBenchmark(); - } - - new CloneBenchmark(mark).benchmark(); - new DotBenchmark(mark).benchmark(); - new PlusBenchmark(mark).benchmark(); - new MinusBenchmark(mark).benchmark(); - new TimesBenchmark(mark).benchmark(); - new SerializationBenchmark(mark).benchmark(); - - DistanceBenchmark distanceBenchmark = new DistanceBenchmark(mark); - distanceBenchmark.benchmark(new CosineDistanceMeasure()); - distanceBenchmark.benchmark(new SquaredEuclideanDistanceMeasure()); - distanceBenchmark.benchmark(new EuclideanDistanceMeasure()); - distanceBenchmark.benchmark(new ManhattanDistanceMeasure()); - distanceBenchmark.benchmark(new TanimotoDistanceMeasure()); - distanceBenchmark.benchmark(new ChebyshevDistanceMeasure()); - distanceBenchmark.benchmark(new MinkowskiDistanceMeasure()); - - if (mark.numClusters > 0) { - ClosestCentroidBenchmark centroidBenchmark = new ClosestCentroidBenchmark(mark); - centroidBenchmark.benchmark(new CosineDistanceMeasure()); - centroidBenchmark.benchmark(new SquaredEuclideanDistanceMeasure()); - centroidBenchmark.benchmark(new EuclideanDistanceMeasure()); - centroidBenchmark.benchmark(new ManhattanDistanceMeasure()); - centroidBenchmark.benchmark(new TanimotoDistanceMeasure()); - centroidBenchmark.benchmark(new ChebyshevDistanceMeasure()); - centroidBenchmark.benchmark(new MinkowskiDistanceMeasure()); - } - } - - private String asCsvString() { - List<String> keys = new ArrayList<>(statsMap.keySet()); - Collections.sort(keys); - Map<Integer,String> implMap = new HashMap<>(); - for (Entry<String,Integer> e : implType.entrySet()) { - implMap.put(e.getValue(), e.getKey()); - } - - StringBuilder sb = new StringBuilder(1000); - for (String benchmarkName : keys) { - int i = 0; - for (String[] stats : statsMap.get(benchmarkName)) { - if (stats.length < 8) { - continue; - } - sb.append(benchmarkName).append(','); - sb.append(implMap.get(i++)).append(','); - sb.append(stats[7].trim().split("=|/")[1].trim()); - sb.append('\n'); - } - } - sb.append('\n'); - return sb.toString(); - } - - @Override - public String toString() { - int pad = 24; - StringBuilder sb = new StringBuilder(1000); - sb.append(StringUtils.rightPad("BenchMarks", pad)); - for (int i = 0; i < implType.size(); i++) { - for (Entry<String,Integer> e : implType.entrySet()) { - if (e.getValue() == i) { - sb.append(StringUtils.rightPad(e.getKey(), pad).substring(0, pad)); - break; - } - } - } - sb.append('\n'); - List<String> keys = new ArrayList<>(statsMap.keySet()); - Collections.sort(keys); - for (String benchmarkName : keys) { - List<String[]> implTokenizedStats = statsMap.get(benchmarkName); - int maxStats = 0; - for (String[] stat : implTokenizedStats) { - maxStats = Math.max(maxStats, stat.length); - } - - for (int i = 0; i < maxStats; i++) { - boolean printedName = false; - for (String[] stats : implTokenizedStats) { - if (i == 0 && !printedName) { - sb.append(StringUtils.rightPad(benchmarkName, pad)); - printedName = true; - } else if (!printedName) { - printedName = true; - sb.append(StringUtils.rightPad("", pad)); - } - if (stats.length > i) { - sb.append(StringUtils.rightPad(stats[i], pad)); - } else { - sb.append(StringUtils.rightPad("", pad)); - } - - } - sb.append('\n'); - } - sb.append('\n'); - } - return sb.toString(); - } - - public BenchmarkRunner getRunner() { - return runner; - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/e0573de3/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/cassandra/CassandraDataModel.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/cassandra/CassandraDataModel.java b/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/cassandra/CassandraDataModel.java deleted file mode 100644 index b220993..0000000 --- a/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/cassandra/CassandraDataModel.java +++ /dev/null @@ -1,465 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf.taste.impl.model.cassandra; - -import com.google.common.base.Preconditions; -import me.prettyprint.cassandra.model.HColumnImpl; -import me.prettyprint.cassandra.serializers.BytesArraySerializer; -import me.prettyprint.cassandra.serializers.FloatSerializer; -import me.prettyprint.cassandra.serializers.LongSerializer; -import me.prettyprint.cassandra.service.OperationType; -import me.prettyprint.hector.api.Cluster; -import me.prettyprint.hector.api.ConsistencyLevelPolicy; -import me.prettyprint.hector.api.HConsistencyLevel; -import me.prettyprint.hector.api.Keyspace; -import me.prettyprint.hector.api.beans.ColumnSlice; -import me.prettyprint.hector.api.beans.HColumn; -import me.prettyprint.hector.api.factory.HFactory; -import me.prettyprint.hector.api.mutation.Mutator; -import me.prettyprint.hector.api.query.ColumnQuery; -import me.prettyprint.hector.api.query.CountQuery; -import me.prettyprint.hector.api.query.SliceQuery; -import org.apache.mahout.cf.taste.common.NoSuchItemException; -import org.apache.mahout.cf.taste.common.NoSuchUserException; -import org.apache.mahout.cf.taste.common.Refreshable; -import org.apache.mahout.cf.taste.common.TasteException; -import org.apache.mahout.cf.taste.impl.common.Cache; -import org.apache.mahout.cf.taste.impl.common.FastIDSet; -import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; -import org.apache.mahout.cf.taste.impl.common.Retriever; -import org.apache.mahout.cf.taste.impl.model.GenericItemPreferenceArray; -import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray; -import org.apache.mahout.cf.taste.model.DataModel; -import org.apache.mahout.cf.taste.model.PreferenceArray; - -import java.io.Closeable; -import java.util.Collection; -import java.util.List; -import java.util.concurrent.atomic.AtomicReference; - -/** - * <p>A {@link DataModel} based on a Cassandra keyspace. By default it uses keyspace "recommender" but this - * can be configured. Create the keyspace before using this class; this can be done on the Cassandra command - * line with a command linke {@code create keyspace recommender;}.</p> - * - * <p>Within the keyspace, this model uses four column families:</p> - * - * <p>First, it uses a column family called "users". This is keyed by the user ID as an 8-byte long. - * It contains a column for every preference the user expresses. The column name is item ID, again as - * an 8-byte long, and value is a floating point value represnted as an IEEE 32-bit floating poitn value.</p> - * - * <p>It uses an analogous column family called "items" for the same data, but keyed by item ID rather - * than user ID. In this column family, column names are user IDs instead.</p> - * - * <p>It uses a column family called "userIDs" as well, with an identical schema. It has one row under key - * 0. IT contains a column for every user ID in th emodel. It has no values.</p> - * - * <p>Finally it also uses an analogous column family "itemIDs" containing item IDs.</p> - * - * <p>Each of these four column families needs to be created ahead of time. Again the - * Cassandra CLI can be used to do so, with commands like {@code create column family users;}.</p> - * - * <p>Note that this thread uses a long-lived Cassandra client which will run until terminated. You - * must {@link #close()} this implementation when done or the JVM will not terminate.</p> - * - * <p>This implementation still relies heavily on reading data into memory and caching, - * as it remains too data-intensive to be effective even against Cassandra. It will take some time to - * "warm up" as the first few requests will block loading user and item data into caches. This is still going - * to send a great deal of query traffic to Cassandra. It would be advisable to employ caching wrapper - * classes in your implementation, like {@link org.apache.mahout.cf.taste.impl.recommender.CachingRecommender} - * or {@link org.apache.mahout.cf.taste.impl.similarity.CachingItemSimilarity}.</p> - */ -public final class CassandraDataModel implements DataModel, Closeable { - - /** Default Cassandra host. Default: localhost */ - private static final String DEFAULT_HOST = "localhost"; - - /** Default Cassandra port. Default: 9160 */ - private static final int DEFAULT_PORT = 9160; - - /** Default Cassandra keyspace. Default: recommender */ - private static final String DEFAULT_KEYSPACE = "recommender"; - - static final String USERS_CF = "users"; - static final String ITEMS_CF = "items"; - static final String USER_IDS_CF = "userIDs"; - static final String ITEM_IDS_CF = "itemIDs"; - private static final long ID_ROW_KEY = 0L; - private static final byte[] EMPTY = new byte[0]; - - private final Cluster cluster; - private final Keyspace keyspace; - private final Cache<Long,PreferenceArray> userCache; - private final Cache<Long,PreferenceArray> itemCache; - private final Cache<Long,FastIDSet> itemIDsFromUserCache; - private final Cache<Long,FastIDSet> userIDsFromItemCache; - private final AtomicReference<Integer> userCountCache; - private final AtomicReference<Integer> itemCountCache; - - /** - * Uses the standard Cassandra host and port (localhost:9160), and keyspace name ("recommender"). - */ - public CassandraDataModel() { - this(DEFAULT_HOST, DEFAULT_PORT, DEFAULT_KEYSPACE); - } - - /** - * @param host Cassandra server host name - * @param port Cassandra server port - * @param keyspaceName name of Cassandra keyspace to use - */ - public CassandraDataModel(String host, int port, String keyspaceName) { - - Preconditions.checkNotNull(host); - Preconditions.checkArgument(port > 0, "port must be greater then 0!"); - Preconditions.checkNotNull(keyspaceName); - - cluster = HFactory.getOrCreateCluster(CassandraDataModel.class.getSimpleName(), host + ':' + port); - keyspace = HFactory.createKeyspace(keyspaceName, cluster); - keyspace.setConsistencyLevelPolicy(new OneConsistencyLevelPolicy()); - - userCache = new Cache<>(new UserPrefArrayRetriever(), 1 << 20); - itemCache = new Cache<>(new ItemPrefArrayRetriever(), 1 << 20); - itemIDsFromUserCache = new Cache<>(new ItemIDsFromUserRetriever(), 1 << 20); - userIDsFromItemCache = new Cache<>(new UserIDsFromItemRetriever(), 1 << 20); - userCountCache = new AtomicReference<>(null); - itemCountCache = new AtomicReference<>(null); - } - - @Override - public LongPrimitiveIterator getUserIDs() { - SliceQuery<Long,Long,?> query = buildNoValueSliceQuery(USER_IDS_CF); - query.setKey(ID_ROW_KEY); - FastIDSet userIDs = new FastIDSet(); - for (HColumn<Long,?> userIDColumn : query.execute().get().getColumns()) { - userIDs.add(userIDColumn.getName()); - } - return userIDs.iterator(); - } - - @Override - public PreferenceArray getPreferencesFromUser(long userID) throws TasteException { - return userCache.get(userID); - } - - @Override - public FastIDSet getItemIDsFromUser(long userID) throws TasteException { - return itemIDsFromUserCache.get(userID); - } - - @Override - public LongPrimitiveIterator getItemIDs() { - SliceQuery<Long,Long,?> query = buildNoValueSliceQuery(ITEM_IDS_CF); - query.setKey(ID_ROW_KEY); - FastIDSet itemIDs = new FastIDSet(); - for (HColumn<Long,?> itemIDColumn : query.execute().get().getColumns()) { - itemIDs.add(itemIDColumn.getName()); - } - return itemIDs.iterator(); - } - - @Override - public PreferenceArray getPreferencesForItem(long itemID) throws TasteException { - return itemCache.get(itemID); - } - - @Override - public Float getPreferenceValue(long userID, long itemID) { - ColumnQuery<Long,Long,Float> query = - HFactory.createColumnQuery(keyspace, LongSerializer.get(), LongSerializer.get(), FloatSerializer.get()); - query.setColumnFamily(USERS_CF); - query.setKey(userID); - query.setName(itemID); - HColumn<Long,Float> column = query.execute().get(); - return column == null ? null : column.getValue(); - } - - @Override - public Long getPreferenceTime(long userID, long itemID) { - ColumnQuery<Long,Long,?> query = - HFactory.createColumnQuery(keyspace, LongSerializer.get(), LongSerializer.get(), BytesArraySerializer.get()); - query.setColumnFamily(USERS_CF); - query.setKey(userID); - query.setName(itemID); - HColumn<Long,?> result = query.execute().get(); - return result == null ? null : result.getClock(); - } - - @Override - public int getNumItems() { - Integer itemCount = itemCountCache.get(); - if (itemCount == null) { - CountQuery<Long,Long> countQuery = - HFactory.createCountQuery(keyspace, LongSerializer.get(), LongSerializer.get()); - countQuery.setKey(ID_ROW_KEY); - countQuery.setColumnFamily(ITEM_IDS_CF); - countQuery.setRange(null, null, Integer.MAX_VALUE); - itemCount = countQuery.execute().get(); - itemCountCache.set(itemCount); - } - return itemCount; - } - - @Override - public int getNumUsers() { - Integer userCount = userCountCache.get(); - if (userCount == null) { - CountQuery<Long,Long> countQuery = - HFactory.createCountQuery(keyspace, LongSerializer.get(), LongSerializer.get()); - countQuery.setKey(ID_ROW_KEY); - countQuery.setColumnFamily(USER_IDS_CF); - countQuery.setRange(null, null, Integer.MAX_VALUE); - userCount = countQuery.execute().get(); - userCountCache.set(userCount); - } - return userCount; - } - - @Override - public int getNumUsersWithPreferenceFor(long itemID) throws TasteException { - /* - CountQuery<Long,Long> query = HFactory.createCountQuery(keyspace, LongSerializer.get(), LongSerializer.get()); - query.setColumnFamily(ITEMS_CF); - query.setKey(itemID); - query.setRange(null, null, Integer.MAX_VALUE); - return query.execute().get(); - */ - return userIDsFromItemCache.get(itemID).size(); - } - - @Override - public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException { - FastIDSet userIDs1 = userIDsFromItemCache.get(itemID1); - FastIDSet userIDs2 = userIDsFromItemCache.get(itemID2); - return userIDs1.size() < userIDs2.size() - ? userIDs2.intersectionSize(userIDs1) - : userIDs1.intersectionSize(userIDs2); - } - - @Override - public void setPreference(long userID, long itemID, float value) { - - if (Float.isNaN(value)) { - value = 1.0f; - } - - long now = System.currentTimeMillis(); - - Mutator<Long> mutator = HFactory.createMutator(keyspace, LongSerializer.get()); - - HColumn<Long,Float> itemForUsers = new HColumnImpl<>(LongSerializer.get(), FloatSerializer.get()); - itemForUsers.setName(itemID); - itemForUsers.setClock(now); - itemForUsers.setValue(value); - mutator.addInsertion(userID, USERS_CF, itemForUsers); - - HColumn<Long,Float> userForItems = new HColumnImpl<>(LongSerializer.get(), FloatSerializer.get()); - userForItems.setName(userID); - userForItems.setClock(now); - userForItems.setValue(value); - mutator.addInsertion(itemID, ITEMS_CF, userForItems); - - HColumn<Long,byte[]> userIDs = new HColumnImpl<>(LongSerializer.get(), BytesArraySerializer.get()); - userIDs.setName(userID); - userIDs.setClock(now); - userIDs.setValue(EMPTY); - mutator.addInsertion(ID_ROW_KEY, USER_IDS_CF, userIDs); - - HColumn<Long,byte[]> itemIDs = new HColumnImpl<>(LongSerializer.get(), BytesArraySerializer.get()); - itemIDs.setName(itemID); - itemIDs.setClock(now); - itemIDs.setValue(EMPTY); - mutator.addInsertion(ID_ROW_KEY, ITEM_IDS_CF, itemIDs); - - mutator.execute(); - } - - @Override - public void removePreference(long userID, long itemID) { - Mutator<Long> mutator = HFactory.createMutator(keyspace, LongSerializer.get()); - mutator.addDeletion(userID, USERS_CF, itemID, LongSerializer.get()); - mutator.addDeletion(itemID, ITEMS_CF, userID, LongSerializer.get()); - mutator.execute(); - // Not deleting from userIDs, itemIDs though - } - - /** - * @return true - */ - @Override - public boolean hasPreferenceValues() { - return true; - } - - /** - * @return Float#NaN - */ - @Override - public float getMaxPreference() { - return Float.NaN; - } - - /** - * @return Float#NaN - */ - @Override - public float getMinPreference() { - return Float.NaN; - } - - @Override - public void refresh(Collection<Refreshable> alreadyRefreshed) { - userCache.clear(); - itemCache.clear(); - userIDsFromItemCache.clear(); - itemIDsFromUserCache.clear(); - userCountCache.set(null); - itemCountCache.set(null); - } - - @Override - public String toString() { - return "CassandraDataModel[" + keyspace + ']'; - } - - @Override - public void close() { - HFactory.shutdownCluster(cluster); - } - - - private SliceQuery<Long,Long,byte[]> buildNoValueSliceQuery(String cf) { - SliceQuery<Long,Long,byte[]> query = - HFactory.createSliceQuery(keyspace, LongSerializer.get(), LongSerializer.get(), BytesArraySerializer.get()); - query.setColumnFamily(cf); - query.setRange(null, null, false, Integer.MAX_VALUE); - return query; - } - - private SliceQuery<Long,Long,Float> buildValueSliceQuery(String cf) { - SliceQuery<Long,Long,Float> query = - HFactory.createSliceQuery(keyspace, LongSerializer.get(), LongSerializer.get(), FloatSerializer.get()); - query.setColumnFamily(cf); - query.setRange(null, null, false, Integer.MAX_VALUE); - return query; - } - - - private static final class OneConsistencyLevelPolicy implements ConsistencyLevelPolicy { - @Override - public HConsistencyLevel get(OperationType op) { - return HConsistencyLevel.ONE; - } - - @Override - public HConsistencyLevel get(OperationType op, String cfName) { - return HConsistencyLevel.ONE; - } - } - - private final class UserPrefArrayRetriever implements Retriever<Long, PreferenceArray> { - @Override - public PreferenceArray get(Long userID) throws TasteException { - SliceQuery<Long,Long,Float> query = buildValueSliceQuery(USERS_CF); - query.setKey(userID); - - ColumnSlice<Long,Float> result = query.execute().get(); - if (result == null) { - throw new NoSuchUserException(userID); - } - List<HColumn<Long,Float>> itemIDColumns = result.getColumns(); - if (itemIDColumns.isEmpty()) { - throw new NoSuchUserException(userID); - } - int size = itemIDColumns.size(); - PreferenceArray prefs = new GenericUserPreferenceArray(size); - prefs.setUserID(0, userID); - for (int i = 0; i < size; i++) { - HColumn<Long,Float> itemIDColumn = itemIDColumns.get(i); - prefs.setItemID(i, itemIDColumn.getName()); - prefs.setValue(i, itemIDColumn.getValue()); - } - return prefs; - } - } - - private final class ItemPrefArrayRetriever implements Retriever<Long, PreferenceArray> { - @Override - public PreferenceArray get(Long itemID) throws TasteException { - SliceQuery<Long,Long,Float> query = buildValueSliceQuery(ITEMS_CF); - query.setKey(itemID); - ColumnSlice<Long,Float> result = query.execute().get(); - if (result == null) { - throw new NoSuchItemException(itemID); - } - List<HColumn<Long,Float>> userIDColumns = result.getColumns(); - if (userIDColumns.isEmpty()) { - throw new NoSuchItemException(itemID); - } - int size = userIDColumns.size(); - PreferenceArray prefs = new GenericItemPreferenceArray(size); - prefs.setItemID(0, itemID); - for (int i = 0; i < size; i++) { - HColumn<Long,Float> userIDColumn = userIDColumns.get(i); - prefs.setUserID(i, userIDColumn.getName()); - prefs.setValue(i, userIDColumn.getValue()); - } - return prefs; - } - } - - private final class UserIDsFromItemRetriever implements Retriever<Long, FastIDSet> { - @Override - public FastIDSet get(Long itemID) throws TasteException { - SliceQuery<Long,Long,byte[]> query = buildNoValueSliceQuery(ITEMS_CF); - query.setKey(itemID); - ColumnSlice<Long,byte[]> result = query.execute().get(); - if (result == null) { - throw new NoSuchItemException(itemID); - } - List<HColumn<Long,byte[]>> columns = result.getColumns(); - FastIDSet userIDs = new FastIDSet(columns.size()); - for (HColumn<Long,?> userIDColumn : columns) { - userIDs.add(userIDColumn.getName()); - } - return userIDs; - } - } - - private final class ItemIDsFromUserRetriever implements Retriever<Long, FastIDSet> { - @Override - public FastIDSet get(Long userID) throws TasteException { - SliceQuery<Long,Long,byte[]> query = buildNoValueSliceQuery(USERS_CF); - query.setKey(userID); - FastIDSet itemIDs = new FastIDSet(); - ColumnSlice<Long,byte[]> result = query.execute().get(); - if (result == null) { - throw new NoSuchUserException(userID); - } - List<HColumn<Long,byte[]>> columns = result.getColumns(); - if (columns.isEmpty()) { - throw new NoSuchUserException(userID); - } - for (HColumn<Long,?> itemIDColumn : columns) { - itemIDs.add(itemIDColumn.getName()); - } - return itemIDs; - } - } - -}
