Repository: mahout Updated Branches: refs/heads/master 0853c069f -> b988c493b
http://git-wip-us.apache.org/repos/asf/mahout/blob/b988c493/mr/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java ---------------------------------------------------------------------- diff --git a/mr/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java b/mr/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java new file mode 100644 index 0000000..c8a8c51 --- /dev/null +++ b/mr/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.hadoop.similarity.cooccurrence.measures; + +import org.apache.mahout.common.ClassUtils; +import org.apache.mahout.common.MahoutTestCase; +import org.apache.mahout.math.DenseVector; +import org.apache.mahout.math.RandomAccessSparseVector; +import org.apache.mahout.math.SequentialAccessSparseVector; +import org.apache.mahout.math.Vector; +import org.junit.Test; + +public class VectorSimilarityMeasuresTest extends MahoutTestCase { + + static double distributedSimilarity(double[] one, + double[] two, + Class<? extends VectorSimilarityMeasure> similarityMeasureClass) { + double rand = computeSimilarity(one, two, similarityMeasureClass, new RandomAccessSparseVector(one.length)); + double seq = computeSimilarity(one, two, similarityMeasureClass, new SequentialAccessSparseVector(one.length)); + double dense = computeSimilarity(one, two, similarityMeasureClass, new DenseVector(one.length)); + assertEquals(seq, rand, 1.0e-10); + assertEquals(seq, dense, 1.0e-10); + assertEquals(dense, rand, 1.0e-10); + return seq; + } + + private static double computeSimilarity(double[] one, double[] two, + Class<? extends VectorSimilarityMeasure> similarityMeasureClass, + Vector like) { + VectorSimilarityMeasure similarityMeasure = ClassUtils.instantiateAs(similarityMeasureClass, + VectorSimilarityMeasure.class); + Vector oneNormalized = similarityMeasure.normalize(asVector(one, like)); + Vector twoNormalized = similarityMeasure.normalize(asVector(two, like)); + + double normOne = similarityMeasure.norm(oneNormalized); + double normTwo = similarityMeasure.norm(twoNormalized); + + double dot = 0; + for (int n = 0; n < one.length; n++) { + if (oneNormalized.get(n) != 0 && twoNormalized.get(n) != 0) { + dot += similarityMeasure.aggregate(oneNormalized.get(n), twoNormalized.get(n)); + } + } + + return similarityMeasure.similarity(dot, normOne, normTwo, one.length); + } + + static Vector asVector(double[] values, Vector like) { + Vector vector = like.like(); + for (int dim = 0; dim < values.length; dim++) { + vector.set(dim, values[dim]); + } + return vector; + } + + @Test + public void testCooccurrenceCountSimilarity() { + double similarity = distributedSimilarity( + new double[] { 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0 }, + new double[] { 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, CooccurrenceCountSimilarity.class); + + assertEquals(5.0, similarity, 0); + } + + @Test + public void testTanimotoCoefficientSimilarity() { + double similarity = distributedSimilarity( + new double[] { 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0 }, + new double[] { 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, TanimotoCoefficientSimilarity.class); + + assertEquals(0.454545455, similarity, EPSILON); + } + + @Test + public void testCityblockSimilarity() { + double similarity = distributedSimilarity( + new double[] { 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0 }, + new double[] { 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, CityBlockSimilarity.class); + + assertEquals(0.142857143, similarity, EPSILON); + } + + @Test + public void testLoglikelihoodSimilarity() { + double similarity = distributedSimilarity( + new double[] { 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0 }, + new double[] { 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, LoglikelihoodSimilarity.class); + + assertEquals(0.03320155369284261, similarity, EPSILON); + } + + @Test + public void testCosineSimilarity() { + double similarity = distributedSimilarity( + new double[] { 0, 2, 0, 0, 8, 3, 0, 6, 0, 1, 2, 2, 0 }, + new double[] { 3, 0, 0, 0, 7, 0, 2, 2, 1, 3, 2, 1, 1 }, CosineSimilarity.class); + + assertEquals(0.769846046, similarity, EPSILON); + } + + @Test + public void testPearsonCorrelationSimilarity() { + double similarity = distributedSimilarity( + new double[] { 0, 2, 0, 0, 8, 3, 0, 6, 0, 1, 1, 2, 1 }, + new double[] { 3, 0, 0, 0, 7, 0, 2, 2, 1, 3, 2, 4, 3 }, PearsonCorrelationSimilarity.class); + + assertEquals(0.5303300858899108, similarity, EPSILON); + } + + @Test + public void testEuclideanDistanceSimilarity() { + double similarity = distributedSimilarity( + new double[] { 0, 2, 0, 0, 8, 3, 0, 6, 0, 1, 1, 2, 1 }, + new double[] { 3, 0, 0, 0, 7, 0, 2, 2, 1, 3, 2, 4, 4 }, EuclideanDistanceSimilarity.class); + + assertEquals(0.11268865367232477, similarity, EPSILON); + } +}
