This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-sdap-mudrod.git
The following commit(s) were added to refs/heads/master by this push:
new 92bb65e SDAP-186 : Add Cosine distance as vector similarity measure
(#44)
92bb65e is described below
commit 92bb65e5223ecb2851ec343edf28977577b100e8
Author: Kevin Ratnasekera <[email protected]>
AuthorDate: Wed Mar 6 22:10:11 2019 +0530
SDAP-186 : Add Cosine distance as vector similarity measure (#44)
---
.../apache/sdap/mudrod/utils/SimilarityUtil.java | 20 ++++++-
.../sdap/mudrod/utils/TestSimilarityUtils.java | 66 ++++++++++++++++++++++
2 files changed, 83 insertions(+), 3 deletions(-)
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/utils/SimilarityUtil.java
b/core/src/main/java/org/apache/sdap/mudrod/utils/SimilarityUtil.java
index 2fcdaf6..71179dc 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/utils/SimilarityUtil.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/utils/SimilarityUtil.java
@@ -104,6 +104,8 @@ public class SimilarityUtil {
weight = SimilarityUtil.pearsonDistance(vecA, vecB);
} else if (simType == SimilarityUtil.SIM_HELLINGER) {
weight = SimilarityUtil.hellingerDistance(vecA, vecB);
+ } else if (simType == SimilarityUtil.SIM_COSINE) {
+ weight = SimilarityUtil.cosineDistance(vecA, vecB);
}
LinkageTriple triple = new LinkageTriple();
@@ -265,13 +267,25 @@ public class SimilarityUtil {
}
/**
- * calculate similarity between vectors
+ * Calculate similarity (Cosine Distance) between vectors.
*
* @param vecA initial vector from which to calculate a similarity
* @param vecB second vector involved in similarity calculation
- * @return similarity between two vectors
+ * @return Cosine similarity between two vectors
*/
public static double cosineDistance(Vector vecA, Vector vecB) {
- return 1;
+ double[] arrA = vecA.toArray();
+ double[] arrB = vecB.toArray();
+
+ double dotProduct = 0.0;
+ double normA = 0.0;
+ double normB = 0.0;
+ for (int i = 0; i < arrA.length; i++) {
+ dotProduct += arrA[i] * arrB[i];
+ normA += Math.pow(arrA[i], 2);
+ normB += Math.pow(arrB[i], 2);
+ }
+
+ return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
}
diff --git
a/core/src/test/java/org/apache/sdap/mudrod/utils/TestSimilarityUtils.java
b/core/src/test/java/org/apache/sdap/mudrod/utils/TestSimilarityUtils.java
new file mode 100644
index 0000000..cae1ba1
--- /dev/null
+++ b/core/src/test/java/org/apache/sdap/mudrod/utils/TestSimilarityUtils.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.utils;
+
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Test cases for Similarity Utility class. {@link
org.apache.sdap.mudrod.utils.SimilarityUtil}
+ */
+public class TestSimilarityUtils {
+
+ @Test
+ public final void testCosineSimilarity() {
+
+ // Vector A :[2, 2, 3, 5]
+ // Vector B :[4, 1, 2, 1]
+
+ double[] vecAValues = {2, 2, 3, 5};
+ double[] vecBValues = {4, 1, 2, 1};
+ Vector vecA = Vectors.dense(vecAValues);
+ Vector vecB = Vectors.dense(vecBValues);
+ double cosineDistance = SimilarityUtil.cosineDistance(vecA, vecB);
+ double expectedValue = 0.6908492797077574;
+ Assert.assertEquals("Cosine similarity calculation failed for 4-D
vectors.",
+ new Double(expectedValue), new Double(cosineDistance));
+
+ // Vector C :[1, 2]
+ // Vector D :[4, 5]
+
+ double[] vecCValues = {1, 2};
+ double[] vecDValues = {4, 5};
+ Vector vecC = Vectors.dense(vecCValues);
+ Vector vecD = Vectors.dense(vecDValues);
+ cosineDistance = SimilarityUtil.cosineDistance(vecC, vecD);
+ expectedValue = 0.9778024140774094;
+ Assert.assertEquals("Cosine similarity calculation failed for 2-D
vectors.",
+ new Double(expectedValue), new Double(cosineDistance));
+
+ // Vector E :[5, 2, 6]
+ // Vector F :[4, 5, 7]
+
+ double[] vecEValues = {5, 2, 6};
+ double[] vecFValues = {4, 5, 7};
+ Vector vecE = Vectors.dense(vecEValues);
+ Vector vecF = Vectors.dense(vecFValues);
+ cosineDistance = SimilarityUtil.cosineDistance(vecE, vecF);
+ expectedValue = 0.9413574486632833;
+ Assert.assertEquals("Cosine similarity calculation failed for 3-D
vectors.",
+ new Double(expectedValue), new Double(cosineDistance));
+ }
+
+}