This is an automated email from the ASF dual-hosted git repository.

lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-sdap-mudrod.git


The following commit(s) were added to refs/heads/master by this push:
     new 92bb65e  SDAP-186 : Add Cosine distance as vector similarity measure 
(#44)
92bb65e is described below

commit 92bb65e5223ecb2851ec343edf28977577b100e8
Author: Kevin Ratnasekera <[email protected]>
AuthorDate: Wed Mar 6 22:10:11 2019 +0530

    SDAP-186 : Add Cosine distance as vector similarity measure (#44)
---
 .../apache/sdap/mudrod/utils/SimilarityUtil.java   | 20 ++++++-
 .../sdap/mudrod/utils/TestSimilarityUtils.java     | 66 ++++++++++++++++++++++
 2 files changed, 83 insertions(+), 3 deletions(-)

diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/utils/SimilarityUtil.java 
b/core/src/main/java/org/apache/sdap/mudrod/utils/SimilarityUtil.java
index 2fcdaf6..71179dc 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/utils/SimilarityUtil.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/utils/SimilarityUtil.java
@@ -104,6 +104,8 @@ public class SimilarityUtil {
           weight = SimilarityUtil.pearsonDistance(vecA, vecB);
         } else if (simType == SimilarityUtil.SIM_HELLINGER) {
           weight = SimilarityUtil.hellingerDistance(vecA, vecB);
+        } else if (simType == SimilarityUtil.SIM_COSINE) {
+          weight = SimilarityUtil.cosineDistance(vecA, vecB);
         }
 
         LinkageTriple triple = new LinkageTriple();
@@ -265,13 +267,25 @@ public class SimilarityUtil {
   }
 
   /**
-   * calculate similarity between vectors
+   * Calculate similarity (Cosine Distance) between vectors.
    *
    * @param vecA initial vector from which to calculate a similarity
    * @param vecB second vector involved in similarity calculation
-   * @return similarity between two vectors
+   * @return Cosine similarity between two vectors
    */
   public static double cosineDistance(Vector vecA, Vector vecB) {
-    return 1;
+    double[] arrA = vecA.toArray();
+    double[] arrB = vecB.toArray();
+
+    double dotProduct = 0.0;
+    double normA = 0.0;
+    double normB = 0.0;
+    for (int i = 0; i < arrA.length; i++) {
+      dotProduct += arrA[i] * arrB[i];
+      normA += Math.pow(arrA[i], 2);
+      normB += Math.pow(arrB[i], 2);
+    }
+
+    return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
   }
 }
diff --git 
a/core/src/test/java/org/apache/sdap/mudrod/utils/TestSimilarityUtils.java 
b/core/src/test/java/org/apache/sdap/mudrod/utils/TestSimilarityUtils.java
new file mode 100644
index 0000000..cae1ba1
--- /dev/null
+++ b/core/src/test/java/org/apache/sdap/mudrod/utils/TestSimilarityUtils.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.utils;
+
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Test cases for Similarity Utility class. {@link 
org.apache.sdap.mudrod.utils.SimilarityUtil}
+ */
+public class TestSimilarityUtils {
+
+  @Test
+  public final void testCosineSimilarity() {
+
+    // Vector A :[2, 2, 3, 5]
+    // Vector B :[4, 1, 2, 1]
+
+    double[] vecAValues = {2, 2, 3, 5};
+    double[] vecBValues = {4, 1, 2, 1};
+    Vector vecA = Vectors.dense(vecAValues);
+    Vector vecB = Vectors.dense(vecBValues);
+    double cosineDistance = SimilarityUtil.cosineDistance(vecA, vecB);
+    double expectedValue = 0.6908492797077574;
+    Assert.assertEquals("Cosine similarity calculation failed for 4-D 
vectors.",
+            new Double(expectedValue), new Double(cosineDistance));
+
+    // Vector C :[1, 2]
+    // Vector D :[4, 5]
+
+    double[] vecCValues = {1, 2};
+    double[] vecDValues = {4, 5};
+    Vector vecC = Vectors.dense(vecCValues);
+    Vector vecD = Vectors.dense(vecDValues);
+    cosineDistance = SimilarityUtil.cosineDistance(vecC, vecD);
+    expectedValue = 0.9778024140774094;
+    Assert.assertEquals("Cosine similarity calculation failed for 2-D 
vectors.",
+            new Double(expectedValue), new Double(cosineDistance));
+
+    // Vector E :[5, 2, 6]
+    // Vector F :[4, 5, 7]
+
+    double[] vecEValues = {5, 2, 6};
+    double[] vecFValues = {4, 5, 7};
+    Vector vecE = Vectors.dense(vecEValues);
+    Vector vecF = Vectors.dense(vecFValues);
+    cosineDistance = SimilarityUtil.cosineDistance(vecE, vecF);
+    expectedValue = 0.9413574486632833;
+    Assert.assertEquals("Cosine similarity calculation failed for 3-D 
vectors.",
+            new Double(expectedValue), new Double(cosineDistance));
+  }
+
+}

Reply via email to