Repository: commons-text
Updated Branches:
  refs/heads/master 8ae4ff075 -> 6872117ae


[TEXT-139] Improve JaccardSimilarity computational cost.


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/0d4c9c45
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/0d4c9c45
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/0d4c9c45

Branch: refs/heads/master
Commit: 0d4c9c4593fa98909c603fc701c8116975d8d8a8
Parents: 85465e2
Author: nickwongwong <a465625...@yeah.net>
Authored: Mon Sep 10 22:01:46 2018 +0800
Committer: nickwongwong <a465625...@yeah.net>
Committed: Mon Sep 10 22:01:46 2018 +0800

----------------------------------------------------------------------
 .../text/similarity/JaccardSimilarity.java      | 27 ++++++++------------
 1 file changed, 11 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/0d4c9c45/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java 
b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
index 1dc2b85..2e88dd2 100644
--- a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
@@ -62,27 +62,22 @@ public class JaccardSimilarity implements 
SimilarityScore<Double> {
      * @return index
      */
     private Double calculateJaccardSimilarity(final CharSequence left, final 
CharSequence right) {
-        final Set<String> intersectionSet = new HashSet<>();
-        final Set<String> unionSet = new HashSet<>();
-        boolean unionFilled = false;
         final int leftLength = left.length();
         final int rightLength = right.length();
         if (leftLength == 0 || rightLength == 0) {
             return 0d;
         }
-
-        for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) {
-            unionSet.add(String.valueOf(left.charAt(leftIndex)));
-            for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) {
-                if (!unionFilled) {
-                    unionSet.add(String.valueOf(right.charAt(rightIndex)));
-                }
-                if (left.charAt(leftIndex) == right.charAt(rightIndex)) {
-                    
intersectionSet.add(String.valueOf(left.charAt(leftIndex)));
-                }
-            }
-            unionFilled = true;
+        final Set<Character> leftSet = new HashSet<>();
+        for (int i = 0; i < leftLength; i++) {
+            leftSet.add(left.charAt(i));
+        }
+        final Set<Character> rightSet = new HashSet<>();
+        for (int i = 0; i < rightLength; i++) {
+            rightSet.add(right.charAt(i));
         }
-        return Double.valueOf(intersectionSet.size()) / 
Double.valueOf(unionSet.size());
+        final Set<Character> unionSet = new HashSet<>(leftSet);
+        unionSet.addAll(rightSet);
+        final int intersectionSize = leftSet.size() + rightSet.size() - 
unionSet.size();
+        return 1.0d * intersectionSize / unionSet.size();
     }
 }

Reply via email to