Repository: commons-text Updated Branches: refs/heads/master 8ae4ff075 -> 6872117ae
[TEXT-139] Improve JaccardSimilarity computational cost. Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/0d4c9c45 Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/0d4c9c45 Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/0d4c9c45 Branch: refs/heads/master Commit: 0d4c9c4593fa98909c603fc701c8116975d8d8a8 Parents: 85465e2 Author: nickwongwong <a465625...@yeah.net> Authored: Mon Sep 10 22:01:46 2018 +0800 Committer: nickwongwong <a465625...@yeah.net> Committed: Mon Sep 10 22:01:46 2018 +0800 ---------------------------------------------------------------------- .../text/similarity/JaccardSimilarity.java | 27 ++++++++------------ 1 file changed, 11 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/0d4c9c45/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java index 1dc2b85..2e88dd2 100644 --- a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java @@ -62,27 +62,22 @@ public class JaccardSimilarity implements SimilarityScore<Double> { * @return index */ private Double calculateJaccardSimilarity(final CharSequence left, final CharSequence right) { - final Set<String> intersectionSet = new HashSet<>(); - final Set<String> unionSet = new HashSet<>(); - boolean unionFilled = false; final int leftLength = left.length(); final int rightLength = right.length(); if (leftLength == 0 || rightLength == 0) { return 0d; } - - for (int leftIndex = 0; leftIndex < leftLength; leftIndex++) { - unionSet.add(String.valueOf(left.charAt(leftIndex))); - for (int rightIndex = 0; rightIndex < rightLength; rightIndex++) { - if (!unionFilled) { - unionSet.add(String.valueOf(right.charAt(rightIndex))); - } - if (left.charAt(leftIndex) == right.charAt(rightIndex)) { - intersectionSet.add(String.valueOf(left.charAt(leftIndex))); - } - } - unionFilled = true; + final Set<Character> leftSet = new HashSet<>(); + for (int i = 0; i < leftLength; i++) { + leftSet.add(left.charAt(i)); + } + final Set<Character> rightSet = new HashSet<>(); + for (int i = 0; i < rightLength; i++) { + rightSet.add(right.charAt(i)); } - return Double.valueOf(intersectionSet.size()) / Double.valueOf(unionSet.size()); + final Set<Character> unionSet = new HashSet<>(leftSet); + unionSet.addAll(rightSet); + final int intersectionSize = leftSet.size() + rightSet.size() - unionSet.size(); + return 1.0d * intersectionSize / unionSet.size(); } }