Repository: commons-text Updated Branches: refs/heads/NEW-METRICS ff1959c84 -> 81f679dea
Use tokenizers in internal package, and add more tests Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/81f679de Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/81f679de Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/81f679de Branch: refs/heads/NEW-METRICS Commit: 81f679dea1eb4c1d90d7173547ff98f7d14110a4 Parents: ff1959c Author: Bruno P. Kinoshita <[email protected]> Authored: Fri Mar 27 03:25:51 2015 -0300 Committer: Bruno P. Kinoshita <[email protected]> Committed: Fri Mar 27 03:25:51 2015 -0300 ---------------------------------------------------------------------- .../commons/text/similarity/CosineDistance.java | 28 ++++++- .../text/similarity/CosineSimilarity.java | 77 ++++++++++++++------ .../similarity/internal/CharacterTokenizer.java | 43 +++++++++++ .../text/similarity/internal/Counter.java | 44 +++++++++++ .../similarity/internal/SimpleTokenizer.java | 50 +++++++++++++ .../similarity/internal/StringTokenizer.java | 36 +++++++++ .../text/similarity/internal/Tokenizer.java | 36 +++++++++ .../internal/WhiteSpaceTokenizer.java | 46 ++++++++++++ .../text/similarity/internal/package-info.java | 21 ++++++ .../text/similarity/CosineDistanceTest.java | 55 ++++++++++++++ .../text/similarity/CosineSimilarityTest.java | 51 ------------- 11 files changed, 414 insertions(+), 73 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/81f679de/src/main/java/org/apache/commons/text/similarity/CosineDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java index ad19745..54970e4 100644 --- a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java @@ -16,6 +16,32 @@ */ package org.apache.commons.text.similarity; -public class CosineDistance { +import java.util.Map; + +import org.apache.commons.text.similarity.internal.Counter; +import org.apache.commons.text.similarity.internal.SimpleTokenizer; +import org.apache.commons.text.similarity.internal.Tokenizer; + +/** + * <p> + * Calcules the cosine distance. + * </p> + */ +public class CosineDistance implements StringMetric<Double> { + + private final Tokenizer<String> tokenizer = new SimpleTokenizer(); + + private final CosineSimilarity cosineSimilarity = new CosineSimilarity(); + + @Override + public Double compare(CharSequence left, CharSequence right) { + String[] leftTokens = tokenizer.tokenize(left); + String[] rightTokens = tokenizer.tokenize(right); + + Map<String, Integer> leftVector = Counter.of(leftTokens); + Map<String, Integer> rightVector = Counter.of(rightTokens); + double similarity = cosineSimilarity.cosineSimilarity(leftVector, rightVector); + return 1.0 - similarity; + } } http://git-wip-us.apache.org/repos/asf/commons-text/blob/81f679de/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java index ca9d087..992e5e9 100644 --- a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java @@ -16,9 +16,16 @@ */ package org.apache.commons.text.similarity; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + + /** - * <p>Measures the Cosine similarity of two CharSequences. It treats the CharSequences as - * two vectors of an inner product space and compares the angle between them.</p> + * <p> + * Measures the Cosine similarity of two vectors of an inner product space and + * compares the angle between them. + * </p> * * <p> * For further explanation about the Cosine Similarity, take a look at its @@ -27,39 +34,67 @@ package org.apache.commons.text.similarity; * * @since 0.1 */ -public class CosineSimilarity implements StringMetric<Double> { +public class CosineSimilarity { - @Override - public Double compare(CharSequence left, CharSequence right) { - if (left == null || right == null) { - throw new IllegalArgumentException("String parameters must not be null"); + /** + * Calculates the cosine similarity for two given vectors. + * + * @param leftVector left vector + * @param rightVector right vector + * @return cosine similarity between the two vectors + */ + public Double cosineSimilarity(Map<String, Integer> leftVector, Map<String, Integer> rightVector) { + if (leftVector == null || rightVector == null) { + throw new IllegalArgumentException("Vectors must not be null"); } - long dotProduct = dot(left, right); + + Set<String> intersection = getIntersection(leftVector, rightVector); + + double dotProduct = dot(leftVector, rightVector, intersection); double d1 = 0.0d; - for (int i = 0; i < left.length(); ++i) { - d1 += Math.pow(((int) left.charAt(i)), 2); + for (Integer value : leftVector.values()) { + d1 += Math.pow(value, 2); } double d2 = 0.0d; - for (int i = 0; i < right.length(); ++i) { - d2 += Math.pow(((int) right.charAt(i)), 2); + for (Integer value : rightVector.values()) { + d2 += Math.pow(value, 2); + } + double cosineSimilarity; + if (d1 <= 0.0 || d2 <= 0.0) { + cosineSimilarity = 0.0; + } else { + cosineSimilarity = (double) (dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2))); } - double cosineSimilarity = dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2)); return cosineSimilarity; } /** - * Computes the dot product of two CharSequences. It ignores remaining characters. It means - * that if a string is longer than other, then a smaller part of it will be used to compute + * Returns a set with strings common to the two given maps. + * + * @param leftVector left vector map + * @param rightVector right vector map + * @return common strings + */ + private Set<String> getIntersection(Map<String, Integer> leftVector, Map<String, Integer> rightVector) { + Set<String> intersection = new HashSet<String>(leftVector.keySet()); + intersection.retainAll(rightVector.keySet()); + return intersection; + } + + /** + * Computes the dot product of two vectors. It ignores remaining elements. It means + * that if a vector is longer than other, then a smaller part of it will be used to compute * the dot product. - * - * @param left left string - * @param right right string + * + * @param left left vector + * @param right right vector + * @param intersection common elements * @return the dot product */ - protected long dot(CharSequence left, CharSequence right) { + private double dot(Map<String, Integer> leftVector, Map<String, Integer> rightVector, Set<String> intersection) { long dotProduct = 0; - for (int i = 0; i < left.length() && i < right.length(); ++i) { - dotProduct += (((int) left.charAt(i)) * ((int) right.charAt(i))); + for (String key : intersection) { + dotProduct += leftVector.get(key) * rightVector.get(key); } return dotProduct; } http://git-wip-us.apache.org/repos/asf/commons-text/blob/81f679de/src/main/java/org/apache/commons/text/similarity/internal/CharacterTokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/internal/CharacterTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/CharacterTokenizer.java new file mode 100644 index 0000000..5bab5a9 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/internal/CharacterTokenizer.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity.internal; + +/** + * <p> + * Returns each character in a string as a token. + * </p> + */ +public class CharacterTokenizer implements Tokenizer<Character> { + + /** + * {@inheritDoc} + * + * @throws IllegalArgumentException if the input text is blank + */ + @Override + public Character[] tokenize(CharSequence text) { + if (text == null || text.toString().trim().equals("")) { + throw new IllegalArgumentException("Invalid text"); + } + Character[] tokens = new Character[text.length()]; + for (int i = 0; i < text.length(); ++i) { + tokens[i] = text.charAt(i); + } + return tokens; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/81f679de/src/main/java/org/apache/commons/text/similarity/internal/Counter.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/internal/Counter.java b/src/main/java/org/apache/commons/text/similarity/internal/Counter.java new file mode 100644 index 0000000..6d2a4c4 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/internal/Counter.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity.internal; + +import java.util.HashMap; +import java.util.Map; + +/** + * Java implementation of Python's collections Counter module. + * + * @see https://docs.python.org/dev/library/collections.html#collections.Counter + */ +public class Counter { + + private Counter() {} + + public static Map<String, Integer> of(String[] tokens) { + Map<String, Integer> innerCounter = new HashMap<String, Integer>(); + for (String token : tokens) { + if (innerCounter.containsKey(token)) { + int value = innerCounter.get(token); + innerCounter.put(token, ++value); + } else { + innerCounter.put(token, 1); + } + } + return innerCounter; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/81f679de/src/main/java/org/apache/commons/text/similarity/internal/SimpleTokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/internal/SimpleTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/SimpleTokenizer.java new file mode 100644 index 0000000..9df5299 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/internal/SimpleTokenizer.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity.internal; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * <p> + * A simple word tokenizer that utilizes regex. + * </p> + */ +public class SimpleTokenizer implements Tokenizer<String> { + + /** + * {@inheritDoc} + * + * @throws IllegalArgumentException if the input text is blank + */ + @Override + public String[] tokenize(CharSequence text) { + if (text == null || text.toString().trim().equals("")) { + throw new IllegalArgumentException("Invalid text"); + } + Pattern pattern = Pattern.compile("(\\w)+"); + Matcher matcher = pattern.matcher(text.toString()); + List<String> tokens = new ArrayList<String>(); + while (matcher.find()) { + tokens.add(matcher.group(0)); + } + return tokens.toArray(new String[0]); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/81f679de/src/main/java/org/apache/commons/text/similarity/internal/StringTokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/internal/StringTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/StringTokenizer.java new file mode 100644 index 0000000..137f338 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/internal/StringTokenizer.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity.internal; + +/** + * <p> + * A string tokenizer. Can produce arrays of tokens from a given type. + * </p> + * + * @param <T> given type + */ +public interface StringTokenizer<T> { + + /** + * Returns an array of tokens. + * + * @param text input text + * @return array of tokens + */ + T[] tokenize(CharSequence text); + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/81f679de/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java new file mode 100644 index 0000000..b5b877b --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity.internal; + +/** + * <p> + * A tokenizer. Can produce arrays of tokens from a given type. + * </p> + * + * @param <T> given type + */ +public interface Tokenizer<T> { + + /** + * Returns an array of tokens. + * + * @param text input text + * @return array of tokens + */ + T[] tokenize(CharSequence text); + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/81f679de/src/main/java/org/apache/commons/text/similarity/internal/WhiteSpaceTokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/internal/WhiteSpaceTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/WhiteSpaceTokenizer.java new file mode 100644 index 0000000..a827fdf --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/internal/WhiteSpaceTokenizer.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity.internal; + +/** + * <p> + * A simple white space tokenizer. + * </p> + */ +public class WhiteSpaceTokenizer implements Tokenizer<String> { + + private static final String SPACE_DELIMITER = " "; + + /** + * {@inheritDoc} + * + * @throws IllegalArgumentException if the input text is blank + */ + @Override + public String[] tokenize(CharSequence text) { + if (text == null || text.toString().trim().equals("")) { + throw new IllegalArgumentException("Invalid text"); + } + java.util.StringTokenizer stringTokenizer = new java.util.StringTokenizer(text.toString(), SPACE_DELIMITER); + String[] tokens = new String[stringTokenizer.countTokens()]; + for (int i = 0; i < tokens.length; ++i) { + tokens[i] = stringTokenizer.nextToken(); + } + return tokens; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/81f679de/src/main/java/org/apache/commons/text/similarity/internal/package-info.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/internal/package-info.java b/src/main/java/org/apache/commons/text/similarity/internal/package-info.java new file mode 100644 index 0000000..a346c07 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/internal/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Classes used internally by similarity algorithms. Internal use, backward compatibility + * is not guaranteed. + */ +package org.apache.commons.text.similarity.internal; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/commons-text/blob/81f679de/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java new file mode 100644 index 0000000..ce33572 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/CosineDistanceTest.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.junit.Assert.assertEquals; + +import java.math.BigDecimal; +import java.math.RoundingMode; + +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Unit tests for {@link org.apache.commons.text.similarity.CosineSimilarity}. + */ +public class CosineDistanceTest { + + private static CosineDistance cosineDistance; + + @BeforeClass + public static void setUp() { + cosineDistance = new CosineDistance(); + } + + @Test + public void testCosineSimilarity() { + assertEquals(Double.valueOf(0.5d), roundValue(cosineDistance.compare("the house", "da house"))); + assertEquals(Double.valueOf(0.0d), roundValue(cosineDistance.compare("AB", "AB"))); + assertEquals(Double.valueOf(1.0d), roundValue(cosineDistance.compare("AB", "BA"))); + assertEquals(Double.valueOf(0.08d), roundValue(cosineDistance.compare( + "the boy was from tamana shi, kumamoto ken, and the girl was from rio de janeiro, rio", + "the boy was from tamana shi, kumamoto, and the boy was from rio de janeiro, rio de janeiro"))); + } + + // --- Utility methods + + private Double roundValue(Double value) { + return (Double) new BigDecimal(value).setScale(2, RoundingMode.HALF_UP).doubleValue(); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/81f679de/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java deleted file mode 100644 index aa08057..0000000 --- a/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -import static org.junit.Assert.assertEquals; - -import java.math.BigDecimal; -import java.math.RoundingMode; - -import org.junit.BeforeClass; -import org.junit.Test; - -/** - * Unit tests for {@link org.apache.commons.text.similarity.CosineSimilarity}. - */ -public class CosineSimilarityTest { - - private static CosineSimilarity cosineSimilarity; - - @BeforeClass - public static void setUp() { - cosineSimilarity = new CosineSimilarity(); - } - - @Test - public void testCosineSimilarity() { - assertEquals(Double.valueOf(0.62d), roundValue(cosineSimilarity.compare("ABCDE", "AB"))); - assertEquals(Double.valueOf(1.00d), roundValue(cosineSimilarity.compare("AB", "AB"))); - } - - // --- Utility methods - - private Double roundValue(Double value) { - return (Double) new BigDecimal(value).setScale(2, RoundingMode.HALF_UP).doubleValue(); - } - -}
