This is an automated email from the ASF dual-hosted git repository. ggregory pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-text.git
commit ea61a39ddc6e77396ab96d03e08885c4c162cb54 Author: Gary Gregory <[email protected]> AuthorDate: Thu May 18 09:35:36 2023 -0400 Rework some org.apache.commons.text.similarity internals to use JRE's Function --- .../{Tokenizer.java => CharSequenceTokenizer.java} | 18 ++---------------- .../apache/commons/text/similarity/CosineDistance.java | 4 ++-- .../apache/commons/text/similarity/RegexTokenizer.java | 4 ++-- .../org/apache/commons/text/similarity/Tokenizer.java | 18 +++++++----------- 4 files changed, 13 insertions(+), 31 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/Tokenizer.java b/src/main/java/org/apache/commons/text/similarity/CharSequenceTokenizer.java similarity index 74% copy from src/main/java/org/apache/commons/text/similarity/Tokenizer.java copy to src/main/java/org/apache/commons/text/similarity/CharSequenceTokenizer.java index fa8fda46..3c85c382 100644 --- a/src/main/java/org/apache/commons/text/similarity/Tokenizer.java +++ b/src/main/java/org/apache/commons/text/similarity/CharSequenceTokenizer.java @@ -16,20 +16,6 @@ */ package org.apache.commons.text.similarity; -/** - * A tokenizer. Can produce arrays of tokens from a given type. - * - * @param <T> given type - * @since 1.0 - */ -interface Tokenizer<T> { - - /** - * Returns an array of tokens. - * - * @param text input text - * @return array of tokens - */ - T[] tokenize(CharSequence text); - +interface CharSequenceTokenizer<T> extends Tokenizer<T, CharSequence> { + // empty } diff --git a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java index 92c71f1c..807d446e 100644 --- a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java @@ -37,8 +37,8 @@ public class CosineDistance implements EditDistance<Double> { @Override public Double apply(final CharSequence left, final CharSequence right) { - final CharSequence[] leftTokens = RegexTokenizer.INSTANCE.tokenize(left); - final CharSequence[] rightTokens = RegexTokenizer.INSTANCE.tokenize(right); + final CharSequence[] leftTokens = RegexTokenizer.INSTANCE.apply(left); + final CharSequence[] rightTokens = RegexTokenizer.INSTANCE.apply(right); final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens); final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens); diff --git a/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java index be64f849..c23c43e4 100644 --- a/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java +++ b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java @@ -34,7 +34,7 @@ import org.apache.commons.lang3.Validate; * * @since 1.0 */ -final class RegexTokenizer implements Tokenizer<CharSequence> { +final class RegexTokenizer implements CharSequenceTokenizer<CharSequence> { /** The whitespace pattern. */ private static final Pattern PATTERN = Pattern.compile("(\\w)+"); @@ -50,7 +50,7 @@ final class RegexTokenizer implements Tokenizer<CharSequence> { * @throws IllegalArgumentException if the input text is blank */ @Override - public CharSequence[] tokenize(final CharSequence text) { + public CharSequence[] apply(final CharSequence text) { Validate.isTrue(StringUtils.isNotBlank(text), "Invalid text"); final Matcher matcher = PATTERN.matcher(text); final List<String> tokens = new ArrayList<>(); diff --git a/src/main/java/org/apache/commons/text/similarity/Tokenizer.java b/src/main/java/org/apache/commons/text/similarity/Tokenizer.java index fa8fda46..724e1966 100644 --- a/src/main/java/org/apache/commons/text/similarity/Tokenizer.java +++ b/src/main/java/org/apache/commons/text/similarity/Tokenizer.java @@ -14,22 +14,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.commons.text.similarity; +import java.util.function.Function; + /** * A tokenizer. Can produce arrays of tokens from a given type. * - * @param <T> given type + * @param <T> The type to tokenize. + * @param <R> The return array element type. * @since 1.0 */ -interface Tokenizer<T> { - - /** - * Returns an array of tokens. - * - * @param text input text - * @return array of tokens - */ - T[] tokenize(CharSequence text); - +interface Tokenizer<T, R> extends Function<T, R[]> { + // empty }
