asfgit closed pull request #1331: DRILL-6519: Add String Distance and Phonetic
Functions
URL: https://github.com/apache/drill/pull/1331
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/exec/java-exec/pom.xml b/exec/java-exec/pom.xml
index 6d57ba37040..3e1a118097a 100644
--- a/exec/java-exec/pom.xml
+++ b/exec/java-exec/pom.xml
@@ -97,7 +97,11 @@
<artifactId>univocity-parsers</artifactId>
<version>1.3.0</version>
</dependency>
-
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-text</artifactId>
+ <version>1.4</version>
+ </dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math</artifactId>
diff --git
a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/PhoneticFunctions.java
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/PhoneticFunctions.java
new file mode 100644
index 00000000000..ee26bd3ec88
--- /dev/null
+++
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/PhoneticFunctions.java
@@ -0,0 +1,407 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.inject.Inject;
+
+public class PhoneticFunctions {
+ static final org.slf4j.Logger logger =
org.slf4j.LoggerFactory.getLogger(PhoneticFunctions.class);
+
+ private PhoneticFunctions() {
+ }
+
+ /**
+ * The Caverphone function is a phonetic matching function. This is an
algorithm created by the Caversham Project at the University of Otago. It
implements the Caverphone 1.0 algorithm.
+ * <p>
+ * <p>
+ * Usage: SELECT caverphone1( string ) FROM...
+ */
+
+ @FunctionTemplate(name = "caverphone1", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class Caverphone1Function implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput;
+
+ @Output
+ VarCharHolder out;
+
+ @Inject
+ DrillBuf buffer;
+
+ @Override
+ public void setup() {
+ }
+
+ @Override
+ public void eval() {
+
+ String input =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
rawInput.end, rawInput.buffer);
+ String outputString = new
org.apache.commons.codec.language.Caverphone1().encode(input);
+
+ out.buffer = buffer;
+ out.start = 0;
+ out.end = outputString.getBytes().length;
+ buffer.setBytes(0, outputString.getBytes());
+ }
+
+ }
+
+ /**
+ * The Caverphone function is a phonetic matching function. This is an
algorithm created by the Caversham Project at the University of Otago. It
implements the Caverphone 2.0 algorithm.
+ * <p>
+ * Usage: SELECT caverphone2( string ) FROM...
+ */
+
+ @FunctionTemplate(name = "caverphone2", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class Caverphone2Function implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput;
+
+ @Output
+ VarCharHolder out;
+
+ @Inject
+ DrillBuf buffer;
+
+ @Override
+ public void setup() {
+ }
+
+ @Override
+ public void eval() {
+
+ String input =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
rawInput.end, rawInput.buffer);
+ String outputString = new
org.apache.commons.codec.language.Caverphone2().encode(input);
+
+ out.buffer = buffer;
+ out.start = 0;
+ out.end = outputString.getBytes().length;
+ buffer.setBytes(0, outputString.getBytes());
+ }
+
+ }
+
+ /**
+ * Encodes a string into a Cologne Phonetic value.
+ * Implements the Kölner Phonetik (Cologne Phonetic) algorithm issued by
Hans Joachim Postel in 1969.
+ * <p>
+ * The Kölner Phonetik is a phonetic algorithm which is optimized for the
German language.
+ * It is related to the well-known soundex algorithm.
+ * <p>
+ * Usage: SELECT cologne_phonetic( string ) FROM...
+ */
+
+ @FunctionTemplate(name = "cologne_phonetic", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class ColognePhoneticFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput;
+
+ @Output
+ VarCharHolder out;
+
+ @Inject
+ DrillBuf buffer;
+
+ @Override
+ public void setup() {
+ }
+
+ @Override
+ public void eval() {
+
+ String input =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
rawInput.end, rawInput.buffer);
+ String outputString = new
org.apache.commons.codec.language.ColognePhonetic().encode(input);
+
+ out.buffer = buffer;
+ out.start = 0;
+ out.end = outputString.getBytes().length;
+ buffer.setBytes(0, outputString.getBytes());
+ }
+
+ }
+
+ /**
+ * Encodes a string into a Daitch-Mokotoff Soundex value.
+ * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and
American Soundex algorithms,
+ * yielding greater accuracy in matching especially Slavish and Yiddish
surnames with similar pronunciation
+ * but differences in spelling.
+ * <p>
+ * The main differences compared to the other soundex variants are:
+ * coded names are 6 digits long
+ * the initial character of the name is coded
+ * rules to encoded multi-character n-grams
+ * multiple possible encodings for the same name (branching)
+ * <p>
+ * Usage: SELECT dm_soundex( string ) FROM...
+ */
+
+ @FunctionTemplate(name = "dm_soundex", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class DaitchMokotoffFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput;
+
+ @Output
+ VarCharHolder out;
+
+ @Inject
+ DrillBuf buffer;
+
+ @Override
+ public void setup() {
+ }
+
+ @Override
+ public void eval() {
+
+ String input =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
rawInput.end, rawInput.buffer);
+ String outputString = new
org.apache.commons.codec.language.DaitchMokotoffSoundex().encode(input);
+
+ out.buffer = buffer;
+ out.start = 0;
+ out.end = outputString.getBytes().length;
+ buffer.setBytes(0, outputString.getBytes());
+ }
+
+ }
+
+ /**
+ * Match Rating Approach Phonetic Algorithm Developed by Western Airlines in
1977.
+ * Usage: SELECT match_rating_encoder( string ) FROM...
+ */
+
+ @FunctionTemplate(name = "match_rating_encoder", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class MatchRatingFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput;
+
+ @Output
+ VarCharHolder out;
+
+ @Inject
+ DrillBuf buffer;
+
+ @Override
+ public void setup() {
+ }
+
+ @Override
+ public void eval() {
+
+ String input =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
rawInput.end, rawInput.buffer);
+ String outputString = new
org.apache.commons.codec.language.MatchRatingApproachEncoder().encode(input);
+
+ out.buffer = buffer;
+ out.start = 0;
+ out.end = outputString.getBytes().length;
+ buffer.setBytes(0, outputString.getBytes());
+ }
+
+ }
+
+ /**
+ * The New York State Identification and Intelligence System Phonetic Code,
commonly known as NYSIIS, is a phonetic algorithm devised in 1970 as part of
the New York State Identification and Intelligence System (now a part of the
New York State Division of Criminal Justice Services). It features an accuracy
increase of 2.7% over the traditional Soundex algorithm.
+ * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to
relate similar names, but can also be used as a general purpose scheme to find
word with similar phonemes.
+ * <p>
+ * Usage: SELECT nysiis(string) FROM...
+ */
+
+ @FunctionTemplate(name = "nysiis", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class NYSIISFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput;
+
+ @Output
+ VarCharHolder out;
+
+ @Inject
+ DrillBuf buffer;
+
+ @Override
+ public void setup() {
+ }
+
+ @Override
+ public void eval() {
+
+ String input =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
rawInput.end, rawInput.buffer);
+ String outputString = new
org.apache.commons.codec.language.Nysiis().encode(input);
+
+ out.buffer = buffer;
+ out.start = 0;
+ out.end = outputString.getBytes().length;
+ buffer.setBytes(0, outputString.getBytes());
+ }
+ }
+
+ /**
+ * Encodes a string into a Refined Soundex value. Soundex is an encoding
used to relate similar names, but can also be used as a general purpose scheme
to find word with similar phonemes.
+ * <p>
+ * Usage: SELECT refined_soundex( string ) FROM...
+ */
+
+ @FunctionTemplate(name = "refined_soundex", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class RefinedSoundexFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput;
+
+ @Output
+ VarCharHolder out;
+
+ @Inject
+ DrillBuf buffer;
+
+ @Override
+ public void setup() {
+ }
+
+ @Override
+ public void eval() {
+
+ String input =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
rawInput.end, rawInput.buffer);
+ String outputString = new
org.apache.commons.codec.language.RefinedSoundex().encode(input);
+
+ out.buffer = buffer;
+ out.start = 0;
+ out.end = outputString.getBytes().length;
+ buffer.setBytes(0, outputString.getBytes());
+ }
+
+ }
+
+ /**
+ * Encodes a string into a Soundex value. Soundex is an encoding used to
relate similar names, but can also be used as a general purpose scheme to find
word with similar phonemes.
+ * <p>
+ * Usage: SELECT soundex( string ) FROM...
+ */
+
+ @FunctionTemplate(name = "soundex", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class SoundexFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput;
+
+ @Output
+ VarCharHolder out;
+
+ @Inject
+ DrillBuf buffer;
+
+ @Override
+ public void setup() {
+ }
+
+ @Override
+ public void eval() {
+
+ String input =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
rawInput.end, rawInput.buffer);
+ String outputString = new
org.apache.commons.codec.language.Soundex().soundex(input);
+
+ out.buffer = buffer;
+ out.start = 0;
+ out.end = outputString.getBytes().length;
+ buffer.setBytes(0, outputString.getBytes());
+ }
+ }
+
+ /**
+ * Implements the Metaphone phonetic algorithm
(https://en.wikipedia.org/wiki/Metaphone),
+ * and calculates a given string's Metaphone value.
+ * <p>
+ * Usage: SELECT metaphone( string ) FROM...
+ */
+
+ @FunctionTemplate(name = "metaphone", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class MetaphoneFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput;
+
+ @Output
+ VarCharHolder out;
+
+ @Inject
+ DrillBuf buffer;
+
+ @Override
+ public void setup() {
+ }
+
+ @Override
+ public void eval() {
+
+ String input =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
rawInput.end, rawInput.buffer);
+ String outputString = new
org.apache.commons.codec.language.Metaphone().metaphone(input);
+
+ out.buffer = buffer;
+ out.start = 0;
+ out.end = outputString.getBytes().length;
+ buffer.setBytes(0, outputString.getBytes());
+ }
+
+ }
+
+ /**
+ * Implements the Double Metaphone phonetic algorithm
(https://en.wikipedia.org/wiki/Metaphone),
+ * and calculates a given string's Double Metaphone value.
+ * <p>
+ * Usage: SELECT double_metaphone( string ) FROM...
+ */
+
+ @FunctionTemplate(name = "double_metaphone", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class DoubleMetaphoneFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput;
+
+ @Output
+ VarCharHolder out;
+
+ @Inject
+ DrillBuf buffer;
+
+ @Override
+ public void setup() {
+ }
+
+ @Override
+ public void eval() {
+
+ String input =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
rawInput.end, rawInput.buffer);
+ String outputString = new
org.apache.commons.codec.language.DoubleMetaphone().doubleMetaphone(input);
+
+ out.buffer = buffer;
+ out.start = 0;
+ out.end = outputString.getBytes().length;
+ buffer.setBytes(0, outputString.getBytes());
+ }
+ }
+}
\ No newline at end of file
diff --git
a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringDistanceFunctions.java
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringDistanceFunctions.java
new file mode 100644
index 00000000000..0b027694450
--- /dev/null
+++
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringDistanceFunctions.java
@@ -0,0 +1,329 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.Float8Holder;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+public class StringDistanceFunctions {
+ static final org.slf4j.Logger logger =
org.slf4j.LoggerFactory.getLogger(StringDistanceFunctions.class);
+
+ private StringDistanceFunctions() {
+ }
+
+ /**
+ * This function calculates the cosine distance between two strings.
+ * Usage: SELECT cosine_distance( string1, string2 ) AS cosine_distance
FROM...
+ */
+
+ @FunctionTemplate(name = "cosine_distance", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class CosineDistanceFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput1;
+
+ @Param
+ VarCharHolder rawInput2;
+
+ @Workspace
+ org.apache.commons.text.similarity.CosineDistance d;
+
+ @Output
+ Float8Holder out;
+
+ @Override
+ public void setup() {
+ d = new org.apache.commons.text.similarity.CosineDistance();
+ }
+
+ @Override
+ public void eval() {
+
+ String input1 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
rawInput1.end, rawInput1.buffer);
+ String input2 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
rawInput2.end, rawInput2.buffer);
+
+
+ double result = d.apply(input1, input2);
+ out.value = result;
+ }
+ }
+
+ /**
+ * This function calculates the cosine distance between two strings.
+ * A matching algorithm that is similar to the searching algorithms
implemented in editors such
+ * as Sublime Text, TextMate, Atom and others.
+ * <p>
+ * One point is given for every matched character. Subsequent matches yield
two bonus points. A higher score
+ * indicates a higher similarity.
+ * <p>
+ * <p>
+ * Usage: SELECT fuzzy_score( string1, string2 ) AS fuzzy_score FROM...
+ */
+
+ @FunctionTemplate(name = "fuzzy_score", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class FuzzyScoreFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput1;
+
+ @Param
+ VarCharHolder rawInput2;
+
+ @Output
+ Float8Holder out;
+
+ @Workspace
+ org.apache.commons.text.similarity.FuzzyScore d;
+
+ @Override
+ public void setup() {
+ d = new
org.apache.commons.text.similarity.FuzzyScore(java.util.Locale.ENGLISH);
+ }
+
+ @Override
+ public void eval() {
+
+ String input1 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
rawInput1.end, rawInput1.buffer);
+ String input2 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
rawInput2.end, rawInput2.buffer);
+
+ double result = d.fuzzyScore(input1, input2);
+ out.value = result;
+ }
+ }
+
+ /**
+ * The hamming distance between two strings of equal length is the number of
+ * positions at which the corresponding symbols are different.
+ * <p>
+ * For further explanation about the Hamming Distance, take a look at its
+ * Wikipedia page at http://en.wikipedia.org/wiki/Hamming_distance.
+ * <p>
+ * Usage: SELECT hamming_distance( string1, string2 ) FROM...
+ */
+
+
+ @FunctionTemplate(name = "hamming_distance", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class HammingDistanceFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput1;
+
+ @Param
+ VarCharHolder rawInput2;
+
+ @Output
+ Float8Holder out;
+
+ @Workspace
+ org.apache.commons.text.similarity.HammingDistance d;
+
+ @Override
+ public void setup() {
+ d = new org.apache.commons.text.similarity.HammingDistance();
+ }
+
+ @Override
+ public void eval() {
+
+ String input1 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
rawInput1.end, rawInput1.buffer);
+ String input2 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
rawInput2.end, rawInput2.buffer);
+
+ double result = d.apply(input1, input2);
+ out.value = result;
+ }
+ }
+
+
+ /**
+ * Measures the Jaccard distance of two sets of character sequence. Jaccard
+ * distance is the dissimilarity between two sets. It is the complementary of
+ * Jaccard similarity.
+ * <p>
+ * For further explanation about Jaccard Distance, refer
+ * https://en.wikipedia.org/wiki/Jaccard_index
+ * <p>
+ * Usage: SELECT jaccard_distance( string1, string2 ) FROM ...
+ */
+
+
+ @FunctionTemplate(name = "jaccard_distance", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class JaccardDistanceFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput1;
+
+ @Param
+ VarCharHolder rawInput2;
+
+ @Output
+ Float8Holder out;
+
+ @Workspace
+ org.apache.commons.text.similarity.JaccardDistance d;
+
+ @Override
+ public void setup() {
+ d = new org.apache.commons.text.similarity.JaccardDistance();
+ }
+
+ @Override
+ public void eval() {
+
+ String input1 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
rawInput1.end, rawInput1.buffer);
+ String input2 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
rawInput2.end, rawInput2.buffer);
+
+ double result = d.apply(input1, input2);
+ out.value = result;
+ }
+ }
+
+ /**
+ * A similarity algorithm indicating the percentage of matched characters
between two character sequences.
+ * <p>
+ * The Jaro measure is the weighted sum of percentage of matched characters
+ * from each file and transposed characters. Winkler increased this measure
+ * for matching initial characters.
+ * <p>
+ * This implementation is based on the Jaro Winkler similarity algorithm
+ * from https://en.wikipedia.org/wiki/Jaro–Winkler_distance
+ * <p>
+ * Usage: SELECT jaro_distance( string1, string2 ) FROM...
+ */
+
+ @FunctionTemplate(name = "jaro_distance", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class JaroDistanceFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput1;
+
+ @Param
+ VarCharHolder rawInput2;
+
+ @Output
+ Float8Holder out;
+
+ @Workspace
+ org.apache.commons.text.similarity.JaroWinklerDistance d;
+
+ @Override
+ public void setup() {
+ d = new org.apache.commons.text.similarity.JaroWinklerDistance();
+ }
+
+ @Override
+ public void eval() {
+
+ String input1 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
rawInput1.end, rawInput1.buffer);
+ String input2 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
rawInput2.end, rawInput2.buffer);
+
+ double result = d.apply(input1, input2);
+ out.value = result;
+ }
+ }
+
+ /**
+ * An algorithm for measuring the difference between two character sequences.
+ * <p>
+ * This is the number of changes needed to change one sequence into another,
+ * where each change is a single character modification (deletion, insertion
+ * or substitution).
+ * <p>
+ * Usage: SELECT levenshtein_distance( string1, string2 ) FROM...
+ */
+
+ @FunctionTemplate(name = "levenshtein_distance", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class LevenstheinDistanceFunction implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput1;
+
+ @Param
+ VarCharHolder rawInput2;
+
+ @Output
+ Float8Holder out;
+
+ @Workspace
+ org.apache.commons.text.similarity.LevenshteinDistance d;
+
+ @Override
+ public void setup() {
+ d = new org.apache.commons.text.similarity.LevenshteinDistance();
+ }
+
+ @Override
+ public void eval() {
+
+ String input1 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
rawInput1.end, rawInput1.buffer);
+ String input2 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
rawInput2.end, rawInput2.buffer);
+
+ double result = d.apply(input1, input2);
+ out.value = result;
+ }
+ }
+
+ /**
+ * The Longest common subsequence algorithm returns the length of the
longest subsequence that two strings have in common.
+ * Two strings that are entirely different, return a value of 0, and two
strings that return a value of the
+ * commonly shared length implies that the strings are completely the same
in value and position.
+ * Note: Generally this algorithm is fairly inefficient, as for length m, n
of the input
+ * CharSequence's left and right respectively, the runtime of the algorithm
is O(m*n).
+ * <p>
+ * This implementation is based on the Longest Commons Substring algorithm
from https://en.wikipedia.org/wiki/Longest_common_subsequence_problem.
+ * <p>
+ * Usage: SELECT longest_common_substring_distance( string1, string2 )
FROM...
+ */
+
+ @FunctionTemplate(name = "longest_common_substring_distance", scope =
FunctionTemplate.FunctionScope.SIMPLE, nulls =
FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class LongestCommonSubstringDistanceFunction implements
DrillSimpleFunc {
+
+ @Param
+ VarCharHolder rawInput1;
+
+ @Param
+ VarCharHolder rawInput2;
+
+ @Output
+ Float8Holder out;
+
+ @Workspace
+ org.apache.commons.text.similarity.LongestCommonSubsequenceDistance d;
+
+ @Override
+ public void setup() {
+ d = new
org.apache.commons.text.similarity.LongestCommonSubsequenceDistance();
+ }
+
+ @Override
+ public void eval() {
+
+ String input1 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
rawInput1.end, rawInput1.buffer);
+ String input2 =
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
rawInput2.end, rawInput2.buffer);
+
+ double result = d.apply(input1, input2);
+ out.value = result;
+ }
+ }
+
+}
diff --git
a/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestPhoneticFunctions.java
b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestPhoneticFunctions.java
new file mode 100644
index 00000000000..85bb135e22f
--- /dev/null
+++
b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestPhoneticFunctions.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.fn.impl;
+
+import org.apache.drill.categories.SqlFunctionTest;
+import org.apache.drill.categories.UnlikelyTest;
+import org.apache.drill.test.BaseDirTestWatcher;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterFixtureBuilder;
+import org.apache.drill.test.ClusterTest;
+import org.apache.drill.test.QueryResultSet;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import static org.junit.Assert.assertEquals;
+
+@Category({UnlikelyTest.class, SqlFunctionTest.class})
+public class TestPhoneticFunctions extends ClusterTest {
+
+ private QueryResultSet result;
+
+ @Rule
+ public final BaseDirTestWatcher baseDirTestWatcher = new
BaseDirTestWatcher();
+
+ @BeforeClass
+ public static void setup() throws Exception {
+ ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher);
+ startCluster(builder);
+ }
+
+ @Test
+ public void testSoundex() throws Exception {
+ String result = queryBuilder()
+ .sql("select soundex('jaime') as soundex from (values(1))")
+ .singletonString();
+ assertEquals("J500", result);
+ }
+
+ @Test
+ public void testCaverphone1() throws Exception {
+ String result = queryBuilder()
+ .sql("SELECT caverphone1('jaime') as caverphone FROM (VALUES(1))")
+ .singletonString();
+ assertEquals("YM1111", result);
+ }
+
+ @Test
+ public void testCaverphone2() throws Exception {
+ String result = queryBuilder()
+ .sql("SELECT caverphone2('steve') as caverphone FROM (VALUES(1))")
+ .singletonString();
+ assertEquals("STF1111111", result);
+ }
+
+ @Test
+ public void testCologne() throws Exception {
+ String result = queryBuilder()
+ .sql("SELECT cologne_phonetic('steve') AS CP FROM (VALUES(1))")
+ .singletonString();
+ assertEquals("823", result);
+ }
+
+ @Test
+ public void testMatchRatingEncoder() throws Exception {
+ String result = queryBuilder()
+ .sql("SELECT match_rating_encoder('Boston') AS MR FROM (VALUES(1))")
+ .singletonString();
+ assertEquals("BSTN", result);
+ }
+
+ @Test
+ public void testNYSIIS() throws Exception {
+ String result = queryBuilder()
+ .sql("SELECT nysiis('Boston') AS ny FROM (VALUES(1))")
+ .singletonString();
+ assertEquals("BASTAN", result);
+ }
+
+ @Test
+ public void testRefinedSoundex() throws Exception {
+ String result = queryBuilder()
+ .sql("SELECT refined_soundex('Boston') AS rs FROM (VALUES(1))")
+ .singletonString();
+ assertEquals("B103608", result);
+ }
+
+ @Test
+ public void testMetaphone() throws Exception {
+ String result = queryBuilder()
+ .sql("SELECT metaphone('Phoenix') AS meta FROM (VALUES(1))")
+ .singletonString();
+ assertEquals("FNKS", result);
+ }
+
+ @Test
+ public void testDoubleMetaphone() throws Exception {
+ String result = queryBuilder()
+ .sql("SELECT double_metaphone('Phoenix') AS meta FROM (VALUES(1))")
+ .singletonString();
+ assertEquals("FNKS", result);
+ }
+}
diff --git
a/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestStringDistanceFunctions.java
b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestStringDistanceFunctions.java
new file mode 100644
index 00000000000..915c0623168
--- /dev/null
+++
b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestStringDistanceFunctions.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.fn.impl;
+
+import org.apache.drill.categories.SqlFunctionTest;
+import org.apache.drill.categories.UnlikelyTest;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterFixtureBuilder;
+import org.apache.drill.test.ClusterTest;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import static org.junit.Assert.assertEquals;
+
+@Category({UnlikelyTest.class, SqlFunctionTest.class})
+public class TestStringDistanceFunctions extends ClusterTest {
+
+ @BeforeClass
+ public static void setup() throws Exception {
+ ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher);
+ startCluster(builder);
+ }
+
+ @Test
+ public void testCosineDistance() throws Exception {
+ double result = queryBuilder()
+ .sql("select cosine_distance( 'Big car', 'red car' ) as distance FROM
(VALUES(1))")
+ .singletonDouble();
+ assertEquals(0.5000000000000001, result, 0.0);
+ }
+
+ @Test
+ public void testHammingDistance() throws Exception {
+ double result = queryBuilder()
+ .sql("select hamming_distance( 'Big car', 'red car' ) as distance FROM
(VALUES(1))")
+ .singletonDouble();
+ assertEquals(3.0, result, 0.0);
+ }
+
+ @Test
+ public void testJaccardDistance() throws Exception {
+ double result = queryBuilder()
+ .sql("select jaccard_distance( 'Big car', 'red car' ) as distance FROM
(VALUES(1))")
+ .singletonDouble();
+ assertEquals(0.56, result, 0.0);
+ }
+
+ @Test
+ public void testJaroDistance() throws Exception {
+ double result = queryBuilder()
+ .sql("select jaro_distance( 'Big car', 'red car' ) as distance FROM
(VALUES(1))")
+ .singletonDouble();
+ assertEquals(0.7142857142857143, result, 0.0);
+ }
+
+ @Test
+ public void testLevenshteinDistance() throws Exception {
+ double result = queryBuilder()
+ .sql("select levenshtein_distance( 'Big car', 'red car' ) as distance
FROM (VALUES(1))")
+ .singletonDouble();
+ assertEquals(3.0, result, 0.0);
+ }
+}
\ No newline at end of file
diff --git
a/exec/java-exec/src/test/java/org/apache/drill/test/QueryBuilder.java
b/exec/java-exec/src/test/java/org/apache/drill/test/QueryBuilder.java
index 0f86955abf3..ff0e166134c 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/test/QueryBuilder.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/test/QueryBuilder.java
@@ -391,6 +391,27 @@ public long singletonLong() throws RpcException {
return value;
}
+ /**
+ * Run the query that is expected to return (at least) one row
+ * with the only (or first) column returning a double value.
+ * The double value cannot be null.
+ *
+ * @return the value of the first column of the first row
+ * @throws RpcException if anything goes wrong
+ */
+
+ public double singletonDouble() throws RpcException {
+ RowSet rowSet = rowSet();
+ if (rowSet == null) {
+ throw new IllegalStateException("No rows returned");
+ }
+ RowSetReader reader = rowSet.reader();
+ reader.next();
+ double value = reader.scalar(0).getDouble();
+ rowSet.clear();
+ return value;
+ }
+
/**
* Run the query that is expected to return (at least) one row
* with the only (or first) column returning a int value.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services