This is an automated email from the ASF dual-hosted git repository. arina pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/drill.git
commit 2ea603f8c8396015006885365921027e5b3e7392 Author: Charles S. Givre <cgi...@gmail.com> AuthorDate: Thu Jul 5 11:45:52 2018 -0400 DRILL-6519: Add String Distance and Phonetic Functions closes #1331 --- exec/java-exec/pom.xml | 6 +- .../drill/exec/expr/fn/impl/PhoneticFunctions.java | 407 +++++++++++++++++++++ .../exec/expr/fn/impl/StringDistanceFunctions.java | 329 +++++++++++++++++ .../drill/exec/fn/impl/TestPhoneticFunctions.java | 119 ++++++ .../exec/fn/impl/TestStringDistanceFunctions.java | 80 ++++ .../java/org/apache/drill/test/QueryBuilder.java | 21 ++ exec/jdbc-all/pom.xml | 2 +- 7 files changed, 962 insertions(+), 2 deletions(-) diff --git a/exec/java-exec/pom.xml b/exec/java-exec/pom.xml index 6d57ba3..3e1a118 100644 --- a/exec/java-exec/pom.xml +++ b/exec/java-exec/pom.xml @@ -97,7 +97,11 @@ <artifactId>univocity-parsers</artifactId> <version>1.3.0</version> </dependency> - + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-text</artifactId> + <version>1.4</version> + </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-math</artifactId> diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/PhoneticFunctions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/PhoneticFunctions.java new file mode 100644 index 0000000..ee26bd3 --- /dev/null +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/PhoneticFunctions.java @@ -0,0 +1,407 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.expr.fn.impl; + +import io.netty.buffer.DrillBuf; +import org.apache.drill.exec.expr.DrillSimpleFunc; +import org.apache.drill.exec.expr.annotations.FunctionTemplate; +import org.apache.drill.exec.expr.annotations.Output; +import org.apache.drill.exec.expr.annotations.Param; +import org.apache.drill.exec.expr.holders.VarCharHolder; + +import javax.inject.Inject; + +public class PhoneticFunctions { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(PhoneticFunctions.class); + + private PhoneticFunctions() { + } + + /** + * The Caverphone function is a phonetic matching function. This is an algorithm created by the Caversham Project at the University of Otago. It implements the Caverphone 1.0 algorithm. + * <p> + * <p> + * Usage: SELECT caverphone1( string ) FROM... + */ + + @FunctionTemplate(name = "caverphone1", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class Caverphone1Function implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput; + + @Output + VarCharHolder out; + + @Inject + DrillBuf buffer; + + @Override + public void setup() { + } + + @Override + public void eval() { + + String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start, rawInput.end, rawInput.buffer); + String outputString = new org.apache.commons.codec.language.Caverphone1().encode(input); + + out.buffer = buffer; + out.start = 0; + out.end = outputString.getBytes().length; + buffer.setBytes(0, outputString.getBytes()); + } + + } + + /** + * The Caverphone function is a phonetic matching function. This is an algorithm created by the Caversham Project at the University of Otago. It implements the Caverphone 2.0 algorithm. + * <p> + * Usage: SELECT caverphone2( string ) FROM... + */ + + @FunctionTemplate(name = "caverphone2", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class Caverphone2Function implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput; + + @Output + VarCharHolder out; + + @Inject + DrillBuf buffer; + + @Override + public void setup() { + } + + @Override + public void eval() { + + String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start, rawInput.end, rawInput.buffer); + String outputString = new org.apache.commons.codec.language.Caverphone2().encode(input); + + out.buffer = buffer; + out.start = 0; + out.end = outputString.getBytes().length; + buffer.setBytes(0, outputString.getBytes()); + } + + } + + /** + * Encodes a string into a Cologne Phonetic value. + * Implements the Kölner Phonetik (Cologne Phonetic) algorithm issued by Hans Joachim Postel in 1969. + * <p> + * The Kölner Phonetik is a phonetic algorithm which is optimized for the German language. + * It is related to the well-known soundex algorithm. + * <p> + * Usage: SELECT cologne_phonetic( string ) FROM... + */ + + @FunctionTemplate(name = "cologne_phonetic", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class ColognePhoneticFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput; + + @Output + VarCharHolder out; + + @Inject + DrillBuf buffer; + + @Override + public void setup() { + } + + @Override + public void eval() { + + String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start, rawInput.end, rawInput.buffer); + String outputString = new org.apache.commons.codec.language.ColognePhonetic().encode(input); + + out.buffer = buffer; + out.start = 0; + out.end = outputString.getBytes().length; + buffer.setBytes(0, outputString.getBytes()); + } + + } + + /** + * Encodes a string into a Daitch-Mokotoff Soundex value. + * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, + * yielding greater accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation + * but differences in spelling. + * <p> + * The main differences compared to the other soundex variants are: + * coded names are 6 digits long + * the initial character of the name is coded + * rules to encoded multi-character n-grams + * multiple possible encodings for the same name (branching) + * <p> + * Usage: SELECT dm_soundex( string ) FROM... + */ + + @FunctionTemplate(name = "dm_soundex", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class DaitchMokotoffFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput; + + @Output + VarCharHolder out; + + @Inject + DrillBuf buffer; + + @Override + public void setup() { + } + + @Override + public void eval() { + + String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start, rawInput.end, rawInput.buffer); + String outputString = new org.apache.commons.codec.language.DaitchMokotoffSoundex().encode(input); + + out.buffer = buffer; + out.start = 0; + out.end = outputString.getBytes().length; + buffer.setBytes(0, outputString.getBytes()); + } + + } + + /** + * Match Rating Approach Phonetic Algorithm Developed by Western Airlines in 1977. + * Usage: SELECT match_rating_encoder( string ) FROM... + */ + + @FunctionTemplate(name = "match_rating_encoder", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class MatchRatingFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput; + + @Output + VarCharHolder out; + + @Inject + DrillBuf buffer; + + @Override + public void setup() { + } + + @Override + public void eval() { + + String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start, rawInput.end, rawInput.buffer); + String outputString = new org.apache.commons.codec.language.MatchRatingApproachEncoder().encode(input); + + out.buffer = buffer; + out.start = 0; + out.end = outputString.getBytes().length; + buffer.setBytes(0, outputString.getBytes()); + } + + } + + /** + * The New York State Identification and Intelligence System Phonetic Code, commonly known as NYSIIS, is a phonetic algorithm devised in 1970 as part of the New York State Identification and Intelligence System (now a part of the New York State Division of Criminal Justice Services). It features an accuracy increase of 2.7% over the traditional Soundex algorithm. + * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a general purpose scheme to find word with similar phonemes. + * <p> + * Usage: SELECT nysiis(string) FROM... + */ + + @FunctionTemplate(name = "nysiis", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class NYSIISFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput; + + @Output + VarCharHolder out; + + @Inject + DrillBuf buffer; + + @Override + public void setup() { + } + + @Override + public void eval() { + + String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start, rawInput.end, rawInput.buffer); + String outputString = new org.apache.commons.codec.language.Nysiis().encode(input); + + out.buffer = buffer; + out.start = 0; + out.end = outputString.getBytes().length; + buffer.setBytes(0, outputString.getBytes()); + } + } + + /** + * Encodes a string into a Refined Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a general purpose scheme to find word with similar phonemes. + * <p> + * Usage: SELECT refined_soundex( string ) FROM... + */ + + @FunctionTemplate(name = "refined_soundex", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class RefinedSoundexFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput; + + @Output + VarCharHolder out; + + @Inject + DrillBuf buffer; + + @Override + public void setup() { + } + + @Override + public void eval() { + + String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start, rawInput.end, rawInput.buffer); + String outputString = new org.apache.commons.codec.language.RefinedSoundex().encode(input); + + out.buffer = buffer; + out.start = 0; + out.end = outputString.getBytes().length; + buffer.setBytes(0, outputString.getBytes()); + } + + } + + /** + * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a general purpose scheme to find word with similar phonemes. + * <p> + * Usage: SELECT soundex( string ) FROM... + */ + + @FunctionTemplate(name = "soundex", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class SoundexFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput; + + @Output + VarCharHolder out; + + @Inject + DrillBuf buffer; + + @Override + public void setup() { + } + + @Override + public void eval() { + + String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start, rawInput.end, rawInput.buffer); + String outputString = new org.apache.commons.codec.language.Soundex().soundex(input); + + out.buffer = buffer; + out.start = 0; + out.end = outputString.getBytes().length; + buffer.setBytes(0, outputString.getBytes()); + } + } + + /** + * Implements the Metaphone phonetic algorithm (https://en.wikipedia.org/wiki/Metaphone), + * and calculates a given string's Metaphone value. + * <p> + * Usage: SELECT metaphone( string ) FROM... + */ + + @FunctionTemplate(name = "metaphone", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class MetaphoneFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput; + + @Output + VarCharHolder out; + + @Inject + DrillBuf buffer; + + @Override + public void setup() { + } + + @Override + public void eval() { + + String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start, rawInput.end, rawInput.buffer); + String outputString = new org.apache.commons.codec.language.Metaphone().metaphone(input); + + out.buffer = buffer; + out.start = 0; + out.end = outputString.getBytes().length; + buffer.setBytes(0, outputString.getBytes()); + } + + } + + /** + * Implements the Double Metaphone phonetic algorithm (https://en.wikipedia.org/wiki/Metaphone), + * and calculates a given string's Double Metaphone value. + * <p> + * Usage: SELECT double_metaphone( string ) FROM... + */ + + @FunctionTemplate(name = "double_metaphone", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class DoubleMetaphoneFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput; + + @Output + VarCharHolder out; + + @Inject + DrillBuf buffer; + + @Override + public void setup() { + } + + @Override + public void eval() { + + String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start, rawInput.end, rawInput.buffer); + String outputString = new org.apache.commons.codec.language.DoubleMetaphone().doubleMetaphone(input); + + out.buffer = buffer; + out.start = 0; + out.end = outputString.getBytes().length; + buffer.setBytes(0, outputString.getBytes()); + } + } +} \ No newline at end of file diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringDistanceFunctions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringDistanceFunctions.java new file mode 100644 index 0000000..0b02769 --- /dev/null +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringDistanceFunctions.java @@ -0,0 +1,329 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.expr.fn.impl; + +import org.apache.drill.exec.expr.DrillSimpleFunc; +import org.apache.drill.exec.expr.annotations.FunctionTemplate; +import org.apache.drill.exec.expr.annotations.Output; +import org.apache.drill.exec.expr.annotations.Param; +import org.apache.drill.exec.expr.annotations.Workspace; +import org.apache.drill.exec.expr.holders.Float8Holder; +import org.apache.drill.exec.expr.holders.VarCharHolder; + +public class StringDistanceFunctions { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(StringDistanceFunctions.class); + + private StringDistanceFunctions() { + } + + /** + * This function calculates the cosine distance between two strings. + * Usage: SELECT cosine_distance( string1, string2 ) AS cosine_distance FROM... + */ + + @FunctionTemplate(name = "cosine_distance", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class CosineDistanceFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput1; + + @Param + VarCharHolder rawInput2; + + @Workspace + org.apache.commons.text.similarity.CosineDistance d; + + @Output + Float8Holder out; + + @Override + public void setup() { + d = new org.apache.commons.text.similarity.CosineDistance(); + } + + @Override + public void eval() { + + String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer); + String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer); + + + double result = d.apply(input1, input2); + out.value = result; + } + } + + /** + * This function calculates the cosine distance between two strings. + * A matching algorithm that is similar to the searching algorithms implemented in editors such + * as Sublime Text, TextMate, Atom and others. + * <p> + * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score + * indicates a higher similarity. + * <p> + * <p> + * Usage: SELECT fuzzy_score( string1, string2 ) AS fuzzy_score FROM... + */ + + @FunctionTemplate(name = "fuzzy_score", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class FuzzyScoreFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput1; + + @Param + VarCharHolder rawInput2; + + @Output + Float8Holder out; + + @Workspace + org.apache.commons.text.similarity.FuzzyScore d; + + @Override + public void setup() { + d = new org.apache.commons.text.similarity.FuzzyScore(java.util.Locale.ENGLISH); + } + + @Override + public void eval() { + + String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer); + String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer); + + double result = d.fuzzyScore(input1, input2); + out.value = result; + } + } + + /** + * The hamming distance between two strings of equal length is the number of + * positions at which the corresponding symbols are different. + * <p> + * For further explanation about the Hamming Distance, take a look at its + * Wikipedia page at http://en.wikipedia.org/wiki/Hamming_distance. + * <p> + * Usage: SELECT hamming_distance( string1, string2 ) FROM... + */ + + + @FunctionTemplate(name = "hamming_distance", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class HammingDistanceFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput1; + + @Param + VarCharHolder rawInput2; + + @Output + Float8Holder out; + + @Workspace + org.apache.commons.text.similarity.HammingDistance d; + + @Override + public void setup() { + d = new org.apache.commons.text.similarity.HammingDistance(); + } + + @Override + public void eval() { + + String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer); + String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer); + + double result = d.apply(input1, input2); + out.value = result; + } + } + + + /** + * Measures the Jaccard distance of two sets of character sequence. Jaccard + * distance is the dissimilarity between two sets. It is the complementary of + * Jaccard similarity. + * <p> + * For further explanation about Jaccard Distance, refer + * https://en.wikipedia.org/wiki/Jaccard_index + * <p> + * Usage: SELECT jaccard_distance( string1, string2 ) FROM ... + */ + + + @FunctionTemplate(name = "jaccard_distance", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class JaccardDistanceFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput1; + + @Param + VarCharHolder rawInput2; + + @Output + Float8Holder out; + + @Workspace + org.apache.commons.text.similarity.JaccardDistance d; + + @Override + public void setup() { + d = new org.apache.commons.text.similarity.JaccardDistance(); + } + + @Override + public void eval() { + + String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer); + String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer); + + double result = d.apply(input1, input2); + out.value = result; + } + } + + /** + * A similarity algorithm indicating the percentage of matched characters between two character sequences. + * <p> + * The Jaro measure is the weighted sum of percentage of matched characters + * from each file and transposed characters. Winkler increased this measure + * for matching initial characters. + * <p> + * This implementation is based on the Jaro Winkler similarity algorithm + * from https://en.wikipedia.org/wiki/Jaro–Winkler_distance + * <p> + * Usage: SELECT jaro_distance( string1, string2 ) FROM... + */ + + @FunctionTemplate(name = "jaro_distance", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class JaroDistanceFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput1; + + @Param + VarCharHolder rawInput2; + + @Output + Float8Holder out; + + @Workspace + org.apache.commons.text.similarity.JaroWinklerDistance d; + + @Override + public void setup() { + d = new org.apache.commons.text.similarity.JaroWinklerDistance(); + } + + @Override + public void eval() { + + String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer); + String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer); + + double result = d.apply(input1, input2); + out.value = result; + } + } + + /** + * An algorithm for measuring the difference between two character sequences. + * <p> + * This is the number of changes needed to change one sequence into another, + * where each change is a single character modification (deletion, insertion + * or substitution). + * <p> + * Usage: SELECT levenshtein_distance( string1, string2 ) FROM... + */ + + @FunctionTemplate(name = "levenshtein_distance", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class LevenstheinDistanceFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput1; + + @Param + VarCharHolder rawInput2; + + @Output + Float8Holder out; + + @Workspace + org.apache.commons.text.similarity.LevenshteinDistance d; + + @Override + public void setup() { + d = new org.apache.commons.text.similarity.LevenshteinDistance(); + } + + @Override + public void eval() { + + String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer); + String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer); + + double result = d.apply(input1, input2); + out.value = result; + } + } + + /** + * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in common. + * Two strings that are entirely different, return a value of 0, and two strings that return a value of the + * commonly shared length implies that the strings are completely the same in value and position. + * Note: Generally this algorithm is fairly inefficient, as for length m, n of the input + * CharSequence's left and right respectively, the runtime of the algorithm is O(m*n). + * <p> + * This implementation is based on the Longest Commons Substring algorithm from https://en.wikipedia.org/wiki/Longest_common_subsequence_problem. + * <p> + * Usage: SELECT longest_common_substring_distance( string1, string2 ) FROM... + */ + + @FunctionTemplate(name = "longest_common_substring_distance", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + public static class LongestCommonSubstringDistanceFunction implements DrillSimpleFunc { + + @Param + VarCharHolder rawInput1; + + @Param + VarCharHolder rawInput2; + + @Output + Float8Holder out; + + @Workspace + org.apache.commons.text.similarity.LongestCommonSubsequenceDistance d; + + @Override + public void setup() { + d = new org.apache.commons.text.similarity.LongestCommonSubsequenceDistance(); + } + + @Override + public void eval() { + + String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer); + String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer); + + double result = d.apply(input1, input2); + out.value = result; + } + } + +} diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestPhoneticFunctions.java b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestPhoneticFunctions.java new file mode 100644 index 0000000..85bb135 --- /dev/null +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestPhoneticFunctions.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.fn.impl; + +import org.apache.drill.categories.SqlFunctionTest; +import org.apache.drill.categories.UnlikelyTest; +import org.apache.drill.test.BaseDirTestWatcher; +import org.apache.drill.test.ClusterFixture; +import org.apache.drill.test.ClusterFixtureBuilder; +import org.apache.drill.test.ClusterTest; +import org.apache.drill.test.QueryResultSet; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import static org.junit.Assert.assertEquals; + +@Category({UnlikelyTest.class, SqlFunctionTest.class}) +public class TestPhoneticFunctions extends ClusterTest { + + private QueryResultSet result; + + @Rule + public final BaseDirTestWatcher baseDirTestWatcher = new BaseDirTestWatcher(); + + @BeforeClass + public static void setup() throws Exception { + ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher); + startCluster(builder); + } + + @Test + public void testSoundex() throws Exception { + String result = queryBuilder() + .sql("select soundex('jaime') as soundex from (values(1))") + .singletonString(); + assertEquals("J500", result); + } + + @Test + public void testCaverphone1() throws Exception { + String result = queryBuilder() + .sql("SELECT caverphone1('jaime') as caverphone FROM (VALUES(1))") + .singletonString(); + assertEquals("YM1111", result); + } + + @Test + public void testCaverphone2() throws Exception { + String result = queryBuilder() + .sql("SELECT caverphone2('steve') as caverphone FROM (VALUES(1))") + .singletonString(); + assertEquals("STF1111111", result); + } + + @Test + public void testCologne() throws Exception { + String result = queryBuilder() + .sql("SELECT cologne_phonetic('steve') AS CP FROM (VALUES(1))") + .singletonString(); + assertEquals("823", result); + } + + @Test + public void testMatchRatingEncoder() throws Exception { + String result = queryBuilder() + .sql("SELECT match_rating_encoder('Boston') AS MR FROM (VALUES(1))") + .singletonString(); + assertEquals("BSTN", result); + } + + @Test + public void testNYSIIS() throws Exception { + String result = queryBuilder() + .sql("SELECT nysiis('Boston') AS ny FROM (VALUES(1))") + .singletonString(); + assertEquals("BASTAN", result); + } + + @Test + public void testRefinedSoundex() throws Exception { + String result = queryBuilder() + .sql("SELECT refined_soundex('Boston') AS rs FROM (VALUES(1))") + .singletonString(); + assertEquals("B103608", result); + } + + @Test + public void testMetaphone() throws Exception { + String result = queryBuilder() + .sql("SELECT metaphone('Phoenix') AS meta FROM (VALUES(1))") + .singletonString(); + assertEquals("FNKS", result); + } + + @Test + public void testDoubleMetaphone() throws Exception { + String result = queryBuilder() + .sql("SELECT double_metaphone('Phoenix') AS meta FROM (VALUES(1))") + .singletonString(); + assertEquals("FNKS", result); + } +} diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestStringDistanceFunctions.java b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestStringDistanceFunctions.java new file mode 100644 index 0000000..915c062 --- /dev/null +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestStringDistanceFunctions.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.fn.impl; + +import org.apache.drill.categories.SqlFunctionTest; +import org.apache.drill.categories.UnlikelyTest; +import org.apache.drill.test.ClusterFixture; +import org.apache.drill.test.ClusterFixtureBuilder; +import org.apache.drill.test.ClusterTest; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import static org.junit.Assert.assertEquals; + +@Category({UnlikelyTest.class, SqlFunctionTest.class}) +public class TestStringDistanceFunctions extends ClusterTest { + + @BeforeClass + public static void setup() throws Exception { + ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher); + startCluster(builder); + } + + @Test + public void testCosineDistance() throws Exception { + double result = queryBuilder() + .sql("select cosine_distance( 'Big car', 'red car' ) as distance FROM (VALUES(1))") + .singletonDouble(); + assertEquals(0.5000000000000001, result, 0.0); + } + + @Test + public void testHammingDistance() throws Exception { + double result = queryBuilder() + .sql("select hamming_distance( 'Big car', 'red car' ) as distance FROM (VALUES(1))") + .singletonDouble(); + assertEquals(3.0, result, 0.0); + } + + @Test + public void testJaccardDistance() throws Exception { + double result = queryBuilder() + .sql("select jaccard_distance( 'Big car', 'red car' ) as distance FROM (VALUES(1))") + .singletonDouble(); + assertEquals(0.56, result, 0.0); + } + + @Test + public void testJaroDistance() throws Exception { + double result = queryBuilder() + .sql("select jaro_distance( 'Big car', 'red car' ) as distance FROM (VALUES(1))") + .singletonDouble(); + assertEquals(0.7142857142857143, result, 0.0); + } + + @Test + public void testLevenshteinDistance() throws Exception { + double result = queryBuilder() + .sql("select levenshtein_distance( 'Big car', 'red car' ) as distance FROM (VALUES(1))") + .singletonDouble(); + assertEquals(3.0, result, 0.0); + } +} \ No newline at end of file diff --git a/exec/java-exec/src/test/java/org/apache/drill/test/QueryBuilder.java b/exec/java-exec/src/test/java/org/apache/drill/test/QueryBuilder.java index 0f86955..ff0e166 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/test/QueryBuilder.java +++ b/exec/java-exec/src/test/java/org/apache/drill/test/QueryBuilder.java @@ -393,6 +393,27 @@ public class QueryBuilder { /** * Run the query that is expected to return (at least) one row + * with the only (or first) column returning a double value. + * The double value cannot be null. + * + * @return the value of the first column of the first row + * @throws RpcException if anything goes wrong + */ + + public double singletonDouble() throws RpcException { + RowSet rowSet = rowSet(); + if (rowSet == null) { + throw new IllegalStateException("No rows returned"); + } + RowSetReader reader = rowSet.reader(); + reader.next(); + double value = reader.scalar(0).getDouble(); + rowSet.clear(); + return value; + } + + /** + * Run the query that is expected to return (at least) one row * with the only (or first) column returning a int value. * The int value cannot be null. * diff --git a/exec/jdbc-all/pom.xml b/exec/jdbc-all/pom.xml index f3595be..f7af511 100644 --- a/exec/jdbc-all/pom.xml +++ b/exec/jdbc-all/pom.xml @@ -566,7 +566,7 @@ This is likely due to you adding new dependencies to a java-exec and not updating the excludes in this module. This is important as it minimizes the size of the dependency of Drill application users. </message> - <maxsize>33000000</maxsize> + <maxsize>34000000</maxsize> <minsize>15000000</minsize> <files> <file>${project.build.directory}/drill-jdbc-all-${project.version}.jar</file>