This is an automated email from the ASF dual-hosted git repository.

arina pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git

commit 2ea603f8c8396015006885365921027e5b3e7392
Author: Charles S. Givre <cgi...@gmail.com>
AuthorDate: Thu Jul 5 11:45:52 2018 -0400

    DRILL-6519: Add String Distance and Phonetic Functions
    
    closes #1331
---
 exec/java-exec/pom.xml                             |   6 +-
 .../drill/exec/expr/fn/impl/PhoneticFunctions.java | 407 +++++++++++++++++++++
 .../exec/expr/fn/impl/StringDistanceFunctions.java | 329 +++++++++++++++++
 .../drill/exec/fn/impl/TestPhoneticFunctions.java  | 119 ++++++
 .../exec/fn/impl/TestStringDistanceFunctions.java  |  80 ++++
 .../java/org/apache/drill/test/QueryBuilder.java   |  21 ++
 exec/jdbc-all/pom.xml                              |   2 +-
 7 files changed, 962 insertions(+), 2 deletions(-)

diff --git a/exec/java-exec/pom.xml b/exec/java-exec/pom.xml
index 6d57ba3..3e1a118 100644
--- a/exec/java-exec/pom.xml
+++ b/exec/java-exec/pom.xml
@@ -97,7 +97,11 @@
       <artifactId>univocity-parsers</artifactId>
       <version>1.3.0</version>
     </dependency>
-
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-text</artifactId>
+      <version>1.4</version>
+    </dependency>
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-math</artifactId>
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/PhoneticFunctions.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/PhoneticFunctions.java
new file mode 100644
index 0000000..ee26bd3
--- /dev/null
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/PhoneticFunctions.java
@@ -0,0 +1,407 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.expr.fn.impl;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+import javax.inject.Inject;
+
+public class PhoneticFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(PhoneticFunctions.class);
+
+  private PhoneticFunctions() {
+  }
+
+  /**
+   * The Caverphone function is a phonetic matching function.   This is an 
algorithm created by the Caversham Project at the University of Otago. It 
implements the Caverphone 1.0 algorithm.
+   * <p>
+   * <p>
+   * Usage:  SELECT caverphone1( string ) FROM...
+   */
+
+  @FunctionTemplate(name = "caverphone1", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class Caverphone1Function implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput;
+
+    @Output
+    VarCharHolder out;
+
+    @Inject
+    DrillBuf buffer;
+
+    @Override
+    public void setup() {
+    }
+
+    @Override
+    public void eval() {
+
+      String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+      String outputString = new 
org.apache.commons.codec.language.Caverphone1().encode(input);
+
+      out.buffer = buffer;
+      out.start = 0;
+      out.end = outputString.getBytes().length;
+      buffer.setBytes(0, outputString.getBytes());
+    }
+
+  }
+
+  /**
+   * The Caverphone function is a phonetic matching function.   This is an 
algorithm created by the Caversham Project at the University of Otago. It 
implements the Caverphone 2.0 algorithm.
+   * <p>
+   * Usage: SELECT caverphone2( string ) FROM...
+   */
+
+  @FunctionTemplate(name = "caverphone2", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class Caverphone2Function implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput;
+
+    @Output
+    VarCharHolder out;
+
+    @Inject
+    DrillBuf buffer;
+
+    @Override
+    public void setup() {
+    }
+
+    @Override
+    public void eval() {
+
+      String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+      String outputString = new 
org.apache.commons.codec.language.Caverphone2().encode(input);
+
+      out.buffer = buffer;
+      out.start = 0;
+      out.end = outputString.getBytes().length;
+      buffer.setBytes(0, outputString.getBytes());
+    }
+
+  }
+
+  /**
+   * Encodes a string into a Cologne Phonetic value.
+   * Implements the Kölner Phonetik (Cologne Phonetic) algorithm issued by 
Hans Joachim Postel in 1969.
+   * <p>
+   * The Kölner Phonetik is a phonetic algorithm which is optimized for the 
German language.
+   * It is related to the well-known soundex algorithm.
+   * <p>
+   * Usage:  SELECT cologne_phonetic( string ) FROM...
+   */
+
+  @FunctionTemplate(name = "cologne_phonetic", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class ColognePhoneticFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput;
+
+    @Output
+    VarCharHolder out;
+
+    @Inject
+    DrillBuf buffer;
+
+    @Override
+    public void setup() {
+    }
+
+    @Override
+    public void eval() {
+
+      String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+      String outputString = new 
org.apache.commons.codec.language.ColognePhonetic().encode(input);
+
+      out.buffer = buffer;
+      out.start = 0;
+      out.end = outputString.getBytes().length;
+      buffer.setBytes(0, outputString.getBytes());
+    }
+
+  }
+
+  /**
+   * Encodes a string into a Daitch-Mokotoff Soundex value.
+   * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and 
American Soundex algorithms,
+   * yielding greater accuracy in matching especially Slavish and Yiddish 
surnames with similar pronunciation
+   * but differences in spelling.
+   * <p>
+   * The main differences compared to the other soundex variants are:
+   * coded names are 6 digits long
+   * the initial character of the name is coded
+   * rules to encoded multi-character n-grams
+   * multiple possible encodings for the same name (branching)
+   * <p>
+   * Usage:  SELECT dm_soundex( string ) FROM...
+   */
+
+  @FunctionTemplate(name = "dm_soundex", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class DaitchMokotoffFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput;
+
+    @Output
+    VarCharHolder out;
+
+    @Inject
+    DrillBuf buffer;
+
+    @Override
+    public void setup() {
+    }
+
+    @Override
+    public void eval() {
+
+      String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+      String outputString = new 
org.apache.commons.codec.language.DaitchMokotoffSoundex().encode(input);
+
+      out.buffer = buffer;
+      out.start = 0;
+      out.end = outputString.getBytes().length;
+      buffer.setBytes(0, outputString.getBytes());
+    }
+
+  }
+
+  /**
+   * Match Rating Approach Phonetic Algorithm Developed by Western Airlines in 
1977.
+   * Usage:  SELECT match_rating_encoder( string ) FROM...
+   */
+
+  @FunctionTemplate(name = "match_rating_encoder", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MatchRatingFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput;
+
+    @Output
+    VarCharHolder out;
+
+    @Inject
+    DrillBuf buffer;
+
+    @Override
+    public void setup() {
+    }
+
+    @Override
+    public void eval() {
+
+      String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+      String outputString = new 
org.apache.commons.codec.language.MatchRatingApproachEncoder().encode(input);
+
+      out.buffer = buffer;
+      out.start = 0;
+      out.end = outputString.getBytes().length;
+      buffer.setBytes(0, outputString.getBytes());
+    }
+
+  }
+
+  /**
+   * The New York State Identification and Intelligence System Phonetic Code, 
commonly known as NYSIIS, is a phonetic algorithm devised in 1970 as part of 
the New York State Identification and Intelligence System (now a part of the 
New York State Division of Criminal Justice Services). It features an accuracy 
increase of 2.7% over the traditional Soundex algorithm.
+   * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to 
relate similar names, but can also be used as a general purpose scheme to find 
word with similar phonemes.
+   * <p>
+   * Usage: SELECT nysiis(string) FROM...
+   */
+
+  @FunctionTemplate(name = "nysiis", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class NYSIISFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput;
+
+    @Output
+    VarCharHolder out;
+
+    @Inject
+    DrillBuf buffer;
+
+    @Override
+    public void setup() {
+    }
+
+    @Override
+    public void eval() {
+
+      String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+      String outputString = new 
org.apache.commons.codec.language.Nysiis().encode(input);
+
+      out.buffer = buffer;
+      out.start = 0;
+      out.end = outputString.getBytes().length;
+      buffer.setBytes(0, outputString.getBytes());
+    }
+  }
+
+  /**
+   * Encodes a string into a Refined Soundex value. Soundex is an encoding 
used to relate similar names, but can also be used as a general purpose scheme 
to find word with similar phonemes.
+   * <p>
+   * Usage:  SELECT refined_soundex( string ) FROM...
+   */
+
+  @FunctionTemplate(name = "refined_soundex", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class RefinedSoundexFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput;
+
+    @Output
+    VarCharHolder out;
+
+    @Inject
+    DrillBuf buffer;
+
+    @Override
+    public void setup() {
+    }
+
+    @Override
+    public void eval() {
+
+      String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+      String outputString = new 
org.apache.commons.codec.language.RefinedSoundex().encode(input);
+
+      out.buffer = buffer;
+      out.start = 0;
+      out.end = outputString.getBytes().length;
+      buffer.setBytes(0, outputString.getBytes());
+    }
+
+  }
+
+  /**
+   * Encodes a string into a Soundex value. Soundex is an encoding used to 
relate similar names, but can also be used as a general purpose scheme to find 
word with similar phonemes.
+   * <p>
+   * Usage:  SELECT soundex( string ) FROM...
+   */
+
+  @FunctionTemplate(name = "soundex", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class SoundexFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput;
+
+    @Output
+    VarCharHolder out;
+
+    @Inject
+    DrillBuf buffer;
+
+    @Override
+    public void setup() {
+    }
+
+    @Override
+    public void eval() {
+
+      String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+      String outputString = new 
org.apache.commons.codec.language.Soundex().soundex(input);
+
+      out.buffer = buffer;
+      out.start = 0;
+      out.end = outputString.getBytes().length;
+      buffer.setBytes(0, outputString.getBytes());
+    }
+  }
+
+  /**
+   * Implements the Metaphone phonetic algorithm 
(https://en.wikipedia.org/wiki/Metaphone),
+   * and calculates a given string's Metaphone value.
+   * <p>
+   * Usage: SELECT metaphone( string ) FROM...
+   */
+
+  @FunctionTemplate(name = "metaphone", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class MetaphoneFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput;
+
+    @Output
+    VarCharHolder out;
+
+    @Inject
+    DrillBuf buffer;
+
+    @Override
+    public void setup() {
+    }
+
+    @Override
+    public void eval() {
+
+      String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+      String outputString = new 
org.apache.commons.codec.language.Metaphone().metaphone(input);
+
+      out.buffer = buffer;
+      out.start = 0;
+      out.end = outputString.getBytes().length;
+      buffer.setBytes(0, outputString.getBytes());
+    }
+
+  }
+
+  /**
+   * Implements the Double Metaphone phonetic algorithm 
(https://en.wikipedia.org/wiki/Metaphone),
+   * and calculates a given string's Double Metaphone value.
+   * <p>
+   * Usage: SELECT double_metaphone( string ) FROM...
+   */
+
+  @FunctionTemplate(name = "double_metaphone", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class DoubleMetaphoneFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput;
+
+    @Output
+    VarCharHolder out;
+
+    @Inject
+    DrillBuf buffer;
+
+    @Override
+    public void setup() {
+    }
+
+    @Override
+    public void eval() {
+
+      String input = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput.start,
 rawInput.end, rawInput.buffer);
+      String outputString = new 
org.apache.commons.codec.language.DoubleMetaphone().doubleMetaphone(input);
+
+      out.buffer = buffer;
+      out.start = 0;
+      out.end = outputString.getBytes().length;
+      buffer.setBytes(0, outputString.getBytes());
+    }
+  }
+}
\ No newline at end of file
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringDistanceFunctions.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringDistanceFunctions.java
new file mode 100644
index 0000000..0b02769
--- /dev/null
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringDistanceFunctions.java
@@ -0,0 +1,329 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.expr.fn.impl;
+
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.Float8Holder;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+public class StringDistanceFunctions {
+  static final org.slf4j.Logger logger = 
org.slf4j.LoggerFactory.getLogger(StringDistanceFunctions.class);
+
+  private StringDistanceFunctions() {
+  }
+
+  /**
+   * This function calculates the cosine distance between two strings.
+   * Usage:  SELECT cosine_distance( string1, string2 ) AS cosine_distance 
FROM...
+   */
+
+  @FunctionTemplate(name = "cosine_distance", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class CosineDistanceFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput1;
+
+    @Param
+    VarCharHolder rawInput2;
+
+    @Workspace
+    org.apache.commons.text.similarity.CosineDistance d;
+
+    @Output
+    Float8Holder out;
+
+    @Override
+    public void setup() {
+      d = new org.apache.commons.text.similarity.CosineDistance();
+    }
+
+    @Override
+    public void eval() {
+
+      String input1 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
 rawInput1.end, rawInput1.buffer);
+      String input2 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
 rawInput2.end, rawInput2.buffer);
+
+
+      double result = d.apply(input1, input2);
+      out.value = result;
+    }
+  }
+
+  /**
+   * This function calculates the cosine distance between two strings.
+   * A matching algorithm that is similar to the searching algorithms 
implemented in editors such
+   * as Sublime Text, TextMate, Atom and others.
+   * <p>
+   * One point is given for every matched character. Subsequent matches yield 
two bonus points. A higher score
+   * indicates a higher similarity.
+   * <p>
+   * <p>
+   * Usage:  SELECT fuzzy_score( string1, string2 ) AS fuzzy_score FROM...
+   */
+
+  @FunctionTemplate(name = "fuzzy_score", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class FuzzyScoreFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput1;
+
+    @Param
+    VarCharHolder rawInput2;
+
+    @Output
+    Float8Holder out;
+
+    @Workspace
+    org.apache.commons.text.similarity.FuzzyScore d;
+
+    @Override
+    public void setup() {
+      d = new 
org.apache.commons.text.similarity.FuzzyScore(java.util.Locale.ENGLISH);
+    }
+
+    @Override
+    public void eval() {
+
+      String input1 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
 rawInput1.end, rawInput1.buffer);
+      String input2 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
 rawInput2.end, rawInput2.buffer);
+
+      double result = d.fuzzyScore(input1, input2);
+      out.value = result;
+    }
+  }
+
+  /**
+   * The hamming distance between two strings of equal length is the number of
+   * positions at which the corresponding symbols are different.
+   * <p>
+   * For further explanation about the Hamming Distance, take a look at its
+   * Wikipedia page at http://en.wikipedia.org/wiki/Hamming_distance.
+   * <p>
+   * Usage:  SELECT hamming_distance( string1, string2 ) FROM...
+   */
+
+
+  @FunctionTemplate(name = "hamming_distance", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class HammingDistanceFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput1;
+
+    @Param
+    VarCharHolder rawInput2;
+
+    @Output
+    Float8Holder out;
+
+    @Workspace
+    org.apache.commons.text.similarity.HammingDistance d;
+
+    @Override
+    public void setup() {
+      d = new org.apache.commons.text.similarity.HammingDistance();
+    }
+
+    @Override
+    public void eval() {
+
+      String input1 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
 rawInput1.end, rawInput1.buffer);
+      String input2 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
 rawInput2.end, rawInput2.buffer);
+
+      double result = d.apply(input1, input2);
+      out.value = result;
+    }
+  }
+
+
+  /**
+   * Measures the Jaccard distance of two sets of character sequence. Jaccard
+   * distance is the dissimilarity between two sets. It is the complementary of
+   * Jaccard similarity.
+   * <p>
+   * For further explanation about Jaccard Distance, refer
+   * https://en.wikipedia.org/wiki/Jaccard_index
+   * <p>
+   * Usage:  SELECT jaccard_distance( string1, string2 ) FROM ...
+   */
+
+
+  @FunctionTemplate(name = "jaccard_distance", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class JaccardDistanceFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput1;
+
+    @Param
+    VarCharHolder rawInput2;
+
+    @Output
+    Float8Holder out;
+
+    @Workspace
+    org.apache.commons.text.similarity.JaccardDistance d;
+
+    @Override
+    public void setup() {
+      d = new org.apache.commons.text.similarity.JaccardDistance();
+    }
+
+    @Override
+    public void eval() {
+
+      String input1 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
 rawInput1.end, rawInput1.buffer);
+      String input2 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
 rawInput2.end, rawInput2.buffer);
+
+      double result = d.apply(input1, input2);
+      out.value = result;
+    }
+  }
+
+  /**
+   * A similarity algorithm indicating the percentage of matched characters 
between two character sequences.
+   * <p>
+   * The Jaro measure is the weighted sum of percentage of matched characters
+   * from each file and transposed characters. Winkler increased this measure
+   * for matching initial characters.
+   * <p>
+   * This implementation is based on the Jaro Winkler similarity algorithm
+   * from https://en.wikipedia.org/wiki/Jaro–Winkler_distance
+   * <p>
+   * Usage: SELECT jaro_distance( string1, string2 ) FROM...
+   */
+
+  @FunctionTemplate(name = "jaro_distance", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class JaroDistanceFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput1;
+
+    @Param
+    VarCharHolder rawInput2;
+
+    @Output
+    Float8Holder out;
+
+    @Workspace
+    org.apache.commons.text.similarity.JaroWinklerDistance d;
+
+    @Override
+    public void setup() {
+      d = new org.apache.commons.text.similarity.JaroWinklerDistance();
+    }
+
+    @Override
+    public void eval() {
+
+      String input1 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
 rawInput1.end, rawInput1.buffer);
+      String input2 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
 rawInput2.end, rawInput2.buffer);
+
+      double result = d.apply(input1, input2);
+      out.value = result;
+    }
+  }
+
+  /**
+   * An algorithm for measuring the difference between two character sequences.
+   * <p>
+   * This is the number of changes needed to change one sequence into another,
+   * where each change is a single character modification (deletion, insertion
+   * or substitution).
+   * <p>
+   * Usage: SELECT levenshtein_distance( string1, string2 ) FROM...
+   */
+
+  @FunctionTemplate(name = "levenshtein_distance", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class LevenstheinDistanceFunction implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput1;
+
+    @Param
+    VarCharHolder rawInput2;
+
+    @Output
+    Float8Holder out;
+
+    @Workspace
+    org.apache.commons.text.similarity.LevenshteinDistance d;
+
+    @Override
+    public void setup() {
+      d = new org.apache.commons.text.similarity.LevenshteinDistance();
+    }
+
+    @Override
+    public void eval() {
+
+      String input1 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
 rawInput1.end, rawInput1.buffer);
+      String input2 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
 rawInput2.end, rawInput2.buffer);
+
+      double result = d.apply(input1, input2);
+      out.value = result;
+    }
+  }
+
+  /**
+   * The Longest common subsequence algorithm returns the length of the 
longest subsequence that two strings have in common.
+   * Two strings that are entirely different, return a value of 0, and two 
strings that return a value of the
+   * commonly shared length implies that the strings are completely the same 
in value and position.
+   * Note: Generally this algorithm is fairly inefficient, as for length m, n 
of the input
+   * CharSequence's left and right respectively, the runtime of the algorithm 
is O(m*n).
+   * <p>
+   * This implementation is based on the Longest Commons Substring algorithm 
from https://en.wikipedia.org/wiki/Longest_common_subsequence_problem.
+   * <p>
+   * Usage:  SELECT longest_common_substring_distance( string1, string2 ) 
FROM...
+   */
+
+  @FunctionTemplate(name = "longest_common_substring_distance", scope = 
FunctionTemplate.FunctionScope.SIMPLE, nulls = 
FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class LongestCommonSubstringDistanceFunction implements 
DrillSimpleFunc {
+
+    @Param
+    VarCharHolder rawInput1;
+
+    @Param
+    VarCharHolder rawInput2;
+
+    @Output
+    Float8Holder out;
+
+    @Workspace
+    org.apache.commons.text.similarity.LongestCommonSubsequenceDistance d;
+
+    @Override
+    public void setup() {
+      d = new 
org.apache.commons.text.similarity.LongestCommonSubsequenceDistance();
+    }
+
+    @Override
+    public void eval() {
+
+      String input1 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start,
 rawInput1.end, rawInput1.buffer);
+      String input2 = 
org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start,
 rawInput2.end, rawInput2.buffer);
+
+      double result = d.apply(input1, input2);
+      out.value = result;
+    }
+  }
+
+}
diff --git 
a/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestPhoneticFunctions.java
 
b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestPhoneticFunctions.java
new file mode 100644
index 0000000..85bb135
--- /dev/null
+++ 
b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestPhoneticFunctions.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.fn.impl;
+
+import org.apache.drill.categories.SqlFunctionTest;
+import org.apache.drill.categories.UnlikelyTest;
+import org.apache.drill.test.BaseDirTestWatcher;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterFixtureBuilder;
+import org.apache.drill.test.ClusterTest;
+import org.apache.drill.test.QueryResultSet;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import static org.junit.Assert.assertEquals;
+
+@Category({UnlikelyTest.class, SqlFunctionTest.class})
+public class TestPhoneticFunctions extends ClusterTest {
+
+  private QueryResultSet result;
+
+  @Rule
+  public final BaseDirTestWatcher baseDirTestWatcher = new 
BaseDirTestWatcher();
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher);
+    startCluster(builder);
+  }
+
+  @Test
+  public void testSoundex() throws Exception {
+    String result = queryBuilder()
+        .sql("select soundex('jaime') as soundex from (values(1))")
+        .singletonString();
+    assertEquals("J500", result);
+  }
+
+  @Test
+  public void testCaverphone1() throws Exception {
+    String result = queryBuilder()
+        .sql("SELECT caverphone1('jaime') as caverphone FROM (VALUES(1))")
+        .singletonString();
+    assertEquals("YM1111", result);
+  }
+
+  @Test
+  public void testCaverphone2() throws Exception {
+    String result = queryBuilder()
+        .sql("SELECT caverphone2('steve') as caverphone FROM (VALUES(1))")
+        .singletonString();
+    assertEquals("STF1111111", result);
+  }
+
+  @Test
+  public void testCologne() throws Exception {
+    String result = queryBuilder()
+        .sql("SELECT cologne_phonetic('steve') AS CP FROM (VALUES(1))")
+        .singletonString();
+    assertEquals("823", result);
+  }
+
+  @Test
+  public void testMatchRatingEncoder() throws Exception {
+    String result = queryBuilder()
+        .sql("SELECT match_rating_encoder('Boston') AS MR FROM (VALUES(1))")
+        .singletonString();
+    assertEquals("BSTN", result);
+  }
+
+  @Test
+  public void testNYSIIS() throws Exception {
+    String result = queryBuilder()
+        .sql("SELECT nysiis('Boston') AS ny FROM (VALUES(1))")
+        .singletonString();
+    assertEquals("BASTAN", result);
+  }
+
+  @Test
+  public void testRefinedSoundex() throws Exception {
+    String result = queryBuilder()
+        .sql("SELECT refined_soundex('Boston') AS rs FROM (VALUES(1))")
+        .singletonString();
+    assertEquals("B103608", result);
+  }
+
+  @Test
+  public void testMetaphone() throws Exception {
+    String result = queryBuilder()
+        .sql("SELECT metaphone('Phoenix') AS meta FROM (VALUES(1))")
+        .singletonString();
+    assertEquals("FNKS", result);
+  }
+
+  @Test
+  public void testDoubleMetaphone() throws Exception {
+    String result = queryBuilder()
+        .sql("SELECT double_metaphone('Phoenix') AS meta FROM (VALUES(1))")
+        .singletonString();
+    assertEquals("FNKS", result);
+  }
+}
diff --git 
a/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestStringDistanceFunctions.java
 
b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestStringDistanceFunctions.java
new file mode 100644
index 0000000..915c062
--- /dev/null
+++ 
b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestStringDistanceFunctions.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.fn.impl;
+
+import org.apache.drill.categories.SqlFunctionTest;
+import org.apache.drill.categories.UnlikelyTest;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterFixtureBuilder;
+import org.apache.drill.test.ClusterTest;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import static org.junit.Assert.assertEquals;
+
+@Category({UnlikelyTest.class, SqlFunctionTest.class})
+public class TestStringDistanceFunctions extends ClusterTest {
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher);
+    startCluster(builder);
+  }
+
+  @Test
+  public void testCosineDistance() throws Exception {
+    double result = queryBuilder()
+        .sql("select cosine_distance( 'Big car', 'red car' ) as distance FROM 
(VALUES(1))")
+        .singletonDouble();
+    assertEquals(0.5000000000000001, result, 0.0);
+  }
+
+  @Test
+  public void testHammingDistance() throws Exception {
+    double result = queryBuilder()
+        .sql("select hamming_distance( 'Big car', 'red car' ) as distance FROM 
(VALUES(1))")
+        .singletonDouble();
+    assertEquals(3.0, result, 0.0);
+  }
+
+  @Test
+  public void testJaccardDistance() throws Exception {
+    double result = queryBuilder()
+        .sql("select jaccard_distance( 'Big car', 'red car' ) as distance FROM 
(VALUES(1))")
+        .singletonDouble();
+    assertEquals(0.56, result, 0.0);
+  }
+
+  @Test
+  public void testJaroDistance() throws Exception {
+    double result = queryBuilder()
+        .sql("select jaro_distance( 'Big car', 'red car' ) as distance FROM 
(VALUES(1))")
+        .singletonDouble();
+    assertEquals(0.7142857142857143, result, 0.0);
+  }
+
+  @Test
+  public void testLevenshteinDistance() throws Exception {
+    double result = queryBuilder()
+        .sql("select levenshtein_distance( 'Big car', 'red car' ) as distance 
FROM (VALUES(1))")
+        .singletonDouble();
+    assertEquals(3.0, result, 0.0);
+  }
+}
\ No newline at end of file
diff --git 
a/exec/java-exec/src/test/java/org/apache/drill/test/QueryBuilder.java 
b/exec/java-exec/src/test/java/org/apache/drill/test/QueryBuilder.java
index 0f86955..ff0e166 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/test/QueryBuilder.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/test/QueryBuilder.java
@@ -393,6 +393,27 @@ public class QueryBuilder {
 
   /**
    * Run the query that is expected to return (at least) one row
+   * with the only (or first) column returning a double value.
+   * The double value cannot be null.
+   *
+   * @return the value of the first column of the first row
+   * @throws RpcException if anything goes wrong
+   */
+
+  public double singletonDouble() throws RpcException {
+    RowSet rowSet = rowSet();
+    if (rowSet == null) {
+      throw new IllegalStateException("No rows returned");
+    }
+    RowSetReader reader = rowSet.reader();
+    reader.next();
+    double value = reader.scalar(0).getDouble();
+    rowSet.clear();
+    return value;
+  }
+
+  /**
+   * Run the query that is expected to return (at least) one row
    * with the only (or first) column returning a int value.
    * The int value cannot be null.
    *
diff --git a/exec/jdbc-all/pom.xml b/exec/jdbc-all/pom.xml
index f3595be..f7af511 100644
--- a/exec/jdbc-all/pom.xml
+++ b/exec/jdbc-all/pom.xml
@@ -566,7 +566,7 @@
                           This is likely due to you adding new dependencies to 
a java-exec and not updating the excludes in this module. This is important as 
it minimizes the size of the dependency of Drill application users.
 
                         </message>
-                        <maxsize>33000000</maxsize>
+                        <maxsize>34000000</maxsize>
                         <minsize>15000000</minsize>
                         <files>
                           
<file>${project.build.directory}/drill-jdbc-all-${project.version}.jar</file>

Reply via email to