Repository: spark Updated Branches: refs/heads/master 3fc0cb920 -> 4d5a6e7b6
[SPARK-8271][SQL]string function: soundex This PR brings SQL function soundex(), see https://issues.apache.org/jira/browse/HIVE-9738 It's based on #7115 , thanks to HuJiayin Author: HuJiayin <[email protected]> Author: Davies Liu <[email protected]> Closes #7812 from davies/soundex and squashes the following commits: fa75941 [Davies Liu] Merge branch 'master' of github.com:apache/spark into soundex a4bd6d8 [Davies Liu] fix soundex 2538908 [HuJiayin] add codegen soundex d15d329 [HuJiayin] add back ut ded1a14 [HuJiayin] Merge branch 'master' of https://github.com/apache/spark e2dec2c [HuJiayin] support soundex rebase code Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4d5a6e7b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4d5a6e7b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4d5a6e7b Branch: refs/heads/master Commit: 4d5a6e7b60b315968973e2298eeee5eb174ec721 Parents: 3fc0cb9 Author: HuJiayin <[email protected]> Authored: Fri Jul 31 16:05:26 2015 -0700 Committer: Reynold Xin <[email protected]> Committed: Fri Jul 31 16:05:26 2015 -0700 ---------------------------------------------------------------------- python/pyspark/sql/functions.py | 17 +++++++ .../catalyst/analysis/FunctionRegistry.scala | 1 + .../catalyst/expressions/stringOperations.scala | 16 ++++++ .../expressions/StringExpressionsSuite.scala | 28 +++++++++++ .../scala/org/apache/spark/sql/functions.scala | 8 +++ .../apache/spark/sql/StringFunctionsSuite.scala | 9 ++++ .../apache/spark/unsafe/types/UTF8String.java | 53 ++++++++++++++++++++ .../spark/unsafe/types/UTF8StringSuite.java | 48 ++++++++++++++++++ 8 files changed, 180 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/4d5a6e7b/python/pyspark/sql/functions.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 8024a8d..bb9926c 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -63,6 +63,8 @@ __all__ += [ 'year', 'quarter', 'month', 'hour', 'minute', 'second', 'dayofmonth', 'dayofyear', 'weekofyear'] +__all__ += ['soundex'] + def _create_function(name, doc=""): """ Create a function for aggregator by name""" @@ -922,6 +924,7 @@ def trunc(date, format): def size(col): """ Collection function: returns the length of the array or map stored in the column. + :param col: name of column or expression >>> df = sqlContext.createDataFrame([([1, 2, 3],),([1],),([],)], ['data']) @@ -932,6 +935,20 @@ def size(col): return Column(sc._jvm.functions.size(_to_java_column(col))) +@since +@ignore_unicode_prefix +def soundex(col): + """ + Returns the SoundEx encoding for a string + + >>> df = sqlContext.createDataFrame([("Peters",),("Uhrbach",)], ['name']) + >>> df.select(soundex(df.name).alias("soundex")).collect() + [Row(soundex=u'P362'), Row(soundex=u'U612')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.size(_to_java_column(col))) + + class UserDefinedFunction(object): """ User defined function in Python http://git-wip-us.apache.org/repos/asf/spark/blob/4d5a6e7b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 1bf7204..3f61a9a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -194,6 +194,7 @@ object FunctionRegistry { expression[StringRepeat]("repeat"), expression[StringReverse]("reverse"), expression[StringTrimRight]("rtrim"), + expression[SoundEx]("soundex"), expression[StringSpace]("space"), expression[StringSplit]("split"), expression[Substring]("substr"), http://git-wip-us.apache.org/repos/asf/spark/blob/4d5a6e7b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index 684eac1..160e72f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -719,6 +719,22 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres } /** + * A function that return soundex code of the given string expression. + */ +case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes { + + override def dataType: DataType = StringType + + override def inputTypes: Seq[DataType] = Seq(StringType) + + override def nullSafeEval(input: Any): Any = input.asInstanceOf[UTF8String].soundex() + + override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + defineCodeGen(ctx, ev, c => s"$c.soundex()") + } +} + +/** * Returns the numeric value of the first character of str. */ case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { http://git-wip-us.apache.org/repos/asf/spark/blob/4d5a6e7b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 3ecd0d3..fb72fe1 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -347,6 +347,34 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // scalastyle:on } + test("soundex unit test") { + checkEvaluation(SoundEx(Literal("ZIN")), "Z500") + checkEvaluation(SoundEx(Literal("SU")), "S000") + checkEvaluation(SoundEx(Literal("")), "") + checkEvaluation(SoundEx(Literal.create(null, StringType)), null) + + // scalastyle:off + // non ascii characters are not allowed in the code, so we disable the scalastyle here. + checkEvaluation(SoundEx(Literal("æµè¯")), "æµè¯") + checkEvaluation(SoundEx(Literal("Tschüss")), "T220") + // scalastyle:on + checkEvaluation(SoundEx(Literal("zZ")), "Z000", create_row("s8")) + checkEvaluation(SoundEx(Literal("RAGSSEEESSSVEEWE")), "R221") + checkEvaluation(SoundEx(Literal("Ashcraft")), "A261") + checkEvaluation(SoundEx(Literal("Aswcraft")), "A261") + checkEvaluation(SoundEx(Literal("Tymczak")), "T522") + checkEvaluation(SoundEx(Literal("Pfister")), "P236") + checkEvaluation(SoundEx(Literal("Miller")), "M460") + checkEvaluation(SoundEx(Literal("Peterson")), "P362") + checkEvaluation(SoundEx(Literal("Peters")), "P362") + checkEvaluation(SoundEx(Literal("Auerbach")), "A612") + checkEvaluation(SoundEx(Literal("Uhrbach")), "U612") + checkEvaluation(SoundEx(Literal("Moskowitz")), "M232") + checkEvaluation(SoundEx(Literal("Moskovitz")), "M213") + checkEvaluation(SoundEx(Literal("relyheewsgeessg")), "R422") + checkEvaluation(SoundEx(Literal("!!")), "!!") + } + test("TRIM/LTRIM/RTRIM") { val s = 'a.string.at(0) checkEvaluation(StringTrim(Literal(" aa ")), "aa", create_row(" abdef ")) http://git-wip-us.apache.org/repos/asf/spark/blob/4d5a6e7b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 5d82a5e..89ffa9c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -1903,6 +1903,14 @@ object functions { } /** + * * Return the soundex code for the specified expression. + * + * @group string_funcs + * @since 1.5.0 + */ + def soundex(e: Column): Column = SoundEx(e.expr) + + /** * Splits str around pattern (pattern is a regular expression). * NOTE: pattern is a string represent the regular expression. * http://git-wip-us.apache.org/repos/asf/spark/blob/4d5a6e7b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 8e0ea76..b7f073c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -142,6 +142,15 @@ class StringFunctionsSuite extends QueryTest { Row("aa123cc")) } + test("soundex function") { + val df = Seq(("MARY", "SU")).toDF("l", "r") + checkAnswer( + df.select(soundex($"l"), soundex($"r")), Row("M600", "S000")) + + checkAnswer( + df.selectExpr("SoundEx(l)", "SoundEx(r)"), Row("M600", "S000")) + } + test("string instr function") { val df = Seq(("aaads", "aa", "zz")).toDF("a", "b", "c") http://git-wip-us.apache.org/repos/asf/spark/blob/4d5a6e7b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java ---------------------------------------------------------------------- diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index c38953f..9d4998f 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -680,4 +680,57 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable { } return result; } + + /** + * Soundex mapping table + */ + private static final byte[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '7', + '0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '7', '2', '0', '2'}; + + /** + * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, + * but can also be used as a general purpose scheme to find word with similar phonemes. + * https://en.wikipedia.org/wiki/Soundex + */ + public UTF8String soundex() { + if (numBytes == 0) { + return EMPTY_UTF8; + } + + byte b = getByte(0); + if ('a' <= b && b <= 'z') { + b -= 32; + } else if (b < 'A' || 'Z' < b) { + // first character must be a letter + return this; + } + byte sx[] = {'0', '0', '0', '0'}; + sx[0] = b; + int sxi = 1; + int idx = b - 'A'; + byte lastCode = US_ENGLISH_MAPPING[idx]; + + for (int i = 1; i < numBytes; i++) { + b = getByte(i); + if ('a' <= b && b <= 'z') { + b -= 32; + } else if (b < 'A' || 'Z' < b) { + // not a letter, skip it + lastCode = '0'; + continue; + } + idx = b - 'A'; + byte code = US_ENGLISH_MAPPING[idx]; + if (code == '7') { + // ignore it + } else { + if (code != '0' && code != lastCode) { + sx[sxi++] = code; + if (sxi > 3) break; + } + lastCode = code; + } + } + return UTF8String.fromBytes(sx); + } } http://git-wip-us.apache.org/repos/asf/spark/blob/4d5a6e7b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java ---------------------------------------------------------------------- diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index f2cc19c..c565210 100644 --- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -341,4 +341,52 @@ public class UTF8StringSuite { assertEquals(fromString(" "), blankString(3)); assertEquals(fromString(""), blankString(0)); } + + @Test + public void soundex() { + assertEquals(fromString("Robert").soundex(), fromString("R163")); + assertEquals(fromString("Rupert").soundex(), fromString("R163")); + assertEquals(fromString("Rubin").soundex(), fromString("R150")); + assertEquals(fromString("Ashcraft").soundex(), fromString("A261")); + assertEquals(fromString("Ashcroft").soundex(), fromString("A261")); + assertEquals(fromString("Burroughs").soundex(), fromString("B620")); + assertEquals(fromString("Burrows").soundex(), fromString("B620")); + assertEquals(fromString("Ekzampul").soundex(), fromString("E251")); + assertEquals(fromString("Example").soundex(), fromString("E251")); + assertEquals(fromString("Ellery").soundex(), fromString("E460")); + assertEquals(fromString("Euler").soundex(), fromString("E460")); + assertEquals(fromString("Ghosh").soundex(), fromString("G200")); + assertEquals(fromString("Gauss").soundex(), fromString("G200")); + assertEquals(fromString("Gutierrez").soundex(), fromString("G362")); + assertEquals(fromString("Heilbronn").soundex(), fromString("H416")); + assertEquals(fromString("Hilbert").soundex(), fromString("H416")); + assertEquals(fromString("Jackson").soundex(), fromString("J250")); + assertEquals(fromString("Kant").soundex(), fromString("K530")); + assertEquals(fromString("Knuth").soundex(), fromString("K530")); + assertEquals(fromString("Lee").soundex(), fromString("L000")); + assertEquals(fromString("Lukasiewicz").soundex(), fromString("L222")); + assertEquals(fromString("Lissajous").soundex(), fromString("L222")); + assertEquals(fromString("Ladd").soundex(), fromString("L300")); + assertEquals(fromString("Lloyd").soundex(), fromString("L300")); + assertEquals(fromString("Moses").soundex(), fromString("M220")); + assertEquals(fromString("O'Hara").soundex(), fromString("O600")); + assertEquals(fromString("Pfister").soundex(), fromString("P236")); + assertEquals(fromString("Rubin").soundex(), fromString("R150")); + assertEquals(fromString("Robert").soundex(), fromString("R163")); + assertEquals(fromString("Rupert").soundex(), fromString("R163")); + assertEquals(fromString("Soundex").soundex(), fromString("S532")); + assertEquals(fromString("Sownteks").soundex(), fromString("S532")); + assertEquals(fromString("Tymczak").soundex(), fromString("T522")); + assertEquals(fromString("VanDeusen").soundex(), fromString("V532")); + assertEquals(fromString("Washington").soundex(), fromString("W252")); + assertEquals(fromString("Wheaton").soundex(), fromString("W350")); + + assertEquals(fromString("a").soundex(), fromString("A000")); + assertEquals(fromString("ab").soundex(), fromString("A100")); + assertEquals(fromString("abc").soundex(), fromString("A120")); + assertEquals(fromString("abcd").soundex(), fromString("A123")); + assertEquals(fromString("").soundex(), fromString("")); + assertEquals(fromString("123").soundex(), fromString("123")); + assertEquals(fromString("ä¸çåä¸").soundex(), fromString("ä¸çåä¸")); + } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
