This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 710120a [SPARK-37508][SQL] Add CONTAINS() string function 710120a is described below commit 710120a499d6082bcec6b65ad1f8dbe4789f4bd9 Author: Angerszhuuuu <angers....@gmail.com> AuthorDate: Wed Dec 1 12:57:22 2021 +0300 [SPARK-37508][SQL] Add CONTAINS() string function ### What changes were proposed in this pull request? Add `CONTAINS` string function. | function| arguments | Returns | |-------|-------|-------| | CONTAINS( left , right) | left: String, right: String | Returns a BOOLEAN. The value is True if right is found inside left. Returns NULL if either input expression is NULL. Otherwise, returns False.| ### Why are the changes needed? contains() is a common convenient function supported by a number of database systems: - https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators#contains_substr - https://docs.snowflake.com/en/sql-reference/functions/contains.html Support of the function can make the migration from other systems to Spark SQL easier. ### Does this PR introduce _any_ user-facing change? User can use `contains(left, right)`: | Left | Right | Return | |----------|:-------------:|------:| | null | "Spark SQL" | null | | "Spark SQL" | null | null | | null | null | null | | "Spark SQL" | "Spark" | true | | "Spark SQL" | "k SQL" | true | | "Spark SQL" | "SPARK" | false | ### How was this patch tested? Added UT Closes #34761 from AngersZhuuuu/SPARK-37508. Authored-by: Angerszhuuuu <angers....@gmail.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../sql/catalyst/analysis/FunctionRegistry.scala | 1 + .../catalyst/expressions/stringExpressions.scala | 17 ++++++++ .../expressions/StringExpressionsSuite.scala | 9 ++++ .../sql-functions/sql-expression-schema.md | 3 +- .../sql-tests/inputs/string-functions.sql | 10 ++++- .../results/ansi/string-functions.sql.out | 50 +++++++++++++++++++++- .../sql-tests/results/string-functions.sql.out | 50 +++++++++++++++++++++- 7 files changed, 136 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 0668460..b2788f8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -455,6 +455,7 @@ object FunctionRegistry { expression[Ascii]("ascii"), expression[Chr]("char", true), expression[Chr]("chr"), + expression[Contains]("contains"), expression[Base64]("base64"), expression[BitLength]("bit_length"), expression[Length]("char_length", true), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 2b997da..959c834 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -465,6 +465,23 @@ abstract class StringPredicate extends BinaryExpression /** * A function that returns true if the string `left` contains the string `right`. */ +@ExpressionDescription( + usage = """ + _FUNC_(expr1, expr2) - Returns a boolean value if expr2 is found inside expr1. + Returns NULL if either input expression is NULL. + """, + examples = """ + Examples: + > SELECT _FUNC_('Spark SQL', 'Spark'); + true + > SELECT _FUNC_('Spark SQL', 'SPARK'); + false + > SELECT _FUNC_('Spark SQL', null); + NULL + """, + since = "3.3.0", + group = "string_funcs" +) case class Contains(left: Expression, right: Expression) extends StringPredicate { override def compare(l: UTF8String, r: UTF8String): Boolean = l.contains(r) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 823ce77..443a94b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -1019,4 +1019,13 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } } } + + test("SPARK-37508: Support contains string expression") { + checkEvaluation(Contains(Literal("aa"), Literal.create(null, StringType)), null) + checkEvaluation(Contains(Literal.create(null, StringType), Literal("aa")), null) + checkEvaluation(Contains(Literal("Spark SQL"), Literal("Spark")), true) + checkEvaluation(Contains(Literal("Spark SQL"), Literal("SPARK")), false) + checkEvaluation(Contains(Literal("Spark SQL"), Literal("SQL")), true) + checkEvaluation(Contains(Literal("Spark SQL"), Literal("k S")), true) + } } diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 821f566..6a4d615 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -1,6 +1,6 @@ <!-- Automatically generated by ExpressionsSchemaSuite --> ## Summary - - Number of queries: 367 + - Number of queries: 368 - Number of expressions that missing example: 12 - Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint ## Schema of Built-in Functions @@ -75,6 +75,7 @@ | org.apache.spark.sql.catalyst.expressions.Coalesce | coalesce | SELECT coalesce(NULL, 1, NULL) | struct<coalesce(NULL, 1, NULL):int> | | org.apache.spark.sql.catalyst.expressions.Concat | concat | SELECT concat('Spark', 'SQL') | struct<concat(Spark, SQL):string> | | org.apache.spark.sql.catalyst.expressions.ConcatWs | concat_ws | SELECT concat_ws(' ', 'Spark', 'SQL') | struct<concat_ws( , Spark, SQL):string> | +| org.apache.spark.sql.catalyst.expressions.Contains | contains | SELECT contains('Spark SQL', 'Spark') | struct<contains(Spark SQL, Spark):boolean> | | org.apache.spark.sql.catalyst.expressions.Conv | conv | SELECT conv('100', 2, 10) | struct<conv(100, 2, 10):string> | | org.apache.spark.sql.catalyst.expressions.Cos | cos | SELECT cos(0) | struct<COS(0):double> | | org.apache.spark.sql.catalyst.expressions.Cosh | cosh | SELECT cosh(0) | struct<COSH(0):double> | diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 61a5a31..f271084 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -104,4 +104,12 @@ select decode(1, 1, 'Southlake'); select decode(2, 1, 'Southlake'); select decode(2, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic'); select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic'); -select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle'); \ No newline at end of file +select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle'); + +-- contains +SELECT CONTAINS(null, 'Spark'); +SELECT CONTAINS('Spark SQL', null); +SELECT CONTAINS(null, null); +SELECT CONTAINS('Spark SQL', 'Spark'); +SELECT CONTAINS('Spark SQL', 'SQL'); +SELECT CONTAINS('Spark SQL', 'SPARK'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 45d4038..a81a34b 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 78 +-- Number of queries: 84 -- !query @@ -632,3 +632,51 @@ select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattl struct<decode(6, 1, Southlake, 2, San Francisco, 3, New Jersey, 4, Seattle):string> -- !query output NULL + + +-- !query +SELECT CONTAINS(null, 'Spark') +-- !query schema +struct<contains(NULL, Spark):boolean> +-- !query output +NULL + + +-- !query +SELECT CONTAINS('Spark SQL', null) +-- !query schema +struct<contains(Spark SQL, NULL):boolean> +-- !query output +NULL + + +-- !query +SELECT CONTAINS(null, null) +-- !query schema +struct<contains(NULL, NULL):boolean> +-- !query output +NULL + + +-- !query +SELECT CONTAINS('Spark SQL', 'Spark') +-- !query schema +struct<contains(Spark SQL, Spark):boolean> +-- !query output +true + + +-- !query +SELECT CONTAINS('Spark SQL', 'SQL') +-- !query schema +struct<contains(Spark SQL, SQL):boolean> +-- !query output +true + + +-- !query +SELECT CONTAINS('Spark SQL', 'SPARK') +-- !query schema +struct<contains(Spark SQL, SPARK):boolean> +-- !query output +false diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 9249f94..d452df8 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 78 +-- Number of queries: 84 -- !query @@ -628,3 +628,51 @@ select decode(6, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattl struct<decode(6, 1, Southlake, 2, San Francisco, 3, New Jersey, 4, Seattle):string> -- !query output NULL + + +-- !query +SELECT CONTAINS(null, 'Spark') +-- !query schema +struct<contains(NULL, Spark):boolean> +-- !query output +NULL + + +-- !query +SELECT CONTAINS('Spark SQL', null) +-- !query schema +struct<contains(Spark SQL, NULL):boolean> +-- !query output +NULL + + +-- !query +SELECT CONTAINS(null, null) +-- !query schema +struct<contains(NULL, NULL):boolean> +-- !query output +NULL + + +-- !query +SELECT CONTAINS('Spark SQL', 'Spark') +-- !query schema +struct<contains(Spark SQL, Spark):boolean> +-- !query output +true + + +-- !query +SELECT CONTAINS('Spark SQL', 'SQL') +-- !query schema +struct<contains(Spark SQL, SQL):boolean> +-- !query output +true + + +-- !query +SELECT CONTAINS('Spark SQL', 'SPARK') +-- !query schema +struct<contains(Spark SQL, SPARK):boolean> +-- !query output +false --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org