This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 4503efcaadd [SPARK-43944][CONNECT][PYTHON] Add string functions to Scala and Python - part 2 4503efcaadd is described below commit 4503efcaadd838cabad806558250abed9c2ace86 Author: panbingkun <pbk1...@gmail.com> AuthorDate: Sun Jun 18 14:45:59 2023 +0800 [SPARK-43944][CONNECT][PYTHON] Add string functions to Scala and Python - part 2 ### What changes were proposed in this pull request? Add following functions: - replace - split_part - substr - parse_url - printf - url_decode - url_encode - position - endswith - startswith to: - Scala API - Python API - Spark Connect Scala Client - Spark Connect Python Client ### Why are the changes needed? for parity ### Does this PR introduce _any_ user-facing change? Yes, new functions. ### How was this patch tested? - Add New UT. - Pass GA. Closes #41594 from panbingkun/SPARK-43944. Authored-by: panbingkun <pbk1...@gmail.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- .../scala/org/apache/spark/sql/functions.scala | 152 +++++++++++ .../apache/spark/sql/PlanGenerationTestSuite.scala | 58 +++- .../explain-results/function_endswith.explain | 2 + .../explain-results/function_parse_url.explain | 2 + .../function_parse_url_with_key.explain | 2 + .../explain-results/function_position.explain | 2 + .../function_position_with_start.explain | 2 + .../explain-results/function_printf.explain | 2 + .../explain-results/function_replace.explain | 2 + .../function_replace_with_specified_string.explain | 2 + .../explain-results/function_split_part.explain | 2 + .../explain-results/function_startswith.explain | 2 + .../explain-results/function_substr.explain | 2 + .../function_substr_with_len.explain | 2 + .../explain-results/function_to_number.explain | 2 +- .../explain-results/function_url_decode.explain | 2 + .../explain-results/function_url_encode.explain | 2 + ...ction_to_number.json => function_endswith.json} | 6 +- ...umber.proto.bin => function_endswith.proto.bin} | Bin 188 -> 184 bytes ...tion_to_number.json => function_parse_url.json} | 6 +- ...mber.proto.bin => function_parse_url.proto.bin} | Bin 188 -> 185 bytes ...umber.json => function_parse_url_with_key.json} | 10 +- ...o.bin => function_parse_url_with_key.proto.bin} | Bin 188 -> 192 bytes ...ction_to_number.json => function_position.json} | 6 +- ...umber.proto.bin => function_position.proto.bin} | Bin 188 -> 184 bytes ...mber.json => function_position_with_start.json} | 10 +- ....bin => function_position_with_start.proto.bin} | Bin 188 -> 191 bytes ...unction_to_number.json => function_printf.json} | 10 +- ..._number.proto.bin => function_printf.proto.bin} | Bin 188 -> 196 bytes ...nction_to_number.json => function_replace.json} | 6 +- ...number.proto.bin => function_replace.proto.bin} | Bin 188 -> 183 bytes ...=> function_replace_with_specified_string.json} | 10 +- ...nction_replace_with_specified_string.proto.bin} | Bin 188 -> 190 bytes ...ion_to_number.json => function_split_part.json} | 10 +- ...ber.proto.bin => function_split_part.proto.bin} | Bin 188 -> 193 bytes ...ion_to_number.json => function_startswith.json} | 6 +- ...ber.proto.bin => function_startswith.proto.bin} | Bin 188 -> 186 bytes ...unction_to_number.json => function_substr.json} | 6 +- ..._number.proto.bin => function_substr.proto.bin} | Bin 188 -> 182 bytes ...o_number.json => function_substr_with_len.json} | 10 +- ...roto.bin => function_substr_with_len.proto.bin} | Bin 188 -> 189 bytes .../query-tests/queries/function_to_number.json | 2 +- .../queries/function_to_number.proto.bin | Bin 188 -> 190 bytes ...ion_to_number.json => function_url_decode.json} | 6 +- ...ber.proto.bin => function_url_decode.proto.bin} | Bin 188 -> 179 bytes ...ion_to_number.json => function_url_encode.json} | 6 +- ...ber.proto.bin => function_url_encode.proto.bin} | Bin 188 -> 179 bytes .../source/reference/pyspark.sql/functions.rst | 10 + python/pyspark/sql/connect/functions.py | 90 +++++++ python/pyspark/sql/functions.py | 297 +++++++++++++++++++++ .../scala/org/apache/spark/sql/functions.scala | 175 ++++++++++++ .../apache/spark/sql/DataFrameFunctionsSuite.scala | 2 +- .../apache/spark/sql/StringFunctionsSuite.scala | 133 ++++++++- 53 files changed, 1006 insertions(+), 51 deletions(-) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index 61783746c56..9c2a5b96182 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -3819,6 +3819,158 @@ object functions { */ def to_number(e: Column, format: Column): Column = Column.fn("to_number", e, format) + /** + * Replaces all occurrences of `search` with `replace`. + * + * @param src + * A column of string to be replaced + * @param search + * A column of string, If `search` is not found in `str`, `str` is returned unchanged. + * @param replace + * A column of string, If `replace` is not specified or is an empty string, nothing replaces + * the string that is removed from `str`. + * @group string_funcs + * @since 3.5.0 + */ + def replace(src: Column, search: Column, replace: Column): Column = + Column.fn("replace", src, search, replace) + + /** + * Replaces all occurrences of `search` with `replace`. + * + * @param src + * A column of string to be replaced + * @param search + * A column of string, If `search` is not found in `src`, `src` is returned unchanged. + * @group string_funcs + * @since 3.5.0 + */ + def replace(src: Column, search: Column): Column = Column.fn("replace", src, search) + + /** + * Splits `str` by delimiter and return requested part of the split (1-based). If any input is + * null, returns null. if `partNum` is out of range of split parts, returns empty string. If + * `partNum` is 0, throws an error. If `partNum` is negative, the parts are counted backward + * from the end of the string. If the `delimiter` is an empty string, the `str` is not split. + * + * @group string_funcs + * @since 3.5.0 + */ + def split_part(str: Column, delimiter: Column, partNum: Column): Column = + Column.fn("split_part", str, delimiter, partNum) + + /** + * Returns the substring of `str` that starts at `pos` and is of length `len`, or the slice of + * byte array that starts at `pos` and is of length `len`. + * + * @group string_funcs + * @since 3.5.0 + */ + def substr(str: Column, pos: Column, len: Column): Column = + Column.fn("substr", str, pos, len) + + /** + * Returns the substring of `str` that starts at `pos`, or the slice of byte array that starts + * at `pos`. + * + * @group string_funcs + * @since 3.5.0 + */ + def substr(str: Column, pos: Column): Column = Column.fn("substr", str, pos) + + /** + * Extracts a part from a URL. + * + * @group string_funcs + * @since 3.5.0 + */ + def parse_url(url: Column, partToExtract: Column, key: Column): Column = + Column.fn("parse_url", url, partToExtract, key) + + /** + * Extracts a part from a URL. + * + * @group string_funcs + * @since 3.5.0 + */ + def parse_url(url: Column, partToExtract: Column): Column = + Column.fn("parse_url", url, partToExtract) + + /** + * Formats the arguments in printf-style and returns the result as a string column. + * + * @group string_funcs + * @since 3.5.0 + */ + def printf(format: Column, arguments: Column*): Column = + Column.fn("format_string", lit(format) +: arguments: _*) + + /** + * Decodes a `str` in 'application/x-www-form-urlencoded' format using a specific encoding + * scheme. + * + * @group string_funcs + * @since 3.5.0 + */ + def url_decode(str: Column): Column = Column.fn("url_decode", str) + + /** + * Translates a string into 'application/x-www-form-urlencoded' format using a specific encoding + * scheme. + * + * @group string_funcs + * @since 3.5.0 + */ + def url_encode(str: Column): Column = Column.fn("url_encode", str) + + /** + * Returns the position of the first occurrence of `substr` in `str` after position `start`. The + * given `start` and return value are 1-based. + * + * @group string_funcs + * @since 3.5.0 + */ + def position(substr: Column, str: Column, start: Column): Column = + Column.fn("position", substr, str, start) + + /** + * Returns the position of the first occurrence of `substr` in `str` after position `1`. The + * return value are 1-based. + * + * @group string_funcs + * @since 3.5.0 + */ + def position(substr: Column, str: Column): Column = + Column.fn("position", substr, str) + + /** + * Returns a boolean. The value is True if str ends with suffix. Returns NULL if either input + * expression is NULL. Otherwise, returns False. Both str or suffix must be of STRING type. + * + * @note + * Only STRING type is supported in this function, while `endswith` in SQL supports both + * STRING and BINARY. + * + * @group string_funcs + * @since 3.5.0 + */ + def endswith(str: Column, suffix: Column): Column = + Column.fn("endswith", str, suffix) + + /** + * Returns a boolean. The value is True if str starts with prefix. Returns NULL if either input + * expression is NULL. Otherwise, returns False. Both str or prefix must be of STRING type. + * + * @note + * Only STRING type is supported in this function, while `startswith` in SQL supports both + * STRING and BINARY. + * + * @group string_funcs + * @since 3.5.0 + */ + def startswith(str: Column, prefix: Column): Column = + Column.fn("startswith", str, prefix) + ////////////////////////////////////////////////////////////////////////////////////////////// // DateTime functions ////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index 8e40a29c3d5..7633cd7d0c0 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -2391,7 +2391,63 @@ class PlanGenerationTestSuite } functionTest("to_number") { - fn.to_char(fn.col("g"), lit("$99.99")) + fn.to_number(fn.col("g"), lit("$99.99")) + } + + functionTest("replace") { + fn.replace(fn.col("g"), fn.col("g")) + } + + functionTest("replace with specified string") { + fn.replace(fn.col("g"), fn.col("g"), fn.col("g")) + } + + functionTest("split_part") { + fn.split_part(fn.col("g"), fn.col("g"), fn.col("a")) + } + + functionTest("substr") { + fn.substr(fn.col("g"), fn.col("a")) + } + + functionTest("substr with len") { + fn.substr(fn.col("g"), fn.col("a"), fn.col("a")) + } + + functionTest("parse_url") { + fn.parse_url(fn.col("g"), fn.col("g")) + } + + functionTest("parse_url with key") { + fn.parse_url(fn.col("g"), fn.col("g"), fn.col("g")) + } + + functionTest("printf") { + fn.printf(fn.col("g"), fn.col("a"), fn.col("g")) + } + + functionTest("url_decode") { + fn.url_decode(fn.col("g")) + } + + functionTest("url_encode") { + fn.url_encode(fn.col("g")) + } + + functionTest("position") { + fn.position(fn.col("g"), fn.col("g")) + } + + functionTest("position with start") { + fn.position(fn.col("g"), fn.col("g"), fn.col("a")) + } + + functionTest("endswith") { + fn.endswith(fn.col("g"), fn.col("g")) + } + + functionTest("startswith") { + fn.startswith(fn.col("g"), fn.col("g")) } functionTest("to_timestamp_ltz") { diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_endswith.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_endswith.explain new file mode 100644 index 00000000000..f78ed7492a5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_endswith.explain @@ -0,0 +1,2 @@ +Project [EndsWith(g#0, g#0) AS endswith(g, g)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_parse_url.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_parse_url.explain new file mode 100644 index 00000000000..3c874b5c8b6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_parse_url.explain @@ -0,0 +1,2 @@ +Project [parse_url(g#0, g#0, false) AS parse_url(g, g)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_parse_url_with_key.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_parse_url_with_key.explain new file mode 100644 index 00000000000..eba1c5c814f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_parse_url_with_key.explain @@ -0,0 +1,2 @@ +Project [parse_url(g#0, g#0, g#0, false) AS parse_url(g, g, g)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_position.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_position.explain new file mode 100644 index 00000000000..b0ec42dfc58 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_position.explain @@ -0,0 +1,2 @@ +Project [position(g#0, g#0, 1) AS position(g, g, 1)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_position_with_start.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_position_with_start.explain new file mode 100644 index 00000000000..c17e658faff --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_position_with_start.explain @@ -0,0 +1,2 @@ +Project [position(g#0, g#0, a#0) AS position(g, g, a)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_printf.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_printf.explain new file mode 100644 index 00000000000..10409df0070 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_printf.explain @@ -0,0 +1,2 @@ +Project [format_string(g#0, a#0, g#0) AS format_string(g, a, g)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_replace.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_replace.explain new file mode 100644 index 00000000000..b521eedaff1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_replace.explain @@ -0,0 +1,2 @@ +Project [replace(g#0, g#0, ) AS replace(g, g, )#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_replace_with_specified_string.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_replace_with_specified_string.explain new file mode 100644 index 00000000000..1f5609d75ec --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_replace_with_specified_string.explain @@ -0,0 +1,2 @@ +Project [replace(g#0, g#0, g#0) AS replace(g, g, g)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_split_part.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_split_part.explain new file mode 100644 index 00000000000..486b1a4538c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_split_part.explain @@ -0,0 +1,2 @@ +Project [element_at(stringsplitsql(g#0, g#0), a#0, Some(), false) AS split_part(g, g, a)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_startswith.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_startswith.explain new file mode 100644 index 00000000000..4b7c2d6f28f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_startswith.explain @@ -0,0 +1,2 @@ +Project [StartsWith(g#0, g#0) AS startswith(g, g)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substr.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substr.explain new file mode 100644 index 00000000000..434836cb8d8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substr.explain @@ -0,0 +1,2 @@ +Project [substr(g#0, a#0, 2147483647) AS substr(g, a, 2147483647)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substr_with_len.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substr_with_len.explain new file mode 100644 index 00000000000..98f51716e34 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substr_with_len.explain @@ -0,0 +1,2 @@ +Project [substr(g#0, a#0, a#0) AS substr(g, a, a)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_number.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_number.explain index 79ece963928..37a68489ba8 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_number.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_number.explain @@ -1,2 +1,2 @@ -Project [to_char(cast(g#0 as decimal(38,18)), $99.99) AS to_char(g, $99.99)#0] +Project [to_number(g#0, $99.99) AS to_number(g, $99.99)#0] +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_decode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_decode.explain new file mode 100644 index 00000000000..36b21e27c10 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_decode.explain @@ -0,0 +1,2 @@ +Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.UrlCodec$, StringType, decode, g#0, UTF-8, StringType, true, true, true) AS url_decode(g)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_encode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_encode.explain new file mode 100644 index 00000000000..70a0f628fc9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_url_encode.explain @@ -0,0 +1,2 @@ +Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.UrlCodec$, StringType, encode, g#0, UTF-8, StringType, true, true, true) AS url_encode(g)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_endswith.json similarity index 84% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_endswith.json index a39682de10f..1f7943f5116 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_endswith.json @@ -13,14 +13,14 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "endswith", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } }, { - "literal": { - "string": "$99.99" + "unresolvedAttribute": { + "unparsedIdentifier": "g" } }] } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_endswith.proto.bin similarity index 70% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_endswith.proto.bin index 86ab9d23572..2dfef1c6d86 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_endswith.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_parse_url.json similarity index 83% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_parse_url.json index a39682de10f..e03b86c21eb 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_parse_url.json @@ -13,14 +13,14 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "parse_url", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } }, { - "literal": { - "string": "$99.99" + "unresolvedAttribute": { + "unparsedIdentifier": "g" } }] } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_parse_url.proto.bin similarity index 70% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_parse_url.proto.bin index 86ab9d23572..56917289c1e 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_parse_url.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.json similarity index 73% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.json index a39682de10f..bd627911ef2 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.json @@ -13,14 +13,18 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "parse_url", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } }, { - "literal": { - "string": "$99.99" + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "g" } }] } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.proto.bin similarity index 68% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.proto.bin index 86ab9d23572..231622cbd8a 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_parse_url_with_key.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_position.json similarity index 84% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_position.json index a39682de10f..7b005e2bb82 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_position.json @@ -13,14 +13,14 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "position", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } }, { - "literal": { - "string": "$99.99" + "unresolvedAttribute": { + "unparsedIdentifier": "g" } }] } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_position.proto.bin similarity index 70% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_position.proto.bin index 86ab9d23572..34b7e301fe9 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_position.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_position_with_start.json similarity index 73% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_position_with_start.json index a39682de10f..2cd04992d1d 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_position_with_start.json @@ -13,14 +13,18 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "position", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } }, { - "literal": { - "string": "$99.99" + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" } }] } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_position_with_start.proto.bin similarity index 69% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_position_with_start.proto.bin index 86ab9d23572..b34eaf80f88 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_position_with_start.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_printf.json similarity index 73% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_printf.json index a39682de10f..dc7ca880c4b 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_printf.json @@ -13,14 +13,18 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "format_string", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } }, { - "literal": { - "string": "$99.99" + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "g" } }] } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_printf.proto.bin similarity index 67% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_printf.proto.bin index 86ab9d23572..7ebdda6cac1 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_printf.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_replace.json similarity index 84% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_replace.json index a39682de10f..2f6df6833f3 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_replace.json @@ -13,14 +13,14 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "replace", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } }, { - "literal": { - "string": "$99.99" + "unresolvedAttribute": { + "unparsedIdentifier": "g" } }] } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_replace.proto.bin similarity index 70% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_replace.proto.bin index 86ab9d23572..0564f7ed575 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_replace.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.json similarity index 73% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.json index a39682de10f..2e91450552c 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.json @@ -13,14 +13,18 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "replace", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } }, { - "literal": { - "string": "$99.99" + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "g" } }] } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.proto.bin similarity index 69% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.proto.bin index 86ab9d23572..136a6b31821 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_replace_with_specified_string.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_split_part.json similarity index 73% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_split_part.json index a39682de10f..81ced1555d3 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_split_part.json @@ -13,14 +13,18 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "split_part", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } }, { - "literal": { - "string": "$99.99" + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" } }] } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_split_part.proto.bin similarity index 68% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_split_part.proto.bin index 86ab9d23572..2c1948f20dc 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_split_part.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_startswith.json similarity index 83% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_startswith.json index a39682de10f..ce2b0ac658c 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_startswith.json @@ -13,14 +13,14 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "startswith", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } }, { - "literal": { - "string": "$99.99" + "unresolvedAttribute": { + "unparsedIdentifier": "g" } }] } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_startswith.proto.bin similarity index 70% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_startswith.proto.bin index 86ab9d23572..2f09e8095f5 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_startswith.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substr.json similarity index 84% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_substr.json index a39682de10f..ef6d225821c 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substr.json @@ -13,14 +13,14 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "substr", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } }, { - "literal": { - "string": "$99.99" + "unresolvedAttribute": { + "unparsedIdentifier": "a" } }] } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substr.proto.bin similarity index 70% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_substr.proto.bin index 86ab9d23572..934201c4333 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_substr.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.json similarity index 73% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.json index a39682de10f..d8492899d69 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.json @@ -13,14 +13,18 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "substr", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } }, { - "literal": { - "string": "$99.99" + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" } }] } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.proto.bin similarity index 69% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.proto.bin index 86ab9d23572..0fab03c0250 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_substr_with_len.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json index a39682de10f..abb71e80a76 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json @@ -13,7 +13,7 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "to_number", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin index 86ab9d23572..189c73553c5 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_url_decode.json similarity index 82% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_url_decode.json index a39682de10f..d4cdeeb6c48 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_url_decode.json @@ -13,15 +13,11 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "url_decode", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }, { - "literal": { - "string": "$99.99" - } }] } }] diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_url_decode.proto.bin similarity index 70% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_url_decode.proto.bin index 86ab9d23572..e347e73c3ae 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_url_decode.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_url_encode.json similarity index 82% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json copy to connector/connect/common/src/test/resources/query-tests/queries/function_url_encode.json index a39682de10f..5d221e0fea6 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_url_encode.json @@ -13,15 +13,11 @@ }, "expressions": [{ "unresolvedFunction": { - "functionName": "to_char", + "functionName": "url_encode", "arguments": [{ "unresolvedAttribute": { "unparsedIdentifier": "g" } - }, { - "literal": { - "string": "$99.99" - } }] } }] diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_url_encode.proto.bin similarity index 70% copy from connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin copy to connector/connect/common/src/test/resources/query-tests/queries/function_url_encode.proto.bin index 86ab9d23572..9313fb82498 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_to_number.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_url_encode.proto.bin differ diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst index ede67262df5..66b2ad149dc 100644 --- a/python/docs/source/reference/pyspark.sql/functions.rst +++ b/python/docs/source/reference/pyspark.sql/functions.rst @@ -357,6 +357,7 @@ String Functions concat_ws decode encode + endswith format_number format_string initcap @@ -368,6 +369,9 @@ String Functions lpad ltrim octet_length + parse_url + position + printf rlike regexp regexp_like @@ -377,12 +381,16 @@ String Functions regexp_replace regexp_substr regexp_instr + replace unbase64 rpad repeat rtrim soundex split + split_part + startswith + substr substring substring_index overlay @@ -393,6 +401,8 @@ String Functions translate trim upper + url_decode + url_encode Bitwise Functions diff --git a/python/pyspark/sql/connect/functions.py b/python/pyspark/sql/connect/functions.py index 4edbc54a556..84a44baccdc 100644 --- a/python/pyspark/sql/connect/functions.py +++ b/python/pyspark/sql/connect/functions.py @@ -2400,6 +2400,96 @@ def to_number(col: "ColumnOrName", format: "ColumnOrName") -> Column: to_number.__doc__ = pysparkfuncs.to_number.__doc__ +def replace( + src: "ColumnOrName", search: "ColumnOrName", replace: Optional["ColumnOrName"] = None +) -> Column: + if replace is not None: + return _invoke_function_over_columns("replace", src, search, replace) + else: + return _invoke_function_over_columns("replace", src, search) + + +replace.__doc__ = pysparkfuncs.replace.__doc__ + + +def split_part(src: "ColumnOrName", delimiter: "ColumnOrName", partNum: "ColumnOrName") -> Column: + return _invoke_function_over_columns("split_part", src, delimiter, partNum) + + +split_part.__doc__ = pysparkfuncs.split_part.__doc__ + + +def substr( + str: "ColumnOrName", pos: "ColumnOrName", len: Optional["ColumnOrName"] = None +) -> Column: + if len is not None: + return _invoke_function_over_columns("substr", str, pos, len) + else: + return _invoke_function_over_columns("substr", str, pos) + + +substr.__doc__ = pysparkfuncs.substr.__doc__ + + +def parse_url( + url: "ColumnOrName", partToExtract: "ColumnOrName", key: Optional["ColumnOrName"] = None +) -> Column: + if key is not None: + return _invoke_function_over_columns("parse_url", url, partToExtract, key) + else: + return _invoke_function_over_columns("parse_url", url, partToExtract) + + +parse_url.__doc__ = pysparkfuncs.parse_url.__doc__ + + +def printf(format: "ColumnOrName", *cols: "ColumnOrName") -> Column: + return _invoke_function("printf", lit(format), *[_to_col(c) for c in cols]) + + +printf.__doc__ = pysparkfuncs.printf.__doc__ + + +def url_decode(str: "ColumnOrName") -> Column: + return _invoke_function_over_columns("url_decode", str) + + +url_decode.__doc__ = pysparkfuncs.url_decode.__doc__ + + +def url_encode(str: "ColumnOrName") -> Column: + return _invoke_function_over_columns("url_encode", str) + + +url_encode.__doc__ = pysparkfuncs.url_encode.__doc__ + + +def position( + substr: "ColumnOrName", str: "ColumnOrName", start: Optional["ColumnOrName"] = None +) -> Column: + if start is not None: + return _invoke_function_over_columns("position", substr, str, start) + else: + return _invoke_function_over_columns("position", substr, str) + + +position.__doc__ = pysparkfuncs.position.__doc__ + + +def endswith(str: "ColumnOrName", suffix: "ColumnOrName") -> Column: + return _invoke_function_over_columns("endswith", str, suffix) + + +endswith.__doc__ = pysparkfuncs.endswith.__doc__ + + +def startswith(str: "ColumnOrName", prefix: "ColumnOrName") -> Column: + return _invoke_function_over_columns("startswith", str, prefix) + + +startswith.__doc__ = pysparkfuncs.startswith.__doc__ + + # Date/Timestamp functions # TODO(SPARK-41455): Resolve dtypes inconsistencies for: # to_timestamp, from_utc_timestamp, to_utc_timestamp, diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index e9b6ee8fa40..0d8f69daabb 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -9091,6 +9091,7 @@ def to_binary(col: "ColumnOrName", format: Optional["ColumnOrName"] = None) -> C return _invoke_function_over_columns("to_binary", col) +@try_remote_functions def to_char(col: "ColumnOrName", format: "ColumnOrName") -> Column: """ Convert `col` to a string based on the `format`. @@ -9130,6 +9131,7 @@ def to_char(col: "ColumnOrName", format: "ColumnOrName") -> Column: return _invoke_function_over_columns("to_char", col, format) +@try_remote_functions def to_number(col: "ColumnOrName", format: "ColumnOrName") -> Column: """ Convert string 'col' to a number based on the string format 'format'. @@ -9170,6 +9172,301 @@ def to_number(col: "ColumnOrName", format: "ColumnOrName") -> Column: return _invoke_function_over_columns("to_number", col, format) +@try_remote_functions +def replace( + src: "ColumnOrName", search: "ColumnOrName", replace: Optional["ColumnOrName"] = None +) -> Column: + """ + Replaces all occurrences of `search` with `replace`. + + .. versionadded:: 3.5.0 + + Parameters + ---------- + src : :class:`~pyspark.sql.Column` or str + A column of string to be replaced. + search : :class:`~pyspark.sql.Column` or str + A column of string, If `search` is not found in `str`, `str` is returned unchanged. + replace : :class:`~pyspark.sql.Column` or str, optional + A column of string, If `replace` is not specified or is an empty string, + nothing replaces the string that is removed from `str`. + + Examples + -------- + >>> df = spark.createDataFrame([("ABCabc", "abc", "DEF",)], ["a", "b", "c"]) + >>> df.select(replace(df.a, df.b, df.c).alias('r')).collect() + [Row(r='ABCDEF')] + + >>> df.select(replace(df.a, df.b).alias('r')).collect() + [Row(r='ABC')] + """ + if replace is not None: + return _invoke_function_over_columns("replace", src, search, replace) + else: + return _invoke_function_over_columns("replace", src, search) + + +@try_remote_functions +def split_part(src: "ColumnOrName", delimiter: "ColumnOrName", partNum: "ColumnOrName") -> Column: + """ + Splits `str` by delimiter and return requested part of the split (1-based). + If any input is null, returns null. if `partNum` is out of range of split parts, + returns empty string. If `partNum` is 0, throws an error. If `partNum` is negative, + the parts are counted backward from the end of the string. + If the `delimiter` is an empty string, the `str` is not split. + + .. versionadded:: 3.5.0 + + Parameters + ---------- + src : :class:`~pyspark.sql.Column` or str + A column of string to be splited. + delimiter : :class:`~pyspark.sql.Column` or str + A column of string, the delimiter used for split. + partNum : :class:`~pyspark.sql.Column` or str + A column of string, requested part of the split (1-based). + + Examples + -------- + >>> df = spark.createDataFrame([("11.12.13", ".", 3,)], ["a", "b", "c"]) + >>> df.select(split_part(df.a, df.b, df.c).alias('r')).collect() + [Row(r='13')] + """ + return _invoke_function_over_columns("split_part", src, delimiter, partNum) + + +@try_remote_functions +def substr( + str: "ColumnOrName", pos: "ColumnOrName", len: Optional["ColumnOrName"] = None +) -> Column: + """ + Returns the substring of `str` that starts at `pos` and is of length `len`, + or the slice of byte array that starts at `pos` and is of length `len`. + + .. versionadded:: 3.5.0 + + Parameters + ---------- + src : :class:`~pyspark.sql.Column` or str + A column of string. + pos : :class:`~pyspark.sql.Column` or str + A column of string, the substring of `str` that starts at `pos`. + len : :class:`~pyspark.sql.Column` or str, optional + A column of string, the substring of `str` is of length `len`. + + Examples + -------- + >>> df = spark.createDataFrame([("Spark SQL", 5, 1,)], ["a", "b", "c"]) + >>> df.select(substr(df.a, df.b, df.c).alias('r')).collect() + [Row(r='k')] + + >>> df.select(substr(df.a, df.b).alias('r')).collect() + [Row(r='k SQL')] + """ + if len is not None: + return _invoke_function_over_columns("substr", str, pos, len) + else: + return _invoke_function_over_columns("substr", str, pos) + + +@try_remote_functions +def parse_url( + url: "ColumnOrName", partToExtract: "ColumnOrName", key: Optional["ColumnOrName"] = None +) -> Column: + """ + Extracts a part from a URL. + + .. versionadded:: 3.5.0 + + Parameters + ---------- + url : :class:`~pyspark.sql.Column` or str + A column of string. + partToExtract : :class:`~pyspark.sql.Column` or str + A column of string, the path. + key : :class:`~pyspark.sql.Column` or str, optional + A column of string, the key. + + Examples + -------- + >>> df = spark.createDataFrame( + ... [("http://spark.apache.org/path?query=1", "QUERY", "query",)], + ... ["a", "b", "c"] + ... ) + >>> df.select(parse_url(df.a, df.b, df.c).alias('r')).collect() + [Row(r='1')] + + >>> df.select(parse_url(df.a, df.b).alias('r')).collect() + [Row(r='query=1')] + """ + if key is not None: + return _invoke_function_over_columns("parse_url", url, partToExtract, key) + else: + return _invoke_function_over_columns("parse_url", url, partToExtract) + + +@try_remote_functions +def printf(format: "ColumnOrName", *cols: "ColumnOrName") -> Column: + """ + Formats the arguments in printf-style and returns the result as a string column. + + .. versionadded:: 3.5.0 + + Parameters + ---------- + format : :class:`~pyspark.sql.Column` or str + string that can contain embedded format tags and used as result column's value + cols : :class:`~pyspark.sql.Column` or str + column names or :class:`~pyspark.sql.Column`\\s to be used in formatting + + Examples + -------- + >>> df = spark.createDataFrame([("aa%d%s", 123, "cc",)], ["a", "b", "c"]) + >>> df.select(printf(df.a, df.b, df.c).alias('r')).collect() + [Row(r='aa123cc')] + """ + sc = get_active_spark_context() + return _invoke_function("printf", _to_java_column(format), _to_seq(sc, cols, _to_java_column)) + + +@try_remote_functions +def url_decode(str: "ColumnOrName") -> Column: + """ + Decodes a `str` in 'application/x-www-form-urlencoded' format + using a specific encoding scheme. + + .. versionadded:: 3.5.0 + + Parameters + ---------- + str : :class:`~pyspark.sql.Column` or str + A column of string to decode. + + Examples + -------- + >>> df = spark.createDataFrame([("https%3A%2F%2Fspark.apache.org",)], ["a"]) + >>> df.select(url_decode(df.a).alias('r')).collect() + [Row(r='https://spark.apache.org')] + """ + return _invoke_function_over_columns("url_decode", str) + + +@try_remote_functions +def url_encode(str: "ColumnOrName") -> Column: + """ + Translates a string into 'application/x-www-form-urlencoded' format + using a specific encoding scheme. + + .. versionadded:: 3.5.0 + + Parameters + ---------- + str : :class:`~pyspark.sql.Column` or str + A column of string to encode. + + Examples + -------- + >>> df = spark.createDataFrame([("https://spark.apache.org",)], ["a"]) + >>> df.select(url_encode(df.a).alias('r')).collect() + [Row(r='https%3A%2F%2Fspark.apache.org')] + """ + return _invoke_function_over_columns("url_encode", str) + + +@try_remote_functions +def position( + substr: "ColumnOrName", str: "ColumnOrName", start: Optional["ColumnOrName"] = None +) -> Column: + """ + Returns the position of the first occurrence of `substr` in `str` after position `start`. + The given `start` and return value are 1-based. + + .. versionadded:: 3.5.0 + + Parameters + ---------- + substr : :class:`~pyspark.sql.Column` or str + A column of string, substring. + str : :class:`~pyspark.sql.Column` or str + A column of string. + start : :class:`~pyspark.sql.Column` or str, optional + A column of string, start position. + + Examples + -------- + >>> df = spark.createDataFrame([("bar", "foobarbar", 5,)], ["a", "b", "c"]) + >>> df.select(position(df.a, df.b, df.c).alias('r')).collect() + [Row(r=7)] + + >>> df.select(position(df.a, df.b).alias('r')).collect() + [Row(r=4)] + """ + if start is not None: + return _invoke_function_over_columns("position", substr, str, start) + else: + return _invoke_function_over_columns("position", substr, str) + + +@try_remote_functions +def endswith(str: "ColumnOrName", suffix: "ColumnOrName") -> Column: + """ + Returns a boolean. The value is True if str ends with suffix. + Returns NULL if either input expression is NULL. Otherwise, returns False. + Both str or suffix must be of STRING or BINARY type. + + .. versionadded:: 3.5.0 + + Notes + ----- + Only STRING type is supported in this function, + while `startswith` in SQL supports both STRING and BINARY. + + Parameters + ---------- + str : :class:`~pyspark.sql.Column` or str + A column of string. + suffix : :class:`~pyspark.sql.Column` or str + A column of string, the suffix. + + Examples + -------- + >>> df = spark.createDataFrame([("Spark SQL", "Spark",)], ["a", "b"]) + >>> df.select(endswith(df.a, df.b).alias('r')).collect() + [Row(r=False)] + """ + return _invoke_function_over_columns("endswith", str, suffix) + + +@try_remote_functions +def startswith(str: "ColumnOrName", prefix: "ColumnOrName") -> Column: + """ + Returns a boolean. The value is True if str starts with prefix. + Returns NULL if either input expression is NULL. Otherwise, returns False. + Both str or prefix must be of STRING or BINARY type. + + .. versionadded:: 3.5.0 + + Notes + ----- + Only STRING type is supported in this function, + while `startswith` in SQL supports both STRING and BINARY. + + Parameters + ---------- + str : :class:`~pyspark.sql.Column` or str + A column of string. + prefix : :class:`~pyspark.sql.Column` or str + A column of string, the prefix. + + Examples + -------- + >>> df = spark.createDataFrame([("Spark SQL", "Spark",)], ["a", "b"]) + >>> df.select(startswith(df.a, df.b).alias('r')).collect() + [Row(r=True)] + """ + return _invoke_function_over_columns("startswith", str, prefix) + + # ---------------------- Collection functions ------------------------------ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index a76044ac98e..81a57368a8d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -3900,6 +3900,181 @@ object functions { ToNumber(e.expr, format.expr) } + /** + * Replaces all occurrences of `search` with `replace`. + * + * @param src + * A column of string to be replaced + * @param search + * A column of string, If `search` is not found in `str`, `str` is returned unchanged. + * @param replace + * A column of string, If `replace` is not specified or is an empty string, nothing replaces + * the string that is removed from `str`. + * + * @group string_funcs + * @since 3.5.0 + */ + def replace(src: Column, search: Column, replace: Column): Column = withExpr { + StringReplace(src.expr, search.expr, replace.expr) + } + + /** + * Replaces all occurrences of `search` with `replace`. + * + * @param src + * A column of string to be replaced + * @param search + * A column of string, If `search` is not found in `src`, `src` is returned unchanged. + * + * @group string_funcs + * @since 3.5.0 + */ + def replace(src: Column, search: Column): Column = withExpr { + new StringReplace(src.expr, search.expr) + } + + /** + * Splits `str` by delimiter and return requested part of the split (1-based). + * If any input is null, returns null. if `partNum` is out of range of split parts, + * returns empty string. If `partNum` is 0, throws an error. If `partNum` is negative, + * the parts are counted backward from the end of the string. + * If the `delimiter` is an empty string, the `str` is not split. + * + * @group string_funcs + * @since 3.5.0 + */ + def split_part(str: Column, delimiter: Column, partNum: Column): Column = withExpr { + SplitPart(str.expr, delimiter.expr, partNum.expr) + } + + /** + * Returns the substring of `str` that starts at `pos` and is of length `len`, + * or the slice of byte array that starts at `pos` and is of length `len`. + * + * @group string_funcs + * @since 3.5.0 + */ + def substr(str: Column, pos: Column, len: Column): Column = withExpr { + Substring(str.expr, pos.expr, len.expr) + } + + /** + * Returns the substring of `str` that starts at `pos`, + * or the slice of byte array that starts at `pos`. + * + * @group string_funcs + * @since 3.5.0 + */ + def substr(str: Column, pos: Column): Column = withExpr { + new Substring(str.expr, pos.expr) + } + + /** + * Extracts a part from a URL. + * + * @group string_funcs + * @since 3.5.0 + */ + def parse_url(url: Column, partToExtract: Column, key: Column): Column = withExpr { + ParseUrl(Seq(url.expr, partToExtract.expr, key.expr)) + } + + /** + * Extracts a part from a URL. + * + * @group string_funcs + * @since 3.5.0 + */ + def parse_url(url: Column, partToExtract: Column): Column = withExpr { + ParseUrl(Seq(url.expr, partToExtract.expr)) + } + + /** + * Formats the arguments in printf-style and returns the result as a string column. + * + * @group string_funcs + * @since 3.5.0 + */ + def printf(format: Column, arguments: Column*): Column = withExpr { + FormatString((lit(format) +: arguments).map(_.expr): _*) + } + + /** + * Decodes a `str` in 'application/x-www-form-urlencoded' format + * using a specific encoding scheme. + * + * @group string_funcs + * @since 3.5.0 + */ + def url_decode(str: Column): Column = withExpr { + UrlDecode(str.expr) + } + + /** + * Translates a string into 'application/x-www-form-urlencoded' format + * using a specific encoding scheme. + * + * @group string_funcs + * @since 3.5.0 + */ + def url_encode(str: Column): Column = withExpr { + UrlEncode(str.expr) + } + + /** + * Returns the position of the first occurrence of `substr` in `str` after position `start`. + * The given `start` and return value are 1-based. + * + * @group string_funcs + * @since 3.5.0 + */ + def position(substr: Column, str: Column, start: Column): Column = withExpr { + StringLocate(substr.expr, str.expr, start.expr) + } + + /** + * Returns the position of the first occurrence of `substr` in `str` after position `1`. + * The return value are 1-based. + * + * @group string_funcs + * @since 3.5.0 + */ + def position(substr: Column, str: Column): Column = withExpr { + new StringLocate(substr.expr, str.expr) + } + + /** + * Returns a boolean. The value is True if str ends with suffix. + * Returns NULL if either input expression is NULL. Otherwise, returns False. + * Both str or suffix must be of STRING type. + * + * @note + * Only STRING type is supported in this function, while `endswith` in SQL supports both + * STRING and BINARY. + * + * @group string_funcs + * @since 3.5.0 + */ + def endswith(str: Column, suffix: Column): Column = withExpr { + EndsWith(str.expr, suffix.expr) + } + + /** + * Returns a boolean. The value is True if str starts with prefix. + * Returns NULL if either input expression is NULL. Otherwise, returns False. + * Both str or prefix must be of STRING type. + * + * @note + * Only STRING type is supported in this function, while `endswith` in SQL supports both + * STRING and BINARY. + * + * @group string_funcs + * @since 3.5.0 + */ + def startswith(str: Column, prefix: Column): Column = withExpr { + StartsWith(str.expr, prefix.expr) + } + ////////////////////////////////////////////////////////////////////////////////////////////// // DateTime functions ////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 30583cfd0d0..9b41720bf9a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -78,7 +78,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { val excludedSqlFunctions = Set( "random", "array_agg", "char_length", "character_length", - "lcase", "position", "printf", "substr", "ucase", "day", "cardinality", "sha", + "lcase", "ucase", "day", "cardinality", "sha", // aliases for existing functions "reflect", "java_method" // Only needed in SQL ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 9b45f645af6..21ab4899a75 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -862,7 +862,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { } test("to_number") { - val df = Seq("$78.12").toDF("a") + val df = Seq("$78.12").toDF("a") checkAnswer( df.selectExpr("to_number(a, '$99.99')"), Seq(Row(78.12)) @@ -872,4 +872,135 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { Seq(Row(78.12)) ) } + + test("replace") { + val df = Seq(("ABCabc", "abc", "DEF")).toDF("a", "b", "c") + + checkAnswer( + df.selectExpr("replace(a, b, c)"), + Seq(Row("ABCDEF")) + ) + checkAnswer( + df.select(replace(col("a"), col("b"), col("c"))), + Seq(Row("ABCDEF")) + ) + + checkAnswer( + df.selectExpr("replace(a, b)"), + Seq(Row("ABC")) + ) + checkAnswer( + df.select(replace(col("a"), col("b"))), + Seq(Row("ABC")) + ) + } + + test("split_part") { + val df = Seq(("11.12.13", ".", 3)).toDF("a", "b", "c") + checkAnswer( + df.selectExpr("split_part(a, b, c)"), + Seq(Row("13")) + ) + checkAnswer( + df.select(split_part(col("a"), col("b"), col("c"))), + Seq(Row("13")) + ) + } + + test("substr") { + val df = Seq(("Spark SQL", 5, 1)).toDF("a", "b", "c") + checkAnswer( + df.selectExpr("substr(a, b, c)"), + Seq(Row("k")) + ) + checkAnswer( + df.select(substr(col("a"), col("b"), col("c"))), + Seq(Row("k")) + ) + + checkAnswer( + df.selectExpr("substr(a, b)"), + Seq(Row("k SQL")) + ) + checkAnswer( + df.select(substr(col("a"), col("b"))), + Seq(Row("k SQL")) + ) + } + + test("parse_url") { + val df = Seq(("http://spark.apache.org/path?query=1", "QUERY", "query")).toDF("a", "b", "c") + + checkAnswer( + df.selectExpr("parse_url(a, b, c)"), + Seq(Row("1")) + ) + checkAnswer( + df.select(parse_url(col("a"), col("b"), col("c"))), + Seq(Row("1")) + ) + + checkAnswer( + df.selectExpr("parse_url(a, b)"), + Seq(Row("query=1")) + ) + checkAnswer( + df.select(parse_url(col("a"), col("b"))), + Seq(Row("query=1")) + ) + } + + test("printf") { + val df = Seq(("aa%d%s", 123, "cc")).toDF("a", "b", "c") + checkAnswer( + df.selectExpr("printf(a, b, c)"), + Row("aa123cc")) + checkAnswer( + df.select(printf(col("a"), col("b"), col("c"))), + Row("aa123cc")) + } + + test("url_decode") { + val df = Seq("https%3A%2F%2Fspark.apache.org").toDF("a") + checkAnswer( + df.selectExpr("url_decode(a)"), + Row("https://spark.apache.org")) + checkAnswer( + df.select(url_decode(col("a"))), + Row("https://spark.apache.org")) + } + + test("url_encode") { + val df = Seq("https://spark.apache.org").toDF("a") + checkAnswer( + df.selectExpr("url_encode(a)"), + Row("https%3A%2F%2Fspark.apache.org")) + checkAnswer( + df.select(url_encode(col("a"))), + Row("https%3A%2F%2Fspark.apache.org")) + } + + test("position") { + val df = Seq(("bar", "foobarbar", 5)).toDF("a", "b", "c") + + checkAnswer(df.selectExpr("position(a, b)"), Row(4)) + checkAnswer(df.select(position(col("a"), col("b"))), Row(4)) + + checkAnswer(df.selectExpr("position(a, b, c)"), Row(7)) + checkAnswer(df.select(position(col("a"), col("b"), col("c"))), Row(7)) + } + + test("endswith") { + val df = Seq(("Spark SQL", "Spark")).toDF("a", "b") + + checkAnswer(df.selectExpr("endswith(a, b)"), Row(false)) + checkAnswer(df.select(endswith(col("a"), col("b"))), Row(false)) + } + + test("startswith") { + val df = Seq(("Spark SQL", "Spark")).toDF("a", "b") + + checkAnswer(df.selectExpr("startswith(a, b)"), Row(true)) + checkAnswer(df.select(startswith(col("a"), col("b"))), Row(true)) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org