Github user viirya commented on a diff in the pull request:
https://github.com/apache/spark/pull/22227#discussion_r212783481
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
---
@@ -232,30 +232,41 @@ case class RLike(left: Expression, right: Expression)
extends StringRegexExpress
* Splits str around pat (pattern is a regular expression).
*/
@ExpressionDescription(
- usage = "_FUNC_(str, regex) - Splits `str` around occurrences that match
`regex`.",
+ usage = "_FUNC_(str, regex, limit) - Splits `str` around occurrences
that match `regex`." +
+ "The `limit` parameter controls the number of times the pattern is
applied and " +
+ "therefore affects the length of the resulting array. If the limit n
is " +
+ "greater than zero then the pattern will be applied at most n - 1
times, " +
+ "the array's length will be no greater than n, and the array's last
entry " +
+ "will contain all input beyond the last matched delimiter. If n is " +
+ "non-positive then the pattern will be applied as many times as " +
+ "possible and the array can have any length. If n is zero then the " +
+ "pattern will be applied as many times as possible, the array can " +
+ "have any length, and trailing empty strings will be discarded.",
examples = """
Examples:
- > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]');
+ > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', -1);
["one","two","three",""]
+| > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', 2);
+ | ["one","twoBthreeC"]
""")
-case class StringSplit(str: Expression, pattern: Expression)
- extends BinaryExpression with ImplicitCastInputTypes {
+case class StringSplit(str: Expression, pattern: Expression, limit:
Expression)
+ extends TernaryExpression with ImplicitCastInputTypes {
- override def left: Expression = str
- override def right: Expression = pattern
override def dataType: DataType = ArrayType(StringType)
- override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
+ override def inputTypes: Seq[DataType] = Seq(StringType, StringType,
IntegerType)
+ override def children: Seq[Expression] = str :: pattern :: limit :: Nil
- override def nullSafeEval(string: Any, regex: Any): Any = {
- val strings =
string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1)
+ override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = {
--- End diff --
I think we still need to do some check on `limit`. According to Presto
document, `limit` must be a positive number. -1 is only used when no `limit`
parameter is given (default value).
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]