Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/22227#discussion_r214562525
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
---
@@ -229,36 +229,58 @@ case class RLike(left: Expression, right: Expression)
extends StringRegexExpress
/**
- * Splits str around pat (pattern is a regular expression).
+ * Splits str around matches of the given regex.
*/
@ExpressionDescription(
- usage = "_FUNC_(str, regex) - Splits `str` around occurrences that match
`regex`.",
+ usage = "_FUNC_(str, regex, limit) - Splits `str` around occurrences
that match `regex`" +
+ " and returns an array of at most `limit`",
+ arguments = """
+ Arguments:
+ * str - a string expression to split.
+ * regex - a string representing a regular expression. The regex
string should be a
+ Java regular expression.
+ * limit - an integer expression which controls the number of times
the regex is applied.
+
+ limit > 0: The resulting array's length will not be more than
`limit`,
+ and the resulting array's last entry will contain all
input
+ beyond the last matched regex.
+ limit <= 0: `regex` will be applied as many times as possible, and
+ the resulting array can be of any size.
+ """,
examples = """
Examples:
> SELECT _FUNC_('oneAtwoBthreeC', '[ABC]');
["one","two","three",""]
+ > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', -1);
+ ["one","two","three",""]
+ > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', 2);
+ ["one","twoBthreeC"]
""")
-case class StringSplit(str: Expression, pattern: Expression)
- extends BinaryExpression with ImplicitCastInputTypes {
+case class StringSplit(str: Expression, regex: Expression, limit:
Expression)
+ extends TernaryExpression with ImplicitCastInputTypes {
- override def left: Expression = str
- override def right: Expression = pattern
override def dataType: DataType = ArrayType(StringType)
- override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
+ override def inputTypes: Seq[DataType] = Seq(StringType, StringType,
IntegerType)
+ override def children: Seq[Expression] = str :: regex :: limit :: Nil
+
+ def this(exp: Expression, regex: Expression) = this(exp, regex,
Literal(-1));
- override def nullSafeEval(string: Any, regex: Any): Any = {
- val strings =
string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1)
+ override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = {
+ val strings = string.asInstanceOf[UTF8String].split(
+ regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int])
new GenericArrayData(strings.asInstanceOf[Array[Any]])
}
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
val arrayClass = classOf[GenericArrayData].getName
- nullSafeCodeGen(ctx, ev, (str, pattern) =>
+ nullSafeCodeGen(ctx, ev, (str, regex, limit) => {
// Array in java is covariant, so we don't need to cast UTF8String[]
to Object[].
- s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""")
+ s"""${ev.value} = new
$arrayClass($str.split($regex,$limit));""".stripMargin
+ })
}
override def prettyName: String = "split"
+
--- End diff --
Not a big deal but let's revert unrelated newline change.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]