Github user ueshin commented on a diff in the pull request:

    https://github.com/apache/spark/pull/22227#discussion_r214497013
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ---
    @@ -229,36 +229,74 @@ case class RLike(left: Expression, right: Expression) 
extends StringRegexExpress
     
     
     /**
    - * Splits str around pat (pattern is a regular expression).
    + * Splits str around matches of the given regex.
      */
     @ExpressionDescription(
    -  usage = "_FUNC_(str, regex) - Splits `str` around occurrences that match 
`regex`.",
    +  usage = "_FUNC_(str, regex, limit) - Splits `str` around occurrences 
that match `regex`" +
    +    " and returns an array of at most `limit`",
    +  arguments = """
    +    Arguments:
    +      * str - a string expression to split.
    +      * regex - a string representing a regular expression. The regex 
string should be a
    +        Java regular expression.
    +      * limit - an integer expression which controls the number of times 
the regex is applied.
    +
    +        limit > 0: The resulting array's length will not be more than 
`limit`,
    +                   and the resulting array's last entry will contain all 
input
    +                   beyond the last matched regex.
    +        limit <= 0: `regex` will be applied as many times as possible, and
    +                    the resulting array can be of any size.
    +  """,
       examples = """
         Examples:
           > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]');
            ["one","two","three",""]
    +      > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', -1);
    +       ["one","two","three",""]
    +      > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', 2);
    +       ["one","twoBthreeC"]
       """)
    -case class StringSplit(str: Expression, pattern: Expression)
    -  extends BinaryExpression with ImplicitCastInputTypes {
    +case class StringSplit(str: Expression, regex: Expression, limit: 
Expression)
    +  extends TernaryExpression with ImplicitCastInputTypes {
     
    -  override def left: Expression = str
    -  override def right: Expression = pattern
       override def dataType: DataType = ArrayType(StringType)
    -  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
    +  override def inputTypes: Seq[DataType] = Seq(StringType, StringType, 
IntegerType)
    +  override def children: Seq[Expression] = str :: regex :: limit :: Nil
    +
    +  def this(exp: Expression, regex: Expression) = this(exp, regex, 
Literal(-1));
     
    -  override def nullSafeEval(string: Any, regex: Any): Any = {
    -    val strings = 
string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1)
    +  override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = {
    +    val strings = string.asInstanceOf[UTF8String].split(
    +      regex.asInstanceOf[UTF8String], 
maybeFallbackLimitValue(limit.asInstanceOf[Int]))
         new GenericArrayData(strings.asInstanceOf[Array[Any]])
       }
     
       override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
         val arrayClass = classOf[GenericArrayData].getName
    -    nullSafeCodeGen(ctx, ev, (str, pattern) =>
    +    nullSafeCodeGen(ctx, ev, (str, regex, limit) => {
           // Array in java is covariant, so we don't need to cast UTF8String[] 
to Object[].
    -      s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""")
    +      s"""${ev.value} = new $arrayClass($str.split(
    +         $regex,${handleCodeGenLimitFallback(limit)}));""".stripMargin
    +    })
       }
     
       override def prettyName: String = "split"
    +
    +  /**
    +   * Java String's split method supports "ignore empty string" behavior 
when the limit is 0.
    +   * To avoid this, we fall back to -1 when the limit is 0. Otherwise, 
this is a noop.
    +   */
    +  def maybeFallbackLimitValue(limit: Int): Int = {
    --- End diff --
    
    +1, and please add `limit = 0` case in `UTF8StringSuite`.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to