[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-08-01 Thread GitBox


beliefer commented on a change in pull request #27507:
URL: https://github.com/apache/spark/pull/27507#discussion_r463962090



##
File path: 
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
##
@@ -322,6 +322,48 @@ class RegexpExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
   RegExpExtract(Literal("\"quote"), Literal("\"quote"), Literal(1)) :: Nil)
   }
 
+  test("RegexExtractAll") {
+val row1 = create_row("100-200,300-400,500-600", "(\\d+)-(\\d+)", 1)
+val row2 = create_row("100-200,300-400,500-600", "(\\d+)-(\\d+)", 2)
+val row3 = create_row("100-200,300-400,500-600", "(\\d+).*", 1)
+val row4 = create_row("100-200,300-400,500-600", "([a-z])", 1)
+val row5 = create_row(null, "([a-z])", 1)
+val row6 = create_row("100-200,300-400,500-600", null, 1)
+val row7 = create_row("100-200,300-400,500-600", "([a-z])", null)
+
+val s = 's.string.at(0)
+val p = 'p.string.at(1)
+val r = 'r.int.at(2)
+
+val expr = RegExpExtractAll(s, p, r)
+checkEvaluation(expr, Seq("100", "300", "500"), row1)
+checkEvaluation(expr, Seq("200", "400", "600"), row2)
+checkEvaluation(expr, Seq("100"), row3)
+checkEvaluation(expr, Seq(), row4)
+checkEvaluation(expr, null, row5)
+checkEvaluation(expr, null, row6)
+checkEvaluation(expr, null, row7)
+
+val expr1 = new RegExpExtractAll(s, p)
+checkEvaluation(expr1, Seq("100", "300", "500"), row1)
+
+val nonNullExpr = RegExpExtractAll(Literal("100-200,300-400,500-600"),
+  Literal("(\\d+)-(\\d+)"), Literal(1))
+checkEvaluation(nonNullExpr, Seq("100", "300", "500"), row1)
+
+// invalid group index
+val row8 = create_row("100-200,300-400,500-600", "(\\d+)-(\\d+)", 3)
+val row9 = create_row("100-200,300-400,500-600", "(\\d+).*", 2)
+val row10 = create_row("100-200,300-400,500-600", "\\d+", 1)

Review comment:
   OK





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-07-31 Thread GitBox


beliefer commented on a change in pull request #27507:
URL: https://github.com/apache/spark/pull/27507#discussion_r463668736



##
File path: 
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
##
@@ -322,6 +322,48 @@ class RegexpExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
   RegExpExtract(Literal("\"quote"), Literal("\"quote"), Literal(1)) :: Nil)
   }
 
+  test("RegexExtractAll") {
+val row1 = create_row("100-200,300-400,500-600", "(\\d+)-(\\d+)", 1)

Review comment:
   OK





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-07-30 Thread GitBox


beliefer commented on a change in pull request #27507:
URL: https://github.com/apache/spark/pull/27507#discussion_r463070305



##
File path: 
sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
##
@@ -154,8 +154,38 @@ class StringFunctionsSuite extends QueryTest with 
SharedSparkSession {
   Row("300", "100") :: Row("400", "100") :: Row("400-400", "100") :: Nil)
   }
 
+  test("string regex_extract_all") {
+val df = Seq(
+  ("100-200,300-400", "(\\d+)-(\\d+)"),
+  ("101-201,301-401", "(\\d+)-(\\d+)"),
+  ("102-202,302-402", "(\\d+)")).toDF("a", "b")
+
+checkAnswer(
+  df.select(
+regexp_extract_all($"a", "(\\d+)-(\\d+)", 1),
+regexp_extract_all($"a", "(\\d+)-(\\d+)", 2)),
+  Row(Seq("100", "300"), Seq("200", "400")) ::
+Row(Seq("101", "301"), Seq("201", "401")) ::
+Row(Seq("102", "302"), Seq("202", "402")) :: Nil)
+
+// for testing the mutable state of the expression in code gen.
+// This is a hack way to enable the codegen, thus the codegen is enable by 
default,
+// it will still use the interpretProjection if projection followed by a 
LocalRelation,
+// hence we add a filter operator.
+// See the optimizer rule `ConvertToLocalRelation`

Review comment:
   OK





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-07-30 Thread GitBox


beliefer commented on a change in pull request #27507:
URL: https://github.com/apache/spark/pull/27507#discussion_r463062732



##
File path: sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
##
@@ -8,4 +8,16 @@ SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)');
 SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0);
 SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1);
 SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2);
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 3);
 SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', -1);
+
+-- regexp_extract_all
+SELECT regexp_extract_all('1a 2b 14m', '\\d+');
+SELECT regexp_extract_all('1a 2b 14m', '\\d+', 0);
+SELECT regexp_extract_all('1a 2b 14m', '\\d+', 1);
+SELECT regexp_extract_all('1a 2b 14m', '\\d+', 2);
+SELECT regexp_extract_all('1a 2b 14m', '(\\d+)([a-z]+)');
+SELECT regexp_extract_all('1a 2b 14m', '(\\d+)([a-z]+)', 0);
+SELECT regexp_extract_all('1a 2b 14m', '(\\d+)([a-z]+)', 1);
+SELECT regexp_extract_all('1a 2b 14m', '(\\d+)([a-z]+)', 2);
+SELECT regexp_extract_all('1a 2b 14m', '(\\d+)([a-z]+)', 3);

Review comment:
   OK





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-07-30 Thread GitBox


beliefer commented on a change in pull request #27507:
URL: https://github.com/apache/spark/pull/27507#discussion_r463061966



##
File path: sql/core/src/main/scala/org/apache/spark/sql/functions.scala
##
@@ -2486,6 +2488,19 @@ object functions {
 RegExpExtract(e.expr, lit(exp).expr, lit(groupIdx).expr)
   }
 
+  /**
+   * Extract all specific groups matched by a Java regex, from the specified 
string column.
+   * If the regex did not match, or the specified group did not match, return 
an empty array.
+   * if the specified group index exceeds the group count of regex, an 
IllegalArgumentException
+   * will be thrown.
+   *
+   * @group string_funcs
+   * @since 3.0.0

Review comment:
   OK





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-07-30 Thread GitBox


beliefer commented on a change in pull request #27507:
URL: https://github.com/apache/spark/pull/27507#discussion_r463061057



##
File path: 
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
##
@@ -322,6 +322,48 @@ class RegexpExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
   RegExpExtract(Literal("\"quote"), Literal("\"quote"), Literal(1)) :: Nil)
   }
 
+  test("RegexExtractAll") {
+val row1 = create_row("100-200,300-400,500-600", "(\\d+)-(\\d+)", 1)
+val row2 = create_row("100-200,300-400,500-600", "(\\d+)-(\\d+)", 2)
+val row3 = create_row("100-200,300-400,500-600", "(\\d+).*", 1)
+val row4 = create_row("100-200,300-400,500-600", "([a-z])", 1)
+val row5 = create_row(null, "([a-z])", 1)
+val row6 = create_row("100-200,300-400,500-600", null, 1)
+val row7 = create_row("100-200,300-400,500-600", "([a-z])", null)
+
+val s = 's.string.at(0)
+val p = 'p.string.at(1)
+val r = 'r.int.at(2)
+
+val expr = RegExpExtractAll(s, p, r)
+checkEvaluation(expr, Seq("100", "300", "500"), row1)
+checkEvaluation(expr, Seq("200", "400", "600"), row2)
+checkEvaluation(expr, Seq("100"), row3)
+checkEvaluation(expr, Seq(), row4)
+checkEvaluation(expr, null, row5)
+checkEvaluation(expr, null, row6)
+checkEvaluation(expr, null, row7)
+
+val expr1 = new RegExpExtractAll(s, p)
+checkEvaluation(expr1, Seq("100", "300", "500"), row1)
+
+val nonNullExpr = RegExpExtractAll(Literal("100-200,300-400,500-600"),
+  Literal("(\\d+)-(\\d+)"), Literal(1))
+checkEvaluation(nonNullExpr, Seq("100", "300", "500"), row1)
+
+// invalid group index
+val row8 = create_row("100-200,300-400,500-600", "(\\d+)-(\\d+)", 3)
+val row9 = create_row("100-200,300-400,500-600", "(\\d+).*", 2)
+val row10 = create_row("100-200,300-400,500-600", "\\d+", 1)

Review comment:
   
https://github.com/apache/spark/blob/5250f988a32a8b5599f6038702da7e31dcbaccd8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala#L418





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-07-30 Thread GitBox


beliefer commented on a change in pull request #27507:
URL: https://github.com/apache/spark/pull/27507#discussion_r463060500



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
##
@@ -518,3 +546,106 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 })
   }
 }
+
+/**
+ * Extract all specific(idx) groups identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
+ */
+@ExpressionDescription(
+  usage = """
+_FUNC_(str, regexp[, idx]) - Extract all strings in the `str` that match 
the `regexp`
+expression and corresponding to the regex group index.
+  """,
+  arguments = """
+Arguments:
+  * str - a string expression.
+  * regexp - a string representing a regular expression. The regex string 
should be a
+  Java regular expression.
+
+  Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+  parser. For example, to match "\abc", a regular expression for 
`regexp` can be
+  "^\\abc$".
+
+  There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+  fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+  if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".
+  * idx - an integer expression that representing the group index. The 
regex may contains
+  multiple groups. `idx` indicates which regex group to extract. The 
group index should
+  be non-negative. If `idx` is not specified, the default group index 
value is 1. The
+  `idx` parameter is the Java regex Matcher group() method index. See
+  docs/api/java/util/regex/Matcher.html for more information on the 
`idx` or Java regex
+  group() method.
+  """,
+  examples = """
+Examples:
+  > SELECT _FUNC_('100-200, 300-400', '(\\d+)-(\\d+)', 1);
+   ["100","300"]
+  """,
+  since = "3.0.0")
+case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: 
Expression)
+  extends RegExpExtractBase {
+  def this(s: Expression, r: Expression) = this(s, r, Literal(1))
+
+  override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
+val m = getLastMatcher(s, p)
+val matchResults = new ArrayBuffer[UTF8String]()
+while(m.find) {
+  val mr: MatchResult = m.toMatchResult
+  val index = r.asInstanceOf[Int]
+  RegExpExtractBase.checkGroupIndex(mr.groupCount, index)
+  val group = mr.group(index)
+  if (group == null) { // Pattern matched, but not optional group

Review comment:
   I just reference 
https://github.com/apache/spark/blob/5250f988a32a8b5599f6038702da7e31dcbaccd8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala#L497





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-07-30 Thread GitBox


beliefer commented on a change in pull request #27507:
URL: https://github.com/apache/spark/pull/27507#discussion_r463059355



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
##
@@ -518,3 +546,106 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 })
   }
 }
+
+/**
+ * Extract all specific(idx) groups identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
+ */
+@ExpressionDescription(
+  usage = """
+_FUNC_(str, regexp[, idx]) - Extract all strings in the `str` that match 
the `regexp`
+expression and corresponding to the regex group index.
+  """,
+  arguments = """
+Arguments:
+  * str - a string expression.
+  * regexp - a string representing a regular expression. The regex string 
should be a
+  Java regular expression.
+
+  Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+  parser. For example, to match "\abc", a regular expression for 
`regexp` can be
+  "^\\abc$".
+
+  There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+  fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+  if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".
+  * idx - an integer expression that representing the group index. The 
regex may contains
+  multiple groups. `idx` indicates which regex group to extract. The 
group index should
+  be non-negative. If `idx` is not specified, the default group index 
value is 1. The
+  `idx` parameter is the Java regex Matcher group() method index. See
+  docs/api/java/util/regex/Matcher.html for more information on the 
`idx` or Java regex
+  group() method.
+  """,
+  examples = """
+Examples:
+  > SELECT _FUNC_('100-200, 300-400', '(\\d+)-(\\d+)', 1);
+   ["100","300"]
+  """,
+  since = "3.0.0")

Review comment:
   OK





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-07-30 Thread GitBox


beliefer commented on a change in pull request #27507:
URL: https://github.com/apache/spark/pull/27507#discussion_r463058892



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
##
@@ -492,7 +521,6 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 } else {
   ""
 }
-

Review comment:
   OK





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-07-30 Thread GitBox


beliefer commented on a change in pull request #27507:
URL: https://github.com/apache/spark/pull/27507#discussion_r463051470



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
##
@@ -421,20 +423,59 @@ object RegExpExtract {
   }
 }
 
+abstract class RegExpExtractBase
+  extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant {
+  def subject: Expression
+  def regexp: Expression
+  def idx: Expression
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, 
IntegerType)
+  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+
+  protected def getLastMatcher(s: Any, p: Any): Matcher = {
+if (!p.equals(lastRegex)) {
+  // regex value changed
+  lastRegex = p.asInstanceOf[UTF8String].clone()
+  pattern = Pattern.compile(lastRegex.toString)
+}
+pattern.matcher(s.toString)
+  }
+}
+
 /**
  * Extract a specific(idx) group identified by a Java regex.
  *
  * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str, regexp[, idx]) - Extracts a group that matches 
`regexp`.",
+  usage = """
+_FUNC_(str, regexp[, idx]) - Extract the first string in the `str` that 
match the `regexp`
+expression and corresponding to the regex group index.
+  """,
   arguments = """
 Arguments:
   * str - a string expression.
-  * regexp - a string representing a regular expression.
-  The regex string should be a Java regular expression.
-  * idx - an integer expression that representing the group index. The 
group index should be
-  non-negative. If `idx` is not specified, the default group index 
value is 1.
+  * regexp - a string representing a regular expression. The regex string 
should be a
+  Java regular expression.
+
+  Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+  parser. For example, to match "\abc", a regular expression for 
`regexp` can be
+  "^\\abc$".
+
+  There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+  fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+  if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".
+  * idx - an integer expression that representing the group index. The 
regex maybe contains
+  multiple groups. `idx` indicates which regex group to extract. The 
group index should
+  be non-negative. If `idx` is not specified, the default group index 
value is 1. The
+  `idx` parameter is the Java regex Matcher group() method index. See
+  docs/api/java/util/regex/Matcher.html for more information on the 
`idx` or Java regex

Review comment:
   Because 
https://github.com/apache/spark/blob/5250f988a32a8b5599f6038702da7e31dcbaccd8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala#L488





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-07-30 Thread GitBox


beliefer commented on a change in pull request #27507:
URL: https://github.com/apache/spark/pull/27507#discussion_r463051326



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
##
@@ -421,20 +423,59 @@ object RegExpExtract {
   }
 }
 
+abstract class RegExpExtractBase
+  extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant {
+  def subject: Expression
+  def regexp: Expression
+  def idx: Expression
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, 
IntegerType)
+  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+
+  protected def getLastMatcher(s: Any, p: Any): Matcher = {
+if (!p.equals(lastRegex)) {
+  // regex value changed
+  lastRegex = p.asInstanceOf[UTF8String].clone()
+  pattern = Pattern.compile(lastRegex.toString)
+}
+pattern.matcher(s.toString)
+  }
+}
+
 /**
  * Extract a specific(idx) group identified by a Java regex.
  *
  * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str, regexp[, idx]) - Extracts a group that matches 
`regexp`.",
+  usage = """
+_FUNC_(str, regexp[, idx]) - Extract the first string in the `str` that 
match the `regexp`
+expression and corresponding to the regex group index.
+  """,
   arguments = """
 Arguments:
   * str - a string expression.
-  * regexp - a string representing a regular expression.
-  The regex string should be a Java regular expression.
-  * idx - an integer expression that representing the group index. The 
group index should be
-  non-negative. If `idx` is not specified, the default group index 
value is 1.
+  * regexp - a string representing a regular expression. The regex string 
should be a
+  Java regular expression.
+
+  Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+  parser. For example, to match "\abc", a regular expression for 
`regexp` can be
+  "^\\abc$".
+
+  There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+  fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+  if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".
+  * idx - an integer expression that representing the group index. The 
regex maybe contains
+  multiple groups. `idx` indicates which regex group to extract. The 
group index should
+  be non-negative. If `idx` is not specified, the default group index 
value is 1. The

Review comment:
   Because 
https://github.com/apache/spark/blob/5250f988a32a8b5599f6038702da7e31dcbaccd8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala#L488





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-07-30 Thread GitBox


beliefer commented on a change in pull request #27507:
URL: https://github.com/apache/spark/pull/27507#discussion_r463047788



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
##
@@ -421,20 +423,59 @@ object RegExpExtract {
   }
 }
 
+abstract class RegExpExtractBase
+  extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant {
+  def subject: Expression
+  def regexp: Expression
+  def idx: Expression
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, 
IntegerType)
+  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+
+  protected def getLastMatcher(s: Any, p: Any): Matcher = {
+if (!p.equals(lastRegex)) {

Review comment:
   OK





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-13 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378890548
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -508,3 +535,98 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 })
   }
 }
+
+/**
+ * Extract all specific(idx) groups identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(str, regexp[, idx]) - Extracts all group that matches 
`regexp`.",
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-13 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378890420
 
 

 ##
 File path: sql/core/src/main/scala/org/apache/spark/sql/functions.scala
 ##
 @@ -2383,6 +2385,19 @@ object functions {
 RegExpExtract(e.expr, lit(exp).expr, lit(groupIdx).expr)
   }
 
+  /**
+   * Extract all specific groups matched by a Java regex, from the specified 
string column.
+   * If the regex did not match, or the specified group did not match, return 
an empty array.
+   * if the specified group index exceeds the group count of regex, an 
IllegalArgumentException
+   * will be throwing.
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378645357
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -508,3 +535,99 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 })
   }
 }
+
+/**
+ * Extract all specific(idx) groups identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(str, regexp[, idx]) - Extracts all group that matches 
`regexp`.",
+  arguments = """
+Arguments:
+  * str - a string expression of the input string.
+  * regexp - a string expression of the regex string.
+
+  Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+  parser. For example, to match "\abc", a regular expression for 
`regexp` can be
+  "^\\abc$".
+
+  There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+  fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+  if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".
+  * idx - an int expression of the regex group index. The regex maybe 
contains multiple
+  groups. `idx` indicates which regex group to extract.
+  """,
+  examples = """
+Examples:
+  > SELECT _FUNC_('100-200, 300-400', '(\\d+)-(\\d+)', 1);
+   ["100","300"]
+  """,
+  since = "3.0.0")
+case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: 
Expression)
+  extends RegExpExtractBase {
+  def this(s: Expression, r: Expression) = this(s, r, Literal(1))
+
+  override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
+val m = getLastMatcher(s, p)
+val matchResults = new ArrayBuffer[UTF8String]()
+val mr: MatchResult = m.toMatchResult
+while(m.find) {
+  val mr: MatchResult = m.toMatchResult
+  val index = r.asInstanceOf[Int]
+  RegExpExtractBase.checkGroupIndex(mr.groupCount, index)
+  val group = mr.group(index)
+  if (group == null) { // Pattern matched, but not optional group
 
 Review comment:
   OK.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378623722
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -508,3 +535,99 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 })
   }
 }
+
+/**
+ * Extract all specific(idx) groups identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(str, regexp[, idx]) - Extracts all group that matches 
`regexp`.",
+  arguments = """
+Arguments:
+  * str - a string expression of the input string.
+  * regexp - a string expression of the regex string.
+
+  Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+  parser. For example, to match "\abc", a regular expression for 
`regexp` can be
+  "^\\abc$".
+
+  There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+  fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+  if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".
+  * idx - an int expression of the regex group index. The regex maybe 
contains multiple
+  groups. `idx` indicates which regex group to extract.
+  """,
+  examples = """
+Examples:
+  > SELECT _FUNC_('100-200, 300-400', '(\\d+)-(\\d+)', 1);
+   ["100","300"]
+  """,
+  since = "3.0.0")
+case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: 
Expression)
+  extends RegExpExtractBase {
+  def this(s: Expression, r: Expression) = this(s, r, Literal(1))
+
+  override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
+val m = getLastMatcher(s, p)
+val matchResults = new ArrayBuffer[UTF8String]()
+val mr: MatchResult = m.toMatchResult
+while(m.find) {
+  val mr: MatchResult = m.toMatchResult
+  val index = r.asInstanceOf[Int]
+  RegExpExtractBase.checkGroupIndex(mr.groupCount, index)
+  val group = mr.group(index)
+  if (group == null) { // Pattern matched, but not optional group
 
 Review comment:
   These codes are borrowed from `RegExpExtract`.
   I removed this code branch and running all test case, then I find this 
branch unreachable.
   I will remove this code.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378623722
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -508,3 +535,99 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 })
   }
 }
+
+/**
+ * Extract all specific(idx) groups identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(str, regexp[, idx]) - Extracts all group that matches 
`regexp`.",
+  arguments = """
+Arguments:
+  * str - a string expression of the input string.
+  * regexp - a string expression of the regex string.
+
+  Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+  parser. For example, to match "\abc", a regular expression for 
`regexp` can be
+  "^\\abc$".
+
+  There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+  fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+  if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".
+  * idx - an int expression of the regex group index. The regex maybe 
contains multiple
+  groups. `idx` indicates which regex group to extract.
+  """,
+  examples = """
+Examples:
+  > SELECT _FUNC_('100-200, 300-400', '(\\d+)-(\\d+)', 1);
+   ["100","300"]
+  """,
+  since = "3.0.0")
+case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: 
Expression)
+  extends RegExpExtractBase {
+  def this(s: Expression, r: Expression) = this(s, r, Literal(1))
+
+  override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
+val m = getLastMatcher(s, p)
+val matchResults = new ArrayBuffer[UTF8String]()
+val mr: MatchResult = m.toMatchResult
+while(m.find) {
+  val mr: MatchResult = m.toMatchResult
+  val index = r.asInstanceOf[Int]
+  RegExpExtractBase.checkGroupIndex(mr.groupCount, index)
+  val group = mr.group(index)
+  if (group == null) { // Pattern matched, but not optional group
 
 Review comment:
   These codes are borrowed from `RegExpExtract`.
   I removed this code branch and running all test case, then I find this 
branch unreachable.
   I will remove this code.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378612746
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -508,3 +535,99 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 })
   }
 }
+
+/**
+ * Extract all specific(idx) groups identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(str, regexp[, idx]) - Extracts all group that matches 
`regexp`.",
+  arguments = """
+Arguments:
+  * str - a string expression of the input string.
+  * regexp - a string expression of the regex string.
+
+  Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+  parser. For example, to match "\abc", a regular expression for 
`regexp` can be
+  "^\\abc$".
+
+  There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+  fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+  if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".
+  * idx - an int expression of the regex group index. The regex maybe 
contains multiple
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378612641
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -508,3 +535,99 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 })
   }
 }
+
+/**
+ * Extract all specific(idx) groups identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(str, regexp[, idx]) - Extracts all group that matches 
`regexp`.",
+  arguments = """
+Arguments:
+  * str - a string expression of the input string.
+  * regexp - a string expression of the regex string.
+
+  Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+  parser. For example, to match "\abc", a regular expression for 
`regexp` can be
+  "^\\abc$".
+
+  There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+  fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+  if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".
+  * idx - an int expression of the regex group index. The regex maybe 
contains multiple
+  groups. `idx` indicates which regex group to extract.
+  """,
+  examples = """
+Examples:
+  > SELECT _FUNC_('100-200, 300-400', '(\\d+)-(\\d+)', 1);
+   ["100","300"]
+  """,
+  since = "3.0.0")
+case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: 
Expression)
+  extends RegExpExtractBase {
+  def this(s: Expression, r: Expression) = this(s, r, Literal(1))
+
+  override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
+val m = getLastMatcher(s, p)
+val matchResults = new ArrayBuffer[UTF8String]()
+val mr: MatchResult = m.toMatchResult
 
 Review comment:
   I will remove it.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378297287
 
 

 ##
 File path: sql/core/src/main/scala/org/apache/spark/sql/functions.scala
 ##
 @@ -2383,6 +2383,17 @@ object functions {
 RegExpExtract(e.expr, lit(exp).expr, lit(groupIdx).expr)
   }
 
+  /**
+   * Extract all specific group matched by a Java regex, from the specified 
string column.
+   * If the regex did not match, or the specified group did not match, an 
empty array is returned.
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378290232
 
 

 ##
 File path: sql/core/src/main/scala/org/apache/spark/sql/functions.scala
 ##
 @@ -2383,6 +2383,17 @@ object functions {
 RegExpExtract(e.expr, lit(exp).expr, lit(groupIdx).expr)
   }
 
+  /**
+   * Extract all specific group matched by a Java regex, from the specified 
string column.
+   * If the regex did not match, or the specified group did not match, an 
empty array is returned.
 
 Review comment:
   2. should throw a IllegalArgumentException. 
   https://github.com/apache/spark/pull/27508
   the behavior of Hive is :
   `FAILED: SemanticException [Error 10014]: Line 1:7 Wrong arguments ‘2’: 
org.apache.hadoop.hive.ql.metadata.HiveException: Unable to execute method 
public java.lang.String 
org.apache.hadoop.hive.ql.udf.UDFRegExpExtract.evaluate(java.lang.String,java.lang.String,java.lang.Integer)
 on object org.apache.hadoop.hive.ql.udf.UDFRegExpExtract@2cf5e0f0 of class 
org.apache.hadoop.hive.ql.udf.UDFRegExpExtract with arguments 
{x=a3&x=18abc&x=2&y=3&x=4:java.lang.String, x=([0-9]+)[a-z]:java.lang.String, 
2:java.lang.Integer} of size 3`


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378282100
 
 

 ##
 File path: sql/core/src/main/scala/org/apache/spark/sql/functions.scala
 ##
 @@ -2383,6 +2383,17 @@ object functions {
 RegExpExtract(e.expr, lit(exp).expr, lit(groupIdx).expr)
   }
 
+  /**
+   * Extract all specific group matched by a Java regex, from the specified 
string column.
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378277156
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -419,39 +421,104 @@ object RegExpExtract {
   }
 }
 
+abstract class RegExpExtractBase extends TernaryExpression with 
ImplicitCastInputTypes {
+  def subject: Expression
+  def regexp: Expression
+  def idx: Expression
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, 
IntegerType)
+  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+
+  protected def getLastMatcher(s: Any, p: Any): Matcher = {
+if (!p.equals(lastRegex)) {
+  // regex value changed
+  lastRegex = p.asInstanceOf[UTF8String].clone()
+  pattern = Pattern.compile(lastRegex.toString)
+}
+pattern.matcher(s.toString)
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): 
ExprCode = {
+val classNamePattern = classOf[Pattern].getCanonicalName
+val classNameRegExpExtractBase = 
classOf[RegExpExtractBase].getCanonicalName
+val matcher = ctx.freshName("matcher")
+val matchResult = ctx.freshName("matchResult")
+
+val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
+val termPattern = ctx.addMutableState(classNamePattern, "pattern")
+
+val setEvNotNull = if (nullable) {
+  s"${ev.isNull} = false;"
+} else {
+  ""
+}
 
 Review comment:
   OK.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378275154
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -508,3 +568,96 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 })
   }
 }
+
+/**
+ * Extract all specific(idx) group identified by a Java regex.
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378274860
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -419,39 +421,104 @@ object RegExpExtract {
   }
 }
 
+abstract class RegExpExtractBase extends TernaryExpression with 
ImplicitCastInputTypes {
+  def subject: Expression
+  def regexp: Expression
+  def idx: Expression
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, 
IntegerType)
+  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+
+  protected def getLastMatcher(s: Any, p: Any): Matcher = {
+if (!p.equals(lastRegex)) {
+  // regex value changed
+  lastRegex = p.asInstanceOf[UTF8String].clone()
+  pattern = Pattern.compile(lastRegex.toString)
+}
+pattern.matcher(s.toString)
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): 
ExprCode = {
+val classNamePattern = classOf[Pattern].getCanonicalName
+val classNameRegExpExtractBase = 
classOf[RegExpExtractBase].getCanonicalName
+val matcher = ctx.freshName("matcher")
+val matchResult = ctx.freshName("matchResult")
+
+val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
+val termPattern = ctx.addMutableState(classNamePattern, "pattern")
+
+val setEvNotNull = if (nullable) {
+  s"${ev.isNull} = false;"
+} else {
+  ""
+}
+doNullSafeCodeGen(
+  ctx,
+  ev,
+  classNamePattern,
+  classNameRegExpExtractBase,
+  matcher,
+  matchResult,
+  termLastRegex,
+  termPattern,
+  setEvNotNull)
+  }
+
+  def doNullSafeCodeGen(
+  ctx: CodegenContext,
+  ev: ExprCode,
+  classNamePattern: String,
+  classNameRegExpExtractBase: String,
+  matcher: String,
+  matchResult: String,
+  termLastRegex: String,
+  termPattern: String,
+  setEvNotNull: String): ExprCode
+}
+
 /**
  * Extract a specific(idx) group identified by a Java regex.
  *
  * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
  */
 @ExpressionDescription(
   usage = "_FUNC_(str, regexp[, idx]) - Extracts a group that matches 
`regexp`.",
+  arguments = """
+Arguments:
+  * str - a string expression
+  * regexp - a string expression. The regex string should be a Java 
regular expression.
+
+  Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+  parser. For example, to match "\abc", a regular expression for 
`regexp` can be
+  "^\\abc$".
+
+  There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+  fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+  if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".
+  * idx - a int expression. The regex maybe contains multiple groups. 
`idx` represents the
+  index of regex group.
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378274404
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -419,39 +421,104 @@ object RegExpExtract {
   }
 }
 
+abstract class RegExpExtractBase extends TernaryExpression with 
ImplicitCastInputTypes {
+  def subject: Expression
+  def regexp: Expression
+  def idx: Expression
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, 
IntegerType)
+  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+
+  protected def getLastMatcher(s: Any, p: Any): Matcher = {
+if (!p.equals(lastRegex)) {
+  // regex value changed
+  lastRegex = p.asInstanceOf[UTF8String].clone()
+  pattern = Pattern.compile(lastRegex.toString)
+}
+pattern.matcher(s.toString)
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): 
ExprCode = {
+val classNamePattern = classOf[Pattern].getCanonicalName
+val classNameRegExpExtractBase = 
classOf[RegExpExtractBase].getCanonicalName
+val matcher = ctx.freshName("matcher")
+val matchResult = ctx.freshName("matchResult")
+
+val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
+val termPattern = ctx.addMutableState(classNamePattern, "pattern")
+
+val setEvNotNull = if (nullable) {
+  s"${ev.isNull} = false;"
+} else {
+  ""
+}
+doNullSafeCodeGen(
+  ctx,
+  ev,
+  classNamePattern,
+  classNameRegExpExtractBase,
+  matcher,
+  matchResult,
+  termLastRegex,
+  termPattern,
+  setEvNotNull)
+  }
+
+  def doNullSafeCodeGen(
+  ctx: CodegenContext,
+  ev: ExprCode,
+  classNamePattern: String,
+  classNameRegExpExtractBase: String,
+  matcher: String,
+  matchResult: String,
+  termLastRegex: String,
+  termPattern: String,
+  setEvNotNull: String): ExprCode
+}
+
 /**
  * Extract a specific(idx) group identified by a Java regex.
  *
  * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
  */
 @ExpressionDescription(
   usage = "_FUNC_(str, regexp[, idx]) - Extracts a group that matches 
`regexp`.",
+  arguments = """
+Arguments:
+  * str - a string expression
+  * regexp - a string expression. The regex string should be a Java 
regular expression.
+
+  Since Spark 2.0, string literals (including regex patterns) are 
unescaped in our SQL
+  parser. For example, to match "\abc", a regular expression for 
`regexp` can be
+  "^\\abc$".
+
+  There is a SQL config 'spark.sql.parser.escapedStringLiterals' that 
can be used to
+  fallback to the Spark 1.6 behavior regarding string literal parsing. 
For example,
+  if the config is enabled, the `regexp` that can match "\abc" is 
"^\abc$".
+  * idx - a int expression. The regex maybe contains multiple groups. 
`idx` represents the
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378273983
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -419,39 +421,104 @@ object RegExpExtract {
   }
 }
 
+abstract class RegExpExtractBase extends TernaryExpression with 
ImplicitCastInputTypes {
+  def subject: Expression
+  def regexp: Expression
+  def idx: Expression
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, 
IntegerType)
+  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+
+  protected def getLastMatcher(s: Any, p: Any): Matcher = {
+if (!p.equals(lastRegex)) {
+  // regex value changed
+  lastRegex = p.asInstanceOf[UTF8String].clone()
+  pattern = Pattern.compile(lastRegex.toString)
+}
+pattern.matcher(s.toString)
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): 
ExprCode = {
+val classNamePattern = classOf[Pattern].getCanonicalName
+val classNameRegExpExtractBase = 
classOf[RegExpExtractBase].getCanonicalName
+val matcher = ctx.freshName("matcher")
+val matchResult = ctx.freshName("matchResult")
+
+val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
+val termPattern = ctx.addMutableState(classNamePattern, "pattern")
+
+val setEvNotNull = if (nullable) {
+  s"${ev.isNull} = false;"
+} else {
+  ""
+}
+doNullSafeCodeGen(
+  ctx,
+  ev,
+  classNamePattern,
+  classNameRegExpExtractBase,
+  matcher,
+  matchResult,
+  termLastRegex,
+  termPattern,
+  setEvNotNull)
+  }
+
+  def doNullSafeCodeGen(
+  ctx: CodegenContext,
+  ev: ExprCode,
+  classNamePattern: String,
+  classNameRegExpExtractBase: String,
+  matcher: String,
+  matchResult: String,
+  termLastRegex: String,
+  termPattern: String,
+  setEvNotNull: String): ExprCode
+}
+
 /**
  * Extract a specific(idx) group identified by a Java regex.
  *
  * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
  */
 @ExpressionDescription(
   usage = "_FUNC_(str, regexp[, idx]) - Extracts a group that matches 
`regexp`.",
+  arguments = """
+Arguments:
+  * str - a string expression
+  * regexp - a string expression. The regex string should be a Java 
regular expression.
 
 Review comment:
   OK. There just references the comment of `RLIKE`.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-12 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378273173
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -419,39 +421,104 @@ object RegExpExtract {
   }
 }
 
+abstract class RegExpExtractBase extends TernaryExpression with 
ImplicitCastInputTypes {
+  def subject: Expression
+  def regexp: Expression
+  def idx: Expression
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, 
IntegerType)
+  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+
+  protected def getLastMatcher(s: Any, p: Any): Matcher = {
+if (!p.equals(lastRegex)) {
+  // regex value changed
+  lastRegex = p.asInstanceOf[UTF8String].clone()
+  pattern = Pattern.compile(lastRegex.toString)
+}
+pattern.matcher(s.toString)
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): 
ExprCode = {
+val classNamePattern = classOf[Pattern].getCanonicalName
+val classNameRegExpExtractBase = 
classOf[RegExpExtractBase].getCanonicalName
+val matcher = ctx.freshName("matcher")
+val matchResult = ctx.freshName("matchResult")
+
+val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
+val termPattern = ctx.addMutableState(classNamePattern, "pattern")
+
+val setEvNotNull = if (nullable) {
+  s"${ev.isNull} = false;"
+} else {
+  ""
+}
+doNullSafeCodeGen(
+  ctx,
+  ev,
+  classNamePattern,
+  classNameRegExpExtractBase,
+  matcher,
+  matchResult,
+  termLastRegex,
+  termPattern,
+  setEvNotNull)
+  }
+
+  def doNullSafeCodeGen(
+  ctx: CodegenContext,
+  ev: ExprCode,
+  classNamePattern: String,
+  classNameRegExpExtractBase: String,
+  matcher: String,
+  matchResult: String,
+  termLastRegex: String,
+  termPattern: String,
+  setEvNotNull: String): ExprCode
+}
+
 /**
  * Extract a specific(idx) group identified by a Java regex.
  *
  * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
  */
 @ExpressionDescription(
   usage = "_FUNC_(str, regexp[, idx]) - Extracts a group that matches 
`regexp`.",
+  arguments = """
+Arguments:
+  * str - a string expression
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-11 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378009621
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -537,3 +557,71 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 })
   }
 }
+
+/**
+ * Extract all specific(idx) group identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable 
status.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(str, regexp[, idx]) - Extracts all group that matches 
`regexp`.",
 
 Review comment:
   OK.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] Support regexp function regexp_extract_all

2020-02-11 Thread GitBox
beliefer commented on a change in pull request #27507: [SPARK-24884][SQL] 
Support regexp function regexp_extract_all
URL: https://github.com/apache/spark/pull/27507#discussion_r378002552
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ##
 @@ -452,6 +454,46 @@ case class RegExpReplace(subject: Expression, regexp: 
Expression, rep: Expressio
   }
 }
 
+abstract class RegExpExtractBase extends TernaryExpression with 
ImplicitCastInputTypes {
+  def subject: Expression
+  def regexp: Expression
+  def idx: Expression
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, 
IntegerType)
+  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+
+  protected def getLastMatcher(s: Any, p: Any): Matcher = {
+if (!p.equals(lastRegex)) {
+  // regex value changed
+  lastRegex = p.asInstanceOf[UTF8String].clone()
+  pattern = Pattern.compile(lastRegex.toString)
+}
+pattern.matcher(s.toString)
+  }
+
+  protected def getGenCodeVals(ctx: CodegenContext, ev: ExprCode) = {
+val classNamePattern = classOf[Pattern].getCanonicalName
+val matcher = ctx.freshName("matcher")
+val matchResult = ctx.freshName("matchResult")
+
+val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
+val termPattern = ctx.addMutableState(classNamePattern, "pattern")
+
+val setEvNotNull = if (nullable) {
+  s"${ev.isNull} = false;"
+} else {
+  ""
+}
+
+(classNamePattern, matcher, matchResult, termLastRegex, termPattern, 
setEvNotNull)
 
 Review comment:
   OK. Good idea.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org