This is an automated email from the ASF dual-hosted git repository.
srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 731aa2cdf8a [SPARK-39107][SQL] Account for empty string input in regex
replace
731aa2cdf8a is described below
commit 731aa2cdf8a78835621fbf3de2d3492b27711d1a
Author: Lorenzo Martini <[email protected]>
AuthorDate: Mon May 9 19:44:19 2022 -0500
[SPARK-39107][SQL] Account for empty string input in regex replace
### What changes were proposed in this pull request?
When trying to perform a regex replace, account for the possibility of
having empty strings as input.
### Why are the changes needed?
https://github.com/apache/spark/pull/29891 was merged to address
https://issues.apache.org/jira/browse/SPARK-30796 and introduced a bug that
would not allow regex matching on empty strings, as it would account for
position within substring but not consider the case where input string has
length 0 (empty string)
From https://issues.apache.org/jira/browse/SPARK-39107 there is a change in
behavior between spark versions.
3.0.2
```
scala> val df = spark.sql("SELECT '' AS col")
df: org.apache.spark.sql.DataFrame = [col: string]
scala> df.withColumn("replaced", regexp_replace(col("col"), "^$",
"<empty>")).show
+---+--------+
|col|replaced|
+---+--------+
| | <empty>|
+---+--------+
```
3.1.2
```
scala> val df = spark.sql("SELECT '' AS col")
df: org.apache.spark.sql.DataFrame = [col: string]
scala> df.withColumn("replaced", regexp_replace(col("col"), "^$",
"<empty>")).show
+---+--------+
|col|replaced|
+---+--------+
| | |
+---+--------+
```
The 3.0.2 outcome is the expected and correct one
### Does this PR introduce _any_ user-facing change?
Yes compared to spark 3.2.1, as it brings back the correct behavior when
trying to regex match empty strings, as shown in the example above.
### How was this patch tested?
Added special casing test in `RegexpExpressionsSuite.RegexReplace` with
empty string replacement.
Closes #36457 from LorenzoMartini/lmartini/fix-empty-string-replace.
Authored-by: Lorenzo Martini <[email protected]>
Signed-off-by: Sean Owen <[email protected]>
---
.../org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala | 4 ++--
.../spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala | 3 +++
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index bfaaba51446..01763f082d6 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -642,7 +642,7 @@ case class RegExpReplace(subject: Expression, regexp:
Expression, rep: Expressio
}
val source = s.toString()
val position = i.asInstanceOf[Int] - 1
- if (position < source.length) {
+ if (position == 0 || position < source.length) {
val m = pattern.matcher(source)
m.region(position, source.length)
result.delete(0, result.length())
@@ -696,7 +696,7 @@ case class RegExpReplace(subject: Expression, regexp:
Expression, rep: Expressio
}
String $source = $subject.toString();
int $position = $pos - 1;
- if ($position < $source.length()) {
+ if ($position == 0 || $position < $source.length()) {
$classNameStringBuffer $termResult = new $classNameStringBuffer();
java.util.regex.Matcher $matcher = $termPattern.matcher($source);
$matcher.region($position, $source.length());
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index 082e6e4194c..5b007b87915 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -293,6 +293,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with
ExpressionEvalHelper {
val row4 = create_row(null, "(\\d+)", "###")
val row5 = create_row("100-200", null, "###")
val row6 = create_row("100-200", "(-)", null)
+ val row7 = create_row("", "^$", "<empty string>")
val s = $"s".string.at(0)
val p = $"p".string.at(1)
@@ -305,6 +306,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with
ExpressionEvalHelper {
checkEvaluation(expr, null, row4)
checkEvaluation(expr, null, row5)
checkEvaluation(expr, null, row6)
+ checkEvaluation(expr, "<empty string>", row7)
// test position
val exprWithPos = RegExpReplace(s, p, r, 4)
checkEvaluation(exprWithPos, "100-num", row1)
@@ -313,6 +315,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with
ExpressionEvalHelper {
checkEvaluation(exprWithPos, null, row4)
checkEvaluation(exprWithPos, null, row5)
checkEvaluation(exprWithPos, null, row6)
+ checkEvaluation(exprWithPos, "", row7)
val exprWithLargePos = RegExpReplace(s, p, r, 7)
checkEvaluation(exprWithLargePos, "100-20num", row1)
checkEvaluation(exprWithLargePos, "100-20###", row2)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]