This is an automated email from the ASF dual-hosted git repository.
yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new da81d8ecb802 [SPARK-48584][SQL] Perf improvement for unescapePathName
da81d8ecb802 is described below
commit da81d8ecb80226fa5fb2b6e50048f05d67fb5904
Author: Kent Yao <[email protected]>
AuthorDate: Wed Jun 12 16:39:49 2024 +0800
[SPARK-48584][SQL] Perf improvement for unescapePathName
### What changes were proposed in this pull request?
This PR improves perf for unescapePathName with algorithms briefly
described as:
- If a path contains no '%' or contains '%' at `position > path.length-2`,
we return the original identity instead of creating a new StringBuilder to
append char by char
- Otherwise, we loop with 2 indices, `plaintextStartIdx` which starts from
0 and then points to the next char after resolving `%xx`, and `plaintextEndIdx`
which points to the next `'%'`. `plaintextStartIdx` moves to `plaintextEndIdx +
3` if `%xx` is valid, or moves to `plaintextEndIdx + 1` if `%xx` is invalid.
- Instead of using Integer.parseInt with error capture, we identify the
high and low characters manually.
### Why are the changes needed?
performance improvement for hotspots
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
- new tests in ExternalCatalogUtilsSuite
- Benchmark results (9-11x faster)
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #46938 from yaooqinn/SPARK-48584.
Authored-by: Kent Yao <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
---
.../EscapePathBenchmark-jdk21-results.txt | 16 ++++++-
.../benchmarks/EscapePathBenchmark-results.txt | 16 ++++++-
.../catalyst/catalog/ExternalCatalogUtils.scala | 52 +++++++++++++---------
.../spark/sql/catalyst/EscapePathBenchmark.scala | 52 +++++++++++++++++++++-
.../catalog/ExternalCatalogUtilsSuite.scala | 26 ++++++++++-
5 files changed, 135 insertions(+), 27 deletions(-)
diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt
b/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt
index 4fffb9bfd49a..3d16c874e8c9 100644
--- a/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt
+++ b/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt
@@ -6,7 +6,19 @@ OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure
AMD EPYC 7763 64-Core Processor
Escape Tests: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Legacy 7128 7146
8 0.1 7127.9 1.0X
-New 790 795
5 1.3 789.7 9.0X
+Legacy 6996 7009
9 0.1 6996.5 1.0X
+New 771 776
3 1.3 770.7 9.1X
+
+
+================================================================================================
+Unescape
+================================================================================================
+
+OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure
+AMD EPYC 7763 64-Core Processor
+Unescape Tests: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Legacy 5127 5137
6 0.2 5127.3 1.0X
+New 579 583
4 1.7 579.3 8.9X
diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt
b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt
index 32e44f6e19ef..7cfa134652c2 100644
--- a/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt
+++ b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt
@@ -6,7 +6,19 @@ OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux
6.5.0-1021-azure
AMD EPYC 7763 64-Core Processor
Escape Tests: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Legacy 6719 6726
6 0.1 6719.3 1.0X
-New 735 744
21 1.4 735.3 9.1X
+Legacy 6966 6978
12 0.1 6965.9 1.0X
+New 725 730
4 1.4 725.4 9.6X
+
+
+================================================================================================
+Unescape
+================================================================================================
+
+OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure
+AMD EPYC 7763 64-Core Processor
+Unescape Tests: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Legacy 6665 6677
11 0.2 6664.6 1.0X
+New 602 606
2 1.7 602.1 11.1X
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
index 1dc2a760a8af..e7d91e7f41cb 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.util.Shell
import org.apache.spark.sql.catalyst.analysis.Resolver
import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference,
BasePredicate, BoundReference, Expression, Predicate}
+import org.apache.spark.sql.catalyst.expressions.Hex.unhexDigits
import org.apache.spark.sql.catalyst.util.CharVarcharUtils
import org.apache.spark.sql.errors.QueryCompilationErrors
import org.apache.spark.sql.internal.SQLConf
@@ -97,31 +98,40 @@ object ExternalCatalogUtils {
}
def unescapePathName(path: String): String = {
- val sb = new StringBuilder
- var i = 0
-
- while (i < path.length) {
- val c = path.charAt(i)
- if (c == '%' && i + 2 < path.length) {
- val code: Int = try {
- Integer.parseInt(path.substring(i + 1, i + 3), 16)
- } catch {
- case _: Exception => -1
- }
- if (code >= 0) {
- sb.append(code.asInstanceOf[Char])
- i += 3
+ if (path == null || path.isEmpty) {
+ return path
+ }
+ var plaintextEndIdx = path.indexOf('%')
+ val length = path.length
+ if (plaintextEndIdx == -1 || plaintextEndIdx + 2 > length) {
+ // fast path, no %xx encoding found then return the string identity
+ path
+ } else {
+ val sb = new java.lang.StringBuilder(length)
+ var plaintextStartIdx = 0
+ while(plaintextEndIdx != -1 && plaintextEndIdx + 2 < length) {
+ if (plaintextEndIdx > plaintextStartIdx) sb.append(path,
plaintextStartIdx, plaintextEndIdx)
+ val high = path.charAt(plaintextEndIdx + 1)
+ if ((high >>> 8) == 0 && unhexDigits(high) != -1) {
+ val low = path.charAt(plaintextEndIdx + 2)
+ if ((low >>> 8) == 0 && unhexDigits(low) != -1) {
+ sb.append((unhexDigits(high) << 4 |
unhexDigits(low)).asInstanceOf[Char])
+ plaintextStartIdx = plaintextEndIdx + 3
+ } else {
+ sb.append('%')
+ plaintextStartIdx = plaintextEndIdx + 1
+ }
} else {
- sb.append(c)
- i += 1
+ sb.append('%')
+ plaintextStartIdx = plaintextEndIdx + 1
}
- } else {
- sb.append(c)
- i += 1
+ plaintextEndIdx = path.indexOf('%', plaintextStartIdx)
+ }
+ if (plaintextStartIdx < length) {
+ sb.append(path, plaintextStartIdx, length)
}
+ sb.toString
}
-
- sb.toString()
}
def generatePartitionPath(
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala
index ce277456e0be..4cbffff184cd 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala
@@ -20,7 +20,7 @@ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
/**
- * Benchmark for path escaping
+ * Benchmark for path escaping/unescaping
* To run this benchmark:
* {{{
* 1. without sbt:
@@ -53,6 +53,30 @@ object EscapePathBenchmark extends BenchmarkBase {
}
benchmark.run()
}
+
+ runBenchmark("Unescape") {
+ val benchmark = new Benchmark("Unescape Tests", N, 10, output = output)
+ val paths = Seq(
+ "https%3A%2F%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
+ "https:%2F%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
+ "https:/%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
+ "https://issues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
+ "https://issues.apache.org/jira%2Fbrowse%2FSPARK-48551",
+ "https://issues.apache.org/jira%2Fbrowse%2FSPARK-48551",
+ "https://issues.apache.org/jira/browse%2FSPARK-48551",
+ "https://issues.apache.org/jira/browse%2SPARK-48551",
+ "https://issues.apache.org/jira/browse/SPARK-48551")
+ benchmark.addCase("Legacy") { _ =>
+ (1 to N).foreach(_ => paths.foreach(unescapePathNameLegacy))
+ }
+
+ benchmark.addCase("New") { _ =>
+ (1 to N).foreach(_ => {
+ paths.foreach(ExternalCatalogUtils.unescapePathName)
+ })
+ }
+ benchmark.run()
+ }
}
/**
@@ -71,4 +95,30 @@ object EscapePathBenchmark extends BenchmarkBase {
builder.toString()
}
+
+ def unescapePathNameLegacy(path: String): String = {
+ val sb = new StringBuilder
+ var i = 0
+ while (i < path.length) {
+ val c = path.charAt(i)
+ if (c == '%' && i + 2 < path.length) {
+ val code: Int = try {
+ Integer.parseInt(path.substring(i + 1, i + 3), 16)
+ } catch {
+ case _: Exception => -1
+ }
+ if (code >= 0) {
+ sb.append(code.asInstanceOf[Char])
+ i += 3
+ } else {
+ sb.append(c)
+ i += 1
+ }
+ } else {
+ sb.append(c)
+ i += 1
+ }
+ }
+ sb.toString()
+ }
}
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala
index 843ffc037d6c..f0aee94d2a61 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala
@@ -18,7 +18,7 @@
package org.apache.spark.sql.catalyst.catalog
import org.apache.spark.SparkFunSuite
-import
org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
+import
org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.{escapePathName,
unescapePathName}
class ExternalCatalogUtilsSuite extends SparkFunSuite {
@@ -39,4 +39,28 @@ class ExternalCatalogUtilsSuite extends SparkFunSuite {
assert(escapePathName("a,b") === "a,b")
assert(escapePathName("a/b") === "a%2Fb")
}
+
+ test("SPARK-48551: unescapePathName") {
+
ExternalCatalogUtils.charToEscape.stream().toArray.map(_.asInstanceOf[Char]).foreach
{ c =>
+ // Check parity with old conversion technique:
+ assert(unescapePathName("%" + f"$c%02X") === c.toString,
+ s"wrong unescaping for $c")
+ }
+ assert(unescapePathName(null) === null)
+ assert(unescapePathName("") === "")
+ assert(unescapePathName(" ") === " ")
+ assert(unescapePathName("%0A") === "\n")
+ assert(unescapePathName("a b") === "a b")
+ assert(unescapePathName("a%3Ab") === "a:b")
+ assert(unescapePathName("%3Aab") === ":ab")
+ assert(unescapePathName("ab%3A") === "ab:")
+ assert(unescapePathName("a%25b") === "a%b")
+ assert(unescapePathName("a,b") === "a,b")
+ assert(unescapePathName("a%2Fb") === "a/b")
+ assert(unescapePathName("a%2") === "a%2")
+ assert(unescapePathName("a%F ") === "a%F ")
+ // scalastyle:off nonascii
+ assert(unescapePathName("a\u00FF") === "a\u00FF")
+ // scalastyle:on nonascii
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]