This is an automated email from the ASF dual-hosted git repository.

yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new da81d8ecb802 [SPARK-48584][SQL] Perf improvement for unescapePathName
da81d8ecb802 is described below

commit da81d8ecb80226fa5fb2b6e50048f05d67fb5904
Author: Kent Yao <[email protected]>
AuthorDate: Wed Jun 12 16:39:49 2024 +0800

    [SPARK-48584][SQL] Perf improvement for unescapePathName
    
    ### What changes were proposed in this pull request?
    
    This PR improves perf for unescapePathName with algorithms briefly 
described as:
    - If a path contains no '%' or contains '%' at `position > path.length-2`, 
we return the original identity instead of creating a new StringBuilder to 
append char by char
    - Otherwise, we loop with 2 indices, `plaintextStartIdx` which starts from 
0 and then points to the next char after resolving `%xx`, and `plaintextEndIdx` 
which points to the next `'%'`. `plaintextStartIdx` moves to `plaintextEndIdx + 
3` if `%xx` is valid, or moves to `plaintextEndIdx + 1` if `%xx` is invalid.
    - Instead of using Integer.parseInt with error capture, we identify the 
high and low characters manually.
    
    ### Why are the changes needed?
    
    performance improvement for hotspots
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    - new tests in ExternalCatalogUtilsSuite
    - Benchmark results (9-11x faster)
    ### Was this patch authored or co-authored using generative AI tooling?
    
    no
    
    Closes #46938 from yaooqinn/SPARK-48584.
    
    Authored-by: Kent Yao <[email protected]>
    Signed-off-by: Kent Yao <[email protected]>
---
 .../EscapePathBenchmark-jdk21-results.txt          | 16 ++++++-
 .../benchmarks/EscapePathBenchmark-results.txt     | 16 ++++++-
 .../catalyst/catalog/ExternalCatalogUtils.scala    | 52 +++++++++++++---------
 .../spark/sql/catalyst/EscapePathBenchmark.scala   | 52 +++++++++++++++++++++-
 .../catalog/ExternalCatalogUtilsSuite.scala        | 26 ++++++++++-
 5 files changed, 135 insertions(+), 27 deletions(-)

diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt 
b/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt
index 4fffb9bfd49a..3d16c874e8c9 100644
--- a/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt
+++ b/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt
@@ -6,7 +6,19 @@ OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure
 AMD EPYC 7763 64-Core Processor
 Escape Tests:                             Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Legacy                                             7128           7146         
  8          0.1        7127.9       1.0X
-New                                                 790            795         
  5          1.3         789.7       9.0X
+Legacy                                             6996           7009         
  9          0.1        6996.5       1.0X
+New                                                 771            776         
  3          1.3         770.7       9.1X
+
+
+================================================================================================
+Unescape
+================================================================================================
+
+OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure
+AMD EPYC 7763 64-Core Processor
+Unescape Tests:                           Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Legacy                                             5127           5137         
  6          0.2        5127.3       1.0X
+New                                                 579            583         
  4          1.7         579.3       8.9X
 
 
diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt 
b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt
index 32e44f6e19ef..7cfa134652c2 100644
--- a/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt
+++ b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt
@@ -6,7 +6,19 @@ OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 
6.5.0-1021-azure
 AMD EPYC 7763 64-Core Processor
 Escape Tests:                             Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Legacy                                             6719           6726         
  6          0.1        6719.3       1.0X
-New                                                 735            744         
 21          1.4         735.3       9.1X
+Legacy                                             6966           6978         
 12          0.1        6965.9       1.0X
+New                                                 725            730         
  4          1.4         725.4       9.6X
+
+
+================================================================================================
+Unescape
+================================================================================================
+
+OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure
+AMD EPYC 7763 64-Core Processor
+Unescape Tests:                           Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Legacy                                             6665           6677         
 11          0.2        6664.6       1.0X
+New                                                 602            606         
  2          1.7         602.1      11.1X
 
 
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
index 1dc2a760a8af..e7d91e7f41cb 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.util.Shell
 import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, 
BasePredicate, BoundReference, Expression, Predicate}
+import org.apache.spark.sql.catalyst.expressions.Hex.unhexDigits
 import org.apache.spark.sql.catalyst.util.CharVarcharUtils
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.internal.SQLConf
@@ -97,31 +98,40 @@ object ExternalCatalogUtils {
   }
 
   def unescapePathName(path: String): String = {
-    val sb = new StringBuilder
-    var i = 0
-
-    while (i < path.length) {
-      val c = path.charAt(i)
-      if (c == '%' && i + 2 < path.length) {
-        val code: Int = try {
-          Integer.parseInt(path.substring(i + 1, i + 3), 16)
-        } catch {
-          case _: Exception => -1
-        }
-        if (code >= 0) {
-          sb.append(code.asInstanceOf[Char])
-          i += 3
+    if (path == null || path.isEmpty) {
+      return path
+    }
+    var plaintextEndIdx = path.indexOf('%')
+    val length = path.length
+    if (plaintextEndIdx == -1 || plaintextEndIdx + 2 > length) {
+      // fast path, no %xx encoding found then return the string identity
+      path
+    } else {
+      val sb = new java.lang.StringBuilder(length)
+      var plaintextStartIdx = 0
+      while(plaintextEndIdx != -1 && plaintextEndIdx + 2 < length) {
+        if (plaintextEndIdx > plaintextStartIdx) sb.append(path, 
plaintextStartIdx, plaintextEndIdx)
+        val high = path.charAt(plaintextEndIdx + 1)
+        if ((high >>> 8) == 0 && unhexDigits(high) != -1) {
+          val low = path.charAt(plaintextEndIdx + 2)
+          if ((low >>> 8) == 0 && unhexDigits(low) != -1) {
+            sb.append((unhexDigits(high) << 4 | 
unhexDigits(low)).asInstanceOf[Char])
+            plaintextStartIdx = plaintextEndIdx + 3
+          } else {
+            sb.append('%')
+            plaintextStartIdx = plaintextEndIdx + 1
+          }
         } else {
-          sb.append(c)
-          i += 1
+          sb.append('%')
+          plaintextStartIdx = plaintextEndIdx + 1
         }
-      } else {
-        sb.append(c)
-        i += 1
+        plaintextEndIdx = path.indexOf('%', plaintextStartIdx)
+      }
+      if (plaintextStartIdx < length) {
+        sb.append(path, plaintextStartIdx, length)
       }
+      sb.toString
     }
-
-    sb.toString()
   }
 
   def generatePartitionPath(
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala
index ce277456e0be..4cbffff184cd 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala
@@ -20,7 +20,7 @@ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
 import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
 
 /**
- * Benchmark for path escaping
+ * Benchmark for path escaping/unescaping
  * To run this benchmark:
  * {{{
  *   1. without sbt:
@@ -53,6 +53,30 @@ object EscapePathBenchmark extends BenchmarkBase {
       }
       benchmark.run()
     }
+
+    runBenchmark("Unescape") {
+      val benchmark = new Benchmark("Unescape Tests", N, 10, output = output)
+      val paths = Seq(
+        "https%3A%2F%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
+        "https:%2F%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
+        "https:/%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
+        "https://issues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551";,
+        "https://issues.apache.org/jira%2Fbrowse%2FSPARK-48551";,
+        "https://issues.apache.org/jira%2Fbrowse%2FSPARK-48551";,
+        "https://issues.apache.org/jira/browse%2FSPARK-48551";,
+        "https://issues.apache.org/jira/browse%2SPARK-48551";,
+        "https://issues.apache.org/jira/browse/SPARK-48551";)
+      benchmark.addCase("Legacy") { _ =>
+        (1 to N).foreach(_ => paths.foreach(unescapePathNameLegacy))
+      }
+
+      benchmark.addCase("New") { _ =>
+        (1 to N).foreach(_ => {
+          paths.foreach(ExternalCatalogUtils.unescapePathName)
+        })
+      }
+      benchmark.run()
+    }
   }
 
   /**
@@ -71,4 +95,30 @@ object EscapePathBenchmark extends BenchmarkBase {
 
     builder.toString()
   }
+
+  def unescapePathNameLegacy(path: String): String = {
+    val sb = new StringBuilder
+    var i = 0
+    while (i < path.length) {
+      val c = path.charAt(i)
+      if (c == '%' && i + 2 < path.length) {
+        val code: Int = try {
+          Integer.parseInt(path.substring(i + 1, i + 3), 16)
+        } catch {
+          case _: Exception => -1
+        }
+        if (code >= 0) {
+          sb.append(code.asInstanceOf[Char])
+          i += 3
+        } else {
+          sb.append(c)
+          i += 1
+        }
+      } else {
+        sb.append(c)
+        i += 1
+      }
+    }
+    sb.toString()
+  }
 }
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala
index 843ffc037d6c..f0aee94d2a61 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.catalog
 
 import org.apache.spark.SparkFunSuite
-import 
org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
+import 
org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.{escapePathName, 
unescapePathName}
 
 class ExternalCatalogUtilsSuite extends SparkFunSuite {
 
@@ -39,4 +39,28 @@ class ExternalCatalogUtilsSuite extends SparkFunSuite {
     assert(escapePathName("a,b") === "a,b")
     assert(escapePathName("a/b") === "a%2Fb")
   }
+
+  test("SPARK-48551: unescapePathName") {
+    
ExternalCatalogUtils.charToEscape.stream().toArray.map(_.asInstanceOf[Char]).foreach
 { c =>
+      // Check parity with old conversion technique:
+      assert(unescapePathName("%" + f"$c%02X") === c.toString,
+        s"wrong unescaping for $c")
+    }
+    assert(unescapePathName(null) === null)
+    assert(unescapePathName("") === "")
+    assert(unescapePathName(" ") === " ")
+    assert(unescapePathName("%0A") === "\n")
+    assert(unescapePathName("a b") === "a b")
+    assert(unescapePathName("a%3Ab") === "a:b")
+    assert(unescapePathName("%3Aab") === ":ab")
+    assert(unescapePathName("ab%3A") === "ab:")
+    assert(unescapePathName("a%25b") === "a%b")
+    assert(unescapePathName("a,b") === "a,b")
+    assert(unescapePathName("a%2Fb") === "a/b")
+    assert(unescapePathName("a%2") === "a%2")
+    assert(unescapePathName("a%F ") === "a%F ")
+    // scalastyle:off nonascii
+    assert(unescapePathName("a\u00FF") === "a\u00FF")
+    // scalastyle:on nonascii
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to