This is an automated email from the ASF dual-hosted git repository.

yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 0c16624c77ad [SPARK-48627][SQL] Perf improvement for binary to to 
HEX_DISCRETE string
0c16624c77ad is described below

commit 0c16624c77ad311a3d076c4cfb4451b1f19f8a9b
Author: Kent Yao <[email protected]>
AuthorDate: Mon Jun 17 13:42:46 2024 +0800

    [SPARK-48627][SQL] Perf improvement for binary to to HEX_DISCRETE string
    
    ### What changes were proposed in this pull request?
    
    By replacing `String.format`, we can achieve nearly 200x performance 
improvement.
    
    The SparkStringUtils.getHexString is widely used by
    - the Spark Thrift Server to convert binary to string when sending results 
to clients
    - the Spark SQL shell for display
    - the Spark Shell when calling `show`
    - the Spark Connect scala client when stringifying binaries in arrow vectors
    
    ```
    +OpenJDK 64-Bit Server VM 17.0.10+0 on Mac OS X 14.5
    +Apple M2 Max
    +Cardinality 100000:                       Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
    
+------------------------------------------------------------------------------------------------------------------------
    +Spark                                             42210          43595     
   1207          0.0      422102.9       1.0X
    +Java                                                238            243     
      2          0.4        2381.9     177.2X
    ```
    
    ### Why are the changes needed?
    
    perf improvement
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    By existing binary*.sql's results
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    no
    
    Closes #46984 from yaooqinn/SPARK-48627.
    
    Authored-by: Kent Yao <[email protected]>
    Signed-off-by: Kent Yao <[email protected]>
---
 .../scala/org/apache/spark/sql/catalyst/util/StringUtils.scala    | 8 +++++++-
 .../scala/org/apache/spark/sql/catalyst/util/StringUtils.scala    | 6 ------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git 
a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala 
b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
index aa8826dd48b6..edb1ee371b15 100644
--- 
a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
+++ 
b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -16,6 +16,7 @@
  */
 package org.apache.spark.sql.catalyst.util
 
+import java.util.HexFormat
 import java.util.concurrent.atomic.AtomicBoolean
 
 import org.apache.spark.internal.Logging
@@ -101,11 +102,16 @@ object SparkStringUtils extends Logging {
     truncatedString(seq, "", sep, "", maxFields)
   }
 
+  private final lazy val SPACE_DELIMITED_UPPERCASE_HEX =
+    HexFormat.of().withDelimiter(" ").withUpperCase()
+
   /**
    * Returns a pretty string of the byte array which prints each byte as a hex 
digit and add spaces
    * between them. For example, [1A C0].
    */
-  def getHexString(bytes: Array[Byte]): String = 
bytes.map("%02X".format(_)).mkString("[", " ", "]")
+  def getHexString(bytes: Array[Byte]): String = {
+    s"[${SPACE_DELIMITED_UPPERCASE_HEX.formatHex(bytes)}]"
+  }
 
   def sideBySide(left: String, right: String): Seq[String] = {
     sideBySide(left.split("\n").toImmutableArraySeq, 
right.split("\n").toImmutableArraySeq)
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
index 2fecd9a23759..e2a5319cbe1a 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -66,12 +66,6 @@ object StringUtils extends Logging {
     "(?s)" + out.result() // (?s) enables dotall mode, causing "." to match 
new lines
   }
 
-  /**
-   * Returns a pretty string of the byte array which prints each byte as a hex 
digit and add spaces
-   * between them. For example, [1A C0].
-   */
-  def getHexString(bytes: Array[Byte]): String = 
bytes.map("%02X".format(_)).mkString("[", " ", "]")
-
   private[this] val trueStrings =
     Set("t", "true", "y", "yes", "1").map(UTF8String.fromString)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to