This is an automated email from the ASF dual-hosted git repository.
yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 0c16624c77ad [SPARK-48627][SQL] Perf improvement for binary to to
HEX_DISCRETE string
0c16624c77ad is described below
commit 0c16624c77ad311a3d076c4cfb4451b1f19f8a9b
Author: Kent Yao <[email protected]>
AuthorDate: Mon Jun 17 13:42:46 2024 +0800
[SPARK-48627][SQL] Perf improvement for binary to to HEX_DISCRETE string
### What changes were proposed in this pull request?
By replacing `String.format`, we can achieve nearly 200x performance
improvement.
The SparkStringUtils.getHexString is widely used by
- the Spark Thrift Server to convert binary to string when sending results
to clients
- the Spark SQL shell for display
- the Spark Shell when calling `show`
- the Spark Connect scala client when stringifying binaries in arrow vectors
```
+OpenJDK 64-Bit Server VM 17.0.10+0 on Mac OS X 14.5
+Apple M2 Max
+Cardinality 100000: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Spark 42210 43595
1207 0.0 422102.9 1.0X
+Java 238 243
2 0.4 2381.9 177.2X
```
### Why are the changes needed?
perf improvement
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
By existing binary*.sql's results
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #46984 from yaooqinn/SPARK-48627.
Authored-by: Kent Yao <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
---
.../scala/org/apache/spark/sql/catalyst/util/StringUtils.scala | 8 +++++++-
.../scala/org/apache/spark/sql/catalyst/util/StringUtils.scala | 6 ------
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git
a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
index aa8826dd48b6..edb1ee371b15 100644
---
a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
+++
b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -16,6 +16,7 @@
*/
package org.apache.spark.sql.catalyst.util
+import java.util.HexFormat
import java.util.concurrent.atomic.AtomicBoolean
import org.apache.spark.internal.Logging
@@ -101,11 +102,16 @@ object SparkStringUtils extends Logging {
truncatedString(seq, "", sep, "", maxFields)
}
+ private final lazy val SPACE_DELIMITED_UPPERCASE_HEX =
+ HexFormat.of().withDelimiter(" ").withUpperCase()
+
/**
* Returns a pretty string of the byte array which prints each byte as a hex
digit and add spaces
* between them. For example, [1A C0].
*/
- def getHexString(bytes: Array[Byte]): String =
bytes.map("%02X".format(_)).mkString("[", " ", "]")
+ def getHexString(bytes: Array[Byte]): String = {
+ s"[${SPACE_DELIMITED_UPPERCASE_HEX.formatHex(bytes)}]"
+ }
def sideBySide(left: String, right: String): Seq[String] = {
sideBySide(left.split("\n").toImmutableArraySeq,
right.split("\n").toImmutableArraySeq)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
index 2fecd9a23759..e2a5319cbe1a 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -66,12 +66,6 @@ object StringUtils extends Logging {
"(?s)" + out.result() // (?s) enables dotall mode, causing "." to match
new lines
}
- /**
- * Returns a pretty string of the byte array which prints each byte as a hex
digit and add spaces
- * between them. For example, [1A C0].
- */
- def getHexString(bytes: Array[Byte]): String =
bytes.map("%02X".format(_)).mkString("[", " ", "]")
-
private[this] val trueStrings =
Set("t", "true", "y", "yes", "1").map(UTF8String.fromString)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]