This is an automated email from the ASF dual-hosted git repository.
srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 8c6d312 [SPARK-37796][SQL] ByteArrayMethods arrayEquals should fast
skip the check of aligning with unaligned platform
8c6d312 is described below
commit 8c6d3123086cf4def7e8be61214dfc9286578169
Author: ulysses-you <[email protected]>
AuthorDate: Wed Jan 5 09:30:05 2022 -0600
[SPARK-37796][SQL] ByteArrayMethods arrayEquals should fast skip the check
of aligning with unaligned platform
### What changes were proposed in this pull request?
The method `arrayEquals` in `ByteArrayMethods` is critical function which
is used in `UTF8String.` `equals`, `indexOf`,`find` etc.
After SPARK-16962, it add the complexity of aligned. It would be better to
fast sikip the check of aligning if the platform is unaligned.
### Why are the changes needed?
Improve the performance.
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
Pass CI. Run the benchmark using
[unaligned-benchmark](https://github.com/ulysses-you/spark/commit/d14d4bfcfeddcf90ccfe7cc3f6cda426d6d6b7e5),
and here is the benchmark result:
[JDK8](https://github.com/ulysses-you/spark/actions/runs/1639852573)
```
================================================================================================
byte array equals
================================================================================================
OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure
Intel(R) Xeon(R) Platinum 8272CL CPU 2.60GHz
Byte Array equals: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Byte Array equals fast 1322 2222
NaN 121.0 8.3 1.0X
Byte Array equals 3378 3381
3 47.4 21.1 0.4X
```
[JDK11](https://github.com/ulysses-you/spark/actions/runs/1639853330)
```
================================================================================================
byte array equals
================================================================================================
OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure
Intel(R) Xeon(R) Platinum 8272CL CPU 2.60GHz
Byte Array equals: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Byte Array equals fast 1860 1891
15 86.0 11.6 1.0X
Byte Array equals 2913 2921
8 54.9 18.2 0.6X
```
[JDK17](https://github.com/ulysses-you/spark/actions/runs/1639853938)
```
================================================================================================
byte array equals
================================================================================================
OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure
Intel(R) Xeon(R) Platinum 8171M CPU 2.60GHz
Byte Array equals: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Byte Array equals fast 1543 1602
39 103.7 9.6 1.0X
Byte Array equals 3027 3029
1 52.9 18.9 0.5X
```
Closes #35078 from ulysses-you/SPARK-37796.
Authored-by: ulysses-you <[email protected]>
Signed-off-by: Sean Owen <[email protected]>
---
.../spark/unsafe/array/ByteArrayMethods.java | 2 +-
.../ByteArrayBenchmark-jdk11-results.txt | 10 ++++
.../ByteArrayBenchmark-jdk17-results.txt | 10 ++++
sql/core/benchmarks/ByteArrayBenchmark-results.txt | 10 ++++
.../execution/benchmark/ByteArrayBenchmark.scala | 66 +++++++++++++++++-----
5 files changed, 83 insertions(+), 15 deletions(-)
diff --git
a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
index f3a59e3..5a7e32b 100644
---
a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
+++
b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
@@ -61,7 +61,7 @@ public class ByteArrayMethods {
int i = 0;
// check if stars align and we can get both offsets to be aligned
- if ((leftOffset % 8) == (rightOffset % 8)) {
+ if (!unaligned && ((leftOffset % 8) == (rightOffset % 8))) {
while ((leftOffset + i) % 8 != 0 && i < length) {
if (Platform.getByte(leftBase, leftOffset + i) !=
Platform.getByte(rightBase, rightOffset + i)) {
diff --git a/sql/core/benchmarks/ByteArrayBenchmark-jdk11-results.txt
b/sql/core/benchmarks/ByteArrayBenchmark-jdk11-results.txt
index 0bdab8d..aafe6e6 100644
--- a/sql/core/benchmarks/ByteArrayBenchmark-jdk11-results.txt
+++ b/sql/core/benchmarks/ByteArrayBenchmark-jdk11-results.txt
@@ -14,3 +14,13 @@ Byte Array compareTo: Best Time(ms)
Avg Time(ms) Stdev(m
2-7 byte 548 564
9 119.5 8.4 0.9X
+================================================================================================
+byte array equals
+================================================================================================
+
+OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure
+Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+Byte Array equals: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Byte Array equals 1860 1891
15 86.0 11.6 1.0X
+
diff --git a/sql/core/benchmarks/ByteArrayBenchmark-jdk17-results.txt
b/sql/core/benchmarks/ByteArrayBenchmark-jdk17-results.txt
index b5e0428..33af4c2 100644
--- a/sql/core/benchmarks/ByteArrayBenchmark-jdk17-results.txt
+++ b/sql/core/benchmarks/ByteArrayBenchmark-jdk17-results.txt
@@ -14,3 +14,13 @@ Byte Array compareTo: Best Time(ms)
Avg Time(ms) Stdev(m
2-7 byte 454 454
0 144.3 6.9 0.9X
+================================================================================================
+byte array equals
+================================================================================================
+
+OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
+Byte Array equals: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Byte Array equals 1543 1602
39 103.7 9.6 1.0X
+
diff --git a/sql/core/benchmarks/ByteArrayBenchmark-results.txt
b/sql/core/benchmarks/ByteArrayBenchmark-results.txt
index cf2a6d4..ae1054b 100644
--- a/sql/core/benchmarks/ByteArrayBenchmark-results.txt
+++ b/sql/core/benchmarks/ByteArrayBenchmark-results.txt
@@ -14,3 +14,13 @@ Byte Array compareTo: Best Time(ms)
Avg Time(ms) Stdev(m
2-7 byte 402 403
0 162.8 6.1 1.0X
+================================================================================================
+byte array equals
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure
+Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+Byte Array equals: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+Byte Array equals 1322 2222
NaN 121.0 8.3 1.0X
+
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ByteArrayBenchmark.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ByteArrayBenchmark.scala
index f8b1e27..9901684 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ByteArrayBenchmark.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ByteArrayBenchmark.scala
@@ -20,10 +20,11 @@ package org.apache.spark.sql.execution.benchmark
import scala.util.Random
import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
-import org.apache.spark.unsafe.types.ByteArray
+import org.apache.spark.unsafe.array.ByteArrayMethods
+import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
/**
- * Benchmark to measure performance for byte array comparisons.
+ * Benchmark to measure performance for byte array operators.
* {{{
* To run this benchmark:
* 1. without sbt:
@@ -34,21 +35,21 @@ import org.apache.spark.unsafe.types.ByteArray
* }}}
*/
object ByteArrayBenchmark extends BenchmarkBase {
+ private val chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ private val randomChar = new Random(0)
- def byteArrayComparisons(iters: Long): Unit = {
- val chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
- val random = new Random(0)
- def randomBytes(min: Int, max: Int): Array[Byte] = {
- val len = random.nextInt(max - min) + min
- val bytes = new Array[Byte](len)
- var i = 0
- while (i < len) {
- bytes(i) = chars.charAt(random.nextInt(chars.length())).toByte
- i += 1
- }
- bytes
+ def randomBytes(min: Int, max: Int): Array[Byte] = {
+ val len = randomChar.nextInt(max - min) + min
+ val bytes = new Array[Byte](len)
+ var i = 0
+ while (i < len) {
+ bytes(i) = chars.charAt(randomChar.nextInt(chars.length())).toByte
+ i += 1
}
+ bytes
+ }
+ def byteArrayComparisons(iters: Long): Unit = {
val count = 16 * 1000
val dataTiny = Seq.fill(count)(randomBytes(2, 7)).toArray
val dataSmall = Seq.fill(count)(randomBytes(8, 16)).toArray
@@ -78,9 +79,46 @@ object ByteArrayBenchmark extends BenchmarkBase {
benchmark.run()
}
+ def byteArrayEquals(iters: Long): Unit = {
+ def binaryEquals(inputs: Array[BinaryEqualInfo]) = { _: Int =>
+ var res = false
+ for (_ <- 0L until iters) {
+ inputs.foreach { input =>
+ res = ByteArrayMethods.arrayEquals(
+ input.s1.getBaseObject, input.s1.getBaseOffset,
+ input.s2.getBaseObject, input.s2.getBaseOffset + input.deltaOffset,
+ input.len)
+ }
+ }
+ }
+ val count = 16 * 1000
+ val rand = new Random(0)
+ val inputs = (0 until count).map { _ =>
+ val s1 = UTF8String.fromBytes(randomBytes(1, 16))
+ val s2 = UTF8String.fromBytes(randomBytes(1, 16))
+ val len = s1.numBytes().min(s2.numBytes())
+ val deltaOffset = rand.nextInt(len)
+ BinaryEqualInfo(s1, s2, deltaOffset, len)
+ }.toArray
+
+ val benchmark = new Benchmark("Byte Array equals", count * iters, 25,
output = output)
+ benchmark.addCase("Byte Array equals")(binaryEquals(inputs))
+ benchmark.run()
+ }
+
+ case class BinaryEqualInfo(
+ s1: UTF8String,
+ s2: UTF8String,
+ deltaOffset: Int,
+ len: Int)
+
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
runBenchmark("byte array comparisons") {
byteArrayComparisons(1024 * 4)
}
+
+ runBenchmark("byte array equals") {
+ byteArrayEquals(1000 * 10)
+ }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]