This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new d81e55e1ff9 [SPARK-41858][SQL] Fix ORC reader perf regression due to
DEFAULT value feature
d81e55e1ff9 is described below
commit d81e55e1ff998c624fa80c5660d7724701b4df23
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Tue Jan 3 10:40:44 2023 -0800
[SPARK-41858][SQL] Fix ORC reader perf regression due to DEFAULT value
feature
### What changes were proposed in this pull request?
This PR is a partial and logical revert of SPARK-39862,
https://github.com/apache/spark/pull/37280, to fix the huge ORC reader perf
regression (3x slower).
SPARK-39862 should propose a fix without perf regression.
### Why are the changes needed?
During Apache Spark 3.4.0 preparation, SPARK-41782 identified a perf
regression.
- https://github.com/apache/spark/pull/39301#discussion_r1059239575
### Does this PR introduce _any_ user-facing change?
After this PR, the regression is removed. However, the bug of DEFAULT value
feature will remain. This should be handled separately.
### How was this patch tested?
Pass the CI.
Closes #39362 from dongjoon-hyun/SPARK-41858.
Authored-by: Dongjoon Hyun <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../execution/datasources/orc/OrcDeserializer.scala | 21 +++++++++++----------
.../org/apache/spark/sql/sources/InsertSuite.scala | 9 +++++++--
2 files changed, 18 insertions(+), 12 deletions(-)
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
index 5276f5c6d7b..5b207a04ada 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
@@ -57,14 +57,7 @@ class OrcDeserializer(
} else {
new RowUpdater(resultRow)
}
- val writer: (Int, WritableComparable[_]) => Unit =
- (ordinal, value) =>
- if (value == null) {
- rowUpdater.setNullAt(ordinal)
- } else {
- val writerFunc = newWriter(f.dataType, rowUpdater)
- writerFunc(ordinal, value)
- }
+ val writer = newWriter(f.dataType, rowUpdater)
(value: WritableComparable[_]) => writer(index, value)
}
}.toArray
@@ -75,7 +68,11 @@ class OrcDeserializer(
while (targetColumnIndex < fieldWriters.length) {
if (fieldWriters(targetColumnIndex) != null) {
val value = orcStruct.getFieldValue(requestedColIds(targetColumnIndex))
- fieldWriters(targetColumnIndex)(value)
+ if (value == null) {
+ resultRow.setNullAt(targetColumnIndex)
+ } else {
+ fieldWriters(targetColumnIndex)(value)
+ }
}
targetColumnIndex += 1
}
@@ -88,7 +85,11 @@ class OrcDeserializer(
while (targetColumnIndex < fieldWriters.length) {
if (fieldWriters(targetColumnIndex) != null) {
val value = orcValues(requestedColIds(targetColumnIndex))
- fieldWriters(targetColumnIndex)(value)
+ if (value == null) {
+ resultRow.setNullAt(targetColumnIndex)
+ } else {
+ fieldWriters(targetColumnIndex)(value)
+ }
}
targetColumnIndex += 1
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index dd37c93871e..7c4a39d6ff4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -1679,7 +1679,8 @@ class InsertSuite extends DataSourceTest with
SharedSparkSession {
Config(
None),
Config(
- Some(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false")))),
+ Some(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false"),
+ insertNullsToStorage = false))),
TestCase(
dataSource = "parquet",
Seq(
@@ -1943,7 +1944,11 @@ class InsertSuite extends DataSourceTest with
SharedSparkSession {
Row(Seq(Row(1, 2)), Seq(Map(false -> "def", true -> "jkl"))),
Seq(Map(true -> "xyz"))),
Row(2,
- null,
+ if (config.dataSource != "orc") {
+ null
+ } else {
+ Row(Seq(Row(1, 2)), Seq(Map(false -> "def", true -> "jkl")))
+ },
Seq(Map(true -> "xyz"))),
Row(3,
Row(Seq(Row(3, 4)), Seq(Map(false -> "mno", true -> "pqr"))),
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]