This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push:
new ef02dbddbd21 [SPARK-47085][SQL][3.4] reduce the complexity of
toTRowSet from n^2 to n
ef02dbddbd21 is described below
commit ef02dbddbd21ac8b0c9a8f0b6bcae1f406e07fb0
Author: Izek Greenfield <[email protected]>
AuthorDate: Wed Feb 21 08:12:35 2024 -0800
[SPARK-47085][SQL][3.4] reduce the complexity of toTRowSet from n^2 to n
### What changes were proposed in this pull request?
reduce the complexity of RowSetUtils.toTRowSet from n^2 to n
### Why are the changes needed?
This causes performance issues.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Tests + test manually on AWS EMR
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #45164 from igreenfield/branch-3.4.
Authored-by: Izek Greenfield <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../apache/spark/sql/hive/thriftserver/RowSetUtils.scala | 14 ++++----------
.../hive/thriftserver/SparkExecuteStatementOperation.scala | 2 +-
2 files changed, 5 insertions(+), 11 deletions(-)
diff --git
a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/RowSetUtils.scala
b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/RowSetUtils.scala
index 9625021f392c..047f0612898d 100644
---
a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/RowSetUtils.scala
+++
b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/RowSetUtils.scala
@@ -52,11 +52,7 @@ object RowSetUtils {
rows: Seq[Row],
schema: Array[DataType],
timeFormatters: TimeFormatters): TRowSet = {
- var i = 0
- val rowSize = rows.length
- val tRows = new java.util.ArrayList[TRow](rowSize)
- while (i < rowSize) {
- val row = rows(i)
+ val tRows = rows.map { row =>
val tRow = new TRow()
var j = 0
val columnSize = row.length
@@ -65,9 +61,8 @@ object RowSetUtils {
tRow.addToColVals(columnValue)
j += 1
}
- i += 1
- tRows.add(tRow)
- }
+ tRow
+ }.asJava
new TRowSet(startRowOffSet, tRows)
}
@@ -159,8 +154,7 @@ object RowSetUtils {
val size = rows.length
val ret = new java.util.ArrayList[T](size)
var idx = 0
- while (idx < size) {
- val row = rows(idx)
+ rows.foreach { row =>
if (row.isNullAt(ordinal)) {
nulls.set(idx, true)
ret.add(idx, defaultVal)
diff --git
a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index c41e92e618ba..5b94cdd6518a 100644
---
a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++
b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -113,7 +113,7 @@ private[hive] class SparkExecuteStatementOperation(
val offset = iter.getPosition
val rows = iter.take(maxRows).toList
log.debug(s"Returning result set with ${rows.length} rows from offsets " +
- s"[${iter.getFetchStart}, ${offset}) with $statementId")
+ s"[${iter.getFetchStart}, ${iter.getPosition}) with $statementId")
RowSetUtils.toTRowSet(offset, rows, dataTypes, getProtocolVersion,
getTimeFormatters)
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]