This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push: new ef02dbddbd21 [SPARK-47085][SQL][3.4] reduce the complexity of toTRowSet from n^2 to n ef02dbddbd21 is described below commit ef02dbddbd21ac8b0c9a8f0b6bcae1f406e07fb0 Author: Izek Greenfield <izek.greenfi...@adenza.com> AuthorDate: Wed Feb 21 08:12:35 2024 -0800 [SPARK-47085][SQL][3.4] reduce the complexity of toTRowSet from n^2 to n ### What changes were proposed in this pull request? reduce the complexity of RowSetUtils.toTRowSet from n^2 to n ### Why are the changes needed? This causes performance issues. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Tests + test manually on AWS EMR ### Was this patch authored or co-authored using generative AI tooling? No Closes #45164 from igreenfield/branch-3.4. Authored-by: Izek Greenfield <izek.greenfi...@adenza.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../apache/spark/sql/hive/thriftserver/RowSetUtils.scala | 14 ++++---------- .../hive/thriftserver/SparkExecuteStatementOperation.scala | 2 +- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/RowSetUtils.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/RowSetUtils.scala index 9625021f392c..047f0612898d 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/RowSetUtils.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/RowSetUtils.scala @@ -52,11 +52,7 @@ object RowSetUtils { rows: Seq[Row], schema: Array[DataType], timeFormatters: TimeFormatters): TRowSet = { - var i = 0 - val rowSize = rows.length - val tRows = new java.util.ArrayList[TRow](rowSize) - while (i < rowSize) { - val row = rows(i) + val tRows = rows.map { row => val tRow = new TRow() var j = 0 val columnSize = row.length @@ -65,9 +61,8 @@ object RowSetUtils { tRow.addToColVals(columnValue) j += 1 } - i += 1 - tRows.add(tRow) - } + tRow + }.asJava new TRowSet(startRowOffSet, tRows) } @@ -159,8 +154,7 @@ object RowSetUtils { val size = rows.length val ret = new java.util.ArrayList[T](size) var idx = 0 - while (idx < size) { - val row = rows(idx) + rows.foreach { row => if (row.isNullAt(ordinal)) { nulls.set(idx, true) ret.add(idx, defaultVal) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala index c41e92e618ba..5b94cdd6518a 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala @@ -113,7 +113,7 @@ private[hive] class SparkExecuteStatementOperation( val offset = iter.getPosition val rows = iter.take(maxRows).toList log.debug(s"Returning result set with ${rows.length} rows from offsets " + - s"[${iter.getFetchStart}, ${offset}) with $statementId") + s"[${iter.getFetchStart}, ${iter.getPosition}) with $statementId") RowSetUtils.toTRowSet(offset, rows, dataTypes, getProtocolVersion, getTimeFormatters) } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org