This is an automated email from the ASF dual-hosted git repository.
coderfender pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new d7147dbb16 bug: no column projection should still persist row count
(#4444)
d7147dbb16 is described below
commit d7147dbb16c024d5cd3c9b6569c06e198395b244
Author: Bhargava Vadlamani <[email protected]>
AuthorDate: Thu May 28 11:39:59 2026 -0500
bug: no column projection should still persist row count (#4444)
---
.github/workflows/pr_build_linux.yml | 1 +
.github/workflows/pr_build_macos.yml | 1 +
.../org/apache/spark/sql/comet/util/Utils.scala | 9 ++++
.../apache/spark/sql/comet/util/UtilsSuite.scala | 54 ++++++++++++++++++++++
4 files changed, 65 insertions(+)
diff --git a/.github/workflows/pr_build_linux.yml
b/.github/workflows/pr_build_linux.yml
index f7d6c1a73d..0e4988e368 100644
--- a/.github/workflows/pr_build_linux.yml
+++ b/.github/workflows/pr_build_linux.yml
@@ -383,6 +383,7 @@ jobs:
org.apache.spark.sql.comet.CometDppFallbackRepro3949Suite
org.apache.spark.sql.comet.CometShuffleFallbackStickinessSuite
org.apache.spark.sql.comet.CometDecimalArithmeticViewSuite
+ org.apache.spark.sql.comet.util.UtilsSuite
org.apache.comet.objectstore.NativeConfigSuite
org.apache.spark.sql.CometToPrettyStringSuite
org.apache.spark.sql.CometCollationSuite
diff --git a/.github/workflows/pr_build_macos.yml
b/.github/workflows/pr_build_macos.yml
index 7af77ca2c9..5101f5290c 100644
--- a/.github/workflows/pr_build_macos.yml
+++ b/.github/workflows/pr_build_macos.yml
@@ -223,6 +223,7 @@ jobs:
org.apache.spark.sql.comet.CometDppFallbackRepro3949Suite
org.apache.spark.sql.comet.CometShuffleFallbackStickinessSuite
org.apache.spark.sql.comet.CometDecimalArithmeticViewSuite
+ org.apache.spark.sql.comet.util.UtilsSuite
org.apache.comet.objectstore.NativeConfigSuite
org.apache.spark.sql.CometToPrettyStringSuite
org.apache.spark.sql.CometCollationSuite
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala
b/spark/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala
index 783367c054..0343983e11 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala
@@ -224,6 +224,10 @@ object Utils extends CometTypeShim with Logging {
val (fieldVectors, batchProviderOpt) = getBatchFieldVectors(batch)
val root = new VectorSchemaRoot(fieldVectors.asJava)
+ if (fieldVectors.isEmpty) {
+ // VSR cannot infer rowCount without field vectors
+ root.setRowCount(batch.numRows())
+ }
val provider = batchProviderOpt.getOrElse(dictionaryProvider)
val writer = new ArrowStreamWriter(root, provider,
Channels.newChannel(out))
@@ -336,6 +340,11 @@ object Utils extends CometTypeShim with Logging {
return (Array.empty, 0L, 0L)
}
+ if (targetRoot.getSchema.getFields.isEmpty) {
+ // VSRAppender does not update rowCount with no columns
+ targetRoot.setRowCount(totalRows.toInt)
+ }
+
assert(
targetRoot.getRowCount.toLong == totalRows,
s"Row count mismatch after coalesce: ${targetRoot.getRowCount} !=
$totalRows")
diff --git
a/spark/src/test/scala/org/apache/spark/sql/comet/util/UtilsSuite.scala
b/spark/src/test/scala/org/apache/spark/sql/comet/util/UtilsSuite.scala
new file mode 100644
index 0000000000..a79b862793
--- /dev/null
+++ b/spark/src/test/scala/org/apache/spark/sql/comet/util/UtilsSuite.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.comet.util
+
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
+
+class UtilsSuite extends CometTestBase {
+
+ test("serializeBatches preserves row count for a zero-column batch") {
+ val numRows = 5
+ val batch = new ColumnarBatch(Array.empty[ColumnVector], numRows)
+
+ val (rowCount, buf) = Utils.serializeBatches(Iterator(batch)).next()
+ assert(rowCount == numRows)
+
+ val decoded = Utils.decodeBatches(buf, "test").toSeq
+ assert(decoded.map(_.numRows()).sum == numRows)
+ }
+
+ test("coalesceBroadcastBatches preserves row count across zero-column
inputs") {
+ val numRows = 5
+ val numBatches = 3
+ val batches =
+ (0 until numBatches).map(_ => new
ColumnarBatch(Array.empty[ColumnVector], numRows))
+
+ val bufs =
Utils.serializeBatches(batches.iterator).map(_._2).toSeq.iterator
+ val (coalesced, batchCount, totalRows) =
Utils.coalesceBroadcastBatches(bufs)
+
+ val expected = numRows.toLong * numBatches
+ assert(batchCount == numBatches)
+ assert(totalRows == expected)
+
+ val decoded = coalesced.iterator.flatMap(b => Utils.decodeBatches(b,
"test")).toSeq
+ assert(decoded.map(_.numRows()).sum == expected)
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]