This is an automated email from the ASF dual-hosted git repository.
yaooqinn pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 33c61b9065 [VL] Compute real nullCount for flat-encoded unsupported
types in cached-batch partition stats (#12176)
33c61b9065 is described below
commit 33c61b9065fc5a6ed20a4fb89df4e5d76fbed183
Author: Kent Yao <[email protected]>
AuthorDate: Fri May 29 12:06:12 2026 +0800
[VL] Compute real nullCount for flat-encoded unsupported types in
cached-batch partition stats (#12176)
VeloxColumnarBatchSerializer::computeStats did not accrue nullCount in the
`default:` arm of the flat-encoding TypeKind switch. For flat-encoded
unsupported TypeKinds (VARBINARY, OPAQUE, UNKNOWN) with V2 partition
stats enabled, stats.nullCount was 0 regardless of real null count, so
Spark IsNull pruning silently dropped null-bearing partitions.
Mirror the non-flat path's countNullsAny call. New e2e test in
ColumnarCachedBatchE2ESuite.
Generated-by: Claude claude-opus-4.7
---
.../sql/execution/ColumnarCachedBatchE2ESuite.scala | 21 +++++++++++++++++++++
.../serializer/VeloxColumnarBatchSerializer.cc | 2 ++
2 files changed, 23 insertions(+)
diff --git
a/backends-velox/src/test/scala/org/apache/spark/sql/execution/ColumnarCachedBatchE2ESuite.scala
b/backends-velox/src/test/scala/org/apache/spark/sql/execution/ColumnarCachedBatchE2ESuite.scala
index fe261b3e3f..fee8eccb65 100644
---
a/backends-velox/src/test/scala/org/apache/spark/sql/execution/ColumnarCachedBatchE2ESuite.scala
+++
b/backends-velox/src/test/scala/org/apache/spark/sql/execution/ColumnarCachedBatchE2ESuite.scala
@@ -488,4 +488,25 @@ class ColumnarCachedBatchE2ESuite
}
}
}
+
+ test("V2 stats: IsNull on VARBINARY null-bearing partition is not pruned") {
+ withSQLConf(
+ GlutenConfig.COLUMNAR_TABLE_CACHE_PARTITION_STATS_ENABLED.key -> "true")
{
+ val df = spark.range(N)
+ .select(
+ when(col("id") === lit(750L), lit(null).cast("binary"))
+ .otherwise(col("id").cast("string").cast("binary"))
+ .as("bin"))
+ .repartitionByRange(P, col("bin"))
+ .cache()
+ try {
+ df.count()
+ assert(
+ df.filter(col("bin").isNull).count() == 1L,
+ "VARBINARY null-bearing partition was silently pruned by IsNull")
+ } finally {
+ df.unpersist(blocking = true)
+ }
+ }
+ }
}
diff --git a/cpp/velox/operators/serializer/VeloxColumnarBatchSerializer.cc
b/cpp/velox/operators/serializer/VeloxColumnarBatchSerializer.cc
index 50e8a96abd..6df1ab509c 100644
--- a/cpp/velox/operators/serializer/VeloxColumnarBatchSerializer.cc
+++ b/cpp/velox/operators/serializer/VeloxColumnarBatchSerializer.cc
@@ -369,6 +369,8 @@ std::vector<ColumnStats>
VeloxColumnarBatchSerializer::computeStats(RowVectorPtr
break;
}
default:
+ // Mirror non-flat path: real nullCount needed for JVM IsNull pruning.
+ nullCnt = countNullsAny(child.get());
// Unsupported type -> hasLowerBound=hasUpperBound=false -> JVM
buildFilter pass-through.
break;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]