This is an automated email from the ASF dual-hosted git repository.
zhangzc pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 5045dc1c0c [GLUTEN-11451][CH] Fix CH BNLJ left outer build side
validation (#11466)
5045dc1c0c is described below
commit 5045dc1c0c087cb7ef12c0aacb83bb99e77081f4
Author: zhanglistar <[email protected]>
AuthorDate: Tue Jan 27 09:06:44 2026 +0800
[GLUTEN-11451][CH] Fix CH BNLJ left outer build side validation (#11466)
* Fix CH BNLJ left outer build side validation
* Fix CH BNLJ left outer build side validation
* Fix UT.
* Allow full outer BNLJ in CH
Keep invalid build-side checks for left/right outer joins while permitting
full outer in CH broadcast nested loop joins so tests don't fallback.
* [CH] Guard BNLJ outer joins on empty build side
Detect empty build-side scans and prevent BNLJ for outer joins when the
build side has no partitions to avoid incorrect results.
* [CH] Restore BNLJ join type validation
Limit CH BNLJ validation to inner-like joins and keep the existing
condition-based fallback behavior.
* Fix CH BNLJ left outer build side validation
* Fix UT.
* Allow full outer BNLJ in CH
Keep invalid build-side checks for left/right outer joins while permitting
full outer in CH broadcast nested loop joins so tests don't fallback.
* [CH] Guard BNLJ outer joins on empty build side
Detect empty build-side scans and prevent BNLJ for outer joins when the
build side has no partitions to avoid incorrect results.
* [CH] Restore BNLJ join type validation
Limit CH BNLJ validation to inner-like joins and keep the existing
condition-based fallback behavior.
---
.../CHBroadcastNestedLoopJoinExecTransformer.scala | 23 ++++++++++++++++++++--
.../gluten/execution/CHFilterExecTransformer.scala | 4 ++--
.../execution/GlutenClickHouseJoinSuite.scala | 16 +++++++++++++++
3 files changed, 39 insertions(+), 4 deletions(-)
diff --git
a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala
b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala
index cd0319e847..5b4b8a3bd7 100644
---
a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala
+++
b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala
@@ -21,10 +21,10 @@ import org.apache.gluten.backendsapi.BackendsApiManager
import org.apache.spark.rdd.RDD
import org.apache.spark.rpc.GlutenDriverEndpoint
import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.optimizer.{BuildRight, BuildSide}
+import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight,
BuildSide}
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.{InnerLike, JoinType, LeftSemi}
-import org.apache.spark.sql.execution.{SparkPlan, SQLExecution}
+import org.apache.spark.sql.execution.{FileSourceScanExecShim, SparkPlan,
SQLExecution}
import org.apache.spark.sql.execution.joins.BuildSideRelation
import org.apache.spark.sql.vectorized.ColumnarBatch
@@ -110,7 +110,26 @@ case class CHBroadcastNestedLoopJoinExecTransformer(
BackendsApiManager.getTransformerApiInstance.packPBMessage(message)
}
+ private def isBuildSideEmpty: Boolean = buildPlan match {
+ case scan: FileSourceScanExecTransformerBase =>
+ scan.getPartitions.isEmpty
+ case scan: FileSourceScanExecShim =>
+ scan.getPartitionArray.isEmpty
+ case _ =>
+ false
+ }
+
override def validateJoinTypeAndBuildSide(): ValidationResult = {
+ if (
+ isBuildSideEmpty && (
+ (joinType == LeftOuter && buildSide == BuildRight) ||
+ (joinType == RightOuter && buildSide == BuildLeft)
+ )
+ ) {
+ return ValidationResult.failed(
+ s"Broadcast Nested Loop join is not supported for $joinType when build
side is empty")
+ }
+
joinType match {
case _: InnerLike =>
case ExistenceJoin(_) =>
diff --git
a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHFilterExecTransformer.scala
b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHFilterExecTransformer.scala
index 19b9238cbd..398fd30265 100644
---
a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHFilterExecTransformer.scala
+++
b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHFilterExecTransformer.scala
@@ -56,7 +56,7 @@ case class FilterExecTransformer(condition: Expression,
child: SparkPlan)
case IsNotNull(a) => isNullIntolerant(a) &&
a.references.subsetOf(child.outputSet)
case _ => false
}
- notNullPreds.flatMap(_.references).distinct.map(_.exprId)
- case _ => notNullPreds.flatMap(_.references).distinct.map(_.exprId)
+ notNullPreds.flatMap(_.references).distinct.map(_.exprId).toList
+ case _ => notNullPreds.flatMap(_.references).distinct.map(_.exprId).toList
}
}
diff --git
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseJoinSuite.scala
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseJoinSuite.scala
index eb5c35bdd7..22e13cb8be 100644
---
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseJoinSuite.scala
+++
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseJoinSuite.scala
@@ -242,4 +242,20 @@ class GlutenClickHouseJoinSuite extends
GlutenClickHouseWholeStageTransformerSui
compareResultsAgainstVanillaSpark(leftSql2, true, { _ => })
}
+ test("left join with empty partition on build side") {
+ withTable("t1", "t2") {
+ sql("create table t1(id int, v string) using parquet")
+ sql("create table t2(id int, v string) using parquet partitioned by (day
string)")
+ sql("insert into t1 values (1, 'a')")
+ sql("alter table t2 add if not exists partition (day='2026-01-01')")
+
+ val q =
+ """
+ |select * from t1
+ |left join (select * from t2 where day='2026-01-01')
+ |""".stripMargin
+ compareResultsAgainstVanillaSpark(q, true, { _ => })
+ }
+ }
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]