This is an automated email from the ASF dual-hosted git repository.

zhangzc pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 5045dc1c0c [GLUTEN-11451][CH] Fix CH BNLJ left outer build side 
validation (#11466)
5045dc1c0c is described below

commit 5045dc1c0c087cb7ef12c0aacb83bb99e77081f4
Author: zhanglistar <[email protected]>
AuthorDate: Tue Jan 27 09:06:44 2026 +0800

    [GLUTEN-11451][CH] Fix CH BNLJ left outer build side validation (#11466)
    
    * Fix CH BNLJ left outer build side validation
    
    * Fix CH BNLJ left outer build side validation
    
    * Fix UT.
    
    * Allow full outer BNLJ in CH
    
    Keep invalid build-side checks for left/right outer joins while permitting
    full outer in CH broadcast nested loop joins so tests don't fallback.
    
    * [CH] Guard BNLJ outer joins on empty build side
    
    Detect empty build-side scans and prevent BNLJ for outer joins when the 
build side has no partitions to avoid incorrect results.
    
    * [CH] Restore BNLJ join type validation
    
    Limit CH BNLJ validation to inner-like joins and keep the existing 
condition-based fallback behavior.
    
    * Fix CH BNLJ left outer build side validation
    
    * Fix UT.
    
    * Allow full outer BNLJ in CH
    
    Keep invalid build-side checks for left/right outer joins while permitting
    full outer in CH broadcast nested loop joins so tests don't fallback.
    
    * [CH] Guard BNLJ outer joins on empty build side
    
    Detect empty build-side scans and prevent BNLJ for outer joins when the 
build side has no partitions to avoid incorrect results.
    
    * [CH] Restore BNLJ join type validation
    
    Limit CH BNLJ validation to inner-like joins and keep the existing 
condition-based fallback behavior.
---
 .../CHBroadcastNestedLoopJoinExecTransformer.scala | 23 ++++++++++++++++++++--
 .../gluten/execution/CHFilterExecTransformer.scala |  4 ++--
 .../execution/GlutenClickHouseJoinSuite.scala      | 16 +++++++++++++++
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git 
a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala
 
b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala
index cd0319e847..5b4b8a3bd7 100644
--- 
a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala
+++ 
b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala
@@ -21,10 +21,10 @@ import org.apache.gluten.backendsapi.BackendsApiManager
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rpc.GlutenDriverEndpoint
 import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.optimizer.{BuildRight, BuildSide}
+import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, 
BuildSide}
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.{InnerLike, JoinType, LeftSemi}
-import org.apache.spark.sql.execution.{SparkPlan, SQLExecution}
+import org.apache.spark.sql.execution.{FileSourceScanExecShim, SparkPlan, 
SQLExecution}
 import org.apache.spark.sql.execution.joins.BuildSideRelation
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
@@ -110,7 +110,26 @@ case class CHBroadcastNestedLoopJoinExecTransformer(
     BackendsApiManager.getTransformerApiInstance.packPBMessage(message)
   }
 
+  private def isBuildSideEmpty: Boolean = buildPlan match {
+    case scan: FileSourceScanExecTransformerBase =>
+      scan.getPartitions.isEmpty
+    case scan: FileSourceScanExecShim =>
+      scan.getPartitionArray.isEmpty
+    case _ =>
+      false
+  }
+
   override def validateJoinTypeAndBuildSide(): ValidationResult = {
+    if (
+      isBuildSideEmpty && (
+        (joinType == LeftOuter && buildSide == BuildRight) ||
+          (joinType == RightOuter && buildSide == BuildLeft)
+      )
+    ) {
+      return ValidationResult.failed(
+        s"Broadcast Nested Loop join is not supported for $joinType when build 
side is empty")
+    }
+
     joinType match {
       case _: InnerLike =>
       case ExistenceJoin(_) =>
diff --git 
a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHFilterExecTransformer.scala
 
b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHFilterExecTransformer.scala
index 19b9238cbd..398fd30265 100644
--- 
a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHFilterExecTransformer.scala
+++ 
b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHFilterExecTransformer.scala
@@ -56,7 +56,7 @@ case class FilterExecTransformer(condition: Expression, 
child: SparkPlan)
         case IsNotNull(a) => isNullIntolerant(a) && 
a.references.subsetOf(child.outputSet)
         case _ => false
       }
-      notNullPreds.flatMap(_.references).distinct.map(_.exprId)
-    case _ => notNullPreds.flatMap(_.references).distinct.map(_.exprId)
+      notNullPreds.flatMap(_.references).distinct.map(_.exprId).toList
+    case _ => notNullPreds.flatMap(_.references).distinct.map(_.exprId).toList
   }
 }
diff --git 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseJoinSuite.scala
 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseJoinSuite.scala
index eb5c35bdd7..22e13cb8be 100644
--- 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseJoinSuite.scala
+++ 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseJoinSuite.scala
@@ -242,4 +242,20 @@ class GlutenClickHouseJoinSuite extends 
GlutenClickHouseWholeStageTransformerSui
     compareResultsAgainstVanillaSpark(leftSql2, true, { _ => })
   }
 
+  test("left join with empty partition on build side") {
+    withTable("t1", "t2") {
+      sql("create table t1(id int, v string) using parquet")
+      sql("create table t2(id int, v string) using parquet partitioned by (day 
string)")
+      sql("insert into t1 values (1, 'a')")
+      sql("alter table t2 add if not exists partition (day='2026-01-01')")
+
+      val q =
+        """
+          |select * from t1
+          |left join (select * from t2 where day='2026-01-01')
+          |""".stripMargin
+      compareResultsAgainstVanillaSpark(q, true, { _ => })
+    }
+  }
+
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to