(hive) branch master updated: HIVE-28911: Improve SEARCH expansion to exploit <> operator (#6503)

zabetak Thu, 11 Jun 2026 00:24:09 -0700

This is an automated email from the ASF dual-hosted git repository.

zabetak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new 45bdea4f8bb HIVE-28911: Improve SEARCH expansion to exploit <> 
operator (#6503)
45bdea4f8bb is described below

commit 45bdea4f8bb62873ab2b2f4a8f7afc4c780d4aa3
Author: Ruben Quesada Lopez <[email protected]>
AuthorDate: Thu Jun 11 08:23:57 2026 +0100

    HIVE-28911: Improve SEARCH expansion to exploit <> operator (#6503)
---
 ...elete_iceberg_copy_on_write_unpartitioned.q.out |  4 +-
 .../positive/llap/iceberg_bucket_map_join_7.q.out  | 80 +++++++++++-----------
 .../update_iceberg_copy_on_write_partitioned.q.out |  4 +-
 ...pdate_iceberg_copy_on_write_unpartitioned.q.out |  4 +-
 .../ql/optimizer/calcite/SearchTransformer.java    | 44 ++++++++----
 .../calcite/translator/ExprNodeConverter.java      |  7 ++
 .../hadoop/hive/ql/plan/ExprNodeDescUtils.java     | 28 +++++++-
 .../stats/TestFilterSelectivityEstimator.java      | 11 +++
 .../clientpositive/llap/folder_predicate.q.out     |  4 +-
 .../llap/orc_predicate_pushdown.q.out              |  6 +-
 .../llap/parquet_predicate_pushdown.q.out          |  6 +-
 .../clientpositive/llap/vector_between_in.q.out    | 12 ++--
 12 files changed, 133 insertions(+), 77 deletions(-)

diff --git 
a/iceberg/iceberg-handler/src/test/results/positive/delete_iceberg_copy_on_write_unpartitioned.q.out
 
b/iceberg/iceberg-handler/src/test/results/positive/delete_iceberg_copy_on_write_unpartitioned.q.out
index 707be189e49..9a5350c2e0f 100644
--- 
a/iceberg/iceberg-handler/src/test/results/positive/delete_iceberg_copy_on_write_unpartitioned.q.out
+++ 
b/iceberg/iceberg-handler/src/test/results/positive/delete_iceberg_copy_on_write_unpartitioned.q.out
@@ -48,10 +48,10 @@ STAGE PLANS:
             Map Operator Tree:
                 TableScan
                   alias: tbl_ice
-                  filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or 
(((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22)) 
or (b) IN ('four', 'one') or (a = 22)) (type: boolean)
+                  filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or 
((b <> 'four') and (b <> 'one') and (a <> 22)) or (b) IN ('four', 'one') or (a 
= 22)) (type: boolean)
                   Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Filter Operator
-                    predicate: ((((b) IN ('four', 'one') or (a = 22)) is null 
or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 
22))) and FILE__PATH is not null) (type: boolean)
+                    predicate: ((((b) IN ('four', 'one') or (a = 22)) is null 
or ((b <> 'four') and (b <> 'one') and (a <> 22))) and FILE__PATH is not null) 
(type: boolean)
                     Statistics: Num rows: 7 Data size: 672 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: a (type: int), b (type: string), c (type: 
int), PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), 
FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION 
(type: string)
diff --git 
a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_7.q.out
 
b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_7.q.out
index e32e34094e8..6701fbaf410 100644
--- 
a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_7.q.out
+++ 
b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_7.q.out
@@ -150,27 +150,27 @@ Stage-0
       File Output Operator [FS_61]
         Limit [LIM_60] (rows=20 width=447)
           Number of rows:20
-          Select Operator [SEL_59] (rows=473 width=447)
+          Select Operator [SEL_59] (rows=791 width=447)
             Output:["_col0","_col1","_col2","_col3","_col4"]
           <-Map 1 [SIMPLE_EDGE] vectorized, llap
             SHUFFLE [RS_58]
-              Top N Key Operator [TNK_57] (rows=473 width=447)
+              Top N Key Operator [TNK_57] (rows=791 width=447)
                 keys:_col0,top n:20
-                Map Join Operator [MAPJOIN_56] (rows=473 width=447)
+                Map Join Operator [MAPJOIN_56] (rows=791 width=447)
                   BucketMapJoin:true,Conds:SEL_55._col0, _col1=RS_53._col0, 
_col1(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
                 <-Map 3 [CUSTOM_EDGE] vectorized, llap
                   MULTICAST [RS_53]
                     PartitionCols:_col0, _col1
-                    Select Operator [SEL_52] (rows=387 width=178)
+                    Select Operator [SEL_52] (rows=500 width=178)
                       Output:["_col0","_col1"]
-                      Filter Operator [FIL_51] (rows=387 width=178)
-                        predicate:(((key < '0') or ((key > '0') and (key < 
'100')) or (key > '100')) and value is not null)
+                      Filter Operator [FIL_51] (rows=500 width=178)
+                        predicate:((key <> '0') and (key <> '100') and value 
is not null)
                         TableScan [TS_3] (rows=500 width=178)
                           
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
-                <-Select Operator [SEL_55] (rows=387 width=269)
+                <-Select Operator [SEL_55] (rows=500 width=269)
                     Output:["_col0","_col1","_col2"]
-                    Filter Operator [FIL_54] (rows=387 width=269)
-                      predicate:(((key1 < '0') or ((key1 > '0') and (key1 < 
'100')) or (key1 > '100')) and key2 is not null)
+                    Filter Operator [FIL_54] (rows=500 width=269)
+                      predicate:((key1 <> '0') and (key1 <> '100') and key2 is 
not null)
                       TableScan [TS_0] (rows=500 width=269)
                         
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num 
Buckets:8,Grouping Partition 
Columns:["key1","key2"],Output:["key1","key2","value"]
 
@@ -346,27 +346,27 @@ Stage-0
       File Output Operator [FS_41]
         Limit [LIM_40] (rows=20 width=447)
           Number of rows:20
-          Select Operator [SEL_39] (rows=473 width=447)
+          Select Operator [SEL_39] (rows=791 width=447)
             Output:["_col0","_col1","_col2","_col3","_col4"]
           <-Map 1 [SIMPLE_EDGE] vectorized, llap
             SHUFFLE [RS_38]
-              Top N Key Operator [TNK_37] (rows=473 width=447)
+              Top N Key Operator [TNK_37] (rows=791 width=447)
                 keys:_col0,top n:20
-                Map Join Operator [MAPJOIN_36] (rows=473 width=447)
+                Map Join Operator [MAPJOIN_36] (rows=791 width=447)
                   
BucketMapJoin:true,Conds:SEL_35._col0=RS_33._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
                 <-Map 3 [CUSTOM_EDGE] vectorized, llap
                   MULTICAST [RS_33]
                     PartitionCols:_col0
-                    Select Operator [SEL_32] (rows=387 width=178)
+                    Select Operator [SEL_32] (rows=500 width=178)
                       Output:["_col0","_col1"]
-                      Filter Operator [FIL_31] (rows=387 width=178)
-                        predicate:((key < '0') or (key > '100') or ((key > 
'0') and (key < '100')))
+                      Filter Operator [FIL_31] (rows=500 width=178)
+                        predicate:((key <> '0') and (key <> '100'))
                         TableScan [TS_3] (rows=500 width=178)
                           
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
-                <-Select Operator [SEL_35] (rows=387 width=269)
+                <-Select Operator [SEL_35] (rows=500 width=269)
                     Output:["_col0","_col1","_col2"]
-                    Filter Operator [FIL_34] (rows=387 width=269)
-                      predicate:((key1 < '0') or (key1 > '100') or ((key1 > 
'0') and (key1 < '100')))
+                    Filter Operator [FIL_34] (rows=500 width=269)
+                      predicate:((key1 <> '0') and (key1 <> '100'))
                       TableScan [TS_0] (rows=500 width=269)
                         
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num 
Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","key2","value"]
 
@@ -435,40 +435,40 @@ POSTHOOK: Input: default@srcbucket_big
 Plan optimized by CBO.
 
 Vertex dependency in root stage
-Map 2 <- Map 1 (BROADCAST_EDGE)
-Reducer 3 <- Map 2 (SIMPLE_EDGE)
+Map 1 <- Map 3 (CUSTOM_EDGE)
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
 
 Stage-0
   Fetch Operator
     limit:20
     Stage-1
-      Reducer 3 vectorized, llap
+      Reducer 2 vectorized, llap
       File Output Operator [FS_41]
         Limit [LIM_40] (rows=20 width=447)
           Number of rows:20
-          Select Operator [SEL_39] (rows=612 width=447)
+          Select Operator [SEL_39] (rows=791 width=447)
             Output:["_col0","_col1","_col2","_col3","_col4"]
-          <-Map 2 [SIMPLE_EDGE] vectorized, llap
+          <-Map 1 [SIMPLE_EDGE] vectorized, llap
             SHUFFLE [RS_38]
-              Top N Key Operator [TNK_37] (rows=612 width=447)
+              Top N Key Operator [TNK_37] (rows=791 width=447)
                 keys:_col0,top n:20
-                Map Join Operator [MAPJOIN_36] (rows=612 width=447)
-                  
Conds:RS_33._col0=SEL_35._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
-                <-Map 1 [BROADCAST_EDGE] vectorized, llap
-                  BROADCAST [RS_33]
+                Map Join Operator [MAPJOIN_36] (rows=791 width=447)
+                  
BucketMapJoin:true,Conds:SEL_35._col0=RS_33._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
+                <-Map 3 [CUSTOM_EDGE] vectorized, llap
+                  MULTICAST [RS_33]
                     PartitionCols:_col0
-                    Select Operator [SEL_32] (rows=387 width=269)
-                      Output:["_col0","_col1","_col2"]
-                      Filter Operator [FIL_31] (rows=387 width=269)
-                        predicate:(((key2 < 'val_0') or ((key2 > 'val_0') and 
(key2 < 'val_100')) or (key2 > 'val_100')) and key1 is not null)
-                        TableScan [TS_0] (rows=500 width=269)
-                          
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["key1","key2","value"]
-                <-Select Operator [SEL_35] (rows=500 width=178)
-                    Output:["_col0","_col1"]
-                    Filter Operator [FIL_34] (rows=500 width=178)
-                      predicate:key is not null
-                      TableScan [TS_3] (rows=500 width=178)
-                        
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
+                    Select Operator [SEL_32] (rows=500 width=178)
+                      Output:["_col0","_col1"]
+                      Filter Operator [FIL_31] (rows=500 width=178)
+                        predicate:key is not null
+                        TableScan [TS_3] (rows=500 width=178)
+                          
default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
+                <-Select Operator [SEL_35] (rows=500 width=269)
+                    Output:["_col0","_col1","_col2"]
+                    Filter Operator [FIL_34] (rows=500 width=269)
+                      predicate:((key2 <> 'val_0') and (key2 <> 'val_100') and 
key1 is not null)
+                      TableScan [TS_0] (rows=500 width=269)
+                        
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num 
Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","key2","value"]
 
 PREHOOK: query: SELECT *
 FROM srcbucket_big a
diff --git 
a/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_partitioned.q.out
 
b/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_partitioned.q.out
index 5d4e328faf2..d0ba154e146 100644
--- 
a/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_partitioned.q.out
+++ 
b/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_partitioned.q.out
@@ -71,10 +71,10 @@ STAGE PLANS:
             Map Operator Tree:
                 TableScan
                   alias: tbl_ice
-                  filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or 
(((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 
22))) (type: boolean)
+                  filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or 
((b <> 'four') and (b <> 'one') and (a <> 22))) (type: boolean)
                   Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE 
Column stats: PARTIAL
                   Filter Operator
-                    predicate: ((((b) IN ('four', 'one') or (a = 22)) is null 
or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 
22))) and FILE__PATH is not null) (type: boolean)
+                    predicate: ((((b) IN ('four', 'one') or (a = 22)) is null 
or ((b <> 'four') and (b <> 'one') and (a <> 22))) and FILE__PATH is not null) 
(type: boolean)
                     Statistics: Num rows: 1 Data size: 84 Basic stats: 
COMPLETE Column stats: PARTIAL
                     Select Operator
                       expressions: a (type: int), b (type: string), c (type: 
int), PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), 
FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION 
(type: string)
diff --git 
a/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_unpartitioned.q.out
 
b/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_unpartitioned.q.out
index 6a149603f73..150fa60ce16 100644
--- 
a/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_unpartitioned.q.out
+++ 
b/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_unpartitioned.q.out
@@ -71,7 +71,7 @@ STAGE PLANS:
             Map Operator Tree:
                 TableScan
                   alias: tbl_ice
-                  filterExpr: ((a = 22) or (b) IN ('four', 'one') or ((b) IN 
('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b 
< 'one')) or (b > 'one')) and (a <> 22))) (type: boolean)
+                  filterExpr: ((a = 22) or (b) IN ('four', 'one') or ((b) IN 
('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a 
<> 22))) (type: boolean)
                   Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Filter Operator
                     predicate: ((a = 22) or (b) IN ('four', 'one')) (type: 
boolean)
@@ -93,7 +93,7 @@ STAGE PLANS:
                       Map-reduce partition columns: FILE__PATH (type: string)
                       Statistics: Num rows: 4 Data size: 368 Basic stats: 
COMPLETE Column stats: COMPLETE
                   Filter Operator
-                    predicate: ((((b) IN ('four', 'one') or (a = 22)) is null 
or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 
22))) and FILE__PATH is not null) (type: boolean)
+                    predicate: ((((b) IN ('four', 'one') or (a = 22)) is null 
or ((b <> 'four') and (b <> 'one') and (a <> 22))) and FILE__PATH is not null) 
(type: boolean)
                     Statistics: Num rows: 7 Data size: 672 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: a (type: int), b (type: string), c (type: 
int), PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), 
FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION 
(type: string)
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/SearchTransformer.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/SearchTransformer.java
index 8ea25a91a0b..565479734b4 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/SearchTransformer.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/SearchTransformer.java
@@ -72,30 +72,44 @@ public SearchTransformer(RexBuilder rexBuilder, RexCall 
search, final RexUnknown
     this.unknownContext = unknownContext;
   }
 
+  /**
+   * Transforms the SEARCH expression into an equivalent RexNode expression.
+   * Warning: when called from a shuttle, callers of this method should 
consider flattening AND/OR expressions
+   * afterward, to get the same result as applying {@link 
SearchTransformer.Shuttle}.
+   */
   public RexNode transform() {
     PerfLogger perfLogger = SessionState.getPerfLogger();
     perfLogger.perfLogBegin(this.getClass().getName(), 
PerfLogger.SEARCH_TRANSFORMER);
 
-    RangeConverter<C> consumer = new RangeConverter<>(rexBuilder, operandType, 
ref);
-    RangeSets.forEach(sarg.rangeSet, consumer);
-
     List<RexNode> orList = new ArrayList<>();
     if (sarg.nullAs == RexUnknownAs.TRUE && unknownContext != 
RexUnknownAs.TRUE) {
       orList.add(rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, ref));
     }
-    switch (consumer.inLiterals.size()) {
-    case 0:
-      break;
-    case 1:
-      orList.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, ref, 
consumer.inLiterals.get(0)));
-      break;
-    default:
-      List<RexNode> operands = new ArrayList<>(consumer.inLiterals.size() + 1);
-      operands.add(ref);
-      operands.addAll(consumer.inLiterals);
-      orList.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands));
+
+    if (sarg.isComplementedPoints()) {
+      // Generate 'ref <> value1 AND ... AND ref <> valueN'
+      List<RexNode> list = sarg.rangeSet.complement().asRanges().stream().map(
+          range -> rexBuilder.makeCall(SqlStdOperatorTable.NOT_EQUALS, ref,
+              rexBuilder.makeLiteral(range.lowerEndpoint(), operandType, true, 
true))).toList();
+      orList.add(RexUtil.composeConjunction(rexBuilder, list));
+    } else {
+      RangeConverter<C> consumer = new RangeConverter<>(rexBuilder, 
operandType, ref);
+      RangeSets.forEach(sarg.rangeSet, consumer);
+
+      switch (consumer.inLiterals.size()) {
+      case 0:
+        break;
+      case 1:
+        orList.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, ref, 
consumer.inLiterals.get(0)));
+        break;
+      default:
+        List<RexNode> operands = new ArrayList<>(consumer.inLiterals.size() + 
1);
+        operands.add(ref);
+        operands.addAll(consumer.inLiterals);
+        orList.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands));
+      }
+      orList.addAll(consumer.nodes);
     }
-    orList.addAll(consumer.nodes);
     RexNode x = RexUtil.composeDisjunction(rexBuilder, orList);
 
     if (sarg.nullAs == RexUnknownAs.FALSE && unknownContext != 
RexUnknownAs.FALSE) {
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ExprNodeConverter.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ExprNodeConverter.java
index b582c62997e..2098f29a7a6 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ExprNodeConverter.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ExprNodeConverter.java
@@ -82,6 +82,7 @@
 import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
 import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
 import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
 import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
 import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
@@ -214,6 +215,12 @@ public ExprNodeDesc visitCall(RexCall call) {
         && SqlTypeUtil.equalSansNullability(dTFactory, call.getType(),
             call.operands.get(0).getType())) {
       return args.get(0);
+    } else if (call.isA(SqlKind.AND)) {
+      // Make sure AND is flattened (we may have nested ANDs due to 
SearchTransformer conversion above)
+      return ExprNodeDescUtils.and(args);
+    }  else if (call.isA(SqlKind.OR)) {
+      // Make sure OR is flattened (we may have nested ORs due to 
SearchTransformer conversion above)
+      return ExprNodeDescUtils.or(args);
     } else {
       GenericUDF hiveUdf = SqlFunctionConverter.getHiveUDF(call.getOperator(), 
call.getType(),
           args.size());
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java
index 541ce20f518..34d5f0ba0b3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java
@@ -64,6 +64,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.function.Predicate;
 
 
 public class ExprNodeDescUtils {
@@ -243,6 +244,21 @@ public static ExprNodeGenericFuncDesc 
and(List<ExprNodeDesc> exps) {
     return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, new 
GenericUDFOPAnd(), "and", flatExps);
   }
 
+  /**
+   * Creates a disjunction (OR) of the given expressions flattening nested 
disjunctions if possible.
+   * <pre>
+   * Input: AND(A, B), C, OR(D, OR(E, F))
+   * Output: OR(AND(A, B), C, D, E, F)
+   * </pre>
+   */
+  public static ExprNodeGenericFuncDesc or(List<ExprNodeDesc> exps) {
+    List<ExprNodeDesc> flatExps = new ArrayList<>();
+    for (ExprNodeDesc e : exps) {
+      split(e, flatExps, FunctionRegistry::isOpOr);
+    }
+    return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, new 
GenericUDFOPOr(), "or", flatExps);
+  }
+
   /**
    * Create an expression for computing a murmur hash by recursively hashing 
given expressions by two:
    * <pre>
@@ -305,9 +321,17 @@ public static List<ExprNodeDesc> split(ExprNodeDesc 
current) {
    * split predicates by AND op
    */
   public static List<ExprNodeDesc> split(ExprNodeDesc current, 
List<ExprNodeDesc> splitted) {
-    if (FunctionRegistry.isOpAnd(current)) {
+    return split(current, splitted, FunctionRegistry::isOpAnd);
+  }
+
+  /**
+   * split predicates by a certain condition
+   */
+  private static List<ExprNodeDesc> split(ExprNodeDesc current, 
List<ExprNodeDesc> splitted,
+      Predicate<ExprNodeDesc> condition) {
+    if (condition.test(current)) {
       for (ExprNodeDesc child : current.getChildren()) {
-        split(child, splitted);
+        split(child, splitted, condition);
       }
       return splitted;
     }
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java
 
b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java
index 39c6ca8f80c..56e294a3fd0 100644
--- 
a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java
+++ 
b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java
@@ -371,6 +371,17 @@ public void testBetweenSelectivityLeftEqualsRight_KO() {
     betweenSelectivity(KLL, 2, 2);
   }
 
+  @Test
+  public void testComputeNotEqualsPredicateSelectivity() {
+    RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.AND,
+        REX_BUILDER.makeCall(SqlStdOperatorTable.NOT_EQUALS, inputRef0, int3),
+        REX_BUILDER.makeCall(SqlStdOperatorTable.NOT_EQUALS, inputRef0, int7));
+    filter = simplify(filter);
+    Assert.assertEquals(SqlKind.SEARCH, filter.getKind());
+    FilterSelectivityEstimator estimator = new 
FilterSelectivityEstimator(scan, mq);
+    Assert.assertEquals(0.8095238095238095, 
estimator.estimateSelectivity(filter), DELTA);
+  }
+
   @Test
   public void testComputeRangePredicateSelectivityWhenNoStats() {
     RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.LESS_THAN, 
inputRef0, int3);
diff --git a/ql/src/test/results/clientpositive/llap/folder_predicate.q.out 
b/ql/src/test/results/clientpositive/llap/folder_predicate.q.out
index f8b2ef3663e..1e67ce4271a 100644
--- a/ql/src/test/results/clientpositive/llap/folder_predicate.q.out
+++ b/ql/src/test/results/clientpositive/llap/folder_predicate.q.out
@@ -41,9 +41,9 @@ STAGE PLANS:
       Processor Tree:
         TableScan
           alias: predicate_fold_tb
-          filterExpr: (value is null or (value < 3) or (value > 3)) (type: 
boolean)
+          filterExpr: ((value <> 3) or value is null) (type: boolean)
           Filter Operator
-            predicate: (value is null or (value < 3) or (value > 3)) (type: 
boolean)
+            predicate: ((value <> 3) or value is null) (type: boolean)
             Select Operator
               expressions: value (type: int)
               outputColumnNames: _col0
diff --git 
a/ql/src/test/results/clientpositive/llap/orc_predicate_pushdown.q.out 
b/ql/src/test/results/clientpositive/llap/orc_predicate_pushdown.q.out
index dcc7c103b77..cb2d50d7366 100644
--- a/ql/src/test/results/clientpositive/llap/orc_predicate_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/llap/orc_predicate_pushdown.q.out
@@ -627,7 +627,7 @@ STAGE PLANS:
                   alias: orc_pred
                   Statistics: Num rows: 1049 Data size: 105941 Basic stats: 
COMPLETE Column stats: COMPLETE
                   Filter Operator
-                    predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and ((t < 
-3Y) or ((t > -3Y) and (t < -2Y)) or ((t > -2Y) and (t < -1Y)) or (t > -1Y)) 
and (s like 'bob%') and s is not null) (type: boolean)
+                    predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and (s like 
'bob%') and (t <> -3Y) and (t <> -2Y) and (t <> -1Y) and s is not null) (type: 
boolean)
                     Statistics: Num rows: 262 Data size: 26462 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: t (type: tinyint), s (type: string)
@@ -695,10 +695,10 @@ STAGE PLANS:
             Map Operator Tree:
                 TableScan
                   alias: orc_pred
-                  filterExpr: (UDFToInteger(t) BETWEEN 25 AND 30 and ((t < 
-3Y) or ((t > -3Y) and (t < -2Y)) or ((t > -2Y) and (t < -1Y)) or (t > -1Y)) 
and (s like 'bob%') and s is not null) (type: boolean)
+                  filterExpr: (UDFToInteger(t) BETWEEN 25 AND 30 and (s like 
'bob%') and (t <> -3Y) and (t <> -2Y) and (t <> -1Y) and s is not null) (type: 
boolean)
                   Statistics: Num rows: 1049 Data size: 105941 Basic stats: 
COMPLETE Column stats: COMPLETE
                   Filter Operator
-                    predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and ((t < 
-3Y) or ((t > -3Y) and (t < -2Y)) or ((t > -2Y) and (t < -1Y)) or (t > -1Y)) 
and (s like 'bob%') and s is not null) (type: boolean)
+                    predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and (s like 
'bob%') and (t <> -3Y) and (t <> -2Y) and (t <> -1Y) and s is not null) (type: 
boolean)
                     Statistics: Num rows: 262 Data size: 26462 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: t (type: tinyint), s (type: string)
diff --git 
a/ql/src/test/results/clientpositive/llap/parquet_predicate_pushdown.q.out 
b/ql/src/test/results/clientpositive/llap/parquet_predicate_pushdown.q.out
index d7a825b592a..4858f10aa63 100644
--- a/ql/src/test/results/clientpositive/llap/parquet_predicate_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/llap/parquet_predicate_pushdown.q.out
@@ -561,7 +561,7 @@ STAGE PLANS:
                   alias: tbl_pred
                   Statistics: Num rows: 1049 Data size: 105941 Basic stats: 
COMPLETE Column stats: COMPLETE
                   Filter Operator
-                    predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and ((t < 
-3Y) or ((t > -3Y) and (t < -2Y)) or ((t > -2Y) and (t < -1Y)) or (t > -1Y)) 
and (s like 'bob%') and s is not null) (type: boolean)
+                    predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and (s like 
'bob%') and (t <> -3Y) and (t <> -2Y) and (t <> -1Y) and s is not null) (type: 
boolean)
                     Statistics: Num rows: 262 Data size: 26462 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: t (type: tinyint), s (type: string)
@@ -629,10 +629,10 @@ STAGE PLANS:
             Map Operator Tree:
                 TableScan
                   alias: tbl_pred
-                  filterExpr: (UDFToInteger(t) BETWEEN 25 AND 30 and ((t < 
-3Y) or ((t > -3Y) and (t < -2Y)) or ((t > -2Y) and (t < -1Y)) or (t > -1Y)) 
and (s like 'bob%') and s is not null) (type: boolean)
+                  filterExpr: (UDFToInteger(t) BETWEEN 25 AND 30 and (s like 
'bob%') and (t <> -3Y) and (t <> -2Y) and (t <> -1Y) and s is not null) (type: 
boolean)
                   Statistics: Num rows: 1049 Data size: 105941 Basic stats: 
COMPLETE Column stats: COMPLETE
                   Filter Operator
-                    predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and ((t < 
-3Y) or ((t > -3Y) and (t < -2Y)) or ((t > -2Y) and (t < -1Y)) or (t > -1Y)) 
and (s like 'bob%') and s is not null) (type: boolean)
+                    predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and (s like 
'bob%') and (t <> -3Y) and (t <> -2Y) and (t <> -1Y) and s is not null) (type: 
boolean)
                     Statistics: Num rows: 262 Data size: 26462 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: t (type: tinyint), s (type: string)
diff --git a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out 
b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
index 23e8a82b7a2..1edc82eeede 100644
--- a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
@@ -153,7 +153,7 @@ STAGE PLANS:
             Map Operator Tree:
                 TableScan
                   alias: decimal_date_test
-                  filterExpr: ((cdate < DATE'1969-07-14') or (cdate > 
DATE'1970-01-21') or ((cdate > DATE'1969-07-14') and (cdate < 
DATE'1969-10-26')) or ((cdate > DATE'1969-10-26') and (cdate < 
DATE'1970-01-21'))) (type: boolean)
+                  filterExpr: ((cdate <> DATE'1969-07-14') and (cdate <> 
DATE'1969-10-26') and (cdate <> DATE'1970-01-21')) (type: boolean)
                   Statistics: Num rows: 12289 Data size: 339304 Basic stats: 
COMPLETE Column stats: COMPLETE
                   TableScan Vectorization:
                       native: true
@@ -161,8 +161,8 @@ STAGE PLANS:
                     Filter Vectorization:
                         className: VectorFilterOperator
                         native: true
-                        predicateExpression: FilterExprOrExpr(children: 
FilterDateColLessDateScalar(col 3:date, val -171), 
FilterDateColGreaterDateScalar(col 3:date, val 20), FilterExprAndExpr(children: 
FilterDateColGreaterDateScalar(col 3:date, val -171), 
FilterDateColLessDateScalar(col 3:date, val -67)), FilterExprAndExpr(children: 
FilterDateColGreaterDateScalar(col 3:date, val -67), 
FilterDateColLessDateScalar(col 3:date, val 20)))
-                    predicate: ((cdate < DATE'1969-07-14') or (cdate > 
DATE'1970-01-21') or ((cdate > DATE'1969-07-14') and (cdate < 
DATE'1969-10-26')) or ((cdate > DATE'1969-10-26') and (cdate < 
DATE'1970-01-21'))) (type: boolean)
+                        predicateExpression: FilterExprAndExpr(children: 
FilterDateColNotEqualDateScalar(col 3:date, val -171), 
FilterDateColNotEqualDateScalar(col 3:date, val -67), 
FilterDateColNotEqualDateScalar(col 3:date, val 20))
+                    predicate: ((cdate <> DATE'1969-07-14') and (cdate <> 
DATE'1969-10-26') and (cdate <> DATE'1970-01-21')) (type: boolean)
                     Statistics: Num rows: 12289 Data size: 339304 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
                       Select Vectorization:
@@ -370,7 +370,7 @@ STAGE PLANS:
             Map Operator Tree:
                 TableScan
                   alias: decimal_date_test
-                  filterExpr: ((cdecimal1 < -3367.6517567568) or (cdecimal1 > 
2365.8945945946) or ((cdecimal1 > -3367.6517567568) and (cdecimal1 < 
881.0135135135)) or ((cdecimal1 > 881.0135135135) and (cdecimal1 < 
2365.8945945946))) (type: boolean)
+                  filterExpr: ((cdecimal1 <> -3367.6517567568) and (cdecimal1 
<> 881.0135135135) and (cdecimal1 <> 2365.8945945946)) (type: boolean)
                   Statistics: Num rows: 12289 Data size: 1027600 Basic stats: 
COMPLETE Column stats: COMPLETE
                   TableScan Vectorization:
                       native: true
@@ -378,8 +378,8 @@ STAGE PLANS:
                     Filter Vectorization:
                         className: VectorFilterOperator
                         native: true
-                        predicateExpression: FilterExprOrExpr(children: 
FilterDecimalColLessDecimalScalar(col 1:decimal(20,10), val -3367.6517567568), 
FilterDecimalColGreaterDecimalScalar(col 1:decimal(20,10), val 
2365.8945945946), FilterExprAndExpr(children: 
FilterDecimalColGreaterDecimalScalar(col 1:decimal(20,10), val 
-3367.6517567568), FilterDecimalColLessDecimalScalar(col 1:decimal(20,10), val 
881.0135135135)), FilterExprAndExpr(children: 
FilterDecimalColGreaterDecimalScalar(col 1:d [...]
-                    predicate: ((cdecimal1 < -3367.6517567568) or (cdecimal1 > 
2365.8945945946) or ((cdecimal1 > -3367.6517567568) and (cdecimal1 < 
881.0135135135)) or ((cdecimal1 > 881.0135135135) and (cdecimal1 < 
2365.8945945946))) (type: boolean)
+                        predicateExpression: FilterExprAndExpr(children: 
FilterDecimalColNotEqualDecimalScalar(col 1:decimal(20,10), val 
-3367.6517567568), FilterDecimalColNotEqualDecimalScalar(col 1:decimal(20,10), 
val 881.0135135135), FilterDecimalColNotEqualDecimalScalar(col 
1:decimal(20,10), val 2365.8945945946))
+                    predicate: ((cdecimal1 <> -3367.6517567568) and (cdecimal1 
<> 881.0135135135) and (cdecimal1 <> 2365.8945945946)) (type: boolean)
                     Statistics: Num rows: 12289 Data size: 1027600 Basic 
stats: COMPLETE Column stats: COMPLETE
                     Select Operator
                       Select Vectorization:

(hive) branch master updated: HIVE-28911: Improve SEARCH expansion to exploit <> operator (#6503)

Reply via email to