wangyum commented on a change in pull request #28575:
URL: https://github.com/apache/spark/pull/28575#discussion_r434623988



##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
##########
@@ -1372,6 +1376,108 @@ object PushPredicateThroughJoin extends 
Rule[LogicalPlan] with PredicateHelper {
   }
 }
 
+/**
+ * Rewriting join condition to conjunctive normal form expression so that we 
can push
+ * more predicate.
+ */
+object PushPredicateThroughJoinByCNF extends Rule[LogicalPlan] with 
PredicateHelper {
+  /**
+   * Rewrite pattern:
+   * 1. (a && b) || c --> (a || c) && (b || c)
+   * 2. a || (b && c) --> (a || b) && (a || c)
+   *
+   * To avoid generating too many predicates, we first group the filter 
columns from the same table.
+   */
+  private def toCNF(condition: Expression, depth: Int = 0): Expression = {
+    if (depth < SQLConf.get.maxRewritingCNFDepth) {
+      condition match {
+        case or @ Or(left: And, right: And) =>
+          val lhs = 
splitConjunctivePredicates(left).groupBy(_.references.map(_.qualifier))
+          val rhs = 
splitConjunctivePredicates(right).groupBy(_.references.map(_.qualifier))
+          if (lhs.size > 1) {
+            lhs.values.map(_.reduceLeft(And)).map { c =>

Review comment:
       ```
   07:35:05.863 WARN org.apache.spark.sql.TPCDSQuerySuite: 
   === Metrics of Analyzer/Optimizer Rules ===
   Total number of runs: 7249
   Total time: 1.949092121 seconds
   
   Rule                                                                         
     Effective Time / Total Time                     Effective Runs / Total 
Runs                    
   
   org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog                 
     151465071 / 249555919                           24 / 59                    
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggregateFunctions    
     138746642 / 168406459                           1 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences            
     78878999 / 132411189                            3 / 59                     
                    
   org.apache.spark.sql.execution.datasources.FindDataSourceTable               
     95372289 / 99326980                             1 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.DecimalPrecision                      
     56750800 / 66170980                             2 / 59                     
                    
   org.apache.spark.sql.execution.datasources.PreprocessTableCreation           
     0 / 48910600                                    0 / 28                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$ImplicitTypeCasts        
     20820860 / 44731509                             1 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.ColumnPruning                        
     12919681 / 44543112                             2 / 105                    
                    
   org.apache.spark.sql.catalyst.optimizer.BooleanSimplification                
     24718048 / 43686766                             1 / 55                     
                    
   org.apache.spark.sql.execution.datasources.SchemaPruning                     
     0 / 32795196                                    0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.ResolveCatalogs                       
     0 / 30645089                                    0 / 59                     
                    
   
org.apache.spark.sql.catalyst.analysis.TypeCoercion$FunctionArgumentConversion  
  17902985 / 27578796                             2 / 59                        
                 
   org.apache.spark.sql.catalyst.analysis.Analyzer$ExtractGenerator             
     0 / 26144312                                    0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.PushDownPredicates                   
     17354780 / 25365681                             5 / 80                     
                    
   org.apache.spark.sql.catalyst.optimizer.InferFiltersFromConstraints          
     22137478 / 24736530                             1 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.RemoveRedundantAliases               
     0 / 24508932                                    0 / 55                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions             
     10536555 / 24417169                             2 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.ReorderJoin                          
     17311087 / 22391786                             1 / 55                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveMissingReferences     
     0 / 21142565                                    0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveBinaryArithmetic      
     0 / 20704002                                    0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations             
     15024644 / 20411277                             1 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAliases               
     15859386 / 18096012                             1 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.ReorderAssociativeOperator           
     0 / 17026758                                    0 / 55                     
                    
   org.apache.spark.sql.catalyst.analysis.UpdateAttributeNullability            
     0 / 16870561                                    0 / 28                     
                    
   org.apache.spark.sql.catalyst.optimizer.PruneFilters                         
     0 / 15220754                                    0 / 80                     
                    
   org.apache.spark.sql.catalyst.optimizer.CollapseProject                      
     8979563 / 13211028                              1 / 80                     
                    
   org.apache.spark.sql.catalyst.optimizer.LikeSimplification                   
     0 / 12837312                                    0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.ConstantFolding                      
     7099416 / 12652039                              1 / 55                     
                    
   org.apache.spark.sql.catalyst.analysis.TimeWindowing                         
     0 / 12566615                                    0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.ComputeCurrentTime                   
     0 / 12279377                                    0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.FoldablePropagation                  
     0 / 12158420                                    0 / 55                     
                    
   org.apache.spark.sql.catalyst.analysis.ResolveTimeZone                       
     7331904 / 11661433                              5 / 59                     
                    
   org.apache.spark.sql.execution.datasources.v2.V2ScanRelationPushDown         
     0 / 11651482                                    0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$HandleNullInputsForUDF       
     0 / 11312278                                    0 / 28                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$PullOutNondeterministic      
     0 / 11111621                                    0 / 28                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$PromoteStrings           
     0 / 11032240                                    0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.OptimizeIn                           
     0 / 10792662                                    0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.SimplifyExtractValueOps              
     0 / 10435223                                    0 / 55                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGroupingAnalytics     
     0 / 10147394                                    0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$LookupFunctions              
     0 / 9906569                                     0 / 28                     
                    
   org.apache.spark.sql.catalyst.optimizer.SimplifyBinaryComparison             
     0 / 9434962                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRandomSeed            
     0 / 8745520                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.NullPropagation                      
     0 / 8673323                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.analysis.ResolveLambdaVariables                
     0 / 8618322                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.DecimalAggregates                    
     1133548 / 8344365                               1 / 26                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$EltCoercion              
     0 / 8306604                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveInsertInto            
     0 / 8272186                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.EliminateResolvedHint                
     0 / 8269481                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$BooleanEquality          
     0 / 8164425                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$InConversion             
     0 / 7999784                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$WindowFrameCoercion      
     0 / 7744630                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.EliminateSorts                       
     0 / 7664352                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveNamespace             
     0 / 7333421                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.RewriteCorrelatedScalarSubquery      
     0 / 7288581                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.ReassignLambdaVariableID             
     0 / 7094275                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.NormalizeFloatingNumbers             
     0 / 6952061                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.CleanupAliases                        
     1507275 / 6818821                               1 / 29                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$DateTimeOperations       
     0 / 6621755                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.SimplifyConditionals                 
     0 / 6594834                                     0 / 55                     
                    
   org.apache.spark.sql.execution.dynamicpruning.PartitionPruning               
     0 / 6435655                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.SimplifyCasts                        
     1726589 / 6431193                               1 / 55                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$IfCoercion               
     0 / 6390305                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$GlobalAggregates             
     0 / 6364189                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$ConcatCoercion           
     0 / 6344767                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.PushLeftSemiLeftAntiThroughJoin      
     0 / 6300823                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveSubquery              
     0 / 6237735                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.SimplifyCaseConversionExpressions    
     0 / 6221240                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$StringLiteralCoercion    
     0 / 6192720                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.CTESubstitution                       
     0 / 6150743                                     0 / 28                     
                    
   org.apache.spark.sql.catalyst.optimizer.GetCurrentDatabase                   
     0 / 6148608                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.ResolveCreateNamedStruct              
     0 / 6101190                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.ReplaceNullWithFalseInPredicate      
     0 / 6100257                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.ConstantPropagation                  
     0 / 6087339                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.analysis.ResolveHigherOrderFunctions           
     0 / 6067619                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ExtractWindowExpressions     
     0 / 6061035                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveUpCast                
     0 / 6007788                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$Division                 
     0 / 5925219                                     0 / 59                     
                    
   org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions         
     0 / 5824041                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolvePivot                 
     0 / 5559831                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.ResolveHints$ResolveJoinStrategyHints 
     0 / 5524307                                     0 / 28                     
                    
   org.apache.spark.sql.catalyst.optimizer.RemoveDispensableExpressions         
     0 / 5510061                                     0 / 55                     
                    
   org.apache.spark.sql.execution.datasources.DataSourceAnalysis                
     5412159 / 5475347                               24 / 28                    
                    
   org.apache.spark.sql.catalyst.optimizer.RewritePredicateSubquery             
     2055027 / 5324950                               1 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.PushPredicateThroughJoinByCNF        
     0 / 5285288                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.RemoveNoopOperators                  
     0 / 4998930                                     0 / 105                    
                    
   
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveOrdinalInOrderByAndGroupBy
 0 / 4914629                                     0 / 59                         
                
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowOrder           
     0 / 4899787                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.SubstituteUnresolvedOrdinals          
     0 / 4897924                                     0 / 28                     
                    
   org.apache.spark.sql.catalyst.optimizer.PushDownLeftSemiAntiJoin             
     0 / 4835640                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.PropagateEmptyRelation               
     0 / 4795908                                     0 / 50                     
                    
   org.apache.spark.sql.catalyst.optimizer.ObjectSerializerPruning              
     0 / 4763236                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.ResolveTableValuedFunctions           
     0 / 4607355                                     0 / 59                     
                    
   org.apache.spark.sql.execution.analysis.DetectAmbiguousSelfJoin              
     0 / 4562616                                     0 / 28                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$CaseWhenCoercion         
     0 / 4517751                                     0 / 59                     
                    
   org.apache.spark.sql.execution.datasources.FallBackFileSourceV2              
     0 / 4397375                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame           
     0 / 4313997                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$MapZipWithCoercion       
     0 / 4309833                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.RewriteNonCorrelatedExists           
     0 / 4253433                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$StackCoercion            
     0 / 4208225                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveDeserializer          
     0 / 4145215                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggAliasInGroupBy     
     0 / 4095335                                     0 / 59                     
                    
   org.apache.spark.sql.execution.python.ExtractPythonUDFs                      
     0 / 4001274                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.PullupCorrelatedPredicates           
     0 / 3920586                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.CollapseWindow                       
     0 / 3888524                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.RewriteDistinctAggregates            
     0 / 3803019                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.CollapseRepartition                  
     0 / 3755697                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.CombineUnions                        
     0 / 3704733                                     0 / 80                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveNewInstance           
     0 / 3695526                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases              
     1421178 / 3635064                               1 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.EliminateSerialization               
     0 / 3550180                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.Optimizer$OptimizeSubqueries         
     0 / 3536272                                     0 / 50                     
                    
   org.apache.spark.sql.catalyst.optimizer.CombineLimits                        
     0 / 3504846                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.TransposeWindow                      
     0 / 3499057                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.PushProjectionThroughUnion           
     0 / 3355877                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.EliminateOuterJoin                   
     0 / 3233735                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.LimitPushDown                        
     0 / 3208991                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.OptimizeLimitZero                    
     0 / 3190070                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables                
     0 / 3094747                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.RewriteIntersectAll                  
     0 / 3085769                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$WindowsSubstitution          
     0 / 2979758                                     0 / 28                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAlterTableChanges     
     0 / 2948089                                     0 / 28                     
                    
   org.apache.spark.sql.catalyst.optimizer.ExtractPythonUDFFromJoinCondition    
     0 / 2915414                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.EliminateView                         
     0 / 2805178                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.ResolveHints$ResolveCoalesceHints     
     0 / 2710440                                     0 / 28                     
                    
   org.apache.spark.sql.catalyst.optimizer.ReplaceExpressions                   
     0 / 2535170                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.EliminateMapObjects                  
     0 / 2520470                                     0 / 25                     
                    
   org.apache.spark.sql.execution.dynamicpruning.CleanupDynamicPruningFilters   
     0 / 2519862                                     0 / 25                     
                    
   org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate          
     0 / 2495766                                     0 / 25                     
                    
   org.apache.spark.sql.execution.python.ExtractGroupingPythonUDFFromAggregate  
     0 / 2417625                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.RewriteExceptAll                     
     0 / 2301441                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveSubqueryColumnAliases 
     0 / 2277596                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.CombineTypedFilters                  
     0 / 2267944                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithFilter              
     0 / 2252560                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.CombineFilters                       
     0 / 2236246                                     0 / 55                     
                    
   org.apache.spark.sql.execution.datasources.ResolveSQLOnFile                  
     0 / 2214162                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.TypeCoercion$WidenSetOperationTypes   
     0 / 2164273                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.RemoveLiteralFromGroupExpressions    
     0 / 2092068                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin         
     0 / 1952290                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.ReplaceDeduplicateWithAggregate      
     0 / 1927408                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGenerate              
     0 / 1915572                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveNaturalAndUsingJoin   
     0 / 1910467                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveOutputRelation        
     0 / 1839082                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithAntiJoin            
     0 / 1651671                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.ResolveInlineTables                   
     0 / 1536486                                     0 / 59                     
                    
   org.apache.spark.sql.catalyst.optimizer.RemoveRepetitionFromGroupExpressions 
     0 / 1474769                                     0 / 25                     
                    
   org.apache.spark.sql.execution.datasources.PreprocessTableInsertion          
     0 / 1406859                                     0 / 28                     
                    
   org.apache.spark.sql.catalyst.analysis.EliminateUnions                       
     0 / 1327002                                     0 / 28                     
                    
   org.apache.spark.sql.catalyst.optimizer.CombineConcats                       
     0 / 1217837                                     0 / 55                     
                    
   org.apache.spark.sql.catalyst.optimizer.ReplaceDistinctWithAggregate         
     0 / 1201090                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.ResolveHints$RemoveAllHints           
     0 / 1199850                                     0 / 28                     
                    
   org.apache.spark.sql.catalyst.optimizer.EliminateDistinct                    
     0 / 1098290                                     0 / 25                     
                    
   org.apache.spark.sql.catalyst.analysis.UpdateOuterReferences                 
     0 / 1028033                                     0 / 28                     
                    
   org.apache.spark.sql.catalyst.optimizer.CheckCartesianProducts               
     0 / 703053                                      0 / 50                     
                    
   org.apache.spark.sql.catalyst.optimizer.PushPredicateThroughNonJoin          
     0 / 617000                                      0 / 25                     
                    
   org.apache.spark.sql.execution.OptimizeMetadataOnlyQuery                     
     0 / 371820                                      0 / 25                     
                    
   org.apache.spark.sql.catalyst.optimizer.CostBasedJoinReorder                 
     0 / 309745                                      0 / 25                     
                    
   
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to