wangyum commented on a change in pull request #28575: URL: https://github.com/apache/spark/pull/28575#discussion_r434623988
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
##########
@@ -1372,6 +1376,108 @@ object PushPredicateThroughJoin extends
Rule[LogicalPlan] with PredicateHelper {
}
}
+/**
+ * Rewriting join condition to conjunctive normal form expression so that we
can push
+ * more predicate.
+ */
+object PushPredicateThroughJoinByCNF extends Rule[LogicalPlan] with
PredicateHelper {
+ /**
+ * Rewrite pattern:
+ * 1. (a && b) || c --> (a || c) && (b || c)
+ * 2. a || (b && c) --> (a || b) && (a || c)
+ *
+ * To avoid generating too many predicates, we first group the filter
columns from the same table.
+ */
+ private def toCNF(condition: Expression, depth: Int = 0): Expression = {
+ if (depth < SQLConf.get.maxRewritingCNFDepth) {
+ condition match {
+ case or @ Or(left: And, right: And) =>
+ val lhs =
splitConjunctivePredicates(left).groupBy(_.references.map(_.qualifier))
+ val rhs =
splitConjunctivePredicates(right).groupBy(_.references.map(_.qualifier))
+ if (lhs.size > 1) {
+ lhs.values.map(_.reduceLeft(And)).map { c =>
Review comment:
```
07:35:05.863 WARN org.apache.spark.sql.TPCDSQuerySuite:
=== Metrics of Analyzer/Optimizer Rules ===
Total number of runs: 7249
Total time: 1.949092121 seconds
Rule
Effective Time / Total Time Effective Runs / Total
Runs
org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog
151465071 / 249555919 24 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggregateFunctions
138746642 / 168406459 1 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences
78878999 / 132411189 3 / 59
org.apache.spark.sql.execution.datasources.FindDataSourceTable
95372289 / 99326980 1 / 59
org.apache.spark.sql.catalyst.analysis.DecimalPrecision
56750800 / 66170980 2 / 59
org.apache.spark.sql.execution.datasources.PreprocessTableCreation
0 / 48910600 0 / 28
org.apache.spark.sql.catalyst.analysis.TypeCoercion$ImplicitTypeCasts
20820860 / 44731509 1 / 59
org.apache.spark.sql.catalyst.optimizer.ColumnPruning
12919681 / 44543112 2 / 105
org.apache.spark.sql.catalyst.optimizer.BooleanSimplification
24718048 / 43686766 1 / 55
org.apache.spark.sql.execution.datasources.SchemaPruning
0 / 32795196 0 / 25
org.apache.spark.sql.catalyst.analysis.ResolveCatalogs
0 / 30645089 0 / 59
org.apache.spark.sql.catalyst.analysis.TypeCoercion$FunctionArgumentConversion
17902985 / 27578796 2 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ExtractGenerator
0 / 26144312 0 / 59
org.apache.spark.sql.catalyst.optimizer.PushDownPredicates
17354780 / 25365681 5 / 80
org.apache.spark.sql.catalyst.optimizer.InferFiltersFromConstraints
22137478 / 24736530 1 / 25
org.apache.spark.sql.catalyst.optimizer.RemoveRedundantAliases
0 / 24508932 0 / 55
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions
10536555 / 24417169 2 / 59
org.apache.spark.sql.catalyst.optimizer.ReorderJoin
17311087 / 22391786 1 / 55
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveMissingReferences
0 / 21142565 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveBinaryArithmetic
0 / 20704002 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations
15024644 / 20411277 1 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAliases
15859386 / 18096012 1 / 59
org.apache.spark.sql.catalyst.optimizer.ReorderAssociativeOperator
0 / 17026758 0 / 55
org.apache.spark.sql.catalyst.analysis.UpdateAttributeNullability
0 / 16870561 0 / 28
org.apache.spark.sql.catalyst.optimizer.PruneFilters
0 / 15220754 0 / 80
org.apache.spark.sql.catalyst.optimizer.CollapseProject
8979563 / 13211028 1 / 80
org.apache.spark.sql.catalyst.optimizer.LikeSimplification
0 / 12837312 0 / 55
org.apache.spark.sql.catalyst.optimizer.ConstantFolding
7099416 / 12652039 1 / 55
org.apache.spark.sql.catalyst.analysis.TimeWindowing
0 / 12566615 0 / 59
org.apache.spark.sql.catalyst.optimizer.ComputeCurrentTime
0 / 12279377 0 / 25
org.apache.spark.sql.catalyst.optimizer.FoldablePropagation
0 / 12158420 0 / 55
org.apache.spark.sql.catalyst.analysis.ResolveTimeZone
7331904 / 11661433 5 / 59
org.apache.spark.sql.execution.datasources.v2.V2ScanRelationPushDown
0 / 11651482 0 / 25
org.apache.spark.sql.catalyst.analysis.Analyzer$HandleNullInputsForUDF
0 / 11312278 0 / 28
org.apache.spark.sql.catalyst.analysis.Analyzer$PullOutNondeterministic
0 / 11111621 0 / 28
org.apache.spark.sql.catalyst.analysis.TypeCoercion$PromoteStrings
0 / 11032240 0 / 59
org.apache.spark.sql.catalyst.optimizer.OptimizeIn
0 / 10792662 0 / 55
org.apache.spark.sql.catalyst.optimizer.SimplifyExtractValueOps
0 / 10435223 0 / 55
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGroupingAnalytics
0 / 10147394 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$LookupFunctions
0 / 9906569 0 / 28
org.apache.spark.sql.catalyst.optimizer.SimplifyBinaryComparison
0 / 9434962 0 / 55
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRandomSeed
0 / 8745520 0 / 59
org.apache.spark.sql.catalyst.optimizer.NullPropagation
0 / 8673323 0 / 55
org.apache.spark.sql.catalyst.analysis.ResolveLambdaVariables
0 / 8618322 0 / 59
org.apache.spark.sql.catalyst.optimizer.DecimalAggregates
1133548 / 8344365 1 / 26
org.apache.spark.sql.catalyst.analysis.TypeCoercion$EltCoercion
0 / 8306604 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveInsertInto
0 / 8272186 0 / 59
org.apache.spark.sql.catalyst.optimizer.EliminateResolvedHint
0 / 8269481 0 / 25
org.apache.spark.sql.catalyst.analysis.TypeCoercion$BooleanEquality
0 / 8164425 0 / 59
org.apache.spark.sql.catalyst.analysis.TypeCoercion$InConversion
0 / 7999784 0 / 59
org.apache.spark.sql.catalyst.analysis.TypeCoercion$WindowFrameCoercion
0 / 7744630 0 / 59
org.apache.spark.sql.catalyst.optimizer.EliminateSorts
0 / 7664352 0 / 25
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveNamespace
0 / 7333421 0 / 59
org.apache.spark.sql.catalyst.optimizer.RewriteCorrelatedScalarSubquery
0 / 7288581 0 / 55
org.apache.spark.sql.catalyst.optimizer.ReassignLambdaVariableID
0 / 7094275 0 / 25
org.apache.spark.sql.catalyst.optimizer.NormalizeFloatingNumbers
0 / 6952061 0 / 25
org.apache.spark.sql.catalyst.analysis.CleanupAliases
1507275 / 6818821 1 / 29
org.apache.spark.sql.catalyst.analysis.TypeCoercion$DateTimeOperations
0 / 6621755 0 / 59
org.apache.spark.sql.catalyst.optimizer.SimplifyConditionals
0 / 6594834 0 / 55
org.apache.spark.sql.execution.dynamicpruning.PartitionPruning
0 / 6435655 0 / 25
org.apache.spark.sql.catalyst.optimizer.SimplifyCasts
1726589 / 6431193 1 / 55
org.apache.spark.sql.catalyst.analysis.TypeCoercion$IfCoercion
0 / 6390305 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$GlobalAggregates
0 / 6364189 0 / 59
org.apache.spark.sql.catalyst.analysis.TypeCoercion$ConcatCoercion
0 / 6344767 0 / 59
org.apache.spark.sql.catalyst.optimizer.PushLeftSemiLeftAntiThroughJoin
0 / 6300823 0 / 55
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveSubquery
0 / 6237735 0 / 59
org.apache.spark.sql.catalyst.optimizer.SimplifyCaseConversionExpressions
0 / 6221240 0 / 55
org.apache.spark.sql.catalyst.analysis.TypeCoercion$StringLiteralCoercion
0 / 6192720 0 / 59
org.apache.spark.sql.catalyst.analysis.CTESubstitution
0 / 6150743 0 / 28
org.apache.spark.sql.catalyst.optimizer.GetCurrentDatabase
0 / 6148608 0 / 25
org.apache.spark.sql.catalyst.analysis.ResolveCreateNamedStruct
0 / 6101190 0 / 59
org.apache.spark.sql.catalyst.optimizer.ReplaceNullWithFalseInPredicate
0 / 6100257 0 / 55
org.apache.spark.sql.catalyst.optimizer.ConstantPropagation
0 / 6087339 0 / 55
org.apache.spark.sql.catalyst.analysis.ResolveHigherOrderFunctions
0 / 6067619 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ExtractWindowExpressions
0 / 6061035 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveUpCast
0 / 6007788 0 / 59
org.apache.spark.sql.catalyst.analysis.TypeCoercion$Division
0 / 5925219 0 / 59
org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
0 / 5824041 0 / 25
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolvePivot
0 / 5559831 0 / 59
org.apache.spark.sql.catalyst.analysis.ResolveHints$ResolveJoinStrategyHints
0 / 5524307 0 / 28
org.apache.spark.sql.catalyst.optimizer.RemoveDispensableExpressions
0 / 5510061 0 / 55
org.apache.spark.sql.execution.datasources.DataSourceAnalysis
5412159 / 5475347 24 / 28
org.apache.spark.sql.catalyst.optimizer.RewritePredicateSubquery
2055027 / 5324950 1 / 25
org.apache.spark.sql.catalyst.optimizer.PushPredicateThroughJoinByCNF
0 / 5285288 0 / 25
org.apache.spark.sql.catalyst.optimizer.RemoveNoopOperators
0 / 4998930 0 / 105
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveOrdinalInOrderByAndGroupBy
0 / 4914629 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowOrder
0 / 4899787 0 / 59
org.apache.spark.sql.catalyst.analysis.SubstituteUnresolvedOrdinals
0 / 4897924 0 / 28
org.apache.spark.sql.catalyst.optimizer.PushDownLeftSemiAntiJoin
0 / 4835640 0 / 55
org.apache.spark.sql.catalyst.optimizer.PropagateEmptyRelation
0 / 4795908 0 / 50
org.apache.spark.sql.catalyst.optimizer.ObjectSerializerPruning
0 / 4763236 0 / 25
org.apache.spark.sql.catalyst.analysis.ResolveTableValuedFunctions
0 / 4607355 0 / 59
org.apache.spark.sql.execution.analysis.DetectAmbiguousSelfJoin
0 / 4562616 0 / 28
org.apache.spark.sql.catalyst.analysis.TypeCoercion$CaseWhenCoercion
0 / 4517751 0 / 59
org.apache.spark.sql.execution.datasources.FallBackFileSourceV2
0 / 4397375 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame
0 / 4313997 0 / 59
org.apache.spark.sql.catalyst.analysis.TypeCoercion$MapZipWithCoercion
0 / 4309833 0 / 59
org.apache.spark.sql.catalyst.optimizer.RewriteNonCorrelatedExists
0 / 4253433 0 / 25
org.apache.spark.sql.catalyst.analysis.TypeCoercion$StackCoercion
0 / 4208225 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveDeserializer
0 / 4145215 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggAliasInGroupBy
0 / 4095335 0 / 59
org.apache.spark.sql.execution.python.ExtractPythonUDFs
0 / 4001274 0 / 25
org.apache.spark.sql.catalyst.optimizer.PullupCorrelatedPredicates
0 / 3920586 0 / 25
org.apache.spark.sql.catalyst.optimizer.CollapseWindow
0 / 3888524 0 / 55
org.apache.spark.sql.catalyst.optimizer.RewriteDistinctAggregates
0 / 3803019 0 / 25
org.apache.spark.sql.catalyst.optimizer.CollapseRepartition
0 / 3755697 0 / 55
org.apache.spark.sql.catalyst.optimizer.CombineUnions
0 / 3704733 0 / 80
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveNewInstance
0 / 3695526 0 / 59
org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
1421178 / 3635064 1 / 25
org.apache.spark.sql.catalyst.optimizer.EliminateSerialization
0 / 3550180 0 / 55
org.apache.spark.sql.catalyst.optimizer.Optimizer$OptimizeSubqueries
0 / 3536272 0 / 50
org.apache.spark.sql.catalyst.optimizer.CombineLimits
0 / 3504846 0 / 55
org.apache.spark.sql.catalyst.optimizer.TransposeWindow
0 / 3499057 0 / 55
org.apache.spark.sql.catalyst.optimizer.PushProjectionThroughUnion
0 / 3355877 0 / 55
org.apache.spark.sql.catalyst.optimizer.EliminateOuterJoin
0 / 3233735 0 / 55
org.apache.spark.sql.catalyst.optimizer.LimitPushDown
0 / 3208991 0 / 55
org.apache.spark.sql.catalyst.optimizer.OptimizeLimitZero
0 / 3190070 0 / 25
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveTables
0 / 3094747 0 / 59
org.apache.spark.sql.catalyst.optimizer.RewriteIntersectAll
0 / 3085769 0 / 25
org.apache.spark.sql.catalyst.analysis.Analyzer$WindowsSubstitution
0 / 2979758 0 / 28
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAlterTableChanges
0 / 2948089 0 / 28
org.apache.spark.sql.catalyst.optimizer.ExtractPythonUDFFromJoinCondition
0 / 2915414 0 / 25
org.apache.spark.sql.catalyst.analysis.EliminateView
0 / 2805178 0 / 25
org.apache.spark.sql.catalyst.analysis.ResolveHints$ResolveCoalesceHints
0 / 2710440 0 / 28
org.apache.spark.sql.catalyst.optimizer.ReplaceExpressions
0 / 2535170 0 / 25
org.apache.spark.sql.catalyst.optimizer.EliminateMapObjects
0 / 2520470 0 / 25
org.apache.spark.sql.execution.dynamicpruning.CleanupDynamicPruningFilters
0 / 2519862 0 / 25
org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate
0 / 2495766 0 / 25
org.apache.spark.sql.execution.python.ExtractGroupingPythonUDFFromAggregate
0 / 2417625 0 / 25
org.apache.spark.sql.catalyst.optimizer.RewriteExceptAll
0 / 2301441 0 / 25
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveSubqueryColumnAliases
0 / 2277596 0 / 59
org.apache.spark.sql.catalyst.optimizer.CombineTypedFilters
0 / 2267944 0 / 25
org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithFilter
0 / 2252560 0 / 25
org.apache.spark.sql.catalyst.optimizer.CombineFilters
0 / 2236246 0 / 55
org.apache.spark.sql.execution.datasources.ResolveSQLOnFile
0 / 2214162 0 / 59
org.apache.spark.sql.catalyst.analysis.TypeCoercion$WidenSetOperationTypes
0 / 2164273 0 / 59
org.apache.spark.sql.catalyst.optimizer.RemoveLiteralFromGroupExpressions
0 / 2092068 0 / 25
org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin
0 / 1952290 0 / 25
org.apache.spark.sql.catalyst.optimizer.ReplaceDeduplicateWithAggregate
0 / 1927408 0 / 25
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGenerate
0 / 1915572 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveNaturalAndUsingJoin
0 / 1910467 0 / 59
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveOutputRelation
0 / 1839082 0 / 59
org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithAntiJoin
0 / 1651671 0 / 25
org.apache.spark.sql.catalyst.analysis.ResolveInlineTables
0 / 1536486 0 / 59
org.apache.spark.sql.catalyst.optimizer.RemoveRepetitionFromGroupExpressions
0 / 1474769 0 / 25
org.apache.spark.sql.execution.datasources.PreprocessTableInsertion
0 / 1406859 0 / 28
org.apache.spark.sql.catalyst.analysis.EliminateUnions
0 / 1327002 0 / 28
org.apache.spark.sql.catalyst.optimizer.CombineConcats
0 / 1217837 0 / 55
org.apache.spark.sql.catalyst.optimizer.ReplaceDistinctWithAggregate
0 / 1201090 0 / 25
org.apache.spark.sql.catalyst.analysis.ResolveHints$RemoveAllHints
0 / 1199850 0 / 28
org.apache.spark.sql.catalyst.optimizer.EliminateDistinct
0 / 1098290 0 / 25
org.apache.spark.sql.catalyst.analysis.UpdateOuterReferences
0 / 1028033 0 / 28
org.apache.spark.sql.catalyst.optimizer.CheckCartesianProducts
0 / 703053 0 / 50
org.apache.spark.sql.catalyst.optimizer.PushPredicateThroughNonJoin
0 / 617000 0 / 25
org.apache.spark.sql.execution.OptimizeMetadataOnlyQuery
0 / 371820 0 / 25
org.apache.spark.sql.catalyst.optimizer.CostBasedJoinReorder
0 / 309745 0 / 25
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
