wangyum commented on code in PR #38682: URL: https://github.com/apache/spark/pull/38682#discussion_r1026470585
########## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala: ########## @@ -117,4 +117,23 @@ object ExprUtils extends QueryErrorsBase { TypeCheckSuccess } } + + /** + * Combine a number of boolean expressions into a balanced expression tree. These expressions are + * either combined by a logical [[And]] or a logical [[Or]]. + * + * A balanced binary tree is created because regular left recursive trees cause considerable + * performance degradations and can cause stack overflows. + */ + def reduceToExpressionTree( + patterns: Seq[Expression], + expressionCombiner: (Expression, Expression) => Expression): Expression = { + assert(patterns.size > 0) + var res = patterns + while (res.size > 1) { + res = res.sliding(2, 2).toSeq + .map(tup => if (tup.size == 2) expressionCombiner(tup.head, tup.last) else tup(0)) + } + res.head Review Comment: seems that the original implementation is better than this one? ```scala def reduceToExpressionTree1( patterns: Seq[Expression], expressionCombiner: (Expression, Expression) => Expression): Expression = { var res = patterns while (res.size > 1) { res = res.sliding(2, 2).toSeq .map(tup => if (tup.size == 2) expressionCombiner(tup.head, tup.last) else tup(0)) } res.head } def reduceToExpressionTree2( expressions: Seq[Expression], expressionCombiner: (Expression, Expression) => Expression): Expression = { def reduceToExpressionTree(low: Int, high: Int): Expression = high - low match { case 0 => expressions(low) case 1 => expressionCombiner(expressions(low), expressions(high)) case x => val mid = low + x / 2 expressionCombiner( reduceToExpressionTree(low, mid), reduceToExpressionTree(mid + 1, high)) } reduceToExpressionTree(0, expressions.size - 1) } Seq(10000, 100000, 1000000).foreach { N => val expressions = Range(1, N).map(Literal(_)) val benchmark = new Benchmark(s"Benchmark reduceToExpressionTree with $N elements", N, minNumIters = 10) benchmark.addCase("reduceToExpressionTree1") { _ => reduceToExpressionTree1(expressions, Or.apply) } benchmark.addCase("reduceToExpressionTree2") { _ => reduceToExpressionTree2(expressions, Or.apply) } benchmark.run() } ``` ``` OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Mac OS X 10.16 Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Benchmark reduceToExpressionTree with 1000000 elements: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- reduceToExpressionTree1 823 1012 212 1.2 822.5 1.0X reduceToExpressionTree2 150 183 25 6.7 149.7 5.5X ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org