peter-toth commented on code in PR #39556: URL: https://github.com/apache/spark/pull/39556#discussion_r1069696162
########## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/AliasAwareOutputExpression.scala: ########## @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.plans + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Empty2Null, Expression, NamedExpression, SortOrder} +import org.apache.spark.sql.internal.SQLConf + +/** + * A trait that provides functionality to handle aliases in the `outputExpressions`. + */ +trait AliasAwareOutputExpression extends SQLConfHelper { + private val aliasCandidateLimit = conf.getConf(SQLConf.EXPRESSION_PROJECTION_CANDIDATE_LIMIT) + protected def outputExpressions: Seq[NamedExpression] + /** + * This method can be used to strip expression which does not affect the result, for example: + * strip the expression which is ordering Agnostic for output ordering. + */ + protected def strip(expr: Expression): Expression = expr + + private lazy val aliasMap = if (aliasCandidateLimit < 1) { + Map.empty + } else { + val attrWithAliasMap = new mutable.HashMap[Expression, ArrayBuffer[Attribute]]() + + def updateAttrWithAliasMap(key: Expression, target: Attribute): Unit = { + val aliasArray = attrWithAliasMap.getOrElseUpdate( + strip(key).canonicalized, new ArrayBuffer[Attribute]()) + // pre-filter if the number of alias has bigger than candidate limit + if (aliasArray.size < aliasCandidateLimit) { + aliasArray.append(target) + } + } + + outputExpressions.foreach { + case a @ Alias(child, _) => + updateAttrWithAliasMap(child, a.toAttribute) + case _ => + } + attrWithAliasMap.toMap + } + + protected def hasAlias: Boolean = aliasMap.nonEmpty + + // Return a set of Expression which normalize the original expression to the aliased. + protected def normalizeExpression(expr: Expression): Seq[Expression] = { + val normalizedCandidates = new ArrayBuffer[Expression]() + // Stop loop if the size of candidates exceed limit + for ((origin, aliases) <- aliasMap if normalizedCandidates.size < aliasCandidateLimit) { Review Comment: I think with this loop on `aliasMap` elements one by one and always adding new elements to `normalizedCandidates` and then do some filtering after the `aliasMap` loop you might do the same issue as described 3rd in https://github.com/apache/spark/pull/38034#issuecomment-1367218779 (constraint generation) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
