dtenedor commented on code in PR #39592:
URL: https://github.com/apache/spark/pull/39592#discussion_r1081597693


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala:
##########
@@ -227,7 +227,7 @@ object LogicalPlanIntegrity {
    * this method checks if the same `ExprId` refers to attributes having the 
same data type
    * in plan output.
    */
-  def hasUniqueExprIdsForOutput(plan: LogicalPlan): Boolean = {
+  def hasUniqueExprIdsForOutput(plan: LogicalPlan): Option[String] = {

Review Comment:
   let's update the method comment to mention what the return string value 
contains, if present? Same for other updated return types.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala:
##########
@@ -247,36 +247,72 @@ object LogicalPlanIntegrity {
       ignoredExprIds.contains(exprId)
     }.groupBy(_._1).values.map(_.distinct)
 
-    groupedDataTypesByExprId.forall(_.length == 1)
+    groupedDataTypesByExprId.collectFirst {
+      case group if group.length > 1 =>
+        val exprId = group.head._1
+        val types = group.map(_._2.sql)
+        s"Multiple attributes have the same expression ID ${exprId.id} but 
different data types: " +
+          types.mkString(", ") + ". The plan tree:\n" + plan.treeString
+    }
   }
 
   /**
    * This method checks if reference `ExprId`s are not reused when assigning a 
new `ExprId`.
    * For example, it returns false if plan transformers create an alias having 
the same `ExprId`
    * with one of reference attributes, e.g., `a#1 + 1 AS a#1`.
    */
-  def checkIfSameExprIdNotReused(plan: LogicalPlan): Boolean = {
-    plan.collect { case p if p.resolved =>
-      p.expressions.forall {
-        case a: Alias =>
-          // Even if a plan is resolved, `a.references` can return unresolved 
references,
-          // e.g., in `Grouping`/`GroupingID`, so we need to filter out them 
and
-          // check if the same `exprId` in `Alias` does not exist
-          // among reference `exprId`s.
-          !a.references.filter(_.resolved).map(_.exprId).exists(_ == a.exprId)
-        case _ =>
-          true
+  def checkIfSameExprIdNotReused(plan: LogicalPlan): Option[String] = {
+    plan.collectFirst { case p if p.resolved =>
+      p.expressions.collectFirst {
+        // Even if a plan is resolved, `a.references` can return unresolved 
references,
+        // e.g., in `Grouping`/`GroupingID`, so we need to filter out them and
+        // check if the same `exprId` in `Alias` does not exist
+        // among reference `exprId`s.

Review Comment:
   join to the previous line?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala:
##########
@@ -247,36 +247,72 @@ object LogicalPlanIntegrity {
       ignoredExprIds.contains(exprId)
     }.groupBy(_._1).values.map(_.distinct)
 
-    groupedDataTypesByExprId.forall(_.length == 1)
+    groupedDataTypesByExprId.collectFirst {
+      case group if group.length > 1 =>
+        val exprId = group.head._1
+        val types = group.map(_._2.sql)
+        s"Multiple attributes have the same expression ID ${exprId.id} but 
different data types: " +
+          types.mkString(", ") + ". The plan tree:\n" + plan.treeString
+    }
   }
 
   /**
    * This method checks if reference `ExprId`s are not reused when assigning a 
new `ExprId`.
    * For example, it returns false if plan transformers create an alias having 
the same `ExprId`
    * with one of reference attributes, e.g., `a#1 + 1 AS a#1`.
    */
-  def checkIfSameExprIdNotReused(plan: LogicalPlan): Boolean = {
-    plan.collect { case p if p.resolved =>
-      p.expressions.forall {
-        case a: Alias =>
-          // Even if a plan is resolved, `a.references` can return unresolved 
references,
-          // e.g., in `Grouping`/`GroupingID`, so we need to filter out them 
and
-          // check if the same `exprId` in `Alias` does not exist
-          // among reference `exprId`s.
-          !a.references.filter(_.resolved).map(_.exprId).exists(_ == a.exprId)
-        case _ =>
-          true
+  def checkIfSameExprIdNotReused(plan: LogicalPlan): Option[String] = {
+    plan.collectFirst { case p if p.resolved =>
+      p.expressions.collectFirst {
+        // Even if a plan is resolved, `a.references` can return unresolved 
references,
+        // e.g., in `Grouping`/`GroupingID`, so we need to filter out them and
+        // check if the same `exprId` in `Alias` does not exist
+        // among reference `exprId`s.
+        case a: Alias if 
a.references.filter(_.resolved).map(_.exprId).exists(_ == a.exprId) =>
+          "An alias reuses the same expression ID as previously present in an 
attribute, " +
+            s"which is invalid: ${a.sql}. The plan tree:\n" + plan.treeString
       }
-    }.forall(identity)
+    }.flatten
   }
 
   /**
    * This method checks if the same `ExprId` refers to an unique attribute in 
a plan tree.
    * Some plan transformers (e.g., `RemoveNoopOperators`) rewrite logical
    * plans based on this assumption.
    */
-  def checkIfExprIdsAreGloballyUnique(plan: LogicalPlan): Boolean = {
-    checkIfSameExprIdNotReused(plan) && hasUniqueExprIdsForOutput(plan)
+  def validateExprIdUniqueness(plan: LogicalPlan): Option[String] = {
+    LogicalPlanIntegrity.checkIfSameExprIdNotReused(plan).orElse(
+      LogicalPlanIntegrity.hasUniqueExprIdsForOutput(plan))
+  }
+
+  /**
+   * Validate the structural integrity of an optimized plan.
+   * For example, we can check after the execution of each rule that each plan:
+   * - is still resolved
+   * - only host special expressions in supported operators
+   * - has globally-unique attribute IDs
+   * - optimized plan have same schema with previous plan.

Review Comment:
   ```suggestion
      * - has the same result schema as the previous plan
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to