[GitHub] [spark] cloud-fan commented on a change in pull request #32513: [SPARK-35378][SQL] Eagerly execute non-root Command

GitBox Fri, 04 Jun 2021 01:38:37 -0700


cloud-fan commented on a change in pull request #32513:
URL: https://github.com/apache/spark/pull/32513#discussion_r645343392




##########
File path: sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
##########
@@ -86,7 +86,7 @@ private[sql] object Dataset {
 
   def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame =
     sparkSession.withActive {
-      val qe = sparkSession.sessionState.executePlan(logicalPlan)
+      val qe = sparkSession.sessionState.executePlan(logicalPlan, name = 
Some("ofRows"))

Review comment:
       why do we need the name here?

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
##########
@@ -67,12 +67,38 @@ object HiveResult {
       command.executeCollect().map(_.getString(1))
     case other =>
       val timeFormatters = getTimeFormatters
-      val result: Seq[Seq[Any]] = 
other.executeCollectPublic().map(_.toSeq).toSeq
-      // We need the types so we can output struct field names
-      val types = executedPlan.output.map(_.dataType)
-      // Reformat to match hive tab delimited output.
-      result.map(_.zip(types).map(e => toHiveString(e, false, timeFormatters)))
-        .map(_.mkString("\t"))
+      val commandPhysicalPlans = other collect {

Review comment:
       I think it's safer to only handle root command here, as that's what Hive 
can support.

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
##########
@@ -67,12 +67,38 @@ object HiveResult {
       command.executeCollect().map(_.getString(1))
     case other =>
       val timeFormatters = getTimeFormatters
-      val result: Seq[Seq[Any]] = 
other.executeCollectPublic().map(_.toSeq).toSeq
-      // We need the types so we can output struct field names
-      val types = executedPlan.output.map(_.dataType)
-      // Reformat to match hive tab delimited output.
-      result.map(_.zip(types).map(e => toHiveString(e, false, timeFormatters)))
-        .map(_.mkString("\t"))
+      val commandPhysicalPlans = other collect {

Review comment:
       Basically we just add more cases above:
   ```
   case command @ CommandResultExec(_,  _: ShowTablesExec, _) => ...
   case ...
   ```

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
##########
@@ -67,12 +67,38 @@ object HiveResult {
       command.executeCollect().map(_.getString(1))
     case other =>
       val timeFormatters = getTimeFormatters
-      val result: Seq[Seq[Any]] = 
other.executeCollectPublic().map(_.toSeq).toSeq
-      // We need the types so we can output struct field names
-      val types = executedPlan.output.map(_.dataType)
-      // Reformat to match hive tab delimited output.
-      result.map(_.zip(types).map(e => toHiveString(e, false, timeFormatters)))
-        .map(_.mkString("\t"))
+      val commandPhysicalPlans = other collect {

Review comment:
       Basically, we just strip the root `CommandResultExec` at the beginning
   ```
   ... = stripRootCommandResult(executedPlan) match {
     ...
   }
   ```

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
##########
@@ -73,12 +76,33 @@ class QueryExecution(
     sparkSession.sessionState.analyzer.executeAndCheck(logical, tracker)
   }
 
+  // SPARK-35378: Commands should be executed eagerly so that `sql("INSERT 
...")` can trigger the
+  // table insertion immediately without a `.collect()`. We also need to 
eagerly execute non-root
+  // commands, because many commands return `GenericInternalRow` and can't be 
put in a query plan
+  // directly, otherwise the query engine may cast `GenericInternalRow` to 
`UnsafeRow` and fail.
+  lazy val commandExecuted: LogicalPlan = if (isExecutingCommand) {
+    analyzed.mapChildren(eagerlyExecuteCommands)
+  } else {
+    eagerlyExecuteCommands(analyzed)
+  }
+
+  private def eagerlyExecuteCommands(p: LogicalPlan) = p transformDown {
+    case c: Command =>
+      val qe = sparkSession.sessionState.executePlan(c, true)
+      CommandResult(
+        qe.analyzed.output,
+        qe.commandExecuted,
+        qe.executedPlan,
+        SQLExecution.withNewExecutionId(qe, 
name)(qe.executedPlan.executeCollect()))
+    case other => other
+  }
+
   lazy val withCachedData: LogicalPlan = sparkSession.withActive {
     assertAnalyzed()
     assertSupported()
     // clone the plan to avoid sharing the plan instance between different 
stages like analyzing,
     // optimizing and planning.
-    sparkSession.sharedState.cacheManager.useCachedData(analyzed.clone())
+    
sparkSession.sharedState.cacheManager.useCachedData(commandExecuted.clone())
   }
 
   lazy val optimizedPlan: LogicalPlan = 
executePhase(QueryPlanningTracker.OPTIMIZATION) {

Review comment:
       To make sure we don't include command execution time in the optimization 
phase, it's better to follow `lazy val sparkPlan`
   
   ```
   lazy val optimizedPlan: LogicalPlan = {
     assertCommandExecuted()
     executePhase(QueryPlanningTracker.OPTIMIZATION) {
       ...
     }
   }
   ```

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
##########
@@ -163,7 +163,7 @@ case class ExplainCommand(
 
   // Run through the optimizer to generate the physical plan.
   override def run(sparkSession: SparkSession): Seq[Row] = try {
-    val outputString = 
sparkSession.sessionState.executePlan(logicalPlan).explainString(mode)
+    val outputString = sparkSession.sessionState.executePlan(logicalPlan, 
true).explainString(mode)

Review comment:
       The `logicalPlan` is not a command, right?

##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
##########
@@ -73,12 +76,33 @@ class QueryExecution(
     sparkSession.sessionState.analyzer.executeAndCheck(logical, tracker)
   }
 
+  // SPARK-35378: Commands should be executed eagerly so that `sql("INSERT 
...")` can trigger the
+  // table insertion immediately without a `.collect()`. We also need to 
eagerly execute non-root
+  // commands, because many commands return `GenericInternalRow` and can't be 
put in a query plan
+  // directly, otherwise the query engine may cast `GenericInternalRow` to 
`UnsafeRow` and fail.
+  lazy val commandExecuted: LogicalPlan = if (isExecutingCommand) {
+    analyzed.mapChildren(eagerlyExecuteCommands)
+  } else {
+    eagerlyExecuteCommands(analyzed)
+  }
+
+  private def eagerlyExecuteCommands(p: LogicalPlan) = p transformDown {
+    case c: Command =>
+      val qe = sparkSession.sessionState.executePlan(c, true)
+      CommandResult(
+        qe.analyzed.output,
+        qe.commandExecuted,
+        qe.executedPlan,
+        SQLExecution.withNewExecutionId(qe, 
name)(qe.executedPlan.executeCollect()))

Review comment:
       nit: let's call `qe.executedPlan.executeCollect()` first to make sure 
AQE is done and we get the final plan
   ```
   val qe = ...
   val result = qe.executedPlan.executeCollect()
   CommandResult(...)(result)
   ```

##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
##########
@@ -227,9 +227,7 @@ class DataFrameCallbackSuite extends QueryTest
         spark.range(10).select($"id", $"id").write.insertInto("tab")
       }
       sparkContext.listenerBus.waitUntilEmpty()
-      assert(exceptions.length == 1)
-      assert(exceptions.head._1 == "insertInto")
-      assert(exceptions.head._2 == e)
+      assert(exceptions.length == 0)

Review comment:
       This is a breaking change? We can't see errors now?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] cloud-fan commented on a change in pull request #32513: [SPARK-35378][SQL] Eagerly execute non-root Command

Reply via email to