[spark] branch branch-3.0 updated: [SPARK-34970][3.0][SQL] Redact map-type options in the output of explain()

dongjoon Thu, 08 Apr 2021 13:36:42 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new ee264b1  [SPARK-34970][3.0][SQL] Redact map-type options in the output 
of explain()
ee264b1 is described below

commit ee264b1ef1823cf90671c9565c7420da1ee7ba5d
Author: Gengliang Wang <[email protected]>
AuthorDate: Thu Apr 8 13:35:30 2021 -0700

    [SPARK-34970][3.0][SQL] Redact map-type options in the output of explain()
    
    ### What changes were proposed in this pull request?
    
    The `explain()` method prints the arguments of tree nodes in 
logical/physical plans. The arguments could contain a map-type option that 
contains sensitive data.
    We should map-type options in the output of `explain()`. Otherwise, we will 
see sensitive data in explain output or Spark UI.
    
![image](https://user-images.githubusercontent.com/1097932/113719178-326ffb00-96a2-11eb-8a2c-28fca3e72941.png)
    
    ### Why are the changes needed?
    
    Data security.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, redact the map-type options in the output of `explain()`
    
    ### How was this patch tested?
    
    Unit tests
    
    Closes #32085 from gengliangwang/redact3.0.
    
    Authored-by: Gengliang Wang <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../apache/spark/sql/catalyst/trees/TreeNode.scala | 16 +++++++
 .../resources/sql-tests/results/describe.sql.out   |  2 +-
 .../scala/org/apache/spark/sql/ExplainSuite.scala  | 52 ++++++++++++++++++++++
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 4dc3627..0ec2bbc 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.trees
 import java.util.UUID
 
 import scala.collection.{mutable, Map}
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.commons.lang3.ClassUtils
@@ -39,6 +40,7 @@ import 
org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat
 import org.apache.spark.sql.catalyst.util.truncatedString
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
 
@@ -532,6 +534,16 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] 
extends Product {
 
   private lazy val allChildren: Set[TreeNode[_]] = (children ++ 
innerChildren).toSet[TreeNode[_]]
 
+  private def redactMapString[K, V](map: Map[K, V], maxFields: Int): 
List[String] = {
+    // For security reason, redact the map value if the key is in centain 
patterns
+    val redactedMap = SQLConf.get.redactOptions(map.toMap)
+    // construct the redacted map as strings of the format "key=value"
+    val keyValuePairs = redactedMap.toSeq.map { item =>
+      item._1 + "=" + item._2
+    }
+    truncatedString(keyValuePairs, "[", ", ", "]", maxFields) :: Nil
+  }
+
   /** Returns a string representing the arguments to this node, minus any 
children */
   def argString(maxFields: Int): String = stringArgs.flatMap {
     case tn: TreeNode[_] if allChildren.contains(tn) => Nil
@@ -548,6 +560,10 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] 
extends Product {
     case None => Nil
     case Some(null) => Nil
     case Some(any) => any :: Nil
+    case map: CaseInsensitiveStringMap =>
+      redactMapString(map.asCaseSensitiveMap().asScala, maxFields)
+    case map: Map[_, _] =>
+      redactMapString(map, maxFields)
     case table: CatalogTable =>
       table.storage.serde match {
         case Some(serde) => table.identifier :: serde :: Nil
diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out 
b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
index a7de033..36118f8 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
@@ -571,7 +571,7 @@ struct<plan:string>
 -- !query output
 == Physical Plan ==
 Execute DescribeTableCommand
-   +- DescribeTableCommand `default`.`t`, Map(c -> Us, d -> 2), false
+   +- DescribeTableCommand `default`.`t`, [c=Us, d=2], false
 
 
 -- !query
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
index 158d939..e8833b8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
@@ -19,8 +19,10 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.adaptive.{DisableAdaptiveExecutionSuite, 
EnableAdaptiveExecutionSuite}
+import org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.TestOptionsSource
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.sql.types.StructType
 
@@ -69,6 +71,18 @@ trait ExplainSuiteHelper extends QueryTest with 
SharedSparkSession {
   protected def checkKeywordsExistsInExplain(df: DataFrame, keywords: 
String*): Unit = {
     checkKeywordsExistsInExplain(df, ExtendedMode, keywords: _*)
   }
+
+  /**
+   * Runs the plan and makes sure the plans does not contain any of the 
keywords.
+   */
+  protected def checkKeywordsNotExistsInExplain(
+      df: DataFrame, mode: ExplainMode, keywords: String*): Unit = {
+    withNormalizedExplain(df, mode) { normalizedOutput =>
+      for (key <- keywords) {
+        assert(!normalizedOutput.contains(key))
+      }
+    }
+  }
 }
 
 class ExplainSuite extends ExplainSuiteHelper with 
DisableAdaptiveExecutionSuite {
@@ -332,6 +346,44 @@ class ExplainSuite extends ExplainSuiteHelper with 
DisableAdaptiveExecutionSuite
         Nil: _*)
   }
 
+  test("SPARK-34970: Redact Map type options in explain output") {
+    val password = "MyPassWord"
+    val token = "MyToken"
+    val value = "value"
+    val options = Map("password" -> password, "token" -> token, "key" -> value)
+    val cmd = SaveIntoDataSourceCommand(spark.range(10).logicalPlan, new 
TestOptionsSource,
+      options, SaveMode.Overwrite)
+
+    Seq(SimpleMode, ExtendedMode, FormattedMode).foreach { mode =>
+      checkKeywordsExistsInExplain(cmd, mode, value)
+    }
+    Seq(SimpleMode, ExtendedMode, CodegenMode, CostMode, 
FormattedMode).foreach { mode =>
+      checkKeywordsNotExistsInExplain(cmd, mode, password)
+      checkKeywordsNotExistsInExplain(cmd, mode, token)
+    }
+  }
+
+  test("SPARK-34970: Redact CaseInsensitiveMap type options in explain 
output") {
+    val password = "MyPassWord"
+    val token = "MyToken"
+    val value = "value"
+    val tableName = "t"
+    withTable(tableName) {
+      val df1 = spark.range(10).toDF()
+      df1.write.format("json").saveAsTable(tableName)
+      val df2 = spark.read
+        .option("key", value)
+        .option("password", password)
+        .option("token", token)
+        .table(tableName)
+
+      Seq(SimpleMode, ExtendedMode, CodegenMode, CostMode, 
FormattedMode).foreach { mode =>
+        checkKeywordsNotExistsInExplain(df2, mode, password)
+        checkKeywordsNotExistsInExplain(df2, mode, token)
+      }
+    }
+  }
+
   test("Dataset.toExplainString has mode as string") {
     val df = spark.range(10).toDF
     def assertExplainOutput(mode: ExplainMode): Unit = {

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch branch-3.0 updated: [SPARK-34970][3.0][SQL] Redact map-type options in the output of explain()

Reply via email to