[GitHub] [spark] rdblue commented on a change in pull request #25305: [SPARK-28572][SQL] Simple analyzer checks for CREATE TABLE v2

GitBox Wed, 31 Jul 2019 15:56:15 -0700

rdblue commented on a change in pull request #25305: [SPARK-28572][SQL] Simple 
analyzer checks for CREATE TABLE v2
URL: https://github.com/apache/spark/pull/25305#discussion_r309464994


 ##########
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala
 ##########
 @@ -88,4 +91,147 @@ private[spark] object SchemaUtils {
         s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", 
")}")
     }
   }
+
+  /**
+   * Returns all column names in this schema as a flat list. For example, a 
schema like:
+   *   | - a
+   *   | | - 1
+   *   | | - 2
+   *   | - b
+   *   | - c
+   *   | | - nest
+   *   |   | - 3
+   *   will get flattened to: "a", "a.1", "a.2", "b", "c", "c.nest", "c.nest.3"
+   */
+  def explodeNestedFieldNames(schema: StructType): Seq[String] = {
+    def explode(schema: StructType): Seq[Seq[String]] = {
+      def recurseIntoComplexTypes(complexType: DataType): Seq[Seq[String]] = {
+        complexType match {
+          case s: StructType => explode(s)
+          case a: ArrayType => recurseIntoComplexTypes(a.elementType)
+          case m: MapType =>
+            recurseIntoComplexTypes(m.keyType).map(Seq("key") ++ _) ++
+              recurseIntoComplexTypes(m.valueType).map(Seq("value") ++ _)
+          case _ => Nil
+        }
+      }
+
+      schema.flatMap {
+        case StructField(name, s: StructType, _, _) =>
+          Seq(Seq(name)) ++ explode(s).map(nested => Seq(name) ++ nested)
+        case StructField(name, a: ArrayType, _, _) =>
+          Seq(Seq(name)) ++ recurseIntoComplexTypes(a).map(nested => Seq(name) 
++ nested)
+        case StructField(name, m: MapType, _, _) =>
+          Seq(Seq(name)) ++ recurseIntoComplexTypes(m).map(nested => Seq(name) 
++ nested)
+        case f => Seq(f.name) :: Nil
+      }
+    }
+
+    explode(schema).map(UnresolvedAttribute.apply(_).name)
+  }
+
+  /**
+   * Checks if input column names have duplicate identifiers even in if they 
are nested. This
+   * throws an exception if the duplication exists.
+   *
+   * @param schema the schema to check for duplicates
+   * @param checkType contextual information around the check, used in an 
exception message
+   * @param isCaseSensitive Whether to be case sensitive when comparing column 
names
+   */
+  def checkV2ColumnNameDuplication(
+      schema: StructType,
+      checkType: String,
+      isCaseSensitive: Boolean): Unit = {
+    val columnNames = explodeNestedFieldNames(schema)
+    checkColumnNameDuplication(columnNames, checkType, isCaseSensitive)
+  }
+
+  /**
+   * Checks if the partitioning transforms are being duplicated or not. Throws 
an exception if
+   * duplication exists.
+   *
+   * @param transforms the schema to check for duplicates
+   * @param checkType contextual information around the check, used in an 
exception message
+   * @param isCaseSensitive Whether to be case sensitive when comparing column 
names
+   */
+  def checkTransformDuplication(
+      transforms: Seq[Transform],
+      checkType: String,
+      isCaseSensitive: Boolean): Unit = {
+    val extractedTransforms = transforms.map {
+      case b: BucketTransform =>
+        val colNames = b.columns.map(c => 
UnresolvedAttribute(c.fieldNames()).name)
+        // We need to check that we're not duplicating columns within our 
bucketing transform
+        checkColumnNameDuplication(colNames, checkType, isCaseSensitive)
+        b.name -> colNames
+      case NamedTransform(transformName, refs) =>
+        val fieldNameParts = refs.collect { case Ref(parts) => 
UnresolvedAttribute(parts).name }
 
 Review comment:
   The extractors allow you to use `NamedReference` instead of `Ref`. The 
extractor will use `Ref` to match any reference, even if it isn't already a 
`NamedReference`. So no need to make `Ref` public.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] rdblue commented on a change in pull request #25305: [SPARK-28572][SQL] Simple analyzer checks for CREATE TABLE v2

Reply via email to