[GitHub] spark pull request #20087: [SPARK-21786][SQL] The 'spark.sql.parquet.compres...

fjh100456 Thu, 18 Jan 2018 17:43:26 -0800

Github user fjh100456 commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20087#discussion_r162519320
  
    --- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala 
---
    @@ -0,0 +1,321 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql.hive
    +
    +import java.io.File
    +
    +import scala.collection.JavaConverters._
    +
    +import org.apache.hadoop.fs.Path
    +import org.apache.orc.OrcConf.COMPRESS
    +import org.apache.parquet.hadoop.ParquetOutputFormat
    +import org.scalatest.BeforeAndAfterAll
    +
    +import org.apache.spark.sql.execution.datasources.orc.OrcOptions
    +import org.apache.spark.sql.execution.datasources.parquet.{ParquetOptions, 
ParquetTest}
    +import org.apache.spark.sql.hive.orc.OrcFileOperator
    +import org.apache.spark.sql.hive.test.TestHiveSingleton
    +import org.apache.spark.sql.internal.SQLConf
    +
    +class CompressionCodecSuite extends TestHiveSingleton with ParquetTest 
with BeforeAndAfterAll {
    +  import spark.implicits._
    +
    +  override def beforeAll(): Unit = {
    +    super.beforeAll()
    +    (0 until 
maxRecordNum).toDF("a").createOrReplaceTempView("table_source")
    +  }
    +
    +  override def afterAll(): Unit = {
    +    try {
    +      spark.catalog.dropTempView("table_source")
    +    } finally {
    +      super.afterAll()
    +    }
    +  }
    +
    +  private val maxRecordNum = 500
    +
    +  private def getConvertMetastoreConfName(format: String): String = 
format.toLowerCase match {
    +    case "parquet" => HiveUtils.CONVERT_METASTORE_PARQUET.key
    +    case "orc" => HiveUtils.CONVERT_METASTORE_ORC.key
    +  }
    +
    +  private def getSparkCompressionConfName(format: String): String = 
format.toLowerCase match {
    +    case "parquet" => SQLConf.PARQUET_COMPRESSION.key
    +    case "orc" => SQLConf.ORC_COMPRESSION.key
    +  }
    +
    +  private def getHiveCompressPropName(format: String): String = 
format.toLowerCase match {
    +    case "parquet" => ParquetOutputFormat.COMPRESSION
    +    case "orc" => COMPRESS.getAttribute
    +  }
    +
    +  private def normalizeCodecName(format: String, name: String): String = {
    +    format.toLowerCase match {
    +      case "parquet" => 
ParquetOptions.shortParquetCompressionCodecNames(name).name()
    +      case "orc" => OrcOptions.shortOrcCompressionCodecNames(name)
    +    }
    +  }
    +
    +  private def getTableCompressionCodec(path: String, format: String): 
Seq[String] = {
    +    val hadoopConf = spark.sessionState.newHadoopConf()
    +    val codecs = format.toLowerCase match {
    +      case "parquet" => for {
    +        footer <- readAllFootersWithoutSummaryFiles(new Path(path), 
hadoopConf)
    +        block <- footer.getParquetMetadata.getBlocks.asScala
    +        column <- block.getColumns.asScala
    +      } yield column.getCodec.name()
    +      case "orc" => new File(path).listFiles().filter{ file =>
    +        file.isFile && !file.getName.endsWith(".crc") && file.getName != 
"_SUCCESS"
    +      }.map { orcFile =>
    +        
OrcFileOperator.getFileReader(orcFile.toPath.toString).get.getCompression.toString
    +      }.toSeq
    +    }
    +    codecs.distinct
    +  }
    +
    +  private def createTable(
    +      rootDir: File,
    +      tableName: String,
    +      isPartitioned: Boolean,
    +      format: String,
    +      compressionCodec: Option[String]): Unit = {
    +    val tblProperties = compressionCodec match {
    +      case Some(prop) => 
s"TBLPROPERTIES('${getHiveCompressPropName(format)}'='$prop')"
    +      case _ => ""
    +    }
    +    val partitionCreate = if (isPartitioned) "PARTITIONED BY (p string)" 
else ""
    +    sql(
    +      s"""
    +        |CREATE TABLE $tableName(a int)
    +        |$partitionCreate
    +        |STORED AS $format
    +        |LOCATION '${rootDir.toURI.toString.stripSuffix("/")}/$tableName'
    +        |$tblProperties
    +      """.stripMargin)
    +  }
    +
    +  private def writeDataToTable(
    +      tableName: String,
    +      partition: Option[String]): Unit = {
    +    val partitionInsert = partition.map(p => s"partition 
(p='$p')").mkString
    +    sql(
    +      s"""
    +        |INSERT INTO TABLE $tableName
    +        |$partitionInsert
    +        |SELECT * FROM table_source
    +      """.stripMargin)
    +  }
    +
    +  private def getTableSize(path: String): Long = {
    +    val dir = new File(path)
    +    val files = dir.listFiles().filter(_.getName.startsWith("part-"))
    +    files.map(_.length()).sum
    +  }
    +
    +  private def getTablePartitionPath(dir: File, tableName: String, 
partition: Option[String]) = {
    +    val partitionPath = partition.map(p => s"p=$p").mkString
    +    s"${dir.getPath.stripSuffix("/")}/$tableName/$partitionPath"
    +  }
    +
    +  private def getUncompressedDataSizeByFormat(
    +      format: String, isPartitioned: Boolean): Long = {
    +    var totalSize = 0L
    +    val tableName = s"tbl_$format"
    +    val codecName = normalizeCodecName(format, "uncompressed")
    +    withSQLConf(getSparkCompressionConfName(format) -> codecName) {
    +      withTempDir { tmpDir =>
    +        withTable(tableName) {
    +          createTable(tmpDir, tableName, isPartitioned, format, 
Option(codecName))
    +          val partition = if (isPartitioned) Some("test") else None
    +          writeDataToTable(tableName, partition)
    +          val path = getTablePartitionPath(tmpDir, tableName, partition)
    +          totalSize = getTableSize(path)
    +        }
    +      }
    +    }
    +    assert(totalSize > 0L)
    +    totalSize
    +  }
    +
    +  private def checkCompressionCodecForTable(
    +      format: String,
    +      isPartitioned: Boolean,
    +      compressionCodec: Option[String])
    +      (assertion: (String, Long) => Unit): Unit = {
    +    val tableName = s"tbl_$format$isPartitioned"
    +    withTempDir { tmpDir =>
    +      withTable(tableName) {
    +        createTable(tmpDir, tableName, isPartitioned, format, 
compressionCodec)
    +        val partition = if (isPartitioned) Some("test") else None
    +        writeDataToTable(tableName, partition)
    +        val path = getTablePartitionPath(tmpDir, tableName, partition)
    +        val relCompressionCodecs = getTableCompressionCodec(path, format)
    +        assert(relCompressionCodecs.length == 1)
    +        val tableSize = getTableSize(path)
    +        assertion(relCompressionCodecs.head, tableSize)
    +      }
    +    }
    +  }
    +
    +  private def checkTableCompressionCodecForCodecs(
    +      format: String,
    +      isPartitioned: Boolean,
    +      convertMetastore: Boolean,
    +      compressionCodecs: List[String],
    +      tableCompressionCodecs: List[String])
    +      (assertionCompressionCodec: (Option[String], String, String, Long) 
=> Unit): Unit = {
    +    withSQLConf(getConvertMetastoreConfName(format) -> 
convertMetastore.toString) {
    +      tableCompressionCodecs.foreach { tableCompression =>
    --- End diff --
    
    Shouldn't we test the precedence of compressionCodecs in effect? Here I 
just want to verify the precedence.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #20087: [SPARK-21786][SQL] The 'spark.sql.parquet.compres...

Reply via email to