[GitHub] spark issue #19218: [SPARK-21786][SQL] The 'spark.sql.parquet.compression.co...

fjh100456 Tue, 05 Dec 2017 18:04:02 -0800

Github user fjh100456 commented on the issue:

    https://github.com/apache/spark/pull/19218
  
    @gatorsmile  
    Thank you very much.I'll fix all the problem later.For moreï¼I'm not very 
clear what you mean by 'workload', I changed the test to a benchmark in this 
two days, and the code and the results are as follows:
    
    ``` scala
    class InsertHiveTableBenchmark extends SQLTestUtils {
      lazy val spark = SparkSession.builder
        .enableHiveSupport()
        .master("local[1]")
        .appName("microbenchmark")
        .config("spark.sql.warehouse.dir", "file:/E:/study/temp/warehouse")
        .config("hive.exec.dynamic.partition.mode", "nonstrict")
        .getOrCreate()
    
      /** Runs function `f` with different format. */
      private def runBenchmark(
        name: String, cardinality: Long, tables: List[String])(f: String => 
Unit): Unit = {
        val benchmark = new Benchmark(name, cardinality)
        val numIters = 5
    
        tables.map { table =>
          benchmark.addCase(s"To $table", numIters)(iter => f(table))
        }
    
        benchmark.run()
      }
    
      private val dataSourceName: String = "projdata"
    
      private def getTableByFormatCompression(format: String, compressionCodec: 
String): String = {
        s"${dataSourceName}_${format}_${compressionCodec}"
      }
    
      private def createTable(format: String, compressionCodec: String): String 
= {
        val tableName = getTableByFormatCompression(format, compressionCodec)
        val compressionName = getCompressionNameByFormat(format)
        spark.sql(
          s"""CREATE TABLE if not exists $tableName(
          /*---About 74 fields, Forgive me for not being able to show all the 
fields 
                 because being banned by the company---/*)
             |PARTITIONED BY (p_provincecode int)
             |STORED AS $format
             
|TBLPROPERTIES('$compressionName'='$compressionCodec')""".stripMargin)
        tableName
      }
    
      private def insertOverwriteTable(tableName: String, dataSource: String): 
Unit = {
        spark.sql(s"insert overwrite table $tableName 
PARTITION(p_provincecode=510000) select * from $dataSource")
      }
    
      private def withCachedTable(sizeInMb: Int)(f: (String, Long) => Unit): 
Unit = {
        val size = 22 //textfile data size in MB
        val times = if (sizeInMb <= size) 1 else sizeInMb / size
        val unions = (0 until times).map(_ => s"select * from $dataSourceName 
where p_provincecode=510000")
        val cache = spark.sql(unions.mkString(" union all ")).cache()
        val tempViewName = s"${dataSourceName}_cache"
        cache.createTempView(tempViewName)
        try {
          val numRecords = cache.count()
          f(tempViewName, numRecords)
        } finally {
          spark.sqlContext.dropTempTable(tempViewName)
          cache.unpersist()
        }
      }
    
      private def getCompressionNameByFormat(format: String): String = {
        format.toLowerCase match {
          case "parquet" => "parquet.compression"
          case "orc" => "orc.compress"
          case _ => throw new Exception(s"Invalid format $format")
        }
      }
    
      private def getCompressionCodesByFormat(format: String): List[String] = {
        format.toLowerCase match {
          case "parquet" => List("UNCOMPRESSED", "SNAPPY", "GZIP")
          case "orc" => List("NONE", "SNAPPY", "ZLIB")
          case _ => throw new Exception(s"Invalid format $format")
        }
      }
    
      private def performance(dataSizeInMB: Int, format: String): Unit = {
        withCachedTable(dataSizeInMB) { case (cacheName, numRecords) =>
          val tables = getCompressionCodesByFormat(format).map(codec => 
createTable(format, codec))
          withTable(tables:_*) {
            runBenchmark(s"Inert overwrite ${dataSizeInMB}MB to hive table", 
numRecords, tables){ tableName =>
              insertOverwriteTable(tableName, cacheName)
            }
          }
        }
      }
    
      test("Insert 22MB textfile datasource into parquet hive table") {
        performance(22, "parquet")
        /*
        Java HotSpot(TM) 64-Bit Server VM 1.8.0_121-b13 on Windows 7 6.1
        Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
        Inert overwrite 22MB to hive table:      Best/Avg Time(ms)    Rate(M/s) 
  Per Row(ns)   Relative
        
------------------------------------------------------------------------------------------------
        To projdata_parquet_UNCOMPRESSED      2043 / 2509          0.0       
45247.5       1.0X
        To projdata_parquet_SNAPPY         1684 / 2806          0.0       
37291.0       1.2X
        To projdata_parquet_GZIP           2066 / 2700          0.0       
45759.0       1.0X
        */
      }
    
      test("Insert 220MB textfile datasource into parquet hive table") {
        performance(220, "parquet")
        /*
        Java HotSpot(TM) 64-Bit Server VM 1.8.0_121-b13 on Windows 7 6.1
        Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
        Inert overwrite 220MB to hive table:     Best/Avg Time(ms)    Rate(M/s) 
  Per Row(ns)   Relative
        
------------------------------------------------------------------------------------------------
        To projdata_parquet_UNCOMPRESSED     9156 / 11150          0.0       
20280.9       1.0X
        To projdata_parquet_SNAPPY        8949 / 17424          0.1       
19823.1       1.0X
        To projdata_parquet_GZIP         14563 / 19023          0.0       
32259.1       0.6X
        */
      }
    
      test("Insert 1100MB textfile datasource into parquet hive table") {
        performance(1100, "parquet")
        /*
        Java HotSpot(TM) 64-Bit Server VM 1.8.0_121-b13 on Windows 7 6.1
        Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
        Inert overwrite 1100MB to hive table:    Best/Avg Time(ms)    Rate(M/s) 
  Per Row(ns)   Relative
        
------------------------------------------------------------------------------------------------
        To projdata_parquet_UNCOMPRESSED    44395 / 53093          0.1       
19667.8       1.0X
        To projdata_parquet_SNAPPY       40974 / 42340          0.1       
18152.2       1.1X
        To projdata_parquet_GZIP         57086 / 58069          0.0       
25290.0       0.8X
        */
      }
    
      test("Insert 22MB textfile datasource into orc hive table") {
        performance(22, "orc")
        /*
        Java HotSpot(TM) 64-Bit Server VM 1.8.0_121-b13 on Windows 7 6.1
        Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
        Inert overwrite 22MB to hive table:      Best/Avg Time(ms)    Rate(M/s) 
  Per Row(ns)   Relative
        
------------------------------------------------------------------------------------------------
        To projdata_orc_NONE               1440 / 1495          0.0       
31887.3       1.0X
        To projdata_orc_SNAPPY             1467 / 1550          0.0       
32488.7       1.0X
        To projdata_orc_ZLIB               1546 / 1649          0.0       
34253.6       0.9X
        */
      }
    
      test("Insert 220MB textfile datasource into orc hive table") {
        performance(220, "orc")
        /*
        Java HotSpot(TM) 64-Bit Server VM 1.8.0_121-b13 on Windows 7 6.1
        Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
        Inert overwrite 220MB to hive table:     Best/Avg Time(ms)    Rate(M/s) 
  Per Row(ns)   Relative
        
------------------------------------------------------------------------------------------------
        To projdata_orc_NONE              9108 / 10050          0.0       
20174.3       1.0X
        To projdata_orc_SNAPPY             8937 / 9052          0.1       
19795.1       1.0X
        To projdata_orc_ZLIB              9906 / 10236          0.0       
21943.0       0.9X
        */
      }
    
      test("Insert 1100MB textfile datasource into orc hive table") {
        performance(1100, "orc")
        /*
        Java HotSpot(TM) 64-Bit Server VM 1.8.0_121-b13 on Windows 7 6.1
        Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
        Inert overwrite 1100MB to hive table:    Best/Avg Time(ms)    Rate(M/s) 
  Per Row(ns)   Relative
        
------------------------------------------------------------------------------------------------
        To lte_cm_projdata_txt_orc_NONE             47655 / 48881          0.0  
     21112.0       1.0X
        To lte_cm_projdata_txt_orc_SNAPPY           44012 / 45273          0.1  
     19498.2       1.1X
        To lte_cm_projdata_txt_orc_ZLIB             49044 / 50509          0.0  
     21727.2       1.0X
        */
      }
    }
    ```



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark issue #19218: [SPARK-21786][SQL] The 'spark.sql.parquet.compression.co...

Reply via email to