Github user fjh100456 commented on the issue:
https://github.com/apache/spark/pull/19218
@gatorsmile
Thank you very much.I'll fix all the problem later.For moreï¼I'm not very
clear what you mean by 'workload', I changed the test to a benchmark in this
two days, and the code and the results are as follows:
``` scala
class InsertHiveTableBenchmark extends SQLTestUtils {
lazy val spark = SparkSession.builder
.enableHiveSupport()
.master("local[1]")
.appName("microbenchmark")
.config("spark.sql.warehouse.dir", "file:/E:/study/temp/warehouse")
.config("hive.exec.dynamic.partition.mode", "nonstrict")
.getOrCreate()
/** Runs function `f` with different format. */
private def runBenchmark(
name: String, cardinality: Long, tables: List[String])(f: String =>
Unit): Unit = {
val benchmark = new Benchmark(name, cardinality)
val numIters = 5
tables.map { table =>
benchmark.addCase(s"To $table", numIters)(iter => f(table))
}
benchmark.run()
}
private val dataSourceName: String = "projdata"
private def getTableByFormatCompression(format: String, compressionCodec:
String): String = {
s"${dataSourceName}_${format}_${compressionCodec}"
}
private def createTable(format: String, compressionCodec: String): String
= {
val tableName = getTableByFormatCompression(format, compressionCodec)
val compressionName = getCompressionNameByFormat(format)
spark.sql(
s"""CREATE TABLE if not exists $tableName(
/*---About 74 fields, Forgive me for not being able to show all the
fields
because being banned by the company---/*)
|PARTITIONED BY (p_provincecode int)
|STORED AS $format
|TBLPROPERTIES('$compressionName'='$compressionCodec')""".stripMargin)
tableName
}
private def insertOverwriteTable(tableName: String, dataSource: String):
Unit = {
spark.sql(s"insert overwrite table $tableName
PARTITION(p_provincecode=510000) select * from $dataSource")
}
private def withCachedTable(sizeInMb: Int)(f: (String, Long) => Unit):
Unit = {
val size = 22 //textfile data size in MB
val times = if (sizeInMb <= size) 1 else sizeInMb / size
val unions = (0 until times).map(_ => s"select * from $dataSourceName
where p_provincecode=510000")
val cache = spark.sql(unions.mkString(" union all ")).cache()
val tempViewName = s"${dataSourceName}_cache"
cache.createTempView(tempViewName)
try {
val numRecords = cache.count()
f(tempViewName, numRecords)
} finally {
spark.sqlContext.dropTempTable(tempViewName)
cache.unpersist()
}
}
private def getCompressionNameByFormat(format: String): String = {
format.toLowerCase match {
case "parquet" => "parquet.compression"
case "orc" => "orc.compress"
case _ => throw new Exception(s"Invalid format $format")
}
}
private def getCompressionCodesByFormat(format: String): List[String] = {
format.toLowerCase match {
case "parquet" => List("UNCOMPRESSED", "SNAPPY", "GZIP")
case "orc" => List("NONE", "SNAPPY", "ZLIB")
case _ => throw new Exception(s"Invalid format $format")
}
}
private def performance(dataSizeInMB: Int, format: String): Unit = {
withCachedTable(dataSizeInMB) { case (cacheName, numRecords) =>
val tables = getCompressionCodesByFormat(format).map(codec =>
createTable(format, codec))
withTable(tables:_*) {
runBenchmark(s"Inert overwrite ${dataSizeInMB}MB to hive table",
numRecords, tables){ tableName =>
insertOverwriteTable(tableName, cacheName)
}
}
}
}
test("Insert 22MB textfile datasource into parquet hive table") {
performance(22, "parquet")
/*
Java HotSpot(TM) 64-Bit Server VM 1.8.0_121-b13 on Windows 7 6.1
Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
Inert overwrite 22MB to hive table: Best/Avg Time(ms) Rate(M/s)
Per Row(ns) Relative
------------------------------------------------------------------------------------------------
To projdata_parquet_UNCOMPRESSED 2043 / 2509 0.0
45247.5 1.0X
To projdata_parquet_SNAPPY 1684 / 2806 0.0
37291.0 1.2X
To projdata_parquet_GZIP 2066 / 2700 0.0
45759.0 1.0X
*/
}
test("Insert 220MB textfile datasource into parquet hive table") {
performance(220, "parquet")
/*
Java HotSpot(TM) 64-Bit Server VM 1.8.0_121-b13 on Windows 7 6.1
Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
Inert overwrite 220MB to hive table: Best/Avg Time(ms) Rate(M/s)
Per Row(ns) Relative
------------------------------------------------------------------------------------------------
To projdata_parquet_UNCOMPRESSED 9156 / 11150 0.0
20280.9 1.0X
To projdata_parquet_SNAPPY 8949 / 17424 0.1
19823.1 1.0X
To projdata_parquet_GZIP 14563 / 19023 0.0
32259.1 0.6X
*/
}
test("Insert 1100MB textfile datasource into parquet hive table") {
performance(1100, "parquet")
/*
Java HotSpot(TM) 64-Bit Server VM 1.8.0_121-b13 on Windows 7 6.1
Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
Inert overwrite 1100MB to hive table: Best/Avg Time(ms) Rate(M/s)
Per Row(ns) Relative
------------------------------------------------------------------------------------------------
To projdata_parquet_UNCOMPRESSED 44395 / 53093 0.1
19667.8 1.0X
To projdata_parquet_SNAPPY 40974 / 42340 0.1
18152.2 1.1X
To projdata_parquet_GZIP 57086 / 58069 0.0
25290.0 0.8X
*/
}
test("Insert 22MB textfile datasource into orc hive table") {
performance(22, "orc")
/*
Java HotSpot(TM) 64-Bit Server VM 1.8.0_121-b13 on Windows 7 6.1
Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
Inert overwrite 22MB to hive table: Best/Avg Time(ms) Rate(M/s)
Per Row(ns) Relative
------------------------------------------------------------------------------------------------
To projdata_orc_NONE 1440 / 1495 0.0
31887.3 1.0X
To projdata_orc_SNAPPY 1467 / 1550 0.0
32488.7 1.0X
To projdata_orc_ZLIB 1546 / 1649 0.0
34253.6 0.9X
*/
}
test("Insert 220MB textfile datasource into orc hive table") {
performance(220, "orc")
/*
Java HotSpot(TM) 64-Bit Server VM 1.8.0_121-b13 on Windows 7 6.1
Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
Inert overwrite 220MB to hive table: Best/Avg Time(ms) Rate(M/s)
Per Row(ns) Relative
------------------------------------------------------------------------------------------------
To projdata_orc_NONE 9108 / 10050 0.0
20174.3 1.0X
To projdata_orc_SNAPPY 8937 / 9052 0.1
19795.1 1.0X
To projdata_orc_ZLIB 9906 / 10236 0.0
21943.0 0.9X
*/
}
test("Insert 1100MB textfile datasource into orc hive table") {
performance(1100, "orc")
/*
Java HotSpot(TM) 64-Bit Server VM 1.8.0_121-b13 on Windows 7 6.1
Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
Inert overwrite 1100MB to hive table: Best/Avg Time(ms) Rate(M/s)
Per Row(ns) Relative
------------------------------------------------------------------------------------------------
To lte_cm_projdata_txt_orc_NONE 47655 / 48881 0.0
21112.0 1.0X
To lte_cm_projdata_txt_orc_SNAPPY 44012 / 45273 0.1
19498.2 1.1X
To lte_cm_projdata_txt_orc_ZLIB 49044 / 50509 0.0
21727.2 1.0X
*/
}
}
```
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]