wangyum commented on issue #24872: [SPARK-28023][SQL] Trim the string when cast string type to Boolean/Numeric types URL: https://github.com/apache/spark/pull/24872#issuecomment-503827016 Benchmark and benchmark result. ```scala package org.apache.spark.sql.execution.benchmark import org.apache.spark.benchmark.Benchmark /** * Benchmark trim the string when casting string type to Boolean/Numeric types. * To run this benchmark: * {{{ * 1. without sbt: * bin/spark-submit --class <this class> --jars <spark core test jar> <spark sql test jar> * 2. build/sbt "sql/test:runMain <this class>" * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>" * Results will be written to "benchmarks/CastBenchmark-results.txt". * }}} */ object CastBenchmark extends SqlBasedBenchmark { override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { val title = "Benchmark trim the string when casting string type to Boolean/Numeric types" runBenchmark(title) { withTempPath { dir => val N = 500L << 13 val df = spark.range(N) val withoutWhitespace = "withoutWhitespace" val withWhitespace = "withWhitespace" val types = Seq("int", "long", "float", "double", "decimal", "boolean") df.selectExpr("cast(id as string) as str") .write.mode("overwrite").parquet(dir + withoutWhitespace) df.selectExpr(s"concat('${" " * 5}', id, '${" " * 5}') as str") .write.mode("overwrite").parquet(dir + withWhitespace) val benchmark = new Benchmark(title, N, minNumIters = 5, output = output) Seq(withoutWhitespace, withWhitespace).foreach { data => Seq(false, true).foreach { isTrimStr => val expr = types.map(t => s"cast(${if (isTrimStr) "trim(str)" else "str"} as $t) as c_$t") val name = s"$data ${if (isTrimStr) "with" else "without"} trim" benchmark.addCase(name) { _ => spark.read.parquet(dir + withoutWhitespace).selectExpr(expr: _*).collect() } } } benchmark.run() } } } } ``` Before this pr(after SPARK-28066): ``` [info] Java HotSpot(TM) 64-Bit Server VM 1.8.0_211-b12 on Linux 3.10.0-957.1.3.el7.x86_64 [info] Intel Core Processor (Broadwell) [info] Benchmark trim the string when casting string type to Boolean/Numeric types: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] ------------------------------------------------------------------------------------------------------------------------ [info] withoutWhitespace without trim 8696 10753 NaN 0.5 2123.1 1.0X [info] withoutWhitespace with trim 9915 10251 545 0.4 2420.6 0.9X [info] withWhitespace without trim 6128 9433 NaN 0.7 1496.0 1.4X [info] withWhitespace with trim 9666 10162 628 0.4 2359.8 0.9X ``` After this pr(after SPARK-28066): ``` [info] Java HotSpot(TM) 64-Bit Server VM 1.8.0_211-b12 on Linux 3.10.0-957.1.3.el7.x86_64 [info] Intel Core Processor (Broadwell) [info] Benchmark trim the string when casting string type to Boolean/Numeric types: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] ------------------------------------------------------------------------------------------------------------------------ [info] withoutWhitespace without trim 7379 9063 NaN 0.6 1801.4 1.0X [info] withoutWhitespace with trim 6260 7819 1505 0.7 1528.3 1.2X [info] withWhitespace without trim 5997 8175 NaN 0.7 1464.2 1.2X [info] withWhitespace with trim 6016 8252 1143 0.7 1468.8 1.2X ```
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
