wangyum commented on issue #24872: [SPARK-28023][SQL] Trim the string when cast 
string type to Boolean/Numeric types
URL: https://github.com/apache/spark/pull/24872#issuecomment-503827016
 
 
   Benchmark and benchmark result.
   ```scala
   package org.apache.spark.sql.execution.benchmark
   
   import org.apache.spark.benchmark.Benchmark
   
   /**
    * Benchmark trim the string when casting string type to Boolean/Numeric 
types.
    * To run this benchmark:
    * {{{
    *   1. without sbt:
    *      bin/spark-submit --class <this class> --jars <spark core test jar> 
<spark sql test jar>
    *   2. build/sbt "sql/test:runMain <this class>"
    *   3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt 
"sql/test:runMain <this class>"
    *      Results will be written to "benchmarks/CastBenchmark-results.txt".
    * }}}
    */
   object CastBenchmark extends SqlBasedBenchmark {
   
     override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
   
       val title = "Benchmark trim the string when casting string type to 
Boolean/Numeric types"
       runBenchmark(title) {
         withTempPath { dir =>
           val N = 500L << 13
           val df = spark.range(N)
           val withoutWhitespace = "withoutWhitespace"
           val withWhitespace = "withWhitespace"
           val types = Seq("int", "long", "float", "double", "decimal", 
"boolean")
   
           df.selectExpr("cast(id as string) as str")
             .write.mode("overwrite").parquet(dir + withoutWhitespace)
           df.selectExpr(s"concat('${" " * 5}', id, '${" " * 5}') as str")
             .write.mode("overwrite").parquet(dir + withWhitespace)
   
           val benchmark = new Benchmark(title, N, minNumIters = 5, output = 
output)
           Seq(withoutWhitespace, withWhitespace).foreach { data =>
             Seq(false, true).foreach { isTrimStr =>
               val expr =
                 types.map(t => s"cast(${if (isTrimStr) "trim(str)" else "str"} 
as $t) as c_$t")
               val name = s"$data ${if (isTrimStr) "with" else "without"} trim"
               benchmark.addCase(name) { _ =>
                 spark.read.parquet(dir + withoutWhitespace).selectExpr(expr: 
_*).collect()
               }
             }
           }
           benchmark.run()
         }
       }
     }
   }
   ```
   Before this pr(after SPARK-28066):
   ```
   [info] Java HotSpot(TM) 64-Bit Server VM 1.8.0_211-b12 on Linux 
3.10.0-957.1.3.el7.x86_64
   [info] Intel Core Processor (Broadwell)
   [info] Benchmark trim the string when casting string type to Boolean/Numeric 
types:  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   
Relative
   [info] 
------------------------------------------------------------------------------------------------------------------------
   [info] withoutWhitespace without trim                     8696          
10753         NaN          0.5        2123.1       1.0X
   [info] withoutWhitespace with trim                        9915          
10251         545          0.4        2420.6       0.9X
   [info] withWhitespace without trim                        6128           
9433         NaN          0.7        1496.0       1.4X
   [info] withWhitespace with trim                           9666          
10162         628          0.4        2359.8       0.9X
   ```
   After this pr(after SPARK-28066):
   ```
   [info] Java HotSpot(TM) 64-Bit Server VM 1.8.0_211-b12 on Linux 
3.10.0-957.1.3.el7.x86_64
   [info] Intel Core Processor (Broadwell)
   [info] Benchmark trim the string when casting string type to Boolean/Numeric 
types:  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   
Relative
   [info] 
------------------------------------------------------------------------------------------------------------------------
   [info] withoutWhitespace without trim                     7379           
9063         NaN          0.6        1801.4       1.0X
   [info] withoutWhitespace with trim                        6260           
7819        1505          0.7        1528.3       1.2X
   [info] withWhitespace without trim                        5997           
8175         NaN          0.7        1464.2       1.2X
   [info] withWhitespace with trim                           6016           
8252        1143          0.7        1468.8       1.2X
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to