[GitHub] spark pull request: [SPARK-14680][SQL]Support all datatypes to use...

davies Wed, 20 Apr 2016 21:51:23 -0700

Github user davies commented on a diff in the pull request:

    https://github.com/apache/spark/pull/12440#discussion_r60526133
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/BenchmarkWholeStageCodegen.scala
 ---
    @@ -224,6 +224,127 @@ class BenchmarkWholeStageCodegen extends 
SparkFunSuite {
         */
       }
     
    +  ignore("aggregate with string key") {
    +    val N = 20 << 20
    +
    +    val benchmark = new Benchmark("Aggregate w string key", N)
    +    def f(): Unit = sqlContext.range(N).selectExpr("id", "cast(id & 1023 
as string) as k")
    +      .groupBy("k").count().collect()
    +
    +    benchmark.addCase(s"codegen = F") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "false")
    +      f()
    +    }
    +
    +    benchmark.addCase(s"codegen = T hashmap = F") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
    +      sqlContext.setConf("spark.sql.codegen.aggregate.map.enabled", 
"false")
    +      f()
    +    }
    +
    +    benchmark.addCase(s"codegen = T hashmap = T") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
    +      sqlContext.setConf("spark.sql.codegen.aggregate.map.enabled", "true")
    +      f()
    +    }
    +
    +    benchmark.run()
    +
    +    /*
    +    Java HotSpot(TM) 64-Bit Server VM 1.8.0_73-b02 on Mac OS X 10.11.4
    +    Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
    +    Aggregate w string key:             Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
    +    
-------------------------------------------------------------------------------------------
    +    codegen = F                              3307 / 3376          6.3      
   157.7       1.0X
    +    codegen = T hashmap = F                  2364 / 2471          8.9      
   112.7       1.4X
    +    codegen = T hashmap = T                  1740 / 1841         12.0      
    83.0       1.9X
    +    */
    +  }
    +
    +  ignore("aggregate with decimal key") {
    +    val N = 20 << 20
    +
    +    val benchmark = new Benchmark("Aggregate w decimal key", N)
    +    def f(): Unit = sqlContext.range(N).selectExpr("id", "cast(id & 65535 
as decimal) as k")
    +      .groupBy("k").count().collect()
    +
    +    benchmark.addCase(s"codegen = F") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "false")
    +      f()
    +    }
    +
    +    benchmark.addCase(s"codegen = T hashmap = F") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
    +      sqlContext.setConf("spark.sql.codegen.aggregate.map.enabled", 
"false")
    +      f()
    +    }
    +
    +    benchmark.addCase(s"codegen = T hashmap = T") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
    +      sqlContext.setConf("spark.sql.codegen.aggregate.map.enabled", "true")
    +      f()
    +    }
    +
    +    benchmark.run()
    +
    +    /*
    +    Java HotSpot(TM) 64-Bit Server VM 1.8.0_73-b02 on Mac OS X 10.11.4
    +    Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
    +    Aggregate w decimal key:             Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
    +    
-------------------------------------------------------------------------------------------
    +    codegen = F                              2756 / 2817          7.6      
   131.4       1.0X
    +    codegen = T hashmap = F                  1580 / 1647         13.3      
    75.4       1.7X
    +    codegen = T hashmap = T                   641 /  662         32.7      
    30.6       4.3X
    +    */
    +  }
    +
    +  ignore("aggregate with multiple key types") {
    +    val N = 20 << 20
    +
    +    val benchmark = new Benchmark("Aggregate w multiple keys", N)
    +    def f(): Unit = sqlContext.range(N)
    +      .selectExpr(
    +        "id",
    +        "(id & 1023) as k1",
    +        "cast(id & 1023 as string) as k2",
    +        "cast(id & 1023 as int) as k3",
    +        "cast(id & 1023 as double) as k4",
    +        "cast(id & 1023 as float) as k5",
    +        "id > 1023 as k6")
    +      .groupBy("k1", "k2", "k3", "k4", "k5", "k6")
    +      .sum()
    +      .collect()
    +
    +    benchmark.addCase(s"codegen = F") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "false")
    +      f()
    +    }
    +
    +    benchmark.addCase(s"codegen = T hashmap = F") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
    +      sqlContext.setConf("spark.sql.codegen.aggregate.map.enabled", 
"false")
    +      f()
    +    }
    +
    +    benchmark.addCase(s"codegen = T hashmap = T") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
    +      sqlContext.setConf("spark.sql.codegen.aggregate.map.enabled", "true")
    +      f()
    +    }
    +
    +    benchmark.run()
    +
    +    /*
    +    Java HotSpot(TM) 64-Bit Server VM 1.8.0_73-b02 on Mac OS X 10.11.4
    +    Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
    +    Aggregate w decimal key:             Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
    +    
-------------------------------------------------------------------------------------------
    +    codegen = F                              6876 / 7216          3.1      
   327.8       1.0X
    +    codegen = T hashmap = F                  5297 / 5478          4.0      
   252.6       1.3X
    +    codegen = T hashmap = T                  5395 / 5668          3.9      
   257.2       1.3X
    --- End diff --
    
    Why we does not see improvements here? two many columns or depends on types?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-14680][SQL]Support all datatypes to use...

Reply via email to