Github user sameeragarwal commented on a diff in the pull request:

    https://github.com/apache/spark/pull/12440#discussion_r60628217
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/BenchmarkWholeStageCodegen.scala
 ---
    @@ -224,6 +224,127 @@ class BenchmarkWholeStageCodegen extends 
SparkFunSuite {
         */
       }
     
    +  ignore("aggregate with string key") {
    +    val N = 20 << 20
    +
    +    val benchmark = new Benchmark("Aggregate w string key", N)
    +    def f(): Unit = sqlContext.range(N).selectExpr("id", "cast(id & 1023 
as string) as k")
    +      .groupBy("k").count().collect()
    +
    +    benchmark.addCase(s"codegen = F") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "false")
    +      f()
    +    }
    +
    +    benchmark.addCase(s"codegen = T hashmap = F") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
    +      sqlContext.setConf("spark.sql.codegen.aggregate.map.enabled", 
"false")
    +      f()
    +    }
    +
    +    benchmark.addCase(s"codegen = T hashmap = T") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
    +      sqlContext.setConf("spark.sql.codegen.aggregate.map.enabled", "true")
    +      f()
    +    }
    +
    +    benchmark.run()
    +
    +    /*
    +    Java HotSpot(TM) 64-Bit Server VM 1.8.0_73-b02 on Mac OS X 10.11.4
    +    Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
    +    Aggregate w string key:             Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
    +    
-------------------------------------------------------------------------------------------
    +    codegen = F                              3307 / 3376          6.3      
   157.7       1.0X
    +    codegen = T hashmap = F                  2364 / 2471          8.9      
   112.7       1.4X
    +    codegen = T hashmap = T                  1740 / 1841         12.0      
    83.0       1.9X
    +    */
    +  }
    +
    +  ignore("aggregate with decimal key") {
    +    val N = 20 << 20
    +
    +    val benchmark = new Benchmark("Aggregate w decimal key", N)
    +    def f(): Unit = sqlContext.range(N).selectExpr("id", "cast(id & 65535 
as decimal) as k")
    +      .groupBy("k").count().collect()
    +
    +    benchmark.addCase(s"codegen = F") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "false")
    +      f()
    +    }
    +
    +    benchmark.addCase(s"codegen = T hashmap = F") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
    +      sqlContext.setConf("spark.sql.codegen.aggregate.map.enabled", 
"false")
    +      f()
    +    }
    +
    +    benchmark.addCase(s"codegen = T hashmap = T") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
    +      sqlContext.setConf("spark.sql.codegen.aggregate.map.enabled", "true")
    +      f()
    +    }
    +
    +    benchmark.run()
    +
    +    /*
    +    Java HotSpot(TM) 64-Bit Server VM 1.8.0_73-b02 on Mac OS X 10.11.4
    +    Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
    +    Aggregate w decimal key:             Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
    +    
-------------------------------------------------------------------------------------------
    +    codegen = F                              2756 / 2817          7.6      
   131.4       1.0X
    +    codegen = T hashmap = F                  1580 / 1647         13.3      
    75.4       1.7X
    +    codegen = T hashmap = T                   641 /  662         32.7      
    30.6       4.3X
    +    */
    +  }
    +
    +  ignore("aggregate with multiple key types") {
    +    val N = 20 << 20
    +
    +    val benchmark = new Benchmark("Aggregate w multiple keys", N)
    +    def f(): Unit = sqlContext.range(N)
    +      .selectExpr(
    +        "id",
    +        "(id & 1023) as k1",
    +        "cast(id & 1023 as string) as k2",
    +        "cast(id & 1023 as int) as k3",
    +        "cast(id & 1023 as double) as k4",
    +        "cast(id & 1023 as float) as k5",
    +        "id > 1023 as k6")
    +      .groupBy("k1", "k2", "k3", "k4", "k5", "k6")
    +      .sum()
    +      .collect()
    +
    +    benchmark.addCase(s"codegen = F") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "false")
    +      f()
    +    }
    +
    +    benchmark.addCase(s"codegen = T hashmap = F") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
    +      sqlContext.setConf("spark.sql.codegen.aggregate.map.enabled", 
"false")
    +      f()
    +    }
    +
    +    benchmark.addCase(s"codegen = T hashmap = T") { iter =>
    +      sqlContext.setConf("spark.sql.codegen.wholeStage", "true")
    +      sqlContext.setConf("spark.sql.codegen.aggregate.map.enabled", "true")
    +      f()
    +    }
    +
    +    benchmark.run()
    +
    +    /*
    +    Java HotSpot(TM) 64-Bit Server VM 1.8.0_73-b02 on Mac OS X 10.11.4
    +    Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
    +    Aggregate w decimal key:             Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
    +    
-------------------------------------------------------------------------------------------
    +    codegen = F                              6876 / 7216          3.1      
   327.8       1.0X
    +    codegen = T hashmap = F                  5297 / 5478          4.0      
   252.6       1.3X
    +    codegen = T hashmap = T                  5395 / 5668          3.9      
   257.2       1.3X
    --- End diff --
    
    I believe it depends on types (string for instance is quite slow as one 
would expect). With the new changes, the current benchmark is now around 13% 
faster (152.8 ns/row). Furthermore, if we remove the `StringType`, it actually 
gets 2.3x faster (68.3 ns/row).
    
    ### Example 1:
    
    ```scala
    sqlContext.range(N)
          .selectExpr(
            "id",
            "(id & 1023) as k1",
            "cast(id & 1023 as string) as k2",  <--------------- [string key]
            "cast(id & 1023 as int) as k3",
            "cast(id & 1023 as double) as k4",
            "cast(id & 1023 as float) as k5",
            "id > 1023 as k6")
          .groupBy("k1", "k2", "k3", "k4", "k5", "k6")
          .sum()
          .collect()
    ```
    
        Java HotSpot(TM) 64-Bit Server VM 1.8.0_73-b02 on Mac OS X 10.11.4
        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
        Aggregate w decimal key:             Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
        
-------------------------------------------------------------------------------------------
        codegen = F                              5885 / 6091          3.6       
  280.6       1.0X
        codegen = T hashmap = F                  3625 / 4009          5.8       
  172.8       1.6X
        codegen = T hashmap = T                  3204 / 3271          6.5       
  152.8       1.8X
    
    ### Example 2:
    
    ```scala
      sqlContext.range(N)
          .selectExpr(
            "id",
            "(id & 1023) as k1",
            "cast(id & 1023 as long) as k2",  <--------------- [long key]
            "cast(id & 1023 as int) as k3",
            "cast(id & 1023 as double) as k4",
            "cast(id & 1023 as float) as k5",
            "id > 1023 as k6")
          .groupBy("k1", "k2", "k3", "k4", "k5", "k6")
          .sum()
          .collect()
    ```
    
        Java HotSpot(TM) 64-Bit Server VM 1.8.0_73-b02 on Mac OS X 10.11.4
        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
        Aggregate w multiple keys:          Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
        
-------------------------------------------------------------------------------------------
        codegen = F                              3090 / 3687          6.8       
  147.3       1.0X
        codegen = T hashmap = F                  2028 / 2142         10.3       
   96.7       1.5X
        codegen = T hashmap = T                  1433 / 1543         14.6       
   68.3       2.2X



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to