[GitHub] spark pull request #14266: [SPARK-16526][SQL] Benchmarking Performance for F...

davies Mon, 08 Aug 2016 11:32:45 -0700

Github user davies commented on a diff in the pull request:

    https://github.com/apache/spark/pull/14266#discussion_r73928683
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
 ---
    @@ -1078,6 +1078,146 @@ class AggregateBenchmark extends BenchmarkBase {
         */
       }
     
    +  ignore("4 key fields, 4 value field, varying linear distinct keys") {
    +    val N = 20 << 22;
    +
    +    var timeStart: Long = 0L
    +    var timeEnd: Long = 0L
    +    var nsPerRow: Long = 0L
    +    var i = 0
    +    sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
    +    sparkSession.conf.set("spark.sql.codegen.aggregate.map.columns.max", 
"30")
    +
    +    // scalastyle:off
    +    println(Benchmark.getJVMOSInfo())
    +    println(Benchmark.getProcessorName())
    +    printf("%20s %20s %20s %20s\n", "Num. Distinct Keys", "No Fast 
Hashmap",
    +      "Vectorized", "Row-based")
    +    // scalastyle:on
    +
    +    val modes = List("skip", "vectorized", "rowbased")
    +
    +    while (i < 17) {
    +      val results = modes.map(mode => {
    +        
sparkSession.conf.set("spark.sql.codegen.aggregate.map.enforce.impl", mode)
    +        var j = 0
    +        var minTime: Long = 1000
    +        while (j < 5) {
    +          System.gc()
    +          val s = "id & " + ((1<<i)-1) + " as k"
    +          sparkSession.range(N)
    +            .selectExpr(List.range(0, 4).map(x => s + x): _*)
    +            .createOrReplaceTempView("test")
    +          timeStart = System.nanoTime
    +          sparkSession.sql("select " + List.range(0, 4).map(x => "sum(k" + 
x + ")").mkString(",") +
    +            " from test group by " + List.range(0, 4).map(x => "k" + 
x).mkString(",")).collect()
    +          timeEnd = System.nanoTime
    +          nsPerRow = (timeEnd - timeStart) / N
    +          // printf("nsPerRow i=%d j=%d mode=%10s %20s\n", i, j, mode, 
nsPerRow)
    +          if (j > 1 && minTime > nsPerRow) minTime = nsPerRow
    +          j += 1
    +        }
    +        minTime
    +      })
    +      printf("%20s %20s %20s %20s\n", (1<<i), results(0), results(1), 
results(2))
    +      i += 1
    +    }
    +    printf("Unit: ns/row\n")
    +
    +    /*
    +    Java HotSpot(TM) 64-Bit Server VM 1.8.0_91-b14 on Mac OS X 10.11.5
    +    Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
    +
    +      Num. Distinct Keys      No Fast Hashmap           Vectorized         
   Row-based
    +                       1                   33                   38         
          24
    +                       2                   58                   43         
          30
    +                       4                   58                   42         
          28
    +                       8                   57                   46         
          28
    +                      16                   56                   41         
          28
    +                      32                   55                   44         
          27
    +                      64                   56                   48         
          27
    +                     128                   58                   43         
          27
    +                     256                   60                   43         
          30
    +                     512                   61                   45         
          31
    +                    1024                   62                   44         
          31
    +                    2048                   64                   42         
          38
    +                    4096                   66                   47         
          38
    +                    8192                   70                   48         
          38
    +                   16384                   72                   48         
          42
    +                   32768                   77                   54         
          47
    +                   65536                   96                   75         
          61
    +                  131072                  115                  119         
         130
    +                  262144                  137                  162         
         185
    +    Unit: ns/row
    +    */
    +  }
    +
    +  ignore("single key field, single value field, varying linear distinct 
keys") {
    --- End diff --
    
    Should we access them in random way?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #14266: [SPARK-16526][SQL] Benchmarking Performance for F...

Reply via email to