LuciferYang commented on PR #37940:
URL: https://github.com/apache/spark/pull/37940#issuecomment-1251869079

   Test the following code with input size 
`1,5,10,20,50,100,150,200,300,400,500,1000,5000,10000,20000`
   ```
     def testZipWithIndexToMap(valuesPerIteration: Int, collectionSize: Int): 
Unit = {
   
       val benchmark = new Benchmark(
         s"Test zip with index to map with collectionSize = $collectionSize",
         valuesPerIteration,
         output = output)
   
       val data = 0 until collectionSize
   
       benchmark.addCase("Use zipWithIndex + toMap") { _: Int =>
         for (_ <- 0L until valuesPerIteration) {
           val map: Map[Int, Int] = data.zipWithIndex.toMap
         }
       }
   
       benchmark.addCase("Use zipWithIndex + collection.breakOut") { _: Int =>
         for (_ <- 0L until valuesPerIteration) {
            val map: Map[Int, Int] =
              data.zipWithIndex(collection.breakOut[IndexedSeq[Int], (Int, 
Int), Map[Int, Int]])
         }
       }
   
       benchmark.addCase("Use Manual builder") { _: Int =>
         for (_ <- 0L until valuesPerIteration) {
           val map: Map[Int, Int] = zipToMapUseMapBuilder[Int](data)
         }
       }
   
       benchmark.addCase("Use Manual map") { _: Int =>
         for (_ <- 0L until valuesPerIteration) {
           val map: Map[Int, Int] = zipWithIndexToMapUseMap[Int](data)
         }
       }
       benchmark.run()
     }
   
     private def zipToMapUseMapBuilder[K](keys: Iterable[K]): Map[K, Int] = {
       import scala.collection.immutable
       val builder = immutable.Map.newBuilder[K, Int]
       val keyIter = keys.iterator
       var idx = 0
       while (keyIter.hasNext) {
         builder += (keyIter.next(), idx).asInstanceOf[(K, Int)]
         idx = idx + 1
       }
       builder.result()
     }
   
     private def zipWithIndexToMapUseMap[K](keys: Iterable[K]): Map[K, Int] = {
       var elems: Map[K, Int] = Map.empty[K, Int]
       val keyIter = keys.iterator
       var idx = 0
       while (keyIter.hasNext) {
         elems += (keyIter.next().asInstanceOf[K] -> idx)
         idx = idx + 1
       }
       elems
     }
   ```
   
   result as follows:
   
   **Java 8**
   
   ```
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 1:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
----------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       41            
 43           3          2.5         406.8       1.0X
   Use zipWithIndex + collection.breakOut                          4            
  4           0         23.6          42.4       9.6X
   Use Manual builder                                              4            
  4           0         27.8          35.9      11.3X
   Use Manual map                                                  3            
  3           0         37.4          26.8      15.2X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 5:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
----------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                      142            
143           2          0.7        1421.2       1.0X
   Use zipWithIndex + collection.breakOut                        101            
102           1          1.0        1011.0       1.4X
   Use Manual builder                                             99            
101           2          1.0         994.0       1.4X
   Use Manual map                                                 49            
 49           1          2.1         485.6       2.9X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 10:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
-----------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       166           
 170           5          0.6        1660.0       1.0X
   Use zipWithIndex + collection.breakOut                         123           
 128           5          0.8        1226.3       1.4X
   Use Manual builder                                             121           
 123           3          0.8        1207.9       1.4X
   Use Manual map                                                 102           
 104           3          1.0        1024.0       1.6X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 20:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
-----------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       215           
 227          10          0.5        2151.1       1.0X
   Use zipWithIndex + collection.breakOut                         167           
 173           6          0.6        1667.0       1.3X
   Use Manual builder                                             161           
 167           6          0.6        1614.5       1.3X
   Use Manual map                                                 208           
 218          10          0.5        2082.3       1.0X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 50:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
-----------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       755           
 756           1          0.1        7553.8       1.0X
   Use zipWithIndex + collection.breakOut                         652           
 654           2          0.2        6521.1       1.2X
   Use Manual builder                                             642           
 667          30          0.2        6420.7       1.2X
   Use Manual map                                                 597           
 604          12          0.2        5966.6       1.3X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 100:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       1380          
 1381           2          0.1       13799.3       1.0X
   Use zipWithIndex + collection.breakOut                         1237          
 1263          37          0.1       12365.3       1.1X
   Use Manual builder                                             1213          
 1226          19          0.1       12126.3       1.1X
   Use Manual map                                                 1283          
 1290          10          0.1       12833.9       1.1X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 150:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       1882          
 1905          33          0.1       18816.7       1.0X
   Use zipWithIndex + collection.breakOut                         1716          
 1725          13          0.1       17155.8       1.1X
   Use Manual builder                                             1731          
 1733           4          0.1       17307.2       1.1X
   Use Manual map                                                 2121          
 2138          24          0.0       21211.1       0.9X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 200:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       2271          
 2293          31          0.0       22707.3       1.0X
   Use zipWithIndex + collection.breakOut                         2124          
 2135          16          0.0       21238.1       1.1X
   Use Manual builder                                             2051          
 2055           5          0.0       20509.8       1.1X
   Use Manual map                                                 2859          
 2892          46          0.0       28592.6       0.8X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 300:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       3441          
 3475          49          0.0       34406.0       1.0X
   Use zipWithIndex + collection.breakOut                         3271          
 3302          44          0.0       32711.7       1.1X
   Use Manual builder                                             3098          
 3115          23          0.0       30981.3       1.1X
   Use Manual map                                                 4620          
 4643          32          0.0       46200.8       0.7X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 400:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       4734          
 4752          26          0.0       47340.5       1.0X
   Use zipWithIndex + collection.breakOut                         4519          
 4554          50          0.0       45187.5       1.0X
   Use Manual builder                                             4299          
 4321          30          0.0       42993.4       1.1X
   Use Manual map                                                 6030          
 6075          63          0.0       60301.8       0.8X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 500:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       5720          
 5784          91          0.0       57197.4       1.0X
   Use zipWithIndex + collection.breakOut                         5763          
 5764           2          0.0       57626.8       1.0X
   Use Manual builder                                             5242          
 5292          72          0.0       52417.1       1.1X
   Use Manual map                                                 7913          
 7943          43          0.0       79125.3       0.7X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 1000:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
-------------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       15654         
 15654           1          0.0      156536.8       1.0X
   Use zipWithIndex + collection.breakOut                         15384         
 15384           0          0.0      153838.5       1.0X
   Use Manual builder                                             14604         
 14680         108          0.0      146038.0       1.1X
   Use Manual map                                                 17196         
 17206          15          0.0      171955.2       0.9X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 5000:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
-------------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       82036         
 82173         194          0.0      820362.9       1.0X
   Use zipWithIndex + collection.breakOut                         82824         
 83256         610          0.0      828240.2       1.0X
   Use Manual builder                                             78756         
 78791          50          0.0      787561.0       1.0X
   Use Manual map                                                101324         
101637         443          0.0     1013241.3       0.8X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 10000:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
--------------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       164053        
 164987        1322          0.0     1640526.0       1.0X
   Use zipWithIndex + collection.breakOut                         171380        
 171931         778          0.0     1713804.3       1.0X
   Use Manual builder                                             161528        
 161667         196          0.0     1615280.2       1.0X
   Use Manual map                                                 219308        
 219999         977          0.0     2193079.7       0.7X
   
   OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1019-azure
   Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
   Test zip with index to map with collectionSize = 20000:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
--------------------------------------------------------------------------------------------------------------------------------------
   Use zipWithIndex + toMap                                       378370        
 379247        1241          0.0     3783699.2       1.0X
   Use zipWithIndex + collection.breakOut                         412945        
 413050         147          0.0     4129454.8       0.9X
   Use Manual builder                                             392057        
 393046        1400          0.0     3920566.0       1.0X
   Use Manual map                                                 471860        
 471867          11          0.0     4718596.0       0.8X
   ```
   
   
   from bench results:
   
   - If input data size <= 1000, the performance of using `while loop manually 
to build the map with mapbuilder` will be  10%+ faster than zip(...).toMap.
   
   - If input data size > 5000, will be no significant performance gap


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to