zhengruifeng edited a comment on issue #26393: [SPARK-29751][ML] Scalers use 
Summarizer instead of MultivariateOnlineSummarizer
URL: https://github.com/apache/spark/pull/26393#issuecomment-549682239
 
 
   test code
   ```scala
   import org.apache.spark.ml.feature._
   
   scala> var df = spark.read.format("libsvm").load("/data1/Datasets/a9a/a9a")
   19/11/05 13:47:02 WARN LibSVMFileFormat: 'numFeatures' option not specified, 
determining the number of features by going though the input. If you know the 
number in advance, please specify it via 'numFeatures' option to avoid the 
extra scan.
   df: org.apache.spark.sql.DataFrame = [label: double, features: vector]       
   
   
   scala> df.persist()
   res0: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: 
double, features: vector]
   
   
   scala> df.count
   res1: Long = 32561
   
   scala> (0 until 8).foreach(_ => df = df.union(df))
   
   scala> df.count
   res3: Long = 8335616 
   
   val durations1 = (0 until 50).map{i => val tic = System.currentTimeMillis; 
val scaler = new MaxAbsScaler().setInputCol("features"); val model = 
scaler.fit(df); val toc = System.currentTimeMillis; toc - tic}
   
   durations1.takeRight(30).sum.toDouble / 30
   
   
   
   val durations2 = (0 until 50).map{i => val tic = System.currentTimeMillis; 
val scaler = new MinMaxScaler().setInputCol("features"); val model = 
scaler.fit(df); val toc = System.currentTimeMillis; toc - tic}
   
   durations2.takeRight(30).sum.toDouble / 30
   
   
   
   val durations3 = (0 until 50).map{i => val tic = System.currentTimeMillis; 
val scaler = new StandardScaler().setInputCol("features"); val model = 
scaler.fit(df); val toc = System.currentTimeMillis; toc - tic}
   
   durations3.takeRight(30).sum.toDouble / 30
   ```
   
   Results: (the last 30 fitting are taken into account)
   
   |MaxAbsScaler(Old)| MinMaxScaler(Old) | StandardScaler(Old) | 
MaxAbsScaler(New) | MinMaxScaler(New) | StandardScaler(New) |
   |------|----------|------------|----------|------------|----------|
   |6768.1|6875.2|6899.9|5862.1|5880.3|5889.7|
   
   
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to