zhengruifeng commented on pull request #30009:
URL: https://github.com/apache/spark/pull/30009#issuecomment-722762550


   @mengxr  Thanks for reviewing!
   
   > Does your benchmark code count pre-processing time?
   yes, pre-processing time is taken into account.
   
   > Could you paste your benchmark code and environment specs? 
   Dataset: 
[Epsilon](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/epsilon_normalized.t.bz2)
   numInstances: 100,000; numFeatures: 2,000
    
   env:  ubuntu 18.04
   cmd: bin/spark-shell --driver-memory=64G --conf 
spark.driver.maxResultSize=10g 
   
   code:
   ```
   import scala.util.Random
    
   import org.apache.spark.ml.linalg._
   import org.apache.spark.ml.classification._
   import org.apache.spark.ml.regression._
   import org.apache.spark.sql.functions._
   import org.apache.spark.storage.StorageLevel
    
   val df = spark.read.option("numFeatures", 
"2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t").withColumn("aftcensor",
 (col("label")+1)/2).withColumn("aftlabel", 
(col("label")+2)/2).withColumn("label", (col("label")+1)/2)
   df.persist(StorageLevel.MEMORY_AND_DISK)
   df.count
    
   def getSparseUDF(dim: Int) = {
   val rng = new Random(123)
   val newIndices = rng.shuffle(Seq.range(0, dim)).take(2000).toArray.sorted
   udf { vec: Vector =>
   Vectors.sparse(dim, newIndices, vec.toArray).compressed
   }
   }
    
   new LinearSVC().setMaxIter(20).fit(df)
    
   val svc = new LinearSVC().setMaxIter(100).setTol(0)
    
   for (dim <- Seq(2000, 3000, 4000, 5000, 10000, 20000, 200000); size <- 
Seq(0.0625, 0.125, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0)) {
   Thread.sleep(60000)
   val ds = if (dim == 2000) { df } else { val sparseUDF = getSparseUDF(dim); 
df.withColumn("features", sparseUDF(col("features"))) }
   val start = System.currentTimeMillis
   val model = svc.setBlockSizeInMB(size).fit(ds)
   val end = System.currentTimeMillis
   println((model.uid, dim, size, end - start, 
model.coefficients.toString.take(100)))
   }
    
    
   // for branch-3.0
   for (dim <- Seq(2000, 3000, 4000, 5000, 10000, 20000, 200000)) {
   Thread.sleep(60000)
   val ds = if (dim == 2000) { df } else { val sparseUDF = getSparseUDF(dim); 
df.withColumn("features", sparseUDF(col("features"))) }
   val start = System.currentTimeMillis
   val model = svc.fit(ds)
   val end = System.currentTimeMillis
   println((model.uid, dim, -1, end - start, 
model.coefficients.toString.take(100)))
   }
   ```
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to