Github user viirya commented on the issue:
https://github.com/apache/spark/pull/19229
I ran the test codes to benchmark RDD-version and DataFrame version with
this `ImputerModel` change:
import org.apache.spark.ml.feature._
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types._
import spark.implicits._
import scala.util.Random
def genData(): DataFrame = {
val seed = 123l
val random = new Random(seed)
val n = 10000
val m = 100
val rows = sc.parallelize(1 to n).map(i=>
Row(Array.fill(m)(random.nextDouble): _*))
val struct = new StructType(Array.range(0,m,1).map(i =>
StructField(s"c$i",DoubleType,true)))
val df = spark.createDataFrame(rows, struct)
df.cache()
df.count()
df
}
for (strategy <- Seq("mean", "median"); k <- Seq(1,10,100)) {
val imputer = new
Imputer().setStrategy(strategy).setInputCols(Array.range(0,k,1).map(i=>s"c$i")).setOutputCols(Array.range(0,k,1).map(i=>s"o$i"))
var duration = 0.0
for (i<- 0 until 10) {
val df = genData()
val start = System.nanoTime()
val model = imputer.fit(df)
val end = System.nanoTime()
val df2 = genData()
val start2 = System.nanoTime()
model.transform(df2).count
val end2 = System.nanoTime()
duration += ((end - start) + (end2 - start2)) / 1e9
}
println((strategy, k, duration/10))
}
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]