zhengruifeng commented on issue #28176: [SPARK-31301][ML] Flatten the result 
dataframe of tests in testChiSquare
URL: https://github.com/apache/spark/pull/28176#issuecomment-611875587
 
 
   bin/spark-shell --driver-memory=2G
   
   testcodes:
   ```
   import org.apache.spark.ml.feature.ChiSqSelector
   import org.apache.spark.ml.stat._
   
   val df = 
spark.read.format("libsvm").load("/data1/Datasets/webspam/webspam_wc_normalized_trigram.svm.10k")
   
   val chi = ChiSquareTest.test(df, "features", "label")
   // val chi = ChiSquareTest.test(df, "features", "label", true) // added in 
this PR
   chi.show
   
   val selector = new 
ChiSqSelector().setNumTopFeatures(1000).setLabelCol("label").setFeaturesCol("features")
   
   val model = selector.fit(df)
   ```
   
   Existing `ChiSquareTest.test` and `selector.fit` will crash Spark-Shell due 
to OOM; while new methods in this PR work fine:
   
   
   Existing `ChiSquareTest.test`
   ```
   Driver stacktrace:
     at 
org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2094)
     at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2043)
     at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2042)
     at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
     at 
scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
     at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
     at 
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2042)
     at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1020)
     at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1020)
     at scala.Option.foreach(Option.scala:407)
     at 
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1020)
     at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2274)
     at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2223)
     at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2212)
     at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
     at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:822)
     at org.apache.spark.SparkContext.runJob(SparkContext.scala:2108)
     at org.apache.spark.SparkContext.runJob(SparkContext.scala:2129)
     at org.apache.spark.SparkContext.runJob(SparkContext.scala:2148)
     at org.apache.spark.SparkContext.runJob(SparkContext.scala:2173)
     at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
     at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
     at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
     at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
     at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
     at 
org.apache.spark.mllib.stat.test.ChiSqTest$.chiSquaredSparseFeatures(ChiSqTest.scala:148)
     at 
org.apache.spark.mllib.stat.test.ChiSqTest$.chiSquaredFeatures(ChiSqTest.scala:88)
     at org.apache.spark.mllib.stat.Statistics$.chiSqTest(Statistics.scala:192)
     at org.apache.spark.ml.stat.ChiSquareTest$.test(ChiSquareTest.scala:73)
     ... 49 elided
   Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
   ```
   
   new `ChiSquareTest.test(df, "features", "label", true)`
   ```
   scala> val chi = ChiSquareTest.test(df, "features", "label", true)
   chi: org.apache.spark.sql.DataFrame = [featureIndex: int, pValue: double ... 
2 more fields]
   
   scala> chi.show
   20/04/10 11:28:51 WARN Executor: Managed memory leak detected; size = 
843044942 bytes, TID = 49
   +------------+-------------------+----------------+------------------+       
   
   |featureIndex|             pValue|degreesOfFreedom|         statistic|
   +------------+-------------------+----------------+------------------+
   |     3184020|                1.0|               0|               0.0|
   |     6697512|0.21072480020843432|               2|3.1144045224283925|
   |     3387408|                1.0|               0|               0.0|
   |     5907828|                1.0|               0|               0.0|
   |     6582516|                1.0|               0|               0.0|
   |     3490824|                1.0|               0|               0.0|
   |     5916408|                1.0|               0|               0.0|
   |     2292732|                1.0|               0|               0.0|
   |     7447896|                1.0|               0|               0.0|
   |     2157804|                1.0|               0|               0.0|
   |     1732608|                1.0|               0|               0.0|
   |     3427284|                1.0|               0|               0.0|
   |     5284836|                1.0|               0|               0.0|
   |     3812544|                1.0|               0|               0.0|
   |     2396784|                1.0|               0|               0.0|
   |     5761524|                1.0|               0|               0.0|
   |     3138636|                1.0|               0|               0.0|
   |     6971832|                1.0|               0|               0.0|
   |     8164968|                1.0|               0|               0.0|
   |     2321808|                1.0|               0|               0.0|
   +------------+-------------------+----------------+------------------+
   only showing top 20 rows
   ```
   
   new `selector.fit(df)`
   ```
   scala> val model = selector.fit(df)
   model: org.apache.spark.ml.feature.ChiSqSelectorModel = ChiSqSelectorModel: 
uid=chiSqSelector_3b712e166c6f, numSelectedFeatures=1000
   
   scala> model.selectedFeatures
   res0: Array[Int] = Array(592236, 592238, 605261, 614717, 616224, 617577, 
619040, 663584, 670767, 670792, 670797, 670804, 670817, 670820, 670828, 670832, 
2105411, 2105903, 2105921, 2105972, 2106179, 2110000, 2111536, 2112559, 
2112621, 2112622, 2112627, 2113858, 2114061, 2115668, 2115950, 2117221, 
2122083, 2122092, 2122094, 2122340, 2122351, 2122604, 2122607, 2123375, 
2123877, 2123890, 2124132, 2124142, 2124147, 2124897, 2125157, 2125409, 
2125678, 2125938, 2126437, 2126697, 2126708, 2126949, 2126953, 2126969, 
2127457, 2127721, 2171454, 2236513, 2236515, 2236520, 2236531, 2236532, 
2236534, 2236535, 2236960, 2236990, 2237251, 2237253, 2237254, 2239522, 
2239541, 2240353, 2240827, 2241058, 2241841, 2242104, 2244106, 2244156, 
2244161, 2244162, 2244168, 2244169, 224417...
   ```
   
   Since `ChiSquareTest` and `ChiSqSelector` are for feature selection, so I 
think it is usual that the input has high dimensionality.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to