zhengruifeng commented on a change in pull request #27954: [SPARK-31185][ML] 
Implement VarianceThresholdSelector
URL: https://github.com/apache/spark/pull/27954#discussion_r395955676
 
 

 ##########
 File path: 
mllib/src/main/scala/org/apache/spark/ml/feature/VarianceThresholdSelector.scala
 ##########
 @@ -92,16 +93,10 @@ with DefaultParamsWritable {
       .select("summary.max", "summary.min", "summary.variance")
       .first()
 
-    val result = 
variances.toArray.zip(maxs.toArray).zip(mins.toArray).zipWithIndex
-    // if varianceThreshold not set, remove the features that have the same 
value in all samples.
-    val features = if (!isSet(varianceThreshold)) {
-      // use max and min to avoid numeric precision issues for constant 
features
-      result.filter { case (((vari, max), min), _) => ((max != min) && (vari 
!= 0)) }
-    } else {
-      result.filter { case (((vari, _), _), _) => !(vari < 
getVarianceThreshold) }
-    }
-
-    val indices = features.map { case (((_, _), _), index) => index }
+    val numFeatures = maxs.size
+    val indices = Array.tabulate(numFeatures) { i =>
+      (i, if (maxs(i) == mins(i)) 0.0 else variances(i))
+    } .filter(_._2 > getVarianceThreshold).map(_._1)
 
 Review comment:
   nit : no space after '}'

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to