srowen commented on a change in pull request #21632: [SPARK-19591][ML][MLlib] 
Add sample weights to decision trees
URL: https://github.com/apache/spark/pull/21632#discussion_r243597203
 
 

 ##########
 File path: 
mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
 ##########
 @@ -1002,19 +1019,20 @@ private[spark] object RandomForest extends Logging 
with Serializable {
       val numSplits = metadata.numSplits(featureIndex)
 
       // get count for each distinct value except zero value
-      val partNumSamples = featureSamples.size
-      val partValueCountMap = scala.collection.mutable.Map[Double, Int]()
-      featureSamples.foreach { x =>
-        partValueCountMap(x) = partValueCountMap.getOrElse(x, 0) + 1
-      }
+      val (partValueCountMap, partNumSamples) =
+        featureSamples.foldLeft((Map.empty[Double, Double], 0.0)) {
+          case ((m, cnt), (w, x)) =>
+            (m + ((x, m.getOrElse(x, 0.0) + w)), cnt + w)
+        }
 
       // Calculate the expected number of samples for finding splits
-      val numSamples = (samplesFractionForFindSplits(metadata) * 
metadata.numExamples).toInt
+      val weightedNumSamples = samplesFractionForFindSplits(metadata) *
+        metadata.weightedNumExamples
       // add expected zero value count and get complete statistics
-      val valueCountMap: Map[Double, Int] = if (numSamples - partNumSamples > 
0) {
-        partValueCountMap.toMap + (0.0 -> (numSamples - partNumSamples))
+      val valueCountMap: Map[Double, Double] = if (weightedNumSamples - 
partNumSamples > 1e-5) {
 
 Review comment:
   Utils.EPSILON is used elsewhere; should this tolerance be different and 
would you need an abs()?
   This is also easier if the map here is mutable, and you can just put a value 
for 0.0.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to