srowen commented on a change in pull request #21632: [SPARK-19591][ML][MLlib] Add sample weights to decision trees URL: https://github.com/apache/spark/pull/21632#discussion_r243597203
########## File path: mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala ########## @@ -1002,19 +1019,20 @@ private[spark] object RandomForest extends Logging with Serializable { val numSplits = metadata.numSplits(featureIndex) // get count for each distinct value except zero value - val partNumSamples = featureSamples.size - val partValueCountMap = scala.collection.mutable.Map[Double, Int]() - featureSamples.foreach { x => - partValueCountMap(x) = partValueCountMap.getOrElse(x, 0) + 1 - } + val (partValueCountMap, partNumSamples) = + featureSamples.foldLeft((Map.empty[Double, Double], 0.0)) { + case ((m, cnt), (w, x)) => + (m + ((x, m.getOrElse(x, 0.0) + w)), cnt + w) + } // Calculate the expected number of samples for finding splits - val numSamples = (samplesFractionForFindSplits(metadata) * metadata.numExamples).toInt + val weightedNumSamples = samplesFractionForFindSplits(metadata) * + metadata.weightedNumExamples // add expected zero value count and get complete statistics - val valueCountMap: Map[Double, Int] = if (numSamples - partNumSamples > 0) { - partValueCountMap.toMap + (0.0 -> (numSamples - partNumSamples)) + val valueCountMap: Map[Double, Double] = if (weightedNumSamples - partNumSamples > 1e-5) { Review comment: Utils.EPSILON is used elsewhere; should this tolerance be different and would you need an abs()? This is also easier if the map here is mutable, and you can just put a value for 0.0. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org