Github user smurching commented on a diff in the pull request:
https://github.com/apache/spark/pull/19666#discussion_r149238123
--- Diff:
mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala ---
@@ -741,17 +678,43 @@ private[spark] object RandomForest extends Logging {
(splits(featureIndex)(bestFeatureSplitIndex),
bestFeatureGainStats)
} else if (binAggregates.metadata.isUnordered(featureIndex)) {
// Unordered categorical feature
- val leftChildOffset =
binAggregates.getFeatureOffset(featureIndexIdx)
- val (bestFeatureSplitIndex, bestFeatureGainStats) =
- Range(0, numSplits).map { splitIndex =>
- val leftChildStats =
binAggregates.getImpurityCalculator(leftChildOffset, splitIndex)
- val rightChildStats =
binAggregates.getParentImpurityCalculator()
- .subtract(leftChildStats)
+ val numBins = binAggregates.metadata.numBins(featureIndex)
+ val featureOffset =
binAggregates.getFeatureOffset(featureIndexIdx)
+
+ val binStatsArray = Array.tabulate(numBins) { binIndex =>
+ binAggregates.getImpurityCalculator(featureOffset, binIndex)
+ }
+ val parentStats = binAggregates.getParentImpurityCalculator()
+
+ var bestGain = Double.NegativeInfinity
+ var bestSet: BitSet = null
+ var bestLeftChildStats: ImpurityCalculator = null
+ var bestRightChildStats: ImpurityCalculator = null
+
+ traverseUnorderedSplits[ImpurityCalculator](numBins, null,
+ (stats, binIndex) => {
+ val binStats = binStatsArray(binIndex)
+ if (stats == null) {
+ binStats
+ } else {
+ stats.copy.add(binStats)
+ }
+ },
+ (set, leftChildStats) => {
+ val rightChildStats =
parentStats.copy.subtract(leftChildStats)
gainAndImpurityStats =
calculateImpurityStats(gainAndImpurityStats,
leftChildStats, rightChildStats, binAggregates.metadata)
- (splitIndex, gainAndImpurityStats)
- }.maxBy(_._2.gain)
- (splits(featureIndex)(bestFeatureSplitIndex),
bestFeatureGainStats)
+ if (gainAndImpurityStats.gain > bestGain) {
+ bestGain = gainAndImpurityStats.gain
+ bestSet = set | new BitSet(numBins) // copy set
--- End diff --
Why not use `set.copy()`?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]