Github user manishamde commented on a diff in the pull request:
https://github.com/apache/spark/pull/2435#discussion_r18070843
--- Diff:
mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala ---
@@ -649,71 +521,62 @@ object DecisionTree extends Serializable with Logging
{
// Calculate bin aggregates.
timer.start("aggregation")
val binAggregates: DTStatsAggregator = {
- val initAgg = new DTStatsAggregator(metadata, numNodes)
+ val initAgg = if (metadata.subsamplingFeatures) {
+ new DTStatsAggregatorSubsampledFeatures(metadata,
treeToNodeToIndexInfo)
+ } else {
+ new DTStatsAggregatorFixedFeatures(metadata, numNodes)
+ }
input.treeAggregate(initAgg)(binSeqOp, DTStatsAggregator.binCombOp)
}
timer.stop("aggregation")
- // Calculate best splits for all nodes at a given level
+ // Calculate best splits for all nodes in the group
timer.start("chooseSplits")
- // On the first iteration, we need to get and return the newly created
root node.
- var newTopNode: Node = topNode
-
- // Iterate over all nodes at this level
- var nodeIndex = 0
- var internalNodeCount = 0
- while (nodeIndex < numNodes) {
- val (split: Split, stats: InformationGainStats, predict: Predict) =
- binsToBestSplit(binAggregates, nodeIndex, level, metadata, splits)
- logDebug("best split = " + split)
-
- val globalNodeIndex = globalNodeIndexOffset + nodeIndex
- // Extract info for this node at the current level.
- val isLeaf = (stats.gain <= 0) || (level == metadata.maxDepth)
- val node =
- new Node(globalNodeIndex, predict.predict, isLeaf, Some(split),
None, None, Some(stats))
- logDebug("Node = " + node)
-
- if (!isLeaf) {
- internalNodeCount += 1
- }
- if (level == 0) {
- newTopNode = node
- } else {
- // Set parent.
- val parentNode = Node.getNode(Node.parentIndex(globalNodeIndex),
topNode)
- if (Node.isLeftChild(globalNodeIndex)) {
- parentNode.leftNode = Some(node)
- } else {
- parentNode.rightNode = Some(node)
+ // Iterate over all nodes in this group.
+ nodesForGroup.foreach { case (treeIndex, nodesForTree) =>
+ nodesForTree.foreach { node =>
+ val nodeIndex = node.id
+ val nodeInfo = treeToNodeToIndexInfo(treeIndex)(nodeIndex)
+ val aggNodeIndex = nodeInfo.nodeIndexInGroup
+ val featuresForNode = nodeInfo.featureSubset
+ val (split: Split, stats: InformationGainStats, predict: Predict) =
+ binsToBestSplit(binAggregates, aggNodeIndex, splits,
featuresForNode)
+ logDebug("best split = " + split)
+
+ // Extract info for this node. Create children if not leaf.
+ val isLeaf = (stats.gain <= 0) || (Node.indexToLevel(nodeIndex) ==
metadata.maxDepth)
+ assert(node.id == nodeIndex)
+ node.predict = predict.predict
+ node.isLeaf = isLeaf
+ node.stats = Some(stats)
+ logDebug("Node = " + node)
+
+ if (!isLeaf) {
+ node.split = Some(split)
+ node.leftNode =
Some(Node.emptyNode(Node.leftChildIndex(nodeIndex)))
+ node.rightNode =
Some(Node.emptyNode(Node.rightChildIndex(nodeIndex)))
+ nodeQueue.enqueue((treeIndex, node.leftNode.get))
--- End diff --
Well, it might not be possible to do a DFS with the current
```predictNodeIndex``` method since we assume data point contributes to one
node in a tree.
Having said that, is there a corner case where nodes for the same tree on
two different layers could be a part of the same group during BFS?
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]