Github user mengxr commented on a diff in the pull request:
https://github.com/apache/spark/pull/2341#discussion_r17466105
--- Diff:
mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala ---
@@ -120,81 +114,35 @@ class DecisionTree (private val strategy: Strategy)
extends Serializable with Lo
* beforehand and is not used in later levels.
*/
+ var topNode: Node = null // set on first iteration
var level = 0
var break = false
while (level <= maxDepth && !break) {
-
logDebug("#####################################")
logDebug("level = " + level)
logDebug("#####################################")
// Find best split for all nodes at a level.
timer.start("findBestSplits")
- val splitsStatsForLevel: Array[(Split, InformationGainStats,
Predict)] =
- DecisionTree.findBestSplits(treeInput, parentImpurities,
- metadata, level, nodes, splits, bins, maxLevelForSingleGroup,
timer)
+ val (tmpTopNode: Node, doneTraining: Boolean) =
DecisionTree.findBestSplits(treeInput,
+ metadata, level, topNode, splits, bins, maxLevelForSingleGroup,
timer)
timer.stop("findBestSplits")
- val levelNodeIndexOffset = Node.startIndexInLevel(level)
- for ((nodeSplitStats, index) <-
splitsStatsForLevel.view.zipWithIndex) {
- val nodeIndex = levelNodeIndexOffset + index
-
- // Extract info for this node (index) at the current level.
- timer.start("extractNodeInfo")
- val split = nodeSplitStats._1
- val stats = nodeSplitStats._2
- val predict = nodeSplitStats._3.predict
- val isLeaf = (stats.gain <= 0) || (level == strategy.maxDepth)
- val node = new Node(nodeIndex, predict, isLeaf, Some(split), None,
None, Some(stats))
- logDebug("Node = " + node)
- nodes(nodeIndex) = node
- timer.stop("extractNodeInfo")
-
- if (level != 0) {
- // Set parent.
- val parentNodeIndex = Node.parentIndex(nodeIndex)
- if (Node.isLeftChild(nodeIndex)) {
- nodes(parentNodeIndex).leftNode = Some(nodes(nodeIndex))
- } else {
- nodes(parentNodeIndex).rightNode = Some(nodes(nodeIndex))
- }
- }
- // Extract info for nodes at the next lower level.
- timer.start("extractInfoForLowerLevels")
- if (level < maxDepth) {
- val leftChildIndex = Node.leftChildIndex(nodeIndex)
- val leftImpurity = stats.leftImpurity
- logDebug("leftChildIndex = " + leftChildIndex + ", impurity = "
+ leftImpurity)
- parentImpurities(leftChildIndex) = leftImpurity
-
- val rightChildIndex = Node.rightChildIndex(nodeIndex)
- val rightImpurity = stats.rightImpurity
- logDebug("rightChildIndex = " + rightChildIndex + ", impurity =
" + rightImpurity)
- parentImpurities(rightChildIndex) = rightImpurity
- }
- timer.stop("extractInfoForLowerLevels")
- logDebug("final best split = " + split)
+ if (level == 0) {
+ topNode = tmpTopNode
}
- require(Node.maxNodesInLevel(level) == splitsStatsForLevel.length)
- // Check whether all the nodes at the current level at leaves.
- val allLeaf = splitsStatsForLevel.forall(_._2.gain <= 0)
- logDebug("all leaf = " + allLeaf)
- if (allLeaf) {
- break = true // no more tree construction
- } else {
- level += 1
+ if (doneTraining) {
+ break = true
--- End diff --
Shall we remove `break` and only use `doneTraining`?
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]