[ https://issues.apache.org/jira/browse/SPARK-21638?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Peng Meng updated SPARK-21638: ------------------------------ Description: When train RF model, there is many warning message like this: {quote}WARN RandomForest: Tree learning is using approximately 268492800 bytes per iteration, which exceeds requested limit maxMemoryUsage=268435456. This allows splitting 2622 nodes in this iteration.{quote} This warning message is unnecessary and the data is not accurate. This is because {code:java} while (nodeStack.nonEmpty && (memUsage < maxMemoryUsage || memUsage == 0)) { val (treeIndex, node) = nodeStack.top // Choose subset of features for node (if subsampling). val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) { Some(SamplingUtils.reservoirSampleAndCount(Range(0, metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong())._1) } else { None } // Check if enough memory remains to add this node to the group. val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, featureSubset) * 8L if (memUsage + nodeMemUsage <= maxMemoryUsage || memUsage == 0) { nodeStack.pop() mutableNodesForGroup.getOrElseUpdate(treeIndex, new mutable.ArrayBuffer[LearningNode]()) += node mutableTreeToNodeToIndexInfo .getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id) = new NodeIndexInfo(numNodesInGroup, featureSubset) } numNodesInGroup += 1 *//we not add the node to mutableNodesForGroup, but we add memUsage here.* memUsage += nodeMemUsage } if (memUsage > maxMemoryUsage) { // If maxMemoryUsage is 0, we should still allow splitting 1 node. logWarning(s"Tree learning is using approximately $memUsage bytes per iteration, which" + s" exceeds requested limit maxMemoryUsage=$maxMemoryUsage. This allows splitting" + s" $numNodesInGroup nodes in this iteration.") } {code} To avoid this unnecessary warning, we should change the code like this: {code:java} while (nodeStack.nonEmpty) { val (treeIndex, node) = nodeStack.top // Choose subset of features for node (if subsampling). val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) { Some(SamplingUtils.reservoirSampleAndCount(Range(0, metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong())._1) } else { None } // Check if enough memory remains to add this node to the group. val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, featureSubset) * 8L if (memUsage + nodeMemUsage <= maxMemoryUsage || memUsage == 0) { nodeStack.pop() mutableNodesForGroup.getOrElseUpdate(treeIndex, new mutable.ArrayBuffer[LearningNode]()) += node mutableTreeToNodeToIndexInfo .getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id) = new NodeIndexInfo(numNodesInGroup, featureSubset) numNodesInGroup += 1 //we not add the node to mutableNodesForGroup, but we add memUsage here. memUsage += nodeMemUsage } else { break } } {code} was: When train RF model, there is many warning message like this: {quote}WARN RandomForest: Tree learning is using approximately 268492800 bytes per iteration, which exceeds requested limit maxMemoryUsage=268435456. This allows splitting 2622 nodes in this iteration.{quote} This warning message is unnecessary and the data is not accuracy. This is because {code:java} while (nodeStack.nonEmpty && (memUsage < maxMemoryUsage || memUsage == 0)) { val (treeIndex, node) = nodeStack.top // Choose subset of features for node (if subsampling). val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) { Some(SamplingUtils.reservoirSampleAndCount(Range(0, metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong())._1) } else { None } // Check if enough memory remains to add this node to the group. val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, featureSubset) * 8L if (memUsage + nodeMemUsage <= maxMemoryUsage || memUsage == 0) { nodeStack.pop() mutableNodesForGroup.getOrElseUpdate(treeIndex, new mutable.ArrayBuffer[LearningNode]()) += node mutableTreeToNodeToIndexInfo .getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id) = new NodeIndexInfo(numNodesInGroup, featureSubset) } numNodesInGroup += 1 *//we not add the node to mutableNodesForGroup, but we add memUsage here.* memUsage += nodeMemUsage } if (memUsage > maxMemoryUsage) { // If maxMemoryUsage is 0, we should still allow splitting 1 node. logWarning(s"Tree learning is using approximately $memUsage bytes per iteration, which" + s" exceeds requested limit maxMemoryUsage=$maxMemoryUsage. This allows splitting" + s" $numNodesInGroup nodes in this iteration.") } {code} To avoid this unnecessary warning, we should change the code like this: {code:java} while (nodeStack.nonEmpty) { val (treeIndex, node) = nodeStack.top // Choose subset of features for node (if subsampling). val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) { Some(SamplingUtils.reservoirSampleAndCount(Range(0, metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong())._1) } else { None } // Check if enough memory remains to add this node to the group. val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, featureSubset) * 8L if (memUsage + nodeMemUsage <= maxMemoryUsage || memUsage == 0) { nodeStack.pop() mutableNodesForGroup.getOrElseUpdate(treeIndex, new mutable.ArrayBuffer[LearningNode]()) += node mutableTreeToNodeToIndexInfo .getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id) = new NodeIndexInfo(numNodesInGroup, featureSubset) numNodesInGroup += 1 //we not add the node to mutableNodesForGroup, but we add memUsage here. memUsage += nodeMemUsage } else { break } } {code} > Warning message of RF is not accurate > ------------------------------------- > > Key: SPARK-21638 > URL: https://issues.apache.org/jira/browse/SPARK-21638 > Project: Spark > Issue Type: Bug > Components: ML > Affects Versions: 2.3.0 > Environment: > Reporter: Peng Meng > Priority: Minor > > When train RF model, there is many warning message like this: > {quote}WARN RandomForest: Tree learning is using approximately 268492800 > bytes per iteration, which exceeds requested limit maxMemoryUsage=268435456. > This allows splitting 2622 nodes in this iteration.{quote} > This warning message is unnecessary and the data is not accurate. > This is because > {code:java} > while (nodeStack.nonEmpty && (memUsage < maxMemoryUsage || memUsage == 0)) { > val (treeIndex, node) = nodeStack.top > // Choose subset of features for node (if subsampling). > val featureSubset: Option[Array[Int]] = if > (metadata.subsamplingFeatures) { > Some(SamplingUtils.reservoirSampleAndCount(Range(0, > metadata.numFeatures).iterator, metadata.numFeaturesPerNode, > rng.nextLong())._1) > } else { > None > } > // Check if enough memory remains to add this node to the group. > val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, > featureSubset) * 8L > if (memUsage + nodeMemUsage <= maxMemoryUsage || memUsage == 0) { > nodeStack.pop() > mutableNodesForGroup.getOrElseUpdate(treeIndex, new > mutable.ArrayBuffer[LearningNode]()) += > node > mutableTreeToNodeToIndexInfo > .getOrElseUpdate(treeIndex, new mutable.HashMap[Int, > NodeIndexInfo]())(node.id) > = new NodeIndexInfo(numNodesInGroup, featureSubset) > } > numNodesInGroup += 1 *//we not add the node to mutableNodesForGroup, > but we add memUsage here.* > memUsage += nodeMemUsage > } > if (memUsage > maxMemoryUsage) { > // If maxMemoryUsage is 0, we should still allow splitting 1 node. > logWarning(s"Tree learning is using approximately $memUsage bytes per > iteration, which" + > s" exceeds requested limit maxMemoryUsage=$maxMemoryUsage. This > allows splitting" + > s" $numNodesInGroup nodes in this iteration.") > } > {code} > To avoid this unnecessary warning, we should change the code like this: > {code:java} > while (nodeStack.nonEmpty) { > val (treeIndex, node) = nodeStack.top > // Choose subset of features for node (if subsampling). > val featureSubset: Option[Array[Int]] = if > (metadata.subsamplingFeatures) { > Some(SamplingUtils.reservoirSampleAndCount(Range(0, > metadata.numFeatures).iterator, metadata.numFeaturesPerNode, > rng.nextLong())._1) > } else { > None > } > // Check if enough memory remains to add this node to the group. > val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, > featureSubset) * 8L > if (memUsage + nodeMemUsage <= maxMemoryUsage || memUsage == 0) { > nodeStack.pop() > mutableNodesForGroup.getOrElseUpdate(treeIndex, new > mutable.ArrayBuffer[LearningNode]()) += > node > mutableTreeToNodeToIndexInfo > .getOrElseUpdate(treeIndex, new mutable.HashMap[Int, > NodeIndexInfo]())(node.id) > = new NodeIndexInfo(numNodesInGroup, featureSubset) > numNodesInGroup += 1 //we not add the node to > mutableNodesForGroup, but we add memUsage here. > memUsage += nodeMemUsage > } else { > break > } > } > {code} -- This message was sent by Atlassian JIRA (v6.4.14#64029) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org