[GitHub] spark pull request: [SPARK-3366][MLLIB]Compute best splits distrib...

jkbradley Wed, 01 Oct 2014 11:15:13 -0700

Github user jkbradley commented on a diff in the pull request:

    https://github.com/apache/spark/pull/2595#discussion_r18297137
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DTStatsAggregator.scala 
---
    @@ -68,90 +93,72 @@ private[tree] abstract class DTStatsAggregator(
        *                           (node, feature, left/right child) offset 
from
        *                           [[getLeftRightNodeFeatureOffsets]].
        */
    -  def getImpurityCalculator(nodeFeatureOffset: Int, binIndex: Int): 
ImpurityCalculator = {
    -    impurityAggregator.getCalculator(allStats, nodeFeatureOffset + 
binIndex * statsSize)
    +  def getImpurityCalculator(featureOffset: Int, binIndex: Int): 
ImpurityCalculator = {
    +    impurityAggregator.getCalculator(allStats, featureOffset + binIndex * 
statsSize)
       }
     
       /**
    -   * Update the stats for a given (node, feature, bin) for ordered 
features, using the given label.
    +   * Update the stats for a given (feature, bin) for ordered features, 
using the given label.
        */
    -  def update(
    -      nodeIndex: Int,
    -      featureIndex: Int,
    -      binIndex: Int,
    -      label: Double,
    -      instanceWeight: Double): Unit = {
    -    val i = getNodeFeatureOffset(nodeIndex, featureIndex) + binIndex * 
statsSize
    +  def update(featureIndex: Int, binIndex: Int, label: Double, 
instanceWeight: Double): Unit = {
    +    val i = featureOffsets(featureIndex) + binIndex * statsSize
         impurityAggregator.update(allStats, i, label, instanceWeight)
       }
     
       /**
    -   * Pre-compute node offset for use with [[nodeUpdate]].
    -   */
    -  def getNodeOffset(nodeIndex: Int): Int
    -
    -  /**
        * Faster version of [[update]].
    -   * Update the stats for a given (node, feature, bin) for ordered 
features, using the given label.
    -   * @param nodeOffset  Pre-computed node offset from [[getNodeOffset]].
    +   * Update the stats for a given (feature, bin), using the given label.
    +   * @param nodeFeatureOffset  For ordered features, this is a 
pre-computed feature offset
    +   *                           from [[getNodeFeatureOffset]].
    +   *                           For unordered features, this is a 
pre-computed
    +   *                           (feature, left/right child) offset from
    +   *                           [[getLeftRightNodeFeatureOffsets]].
        */
    -  def nodeUpdate(
    -      nodeOffset: Int,
    -      nodeIndex: Int,
    -      featureIndex: Int,
    +  def nodeFeatureUpdate(
    +      nodeFeatureOffset: Int,
           binIndex: Int,
           label: Double,
    -      instanceWeight: Double): Unit
    +      instanceWeight: Double): Unit = {
    +    impurityAggregator.update(allStats, nodeFeatureOffset + binIndex * 
statsSize,
    +      label, instanceWeight)
    +  }
     
       /**
    -   * Pre-compute (node, feature) offset for use with [[nodeFeatureUpdate]].
    +   * Pre-compute feature offset for use with [[nodeFeatureUpdate]].
        * For ordered features only.
        */
    -  def getNodeFeatureOffset(nodeIndex: Int, featureIndex: Int): Int
    +  def getFeatureOffset(featureIndex: Int): Int = {
    +    require(!isUnordered(featureIndex),
    +      s"DTStatsAggregator.getNodeFeatureOffset is for ordered features 
only, but was called" +
    +        s" for unordered feature $featureIndex.")
    +    featureOffsets(featureIndex)
    +  }
     
       /**
    -   * Pre-compute (node, feature) offset for use with [[nodeFeatureUpdate]].
    +   * Pre-compute feature offset for use with [[nodeFeatureUpdate]].
        * For unordered features only.
        */
    -  def getLeftRightNodeFeatureOffsets(nodeIndex: Int, featureIndex: Int): 
(Int, Int) = {
    +  def getLeftRightFeatureOffsets(featureIndex: Int): (Int, Int) = {
         require(isUnordered(featureIndex),
           s"DTStatsAggregator.getLeftRightNodeFeatureOffsets is for unordered 
features only," +
             s" but was called for ordered feature $featureIndex.")
    -    val baseOffset = getNodeFeatureOffset(nodeIndex, featureIndex)
    -    (baseOffset, baseOffset + (metadata.numBins(featureIndex) >> 1) * 
statsSize)
    +    val baseOffset = featureOffsets(featureIndex)
    +    (baseOffset, baseOffset + (numBins(featureIndex) >> 1) * statsSize)
       }
     
       /**
    -   * Faster version of [[update]].
    -   * Update the stats for a given (node, feature, bin), using the given 
label.
    -   * @param nodeFeatureOffset  For ordered features, this is a 
pre-computed (node, feature) offset
    +   * For a given feature, merge the stats for two bins.
    +   * @param nodeFeatureOffset  For ordered features, this is a 
pre-computed feature offset
    --- End diff --
    
    Old names in doc: "nodeFeatureOffset", "getNodeFeatureOffset", 
"getLeftRightNodeFeatureOffsets"



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-3366][MLLIB]Compute best splits distrib...

Reply via email to