[1/2] spark git commit: [SPARK-4486][MLLIB] Improve GradientBoosting APIs and doc

meng Thu, 20 Nov 2014 00:49:33 -0800

Repository: spark
Updated Branches:
  refs/heads/master e216ffaea -> 15cacc812



http://git-wip-us.apache.org/repos/asf/spark/blob/15cacc81/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
index effb7b8..8972c22 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.tree
 
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.tree.model.WeightedEnsembleModel
+import org.apache.spark.mllib.tree.model.TreeEnsembleModel
 import org.apache.spark.util.StatCounter
 
 import scala.collection.mutable
@@ -48,7 +48,7 @@ object EnsembleTestHelper {
   }
 
   def validateClassifier(
-      model: WeightedEnsembleModel,
+      model: TreeEnsembleModel,
       input: Seq[LabeledPoint],
       requiredAccuracy: Double) {
     val predictions = input.map(x => model.predict(x.features))
@@ -60,17 +60,27 @@ object EnsembleTestHelper {
       s"validateClassifier calculated accuracy $accuracy but required 
$requiredAccuracy.")
   }
 
+  /**
+   * Validates a tree ensemble model for regression.
+   */
   def validateRegressor(
-      model: WeightedEnsembleModel,
+      model: TreeEnsembleModel,
       input: Seq[LabeledPoint],
-      requiredMSE: Double) {
+      required: Double,
+      metricName: String = "mse") {
     val predictions = input.map(x => model.predict(x.features))
-    val squaredError = predictions.zip(input).map { case (prediction, 
expected) =>
-      val err = prediction - expected.label
-      err * err
-    }.sum
-    val mse = squaredError / input.length
-    assert(mse <= requiredMSE, s"validateRegressor calculated MSE $mse but 
required $requiredMSE.")
+    val errors = predictions.zip(input.map(_.label)).map { case (prediction, 
label) =>
+      prediction - label
+    }
+    val metric = metricName match {
+      case "mse" =>
+        errors.map(err => err * err).sum / errors.size
+      case "mae" =>
+        errors.map(math.abs).sum / errors.size
+    }
+
+    assert(metric <= required,
+      s"validateRegressor calculated $metricName $metric but required 
$required.")
   }
 
   def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): 
Array[LabeledPoint] = {

http://git-wip-us.apache.org/repos/asf/spark/blob/15cacc81/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
new file mode 100644
index 0000000..f3f8eff
--- /dev/null
+++ 
b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.tree
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.tree.configuration.Algo._
+import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy}
+import org.apache.spark.mllib.tree.impurity.Variance
+import org.apache.spark.mllib.tree.loss.{AbsoluteError, SquaredError, LogLoss}
+
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+/**
+ * Test suite for [[GradientBoostedTrees]].
+ */
+class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext {
+
+  test("Regression with continuous features: SquaredError") {
+    GradientBoostedTreesSuite.testCombinations.foreach {
+      case (numIterations, learningRate, subsamplingRate) =>
+        val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures 
= 10, 100)
+        val rdd = sc.parallelize(arr, 2)
+
+        val treeStrategy = new Strategy(algo = Regression, impurity = 
Variance, maxDepth = 2,
+          categoricalFeaturesInfo = Map.empty, subsamplingRate = 
subsamplingRate)
+        val boostingStrategy =
+          new BoostingStrategy(treeStrategy, SquaredError, numIterations, 
learningRate)
+
+        val gbt = GradientBoostedTrees.train(rdd, boostingStrategy)
+
+        assert(gbt.trees.size === numIterations)
+        EnsembleTestHelper.validateRegressor(gbt, arr, 0.03)
+
+        val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, 
x.features))
+        val dt = DecisionTree.train(remappedInput, treeStrategy)
+
+        // Make sure trees are the same.
+        assert(gbt.trees.head.toString == dt.toString)
+    }
+  }
+
+  test("Regression with continuous features: Absolute Error") {
+    GradientBoostedTreesSuite.testCombinations.foreach {
+      case (numIterations, learningRate, subsamplingRate) =>
+        val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures 
= 10, 100)
+        val rdd = sc.parallelize(arr, 2)
+
+        val treeStrategy = new Strategy(algo = Regression, impurity = 
Variance, maxDepth = 2,
+          categoricalFeaturesInfo = Map.empty, subsamplingRate = 
subsamplingRate)
+        val boostingStrategy =
+          new BoostingStrategy(treeStrategy, AbsoluteError, numIterations, 
learningRate)
+
+        val gbt = GradientBoostedTrees.train(rdd, boostingStrategy)
+
+        assert(gbt.trees.size === numIterations)
+        EnsembleTestHelper.validateRegressor(gbt, arr, 0.85, "mae")
+
+        val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, 
x.features))
+        val dt = DecisionTree.train(remappedInput, treeStrategy)
+
+        // Make sure trees are the same.
+        assert(gbt.trees.head.toString == dt.toString)
+    }
+  }
+
+  test("Binary classification with continuous features: Log Loss") {
+    GradientBoostedTreesSuite.testCombinations.foreach {
+      case (numIterations, learningRate, subsamplingRate) =>
+        val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures 
= 10, 100)
+        val rdd = sc.parallelize(arr, 2)
+
+        val treeStrategy = new Strategy(algo = Classification, impurity = 
Variance, maxDepth = 2,
+          numClassesForClassification = 2, categoricalFeaturesInfo = Map.empty,
+          subsamplingRate = subsamplingRate)
+        val boostingStrategy =
+          new BoostingStrategy(treeStrategy, LogLoss, numIterations, 
learningRate)
+
+        val gbt = GradientBoostedTrees.train(rdd, boostingStrategy)
+
+        assert(gbt.trees.size === numIterations)
+        EnsembleTestHelper.validateClassifier(gbt, arr, 0.9)
+
+        val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, 
x.features))
+        val ensembleStrategy = treeStrategy.copy
+        ensembleStrategy.algo = Regression
+        ensembleStrategy.impurity = Variance
+        val dt = DecisionTree.train(remappedInput, ensembleStrategy)
+
+        // Make sure trees are the same.
+        assert(gbt.trees.head.toString == dt.toString)
+    }
+  }
+
+}
+
+object GradientBoostedTreesSuite {
+
+  // Combinations for estimators, learning rates and subsamplingRate
+  val testCombinations = Array((10, 1.0, 1.0), (10, 0.1, 1.0), (10, 1.0, 
0.75), (10, 0.1, 0.75))
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/15cacc81/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostingSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostingSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostingSuite.scala
deleted file mode 100644
index 84de401..0000000
--- 
a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostingSuite.scala
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.mllib.tree
-
-import org.scalatest.FunSuite
-
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.tree.configuration.Algo._
-import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy}
-import org.apache.spark.mllib.tree.impurity.Variance
-import org.apache.spark.mllib.tree.loss.{SquaredError, LogLoss}
-
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-
-/**
- * Test suite for [[GradientBoosting]].
- */
-class GradientBoostingSuite extends FunSuite with MLlibTestSparkContext {
-
-  test("Regression with continuous features: SquaredError") {
-    GradientBoostingSuite.testCombinations.foreach {
-      case (numIterations, learningRate, subsamplingRate) =>
-        val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures 
= 10, 100)
-        val rdd = sc.parallelize(arr)
-        val categoricalFeaturesInfo = Map.empty[Int, Int]
-
-        val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, 
x.features))
-        val treeStrategy = new Strategy(algo = Regression, impurity = 
Variance, maxDepth = 2,
-          numClassesForClassification = 2, categoricalFeaturesInfo = 
categoricalFeaturesInfo,
-          subsamplingRate = subsamplingRate)
-
-        val dt = DecisionTree.train(remappedInput, treeStrategy)
-
-        val boostingStrategy = new BoostingStrategy(Regression, numIterations, 
SquaredError,
-          learningRate, 1, treeStrategy)
-
-        val gbt = GradientBoosting.trainRegressor(rdd, boostingStrategy)
-        assert(gbt.weakHypotheses.size === numIterations)
-        val gbtTree = gbt.weakHypotheses(0)
-
-        EnsembleTestHelper.validateRegressor(gbt, arr, 0.03)
-
-        // Make sure trees are the same.
-        assert(gbtTree.toString == dt.toString)
-    }
-  }
-
-  test("Regression with continuous features: Absolute Error") {
-    GradientBoostingSuite.testCombinations.foreach {
-      case (numIterations, learningRate, subsamplingRate) =>
-        val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures 
= 10, 100)
-        val rdd = sc.parallelize(arr)
-        val categoricalFeaturesInfo = Map.empty[Int, Int]
-
-        val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, 
x.features))
-        val treeStrategy = new Strategy(algo = Regression, impurity = 
Variance, maxDepth = 2,
-          numClassesForClassification = 2, categoricalFeaturesInfo = 
categoricalFeaturesInfo,
-          subsamplingRate = subsamplingRate)
-
-        val dt = DecisionTree.train(remappedInput, treeStrategy)
-
-        val boostingStrategy = new BoostingStrategy(Regression, numIterations, 
SquaredError,
-          learningRate, numClassesForClassification = 2, treeStrategy)
-
-        val gbt = GradientBoosting.trainRegressor(rdd, boostingStrategy)
-        assert(gbt.weakHypotheses.size === numIterations)
-        val gbtTree = gbt.weakHypotheses(0)
-
-        EnsembleTestHelper.validateRegressor(gbt, arr, 0.03)
-
-        // Make sure trees are the same.
-        assert(gbtTree.toString == dt.toString)
-    }
-  }
-
-  test("Binary classification with continuous features: Log Loss") {
-    GradientBoostingSuite.testCombinations.foreach {
-      case (numIterations, learningRate, subsamplingRate) =>
-        val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures 
= 10, 100)
-        val rdd = sc.parallelize(arr)
-        val categoricalFeaturesInfo = Map.empty[Int, Int]
-
-        val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, 
x.features))
-        val treeStrategy = new Strategy(algo = Regression, impurity = 
Variance, maxDepth = 2,
-          numClassesForClassification = 2, categoricalFeaturesInfo = 
categoricalFeaturesInfo,
-          subsamplingRate = subsamplingRate)
-
-        val dt = DecisionTree.train(remappedInput, treeStrategy)
-
-        val boostingStrategy = new BoostingStrategy(Classification, 
numIterations, LogLoss,
-          learningRate, numClassesForClassification = 2, treeStrategy)
-
-        val gbt = GradientBoosting.trainClassifier(rdd, boostingStrategy)
-        assert(gbt.weakHypotheses.size === numIterations)
-        val gbtTree = gbt.weakHypotheses(0)
-
-        EnsembleTestHelper.validateClassifier(gbt, arr, 0.9)
-
-        // Make sure trees are the same.
-        assert(gbtTree.toString == dt.toString)
-    }
-  }
-
-}
-
-object GradientBoostingSuite {
-
-  // Combinations for estimators, learning rates and subsamplingRate
-  val testCombinations = Array((10, 1.0, 1.0), (10, 0.1, 1.0), (10, 1.0, 
0.75), (10, 0.1, 0.75))
-
-}

http://git-wip-us.apache.org/repos/asf/spark/blob/15cacc81/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
index 2734e08..90a8c2d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
@@ -41,8 +41,8 @@ class RandomForestSuite extends FunSuite with 
MLlibTestSparkContext {
 
     val rf = RandomForest.trainClassifier(rdd, strategy, numTrees = numTrees,
       featureSubsetStrategy = "auto", seed = 123)
-    assert(rf.weakHypotheses.size === 1)
-    val rfTree = rf.weakHypotheses(0)
+    assert(rf.trees.size === 1)
+    val rfTree = rf.trees(0)
 
     val dt = DecisionTree.train(rdd, strategy)
 
@@ -65,7 +65,8 @@ class RandomForestSuite extends FunSuite with 
MLlibTestSparkContext {
     " comparing DecisionTree vs. RandomForest(numTrees = 1)") {
     val categoricalFeaturesInfo = Map.empty[Int, Int]
     val strategy = new Strategy(algo = Classification, impurity = Gini, 
maxDepth = 2,
-      numClassesForClassification = 2, categoricalFeaturesInfo = 
categoricalFeaturesInfo, useNodeIdCache = true)
+      numClassesForClassification = 2, categoricalFeaturesInfo = 
categoricalFeaturesInfo,
+      useNodeIdCache = true)
     binaryClassificationTestWithContinuousFeatures(strategy)
   }
 
@@ -76,8 +77,8 @@ class RandomForestSuite extends FunSuite with 
MLlibTestSparkContext {
 
     val rf = RandomForest.trainRegressor(rdd, strategy, numTrees = numTrees,
       featureSubsetStrategy = "auto", seed = 123)
-    assert(rf.weakHypotheses.size === 1)
-    val rfTree = rf.weakHypotheses(0)
+    assert(rf.trees.size === 1)
+    val rfTree = rf.trees(0)
 
     val dt = DecisionTree.train(rdd, strategy)
 
@@ -175,7 +176,8 @@ class RandomForestSuite extends FunSuite with 
MLlibTestSparkContext {
   test("Binary classification with continuous features and node Id cache: 
subsampling features") {
     val categoricalFeaturesInfo = Map.empty[Int, Int]
     val strategy = new Strategy(algo = Classification, impurity = Gini, 
maxDepth = 2,
-      numClassesForClassification = 2, categoricalFeaturesInfo = 
categoricalFeaturesInfo, useNodeIdCache = true)
+      numClassesForClassification = 2, categoricalFeaturesInfo = 
categoricalFeaturesInfo,
+      useNodeIdCache = true)
     
binaryClassificationTestWithContinuousFeaturesAndSubsampledFeatures(strategy)
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[1/2] spark git commit: [SPARK-4486][MLLIB] Improve GradientBoosting APIs and doc

Reply via email to