Repository: spark Updated Branches: refs/heads/master e216ffaea -> 15cacc812
http://git-wip-us.apache.org/repos/asf/spark/blob/15cacc81/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala index effb7b8..8972c22 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.tree import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.tree.model.WeightedEnsembleModel +import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter import scala.collection.mutable @@ -48,7 +48,7 @@ object EnsembleTestHelper { } def validateClassifier( - model: WeightedEnsembleModel, + model: TreeEnsembleModel, input: Seq[LabeledPoint], requiredAccuracy: Double) { val predictions = input.map(x => model.predict(x.features)) @@ -60,17 +60,27 @@ object EnsembleTestHelper { s"validateClassifier calculated accuracy $accuracy but required $requiredAccuracy.") } + /** + * Validates a tree ensemble model for regression. + */ def validateRegressor( - model: WeightedEnsembleModel, + model: TreeEnsembleModel, input: Seq[LabeledPoint], - requiredMSE: Double) { + required: Double, + metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) - val squaredError = predictions.zip(input).map { case (prediction, expected) => - val err = prediction - expected.label - err * err - }.sum - val mse = squaredError / input.length - assert(mse <= requiredMSE, s"validateRegressor calculated MSE $mse but required $requiredMSE.") + val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) => + prediction - label + } + val metric = metricName match { + case "mse" => + errors.map(err => err * err).sum / errors.size + case "mae" => + errors.map(math.abs).sum / errors.size + } + + assert(metric <= required, + s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { http://git-wip-us.apache.org/repos/asf/spark/blob/15cacc81/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala new file mode 100644 index 0000000..f3f8eff --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.tree + +import org.scalatest.FunSuite + +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.tree.configuration.Algo._ +import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} +import org.apache.spark.mllib.tree.impurity.Variance +import org.apache.spark.mllib.tree.loss.{AbsoluteError, SquaredError, LogLoss} + +import org.apache.spark.mllib.util.MLlibTestSparkContext + +/** + * Test suite for [[GradientBoostedTrees]]. + */ +class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext { + + test("Regression with continuous features: SquaredError") { + GradientBoostedTreesSuite.testCombinations.foreach { + case (numIterations, learningRate, subsamplingRate) => + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 10, 100) + val rdd = sc.parallelize(arr, 2) + + val treeStrategy = new Strategy(algo = Regression, impurity = Variance, maxDepth = 2, + categoricalFeaturesInfo = Map.empty, subsamplingRate = subsamplingRate) + val boostingStrategy = + new BoostingStrategy(treeStrategy, SquaredError, numIterations, learningRate) + + val gbt = GradientBoostedTrees.train(rdd, boostingStrategy) + + assert(gbt.trees.size === numIterations) + EnsembleTestHelper.validateRegressor(gbt, arr, 0.03) + + val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, x.features)) + val dt = DecisionTree.train(remappedInput, treeStrategy) + + // Make sure trees are the same. + assert(gbt.trees.head.toString == dt.toString) + } + } + + test("Regression with continuous features: Absolute Error") { + GradientBoostedTreesSuite.testCombinations.foreach { + case (numIterations, learningRate, subsamplingRate) => + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 10, 100) + val rdd = sc.parallelize(arr, 2) + + val treeStrategy = new Strategy(algo = Regression, impurity = Variance, maxDepth = 2, + categoricalFeaturesInfo = Map.empty, subsamplingRate = subsamplingRate) + val boostingStrategy = + new BoostingStrategy(treeStrategy, AbsoluteError, numIterations, learningRate) + + val gbt = GradientBoostedTrees.train(rdd, boostingStrategy) + + assert(gbt.trees.size === numIterations) + EnsembleTestHelper.validateRegressor(gbt, arr, 0.85, "mae") + + val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, x.features)) + val dt = DecisionTree.train(remappedInput, treeStrategy) + + // Make sure trees are the same. + assert(gbt.trees.head.toString == dt.toString) + } + } + + test("Binary classification with continuous features: Log Loss") { + GradientBoostedTreesSuite.testCombinations.foreach { + case (numIterations, learningRate, subsamplingRate) => + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 10, 100) + val rdd = sc.parallelize(arr, 2) + + val treeStrategy = new Strategy(algo = Classification, impurity = Variance, maxDepth = 2, + numClassesForClassification = 2, categoricalFeaturesInfo = Map.empty, + subsamplingRate = subsamplingRate) + val boostingStrategy = + new BoostingStrategy(treeStrategy, LogLoss, numIterations, learningRate) + + val gbt = GradientBoostedTrees.train(rdd, boostingStrategy) + + assert(gbt.trees.size === numIterations) + EnsembleTestHelper.validateClassifier(gbt, arr, 0.9) + + val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, x.features)) + val ensembleStrategy = treeStrategy.copy + ensembleStrategy.algo = Regression + ensembleStrategy.impurity = Variance + val dt = DecisionTree.train(remappedInput, ensembleStrategy) + + // Make sure trees are the same. + assert(gbt.trees.head.toString == dt.toString) + } + } + +} + +object GradientBoostedTreesSuite { + + // Combinations for estimators, learning rates and subsamplingRate + val testCombinations = Array((10, 1.0, 1.0), (10, 0.1, 1.0), (10, 1.0, 0.75), (10, 0.1, 0.75)) +} http://git-wip-us.apache.org/repos/asf/spark/blob/15cacc81/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostingSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostingSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostingSuite.scala deleted file mode 100644 index 84de401..0000000 --- a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostingSuite.scala +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.mllib.tree - -import org.scalatest.FunSuite - -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.tree.configuration.Algo._ -import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} -import org.apache.spark.mllib.tree.impurity.Variance -import org.apache.spark.mllib.tree.loss.{SquaredError, LogLoss} - -import org.apache.spark.mllib.util.MLlibTestSparkContext - -/** - * Test suite for [[GradientBoosting]]. - */ -class GradientBoostingSuite extends FunSuite with MLlibTestSparkContext { - - test("Regression with continuous features: SquaredError") { - GradientBoostingSuite.testCombinations.foreach { - case (numIterations, learningRate, subsamplingRate) => - val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 10, 100) - val rdd = sc.parallelize(arr) - val categoricalFeaturesInfo = Map.empty[Int, Int] - - val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, x.features)) - val treeStrategy = new Strategy(algo = Regression, impurity = Variance, maxDepth = 2, - numClassesForClassification = 2, categoricalFeaturesInfo = categoricalFeaturesInfo, - subsamplingRate = subsamplingRate) - - val dt = DecisionTree.train(remappedInput, treeStrategy) - - val boostingStrategy = new BoostingStrategy(Regression, numIterations, SquaredError, - learningRate, 1, treeStrategy) - - val gbt = GradientBoosting.trainRegressor(rdd, boostingStrategy) - assert(gbt.weakHypotheses.size === numIterations) - val gbtTree = gbt.weakHypotheses(0) - - EnsembleTestHelper.validateRegressor(gbt, arr, 0.03) - - // Make sure trees are the same. - assert(gbtTree.toString == dt.toString) - } - } - - test("Regression with continuous features: Absolute Error") { - GradientBoostingSuite.testCombinations.foreach { - case (numIterations, learningRate, subsamplingRate) => - val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 10, 100) - val rdd = sc.parallelize(arr) - val categoricalFeaturesInfo = Map.empty[Int, Int] - - val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, x.features)) - val treeStrategy = new Strategy(algo = Regression, impurity = Variance, maxDepth = 2, - numClassesForClassification = 2, categoricalFeaturesInfo = categoricalFeaturesInfo, - subsamplingRate = subsamplingRate) - - val dt = DecisionTree.train(remappedInput, treeStrategy) - - val boostingStrategy = new BoostingStrategy(Regression, numIterations, SquaredError, - learningRate, numClassesForClassification = 2, treeStrategy) - - val gbt = GradientBoosting.trainRegressor(rdd, boostingStrategy) - assert(gbt.weakHypotheses.size === numIterations) - val gbtTree = gbt.weakHypotheses(0) - - EnsembleTestHelper.validateRegressor(gbt, arr, 0.03) - - // Make sure trees are the same. - assert(gbtTree.toString == dt.toString) - } - } - - test("Binary classification with continuous features: Log Loss") { - GradientBoostingSuite.testCombinations.foreach { - case (numIterations, learningRate, subsamplingRate) => - val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 10, 100) - val rdd = sc.parallelize(arr) - val categoricalFeaturesInfo = Map.empty[Int, Int] - - val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, x.features)) - val treeStrategy = new Strategy(algo = Regression, impurity = Variance, maxDepth = 2, - numClassesForClassification = 2, categoricalFeaturesInfo = categoricalFeaturesInfo, - subsamplingRate = subsamplingRate) - - val dt = DecisionTree.train(remappedInput, treeStrategy) - - val boostingStrategy = new BoostingStrategy(Classification, numIterations, LogLoss, - learningRate, numClassesForClassification = 2, treeStrategy) - - val gbt = GradientBoosting.trainClassifier(rdd, boostingStrategy) - assert(gbt.weakHypotheses.size === numIterations) - val gbtTree = gbt.weakHypotheses(0) - - EnsembleTestHelper.validateClassifier(gbt, arr, 0.9) - - // Make sure trees are the same. - assert(gbtTree.toString == dt.toString) - } - } - -} - -object GradientBoostingSuite { - - // Combinations for estimators, learning rates and subsamplingRate - val testCombinations = Array((10, 1.0, 1.0), (10, 0.1, 1.0), (10, 1.0, 0.75), (10, 0.1, 0.75)) - -} http://git-wip-us.apache.org/repos/asf/spark/blob/15cacc81/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala index 2734e08..90a8c2d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala @@ -41,8 +41,8 @@ class RandomForestSuite extends FunSuite with MLlibTestSparkContext { val rf = RandomForest.trainClassifier(rdd, strategy, numTrees = numTrees, featureSubsetStrategy = "auto", seed = 123) - assert(rf.weakHypotheses.size === 1) - val rfTree = rf.weakHypotheses(0) + assert(rf.trees.size === 1) + val rfTree = rf.trees(0) val dt = DecisionTree.train(rdd, strategy) @@ -65,7 +65,8 @@ class RandomForestSuite extends FunSuite with MLlibTestSparkContext { " comparing DecisionTree vs. RandomForest(numTrees = 1)") { val categoricalFeaturesInfo = Map.empty[Int, Int] val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 2, - numClassesForClassification = 2, categoricalFeaturesInfo = categoricalFeaturesInfo, useNodeIdCache = true) + numClassesForClassification = 2, categoricalFeaturesInfo = categoricalFeaturesInfo, + useNodeIdCache = true) binaryClassificationTestWithContinuousFeatures(strategy) } @@ -76,8 +77,8 @@ class RandomForestSuite extends FunSuite with MLlibTestSparkContext { val rf = RandomForest.trainRegressor(rdd, strategy, numTrees = numTrees, featureSubsetStrategy = "auto", seed = 123) - assert(rf.weakHypotheses.size === 1) - val rfTree = rf.weakHypotheses(0) + assert(rf.trees.size === 1) + val rfTree = rf.trees(0) val dt = DecisionTree.train(rdd, strategy) @@ -175,7 +176,8 @@ class RandomForestSuite extends FunSuite with MLlibTestSparkContext { test("Binary classification with continuous features and node Id cache: subsampling features") { val categoricalFeaturesInfo = Map.empty[Int, Int] val strategy = new Strategy(algo = Classification, impurity = Gini, maxDepth = 2, - numClassesForClassification = 2, categoricalFeaturesInfo = categoricalFeaturesInfo, useNodeIdCache = true) + numClassesForClassification = 2, categoricalFeaturesInfo = categoricalFeaturesInfo, + useNodeIdCache = true) binaryClassificationTestWithContinuousFeaturesAndSubsampledFeatures(strategy) } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
