Repository: spark Updated Branches: refs/heads/master e07fb6a41 -> 860219551
http://git-wip-us.apache.org/repos/asf/spark/blob/86021955/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala index d3eff59..10c046e 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala @@ -25,45 +25,20 @@ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.configuration.Strategy -import org.apache.spark.mllib.tree.impl.{BaggedPoint, DecisionTreeMetadata} +import org.apache.spark.mllib.tree.impl.DecisionTreeMetadata import org.apache.spark.mllib.tree.impurity.{Gini, Variance} -import org.apache.spark.mllib.tree.model.{Node, RandomForestModel} +import org.apache.spark.mllib.tree.model.Node import org.apache.spark.mllib.util.LocalSparkContext -import org.apache.spark.util.StatCounter /** * Test suite for [[RandomForest]]. */ class RandomForestSuite extends FunSuite with LocalSparkContext { - test("BaggedPoint RDD: without subsampling") { - val arr = RandomForestSuite.generateOrderedLabeledPoints(numFeatures = 1) - val rdd = sc.parallelize(arr) - val baggedRDD = BaggedPoint.convertToBaggedRDDWithoutSampling(rdd) - baggedRDD.collect().foreach { baggedPoint => - assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1) - } - } - - test("BaggedPoint RDD: with subsampling") { - val numSubsamples = 100 - val (expectedMean, expectedStddev) = (1.0, 1.0) - - val seeds = Array(123, 5354, 230, 349867, 23987) - val arr = RandomForestSuite.generateOrderedLabeledPoints(numFeatures = 1) - val rdd = sc.parallelize(arr) - seeds.foreach { seed => - val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, numSubsamples, seed = seed) - val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() - RandomForestSuite.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, - expectedStddev, epsilon = 0.01) - } - } - test("Binary classification with continuous features:" + " comparing DecisionTree vs. RandomForest(numTrees = 1)") { - val arr = RandomForestSuite.generateOrderedLabeledPoints(numFeatures = 50) + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 50, 1000) val rdd = sc.parallelize(arr) val categoricalFeaturesInfo = Map.empty[Int, Int] val numTrees = 1 @@ -73,12 +48,12 @@ class RandomForestSuite extends FunSuite with LocalSparkContext { val rf = RandomForest.trainClassifier(rdd, strategy, numTrees = numTrees, featureSubsetStrategy = "auto", seed = 123) - assert(rf.trees.size === 1) - val rfTree = rf.trees(0) + assert(rf.weakHypotheses.size === 1) + val rfTree = rf.weakHypotheses(0) val dt = DecisionTree.train(rdd, strategy) - RandomForestSuite.validateClassifier(rf, arr, 0.9) + EnsembleTestHelper.validateClassifier(rf, arr, 0.9) DecisionTreeSuite.validateClassifier(dt, arr, 0.9) // Make sure trees are the same. @@ -88,7 +63,7 @@ class RandomForestSuite extends FunSuite with LocalSparkContext { test("Regression with continuous features:" + " comparing DecisionTree vs. RandomForest(numTrees = 1)") { - val arr = RandomForestSuite.generateOrderedLabeledPoints(numFeatures = 50) + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 50, 1000) val rdd = sc.parallelize(arr) val categoricalFeaturesInfo = Map.empty[Int, Int] val numTrees = 1 @@ -99,12 +74,12 @@ class RandomForestSuite extends FunSuite with LocalSparkContext { val rf = RandomForest.trainRegressor(rdd, strategy, numTrees = numTrees, featureSubsetStrategy = "auto", seed = 123) - assert(rf.trees.size === 1) - val rfTree = rf.trees(0) + assert(rf.weakHypotheses.size === 1) + val rfTree = rf.weakHypotheses(0) val dt = DecisionTree.train(rdd, strategy) - RandomForestSuite.validateRegressor(rf, arr, 0.01) + EnsembleTestHelper.validateRegressor(rf, arr, 0.01) DecisionTreeSuite.validateRegressor(dt, arr, 0.01) // Make sure trees are the same. @@ -113,7 +88,7 @@ class RandomForestSuite extends FunSuite with LocalSparkContext { test("Binary classification with continuous features: subsampling features") { val numFeatures = 50 - val arr = RandomForestSuite.generateOrderedLabeledPoints(numFeatures) + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures, 1000) val rdd = sc.parallelize(arr) val categoricalFeaturesInfo = Map.empty[Int, Int] @@ -187,77 +162,9 @@ class RandomForestSuite extends FunSuite with LocalSparkContext { numClassesForClassification = 3, categoricalFeaturesInfo = categoricalFeaturesInfo) val model = RandomForest.trainClassifier(input, strategy, numTrees = 2, featureSubsetStrategy = "sqrt", seed = 12345) - RandomForestSuite.validateClassifier(model, arr, 0.0) + EnsembleTestHelper.validateClassifier(model, arr, 1.0) } } -object RandomForestSuite { - - /** - * Aggregates all values in data, and tests whether the empirical mean and stddev are within - * epsilon of the expected values. - * @param data Every element of the data should be an i.i.d. sample from some distribution. - */ - def testRandomArrays( - data: Array[Array[Double]], - numCols: Int, - expectedMean: Double, - expectedStddev: Double, - epsilon: Double) { - val values = new mutable.ArrayBuffer[Double]() - data.foreach { row => - assert(row.size == numCols) - values ++= row - } - val stats = new StatCounter(values) - assert(math.abs(stats.mean - expectedMean) < epsilon) - assert(math.abs(stats.stdev - expectedStddev) < epsilon) - } - - def validateClassifier( - model: RandomForestModel, - input: Seq[LabeledPoint], - requiredAccuracy: Double) { - val predictions = input.map(x => model.predict(x.features)) - val numOffPredictions = predictions.zip(input).count { case (prediction, expected) => - prediction != expected.label - } - val accuracy = (input.length - numOffPredictions).toDouble / input.length - assert(accuracy >= requiredAccuracy, - s"validateClassifier calculated accuracy $accuracy but required $requiredAccuracy.") - } - - def validateRegressor( - model: RandomForestModel, - input: Seq[LabeledPoint], - requiredMSE: Double) { - val predictions = input.map(x => model.predict(x.features)) - val squaredError = predictions.zip(input).map { case (prediction, expected) => - val err = prediction - expected.label - err * err - }.sum - val mse = squaredError / input.length - assert(mse <= requiredMSE, s"validateRegressor calculated MSE $mse but required $requiredMSE.") - } - def generateOrderedLabeledPoints(numFeatures: Int): Array[LabeledPoint] = { - val numInstances = 1000 - val arr = new Array[LabeledPoint](numInstances) - for (i <- 0 until numInstances) { - val label = if (i < numInstances / 10) { - 0.0 - } else if (i < numInstances / 2) { - 1.0 - } else if (i < numInstances * 0.9) { - 0.0 - } else { - 1.0 - } - val features = Array.fill[Double](numFeatures)(i.toDouble) - arr(i) = new LabeledPoint(label, Vectors.dense(features)) - } - arr - } - -} http://git-wip-us.apache.org/repos/asf/spark/blob/86021955/mllib/src/test/scala/org/apache/spark/mllib/tree/impl/BaggedPointSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/impl/BaggedPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/impl/BaggedPointSuite.scala new file mode 100644 index 0000000..c0a62e0 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/impl/BaggedPointSuite.scala @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.tree.impl + +import org.scalatest.FunSuite + +import org.apache.spark.mllib.tree.EnsembleTestHelper +import org.apache.spark.mllib.util.LocalSparkContext + +/** + * Test suite for [[BaggedPoint]]. + */ +class BaggedPointSuite extends FunSuite with LocalSparkContext { + + test("BaggedPoint RDD: without subsampling") { + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) + val rdd = sc.parallelize(arr) + val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false) + baggedRDD.collect().foreach { baggedPoint => + assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1) + } + } + + test("BaggedPoint RDD: with subsampling with replacement (fraction = 1.0)") { + val numSubsamples = 100 + val (expectedMean, expectedStddev) = (1.0, 1.0) + + val seeds = Array(123, 5354, 230, 349867, 23987) + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) + val rdd = sc.parallelize(arr) + seeds.foreach { seed => + val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true) + val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() + EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, + expectedStddev, epsilon = 0.01) + } + } + + test("BaggedPoint RDD: with subsampling with replacement (fraction = 0.5)") { + val numSubsamples = 100 + val subsample = 0.5 + val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample)) + + val seeds = Array(123, 5354, 230, 349867, 23987) + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) + val rdd = sc.parallelize(arr) + seeds.foreach { seed => + val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true) + val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() + EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, + expectedStddev, epsilon = 0.01) + } + } + + test("BaggedPoint RDD: with subsampling without replacement (fraction = 1.0)") { + val numSubsamples = 100 + val (expectedMean, expectedStddev) = (1.0, 0) + + val seeds = Array(123, 5354, 230, 349867, 23987) + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) + val rdd = sc.parallelize(arr) + seeds.foreach { seed => + val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false) + val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() + EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, + expectedStddev, epsilon = 0.01) + } + } + + test("BaggedPoint RDD: with subsampling without replacement (fraction = 0.5)") { + val numSubsamples = 100 + val subsample = 0.5 + val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample * (1 - subsample))) + + val seeds = Array(123, 5354, 230, 349867, 23987) + val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000) + val rdd = sc.parallelize(arr) + seeds.foreach { seed => + val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false) + val subsampleCounts: Array[Array[Double]] = baggedRDD.map(_.subsampleWeights).collect() + EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, expectedMean, + expectedStddev, epsilon = 0.01) + } + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
