[1/2] [MLLIB] SPARK-1547: Add Gradient Boosting to MLlib

meng Fri, 31 Oct 2014 18:58:15 -0700

Repository: spark
Updated Branches:
  refs/heads/master e07fb6a41 -> 860219551



http://git-wip-us.apache.org/repos/asf/spark/blob/86021955/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
index d3eff59..10c046e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
@@ -25,45 +25,20 @@ import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.Strategy
-import org.apache.spark.mllib.tree.impl.{BaggedPoint, DecisionTreeMetadata}
+import org.apache.spark.mllib.tree.impl.DecisionTreeMetadata
 import org.apache.spark.mllib.tree.impurity.{Gini, Variance}
-import org.apache.spark.mllib.tree.model.{Node, RandomForestModel}
+import org.apache.spark.mllib.tree.model.Node
 import org.apache.spark.mllib.util.LocalSparkContext
-import org.apache.spark.util.StatCounter
 
 /**
  * Test suite for [[RandomForest]].
  */
 class RandomForestSuite extends FunSuite with LocalSparkContext {
 
-  test("BaggedPoint RDD: without subsampling") {
-    val arr = RandomForestSuite.generateOrderedLabeledPoints(numFeatures = 1)
-    val rdd = sc.parallelize(arr)
-    val baggedRDD = BaggedPoint.convertToBaggedRDDWithoutSampling(rdd)
-    baggedRDD.collect().foreach { baggedPoint =>
-      assert(baggedPoint.subsampleWeights.size == 1 && 
baggedPoint.subsampleWeights(0) == 1)
-    }
-  }
-
-  test("BaggedPoint RDD: with subsampling") {
-    val numSubsamples = 100
-    val (expectedMean, expectedStddev) = (1.0, 1.0)
-
-    val seeds = Array(123, 5354, 230, 349867, 23987)
-    val arr = RandomForestSuite.generateOrderedLabeledPoints(numFeatures = 1)
-    val rdd = sc.parallelize(arr)
-    seeds.foreach { seed =>
-      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, numSubsamples, seed 
= seed)
-      val subsampleCounts: Array[Array[Double]] = 
baggedRDD.map(_.subsampleWeights).collect()
-      RandomForestSuite.testRandomArrays(subsampleCounts, numSubsamples, 
expectedMean,
-        expectedStddev, epsilon = 0.01)
-    }
-  }
-
   test("Binary classification with continuous features:" +
       " comparing DecisionTree vs. RandomForest(numTrees = 1)") {
 
-    val arr = RandomForestSuite.generateOrderedLabeledPoints(numFeatures = 50)
+    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 
50, 1000)
     val rdd = sc.parallelize(arr)
     val categoricalFeaturesInfo = Map.empty[Int, Int]
     val numTrees = 1
@@ -73,12 +48,12 @@ class RandomForestSuite extends FunSuite with 
LocalSparkContext {
 
     val rf = RandomForest.trainClassifier(rdd, strategy, numTrees = numTrees,
       featureSubsetStrategy = "auto", seed = 123)
-    assert(rf.trees.size === 1)
-    val rfTree = rf.trees(0)
+    assert(rf.weakHypotheses.size === 1)
+    val rfTree = rf.weakHypotheses(0)
 
     val dt = DecisionTree.train(rdd, strategy)
 
-    RandomForestSuite.validateClassifier(rf, arr, 0.9)
+    EnsembleTestHelper.validateClassifier(rf, arr, 0.9)
     DecisionTreeSuite.validateClassifier(dt, arr, 0.9)
 
     // Make sure trees are the same.
@@ -88,7 +63,7 @@ class RandomForestSuite extends FunSuite with 
LocalSparkContext {
   test("Regression with continuous features:" +
     " comparing DecisionTree vs. RandomForest(numTrees = 1)") {
 
-    val arr = RandomForestSuite.generateOrderedLabeledPoints(numFeatures = 50)
+    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 
50, 1000)
     val rdd = sc.parallelize(arr)
     val categoricalFeaturesInfo = Map.empty[Int, Int]
     val numTrees = 1
@@ -99,12 +74,12 @@ class RandomForestSuite extends FunSuite with 
LocalSparkContext {
 
     val rf = RandomForest.trainRegressor(rdd, strategy, numTrees = numTrees,
       featureSubsetStrategy = "auto", seed = 123)
-    assert(rf.trees.size === 1)
-    val rfTree = rf.trees(0)
+    assert(rf.weakHypotheses.size === 1)
+    val rfTree = rf.weakHypotheses(0)
 
     val dt = DecisionTree.train(rdd, strategy)
 
-    RandomForestSuite.validateRegressor(rf, arr, 0.01)
+    EnsembleTestHelper.validateRegressor(rf, arr, 0.01)
     DecisionTreeSuite.validateRegressor(dt, arr, 0.01)
 
     // Make sure trees are the same.
@@ -113,7 +88,7 @@ class RandomForestSuite extends FunSuite with 
LocalSparkContext {
 
   test("Binary classification with continuous features: subsampling features") 
{
     val numFeatures = 50
-    val arr = RandomForestSuite.generateOrderedLabeledPoints(numFeatures)
+    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures, 
1000)
     val rdd = sc.parallelize(arr)
     val categoricalFeaturesInfo = Map.empty[Int, Int]
 
@@ -187,77 +162,9 @@ class RandomForestSuite extends FunSuite with 
LocalSparkContext {
       numClassesForClassification = 3, categoricalFeaturesInfo = 
categoricalFeaturesInfo)
     val model = RandomForest.trainClassifier(input, strategy, numTrees = 2,
       featureSubsetStrategy = "sqrt", seed = 12345)
-    RandomForestSuite.validateClassifier(model, arr, 0.0)
+    EnsembleTestHelper.validateClassifier(model, arr, 1.0)
   }
 
 }
 
-object RandomForestSuite {
-
-  /**
-   * Aggregates all values in data, and tests whether the empirical mean and 
stddev are within
-   * epsilon of the expected values.
-   * @param data  Every element of the data should be an i.i.d. sample from 
some distribution.
-   */
-  def testRandomArrays(
-      data: Array[Array[Double]],
-      numCols: Int,
-      expectedMean: Double,
-      expectedStddev: Double,
-      epsilon: Double) {
-    val values = new mutable.ArrayBuffer[Double]()
-    data.foreach { row =>
-      assert(row.size == numCols)
-      values ++= row
-    }
-    val stats = new StatCounter(values)
-    assert(math.abs(stats.mean - expectedMean) < epsilon)
-    assert(math.abs(stats.stdev - expectedStddev) < epsilon)
-  }
-
-  def validateClassifier(
-      model: RandomForestModel,
-      input: Seq[LabeledPoint],
-      requiredAccuracy: Double) {
-    val predictions = input.map(x => model.predict(x.features))
-    val numOffPredictions = predictions.zip(input).count { case (prediction, 
expected) =>
-      prediction != expected.label
-    }
-    val accuracy = (input.length - numOffPredictions).toDouble / input.length
-    assert(accuracy >= requiredAccuracy,
-      s"validateClassifier calculated accuracy $accuracy but required 
$requiredAccuracy.")
-  }
-
-  def validateRegressor(
-      model: RandomForestModel,
-      input: Seq[LabeledPoint],
-      requiredMSE: Double) {
-    val predictions = input.map(x => model.predict(x.features))
-    val squaredError = predictions.zip(input).map { case (prediction, 
expected) =>
-      val err = prediction - expected.label
-      err * err
-    }.sum
-    val mse = squaredError / input.length
-    assert(mse <= requiredMSE, s"validateRegressor calculated MSE $mse but 
required $requiredMSE.")
-  }
 
-  def generateOrderedLabeledPoints(numFeatures: Int): Array[LabeledPoint] = {
-    val numInstances = 1000
-    val arr = new Array[LabeledPoint](numInstances)
-    for (i <- 0 until numInstances) {
-      val label = if (i < numInstances / 10) {
-        0.0
-      } else if (i < numInstances / 2) {
-        1.0
-      } else if (i < numInstances * 0.9) {
-        0.0
-      } else {
-        1.0
-      }
-      val features = Array.fill[Double](numFeatures)(i.toDouble)
-      arr(i) = new LabeledPoint(label, Vectors.dense(features))
-    }
-    arr
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/spark/blob/86021955/mllib/src/test/scala/org/apache/spark/mllib/tree/impl/BaggedPointSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/tree/impl/BaggedPointSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/tree/impl/BaggedPointSuite.scala
new file mode 100644
index 0000000..c0a62e0
--- /dev/null
+++ 
b/mllib/src/test/scala/org/apache/spark/mllib/tree/impl/BaggedPointSuite.scala
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.tree.impl
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.tree.EnsembleTestHelper
+import org.apache.spark.mllib.util.LocalSparkContext
+
+/**
+ * Test suite for [[BaggedPoint]].
+ */
+class BaggedPointSuite extends FunSuite with LocalSparkContext  {
+
+  test("BaggedPoint RDD: without subsampling") {
+    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
+    val rdd = sc.parallelize(arr)
+    val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false)
+    baggedRDD.collect().foreach { baggedPoint =>
+      assert(baggedPoint.subsampleWeights.size == 1 && 
baggedPoint.subsampleWeights(0) == 1)
+    }
+  }
+
+  test("BaggedPoint RDD: with subsampling with replacement (fraction = 1.0)") {
+    val numSubsamples = 100
+    val (expectedMean, expectedStddev) = (1.0, 1.0)
+
+    val seeds = Array(123, 5354, 230, 349867, 23987)
+    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
+    val rdd = sc.parallelize(arr)
+    seeds.foreach { seed =>
+      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, 
true)
+      val subsampleCounts: Array[Array[Double]] = 
baggedRDD.map(_.subsampleWeights).collect()
+      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, 
expectedMean,
+        expectedStddev, epsilon = 0.01)
+    }
+  }
+
+  test("BaggedPoint RDD: with subsampling with replacement (fraction = 0.5)") {
+    val numSubsamples = 100
+    val subsample = 0.5
+    val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample))
+
+    val seeds = Array(123, 5354, 230, 349867, 23987)
+    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
+    val rdd = sc.parallelize(arr)
+    seeds.foreach { seed =>
+      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, 
numSubsamples, true)
+      val subsampleCounts: Array[Array[Double]] = 
baggedRDD.map(_.subsampleWeights).collect()
+      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, 
expectedMean,
+        expectedStddev, epsilon = 0.01)
+    }
+  }
+
+  test("BaggedPoint RDD: with subsampling without replacement (fraction = 
1.0)") {
+    val numSubsamples = 100
+    val (expectedMean, expectedStddev) = (1.0, 0)
+
+    val seeds = Array(123, 5354, 230, 349867, 23987)
+    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
+    val rdd = sc.parallelize(arr)
+    seeds.foreach { seed =>
+      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, 
false)
+      val subsampleCounts: Array[Array[Double]] = 
baggedRDD.map(_.subsampleWeights).collect()
+      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, 
expectedMean,
+        expectedStddev, epsilon = 0.01)
+    }
+  }
+
+  test("BaggedPoint RDD: with subsampling without replacement (fraction = 
0.5)") {
+    val numSubsamples = 100
+    val subsample = 0.5
+    val (expectedMean, expectedStddev) = (subsample, math.sqrt(subsample * (1 
- subsample)))
+
+    val seeds = Array(123, 5354, 230, 349867, 23987)
+    val arr = EnsembleTestHelper.generateOrderedLabeledPoints(1, 1000)
+    val rdd = sc.parallelize(arr)
+    seeds.foreach { seed =>
+      val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, 
numSubsamples, false)
+      val subsampleCounts: Array[Array[Double]] = 
baggedRDD.map(_.subsampleWeights).collect()
+      EnsembleTestHelper.testRandomArrays(subsampleCounts, numSubsamples, 
expectedMean,
+        expectedStddev, epsilon = 0.01)
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[1/2] [MLLIB] SPARK-1547: Add Gradient Boosting to MLlib

Reply via email to