Github user mengxr commented on a diff in the pull request:
https://github.com/apache/spark/pull/1673#discussion_r15628659
--- Diff:
mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala ---
@@ -602,12 +609,78 @@ class DecisionTreeSuite extends FunSuite with
LocalSparkContext {
assert(bestSplit.featureType === Categorical)
}
+ test("stump with 1 continuous variable for binary classification, to
check off-by-1 error") {
+ val arr = new Array[LabeledPoint](4)
+ arr(0) = new LabeledPoint(0.0, Vectors.dense(0.0))
+ arr(1) = new LabeledPoint(1.0, Vectors.dense(1.0))
+ arr(2) = new LabeledPoint(1.0, Vectors.dense(2.0))
+ arr(3) = new LabeledPoint(1.0, Vectors.dense(3.0))
+ val input = sc.parallelize(arr)
+ val strategy = new Strategy(algo = Classification, impurity = Gini,
maxDepth = 5,
+ numClassesForClassification = 2)
+
+ val model = DecisionTree.train(input, strategy)
+ validateClassifier(model, arr, 1.0)
+ assert(model.numNodes === 3)
+ assert(model.depth === 1)
+ }
+
+ test("stump with 2 continuous variables for binary classification") {
+ val arr = new Array[LabeledPoint](4)
+ arr(0) = new LabeledPoint(0.0, Vectors.sparse(2, Seq((0, 0.0))))
+ arr(1) = new LabeledPoint(1.0, Vectors.sparse(2, Seq((1, 1.0))))
+ arr(2) = new LabeledPoint(0.0, Vectors.sparse(2, Seq((0, 0.0))))
+ arr(3) = new LabeledPoint(1.0, Vectors.sparse(2, Seq((1, 2.0))))
+
+ val input = sc.parallelize(arr)
+ val strategy = new Strategy(algo = Classification, impurity = Gini,
maxDepth = 5,
+ numClassesForClassification = 2)
+
+ val model = DecisionTree.train(input, strategy)
+ validateClassifier(model, arr, 1.0)
+ assert(model.numNodes === 3)
+ assert(model.depth === 1)
+ assert(model.topNode.split.get.feature === 1)
+ }
+
+ test("stump with categorical variables for multiclass classification,
with just enough bins") {
+ val maxBins = math.pow(2, 3 - 1).toInt // just enough bins to allow
unordered features
+ val arr =
DecisionTreeSuite.generateCategoricalDataPointsForMulticlass()
+ val input = sc.parallelize(arr)
+ val strategy = new Strategy(algo = Classification, impurity = Gini,
maxDepth = 5,
+ numClassesForClassification = 3, categoricalFeaturesInfo = Map(0 ->
3, 1 -> 3))
+ assert(strategy.isMulticlassClassification)
+
+ val model = DecisionTree.train(input, strategy)
+ validateClassifier(model, arr, 1.0)
+ assert(model.numNodes === 3)
+ assert(model.depth === 1)
+
+ val (splits, bins) = DecisionTree.findSplitsBins(input, strategy)
+ val bestSplits = DecisionTree.findBestSplits(input, new Array(31),
strategy, 0,
+ Array[List[Filter]](), splits, bins, 10)
+
+ assert(bestSplits.length === 1)
+ val bestSplit = bestSplits(0)._1
+ assert(bestSplit.feature === 0)
+ assert(bestSplit.categories.length === 1)
+ assert(bestSplit.categories.contains(1))
+ assert(bestSplit.featureType === Categorical)
+ val gain = bestSplits(0)._2
+ assert(gain.leftImpurity == 0)
--- End diff --
use `===` instead of `==` to get more information if something is wrong
(same for the line below)
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---