Github user mengxr commented on a diff in the pull request:
https://github.com/apache/spark/pull/3951#discussion_r23710574
--- Diff: python/pyspark/mllib/tree.py ---
@@ -383,6 +381,137 @@ def trainRegressor(cls, data,
categoricalFeaturesInfo, numTrees, featureSubsetSt
featureSubsetStrategy, impurity, maxDepth,
maxBins, seed)
+class GradientBoostedTreesModel(TreeEnsembleModel):
+ """
+ .. note:: Experimental
+
+ Represents a gradient-boosted tree model.
+ """
+
+
+class GradientBoostedTrees(object):
+ """
+ .. note:: Experimental
+
+ Learning algorithm for a gradient boosted trees model for
classification or regression.
+ """
+
+ @classmethod
+ def _train(cls, data, algo, categoricalFeaturesInfo,
+ loss, numIterations, learningRate, maxDepth):
+ first = data.first()
+ assert isinstance(first, LabeledPoint), "the data should be RDD of
LabeledPoint"
+ model = callMLlibFunc("trainGradientBoostedTreesModel", data,
algo, categoricalFeaturesInfo,
+ loss, numIterations, learningRate, maxDepth)
+ return GradientBoostedTreesModel(model)
+
+ @classmethod
+ def trainClassifier(cls, data, categoricalFeaturesInfo,
+ loss="logLoss", numIterations=100,
learningRate=0.1, maxDepth=3):
+ """
+ Method to train a gradient-boosted trees model for classification.
+
+ :param data: Training dataset: RDD of LabeledPoint. Labels should
take values {0, 1}.
+ :param categoricalFeaturesInfo: Map storing arity of categorical
+ features. E.g., an entry (n -> k) indicates that feature
+ n is categorical with k categories indexed from 0:
+ {0, 1, ..., k-1}.
+ :param loss: Loss function used for minimization during gradient
boosting.
+ (default: "logLoss")
+ :param numIterations: Number of iterations of boosting.
+ (default: 100)
+ :param learningRate: Learning rate for shrinking the contribution
of each estimator.
+ The learning rate should be between in the
interval (0, 1]
+ (default: 0.1)
+ :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1
+ leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+ (default: 3)
+ :return: GradientBoostedTreesModel that can be used for prediction
+
+ Example usage:
+
+ >>> from pyspark.mllib.regression import LabeledPoint
+ >>> from pyspark.mllib.tree import GradientBoostedTrees
+ >>>
+ >>> data = [
+ ... LabeledPoint(0.0, [0.0]),
+ ... LabeledPoint(0.0, [1.0]),
+ ... LabeledPoint(1.0, [2.0]),
+ ... LabeledPoint(1.0, [3.0])
+ ... ]
+ >>>
+ >>> model =
GradientBoostedTrees.trainClassifier(sc.parallelize(data), {})
+ >>> model.numTrees()
+ 100
+ >>> model.totalNumNodes()
+ 300
+ >>> print model, # it already has newline
+ TreeEnsembleModel classifier with 100 trees
+ >>> model.predict([2.0])
+ 1.0
+ >>> model.predict([0.0])
+ 0.0
+ >>> rdd = sc.parallelize([[2.0], [0.0]])
+ >>> model.predict(rdd).collect()
+ [1.0, 0.0]
+ """
+ return cls._train(data, "classification", categoricalFeaturesInfo,
+ loss, numIterations, learningRate, maxDepth)
+
+ @classmethod
+ def trainRegressor(cls, data, categoricalFeaturesInfo,
+ loss="leastSquaresError", numIterations=100,
learningRate=0.1, maxDepth=3):
+ """
+ Method to train a gradient-boosted trees model for regression.
+
+ :param data: Training dataset: RDD of LabeledPoint. Labels are
+ real numbers.
+ :param categoricalFeaturesInfo: Map storing arity of categorical
+ features. E.g., an entry (n -> k) indicates that feature
+ n is categorical with k categories indexed from 0:
+ {0, 1, ..., k-1}.
+ :param loss: Loss function used for minimization during gradient
boosting.
+ (default: "leastSquaresError")
+ :param numIterations: Number of iterations of boosting.
+ (default: 100)
+ :param learningRate: Learning rate for shrinking the contribution
of each estimator.
+ The learning rate should be between in the
interval (0, 1]
+ (default: 0.1)
+ :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1
+ leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+ (default: 3)
+ :return: GradientBoostedTreesModel that can be used for prediction
+
+ Example usage:
+
+ >>> from pyspark.mllib.regression import LabeledPoint
+ >>> from pyspark.mllib.tree import GradientBoostedTrees
+ >>> from pyspark.mllib.linalg import SparseVector
+ >>>
+ >>> sparse_data = [
+ ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
+ ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
+ ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
+ ... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
+ ... ]
+ >>>
+ >>> model =
GradientBoostedTrees.trainRegressor(sc.parallelize(sparse_data), {})
+ >>> model.numTrees()
+ 100
+ >>> model.totalNumNodes()
+ 102
+ >>> model.predict(SparseVector(2, {1: 1.0}))
+ 1.0
+ >>> model.predict(SparseVector(2, {0: 1.0}))
+ 0.0
+ >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]])
+ >>> model.predict(rdd).collect()
+ [1.0, 0.0]
+ """
+ return cls._train(data, "regression", categoricalFeaturesInfo,
--- End diff --
Same about `regression".
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]