Repository: spark Updated Branches: refs/heads/branch-1.6 b75f97094 -> 70b587841
[SPARK-14107][PYSPARK][ML] Add seed as named argument to GBTs in pyspark ## What changes were proposed in this pull request? GBTs in pyspark previously had seed parameters, but they could not be passed as keyword arguments through the class constructor. This patch adds seed as a keyword argument and also sets default value. ## How was this patch tested? Doc tests were updated to pass a random seed through the GBTClassifier and GBTRegressor constructors. Author: sethah <[email protected]> Closes #11944 from sethah/SPARK-14107. (cherry picked from commit 585097716c1979ea538ef733cf33225ef7be06f5) Signed-off-by: Xiangrui Meng <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/70b58784 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/70b58784 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/70b58784 Branch: refs/heads/branch-1.6 Commit: 70b587841f5670a825d22ecdf9727bd7ceb684c3 Parents: b75f970 Author: sethah <[email protected]> Authored: Thu Mar 24 19:14:24 2016 -0700 Committer: Xiangrui Meng <[email protected]> Committed: Thu Mar 24 19:14:32 2016 -0700 ---------------------------------------------------------------------- python/pyspark/ml/classification.py | 12 ++++++------ python/pyspark/ml/regression.py | 13 +++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/70b58784/python/pyspark/ml/classification.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 5599b8f..7a300e7 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -468,7 +468,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") >>> si_model = stringIndexer.fit(df) >>> td = si_model.transform(df) - >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed") + >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42) >>> model = gbt.fit(td) >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) True @@ -491,12 +491,12 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", - maxIter=20, stepSize=0.1): + maxIter=20, stepSize=0.1, seed=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - lossType="logistic", maxIter=20, stepSize=0.1) + lossType="logistic", maxIter=20, stepSize=0.1, seed=None) """ super(GBTClassifier, self).__init__() self._java_obj = self._new_java_obj( @@ -507,7 +507,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol "Supported options: " + ", ".join(GBTParams.supportedLossTypes)) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - lossType="logistic", maxIter=20, stepSize=0.1) + lossType="logistic", maxIter=20, stepSize=0.1, seed=None) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -516,12 +516,12 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - lossType="logistic", maxIter=20, stepSize=0.1): + lossType="logistic", maxIter=20, stepSize=0.1, seed=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - lossType="logistic", maxIter=20, stepSize=0.1) + lossType="logistic", maxIter=20, stepSize=0.1, seed=None) Sets params for Gradient Boosted Tree Classification. """ kwargs = self.setParams._input_kwargs http://git-wip-us.apache.org/repos/asf/spark/blob/70b58784/python/pyspark/ml/regression.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index a0bb8ce..b09356a 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -591,7 +591,7 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> gbt = GBTRegressor(maxIter=5, maxDepth=2) + >>> gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42) >>> model = gbt.fit(df) >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) True @@ -614,12 +614,12 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1): + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1) + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None) """ super(GBTRegressor, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid) @@ -629,7 +629,8 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, "Supported options: " + ", ".join(GBTParams.supportedLossTypes)) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1) + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, + seed=None) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -638,12 +639,12 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1): + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1) + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None) Sets params for Gradient Boosted Tree Regression. """ kwargs = self.setParams._input_kwargs --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
