This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new acd086f [SPARK-19591][ML][PYSPARK][FOLLOWUP] Add sample weights to decision trees acd086f is described below commit acd086f207bbd3e6d3654eb8b06900793a781f27 Author: zhengruifeng <ruife...@foxmail.com> AuthorDate: Wed Feb 27 21:11:30 2019 -0600 [SPARK-19591][ML][PYSPARK][FOLLOWUP] Add sample weights to decision trees ## What changes were proposed in this pull request? Add sample weights to decision trees ## How was this patch tested? updated testsuites Closes #23818 from zhengruifeng/py_tree_support_sample_weight. Authored-by: zhengruifeng <ruife...@foxmail.com> Signed-off-by: Sean Owen <sean.o...@databricks.com> --- .../ml/classification/DecisionTreeClassifier.scala | 1 - .../ml/regression/DecisionTreeRegressor.scala | 1 - python/pyspark/ml/classification.py | 28 +++++++++++++++------- python/pyspark/ml/regression.py | 24 +++++++++++++------ 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 200ac00..cbb7e4f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -25,7 +25,6 @@ import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.param.ParamMap -import org.apache.spark.ml.param.shared.HasWeightCol import org.apache.spark.ml.tree._ import org.apache.spark.ml.tree.{DecisionTreeModel, Node, TreeClassifierParams} import org.apache.spark.ml.tree.DecisionTreeModelReadWrite._ diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 5254791..f4f4e56 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -26,7 +26,6 @@ import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.ParamMap -import org.apache.spark.ml.param.shared.HasWeightCol import org.apache.spark.ml.tree._ import org.apache.spark.ml.tree.DecisionTreeModelReadWrite._ import org.apache.spark.ml.tree.impl.RandomForest diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 134b9e0..131756b 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -889,10 +889,10 @@ class TreeClassifierParams(object): @inherit_doc -class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - HasProbabilityCol, HasRawPredictionCol, DecisionTreeParams, - TreeClassifierParams, HasCheckpointInterval, HasSeed, JavaMLWritable, - JavaMLReadable): +class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasWeightCol, + HasPredictionCol, HasProbabilityCol, HasRawPredictionCol, + DecisionTreeParams, TreeClassifierParams, HasCheckpointInterval, + HasSeed, JavaMLWritable, JavaMLReadable): """ `Decision tree <http://en.wikipedia.org/wiki/Decision_tree_learning>`_ learning algorithm for classification. @@ -944,6 +944,18 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred >>> model.featureImportances == model2.featureImportances True + >>> df3 = spark.createDataFrame([ + ... (1.0, 0.2, Vectors.dense(1.0)), + ... (1.0, 0.8, Vectors.dense(1.0)), + ... (0.0, 1.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) + >>> si3 = StringIndexer(inputCol="label", outputCol="indexed") + >>> si_model3 = si3.fit(df3) + >>> td3 = si_model3.transform(df3) + >>> dt3 = DecisionTreeClassifier(maxDepth=2, weightCol="weight", labelCol="indexed") + >>> model3 = dt3.fit(td3) + >>> print(model3.toDebugString) + DecisionTreeClassificationModel (uid=...) of depth 1 with 3 nodes... + .. versionadded:: 1.4.0 """ @@ -952,13 +964,13 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", - seed=None): + seed=None, weightCol=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ probabilityCol="probability", rawPredictionCol="rawPrediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \ - seed=None) + seed=None, weightCol=None) """ super(DecisionTreeClassifier, self).__init__() self._java_obj = self._new_java_obj( @@ -975,13 +987,13 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="gini", seed=None): + impurity="gini", seed=None, weightCol=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ probabilityCol="probability", rawPredictionCol="rawPrediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \ - seed=None) + seed=None, weightCol=None) Sets params for the DecisionTreeClassifier. """ kwargs = self._input_kwargs diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 7841de9..927cc77 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -748,9 +748,10 @@ class GBTRegressorParams(GBTParams, TreeRegressorParams): @inherit_doc -class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval, - HasSeed, JavaMLWritable, JavaMLReadable, HasVarianceCol): +class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasWeightCol, + HasPredictionCol, DecisionTreeParams, TreeRegressorParams, + HasCheckpointInterval, HasSeed, JavaMLWritable, JavaMLReadable, + HasVarianceCol): """ `Decision tree <http://en.wikipedia.org/wiki/Decision_tree_learning>`_ learning algorithm for regression. @@ -791,6 +792,15 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi >>> model.transform(test1).head().variance 0.0 + >>> df3 = spark.createDataFrame([ + ... (1.0, 0.2, Vectors.dense(1.0)), + ... (1.0, 0.8, Vectors.dense(1.0)), + ... (0.0, 1.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) + >>> dt3 = DecisionTreeRegressor(maxDepth=2, weightCol="weight", varianceCol="variance") + >>> model3 = dt3.fit(df3) + >>> print(model3.toDebugString) + DecisionTreeRegressionModel (uid=...) of depth 1 with 3 nodes... + .. versionadded:: 1.4.0 """ @@ -798,12 +808,12 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance", - seed=None, varianceCol=None): + seed=None, varianceCol=None, weightCol=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - impurity="variance", seed=None, varianceCol=None) + impurity="variance", seed=None, varianceCol=None, weightCol=None) """ super(DecisionTreeRegressor, self).__init__() self._java_obj = self._new_java_obj( @@ -819,12 +829,12 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="variance", seed=None, varianceCol=None): + impurity="variance", seed=None, varianceCol=None, weightCol=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - impurity="variance", seed=None, varianceCol=None) + impurity="variance", seed=None, varianceCol=None, weightCol=None) Sets params for the DecisionTreeRegressor. """ kwargs = self._input_kwargs --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org