[spark] branch master updated: [SPARK-19591][ML][PYSPARK][FOLLOWUP] Add sample weights to decision trees

srowen Wed, 27 Feb 2019 19:12:07 -0800

This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new acd086f  [SPARK-19591][ML][PYSPARK][FOLLOWUP] Add sample weights to 
decision trees
acd086f is described below

commit acd086f207bbd3e6d3654eb8b06900793a781f27
Author: zhengruifeng <ruife...@foxmail.com>
AuthorDate: Wed Feb 27 21:11:30 2019 -0600

    [SPARK-19591][ML][PYSPARK][FOLLOWUP] Add sample weights to decision trees
    
    ## What changes were proposed in this pull request?
    Add sample weights to decision trees
    
    ## How was this patch tested?
    updated testsuites
    
    Closes #23818 from zhengruifeng/py_tree_support_sample_weight.
    
    Authored-by: zhengruifeng <ruife...@foxmail.com>
    Signed-off-by: Sean Owen <sean.o...@databricks.com>
---
 .../ml/classification/DecisionTreeClassifier.scala |  1 -
 .../ml/regression/DecisionTreeRegressor.scala      |  1 -
 python/pyspark/ml/classification.py                | 28 +++++++++++++++-------
 python/pyspark/ml/regression.py                    | 24 +++++++++++++------
 4 files changed, 37 insertions(+), 17 deletions(-)

diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 200ac00..cbb7e4f 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -25,7 +25,6 @@ import org.apache.spark.annotation.Since
 import org.apache.spark.ml.feature.{Instance, LabeledPoint}
 import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.param.shared.HasWeightCol
 import org.apache.spark.ml.tree._
 import org.apache.spark.ml.tree.{DecisionTreeModel, Node, TreeClassifierParams}
 import org.apache.spark.ml.tree.DecisionTreeModelReadWrite._
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index 5254791..f4f4e56 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -26,7 +26,6 @@ import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.feature.{Instance, LabeledPoint}
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.param.shared.HasWeightCol
 import org.apache.spark.ml.tree._
 import org.apache.spark.ml.tree.DecisionTreeModelReadWrite._
 import org.apache.spark.ml.tree.impl.RandomForest
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 134b9e0..131756b 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -889,10 +889,10 @@ class TreeClassifierParams(object):
 
 
 @inherit_doc
-class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol,
-                             HasProbabilityCol, HasRawPredictionCol, 
DecisionTreeParams,
-                             TreeClassifierParams, HasCheckpointInterval, 
HasSeed, JavaMLWritable,
-                             JavaMLReadable):
+class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasWeightCol,
+                             HasPredictionCol, HasProbabilityCol, 
HasRawPredictionCol,
+                             DecisionTreeParams, TreeClassifierParams, 
HasCheckpointInterval,
+                             HasSeed, JavaMLWritable, JavaMLReadable):
     """
     `Decision tree <http://en.wikipedia.org/wiki/Decision_tree_learning>`_
     learning algorithm for classification.
@@ -944,6 +944,18 @@ class DecisionTreeClassifier(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPred
     >>> model.featureImportances == model2.featureImportances
     True
 
+    >>> df3 = spark.createDataFrame([
+    ...     (1.0, 0.2, Vectors.dense(1.0)),
+    ...     (1.0, 0.8, Vectors.dense(1.0)),
+    ...     (0.0, 1.0, Vectors.sparse(1, [], []))], ["label", "weight", 
"features"])
+    >>> si3 = StringIndexer(inputCol="label", outputCol="indexed")
+    >>> si_model3 = si3.fit(df3)
+    >>> td3 = si_model3.transform(df3)
+    >>> dt3 = DecisionTreeClassifier(maxDepth=2, weightCol="weight", 
labelCol="indexed")
+    >>> model3 = dt3.fit(td3)
+    >>> print(model3.toDebugString)
+    DecisionTreeClassificationModel (uid=...) of depth 1 with 3 nodes...
+
     .. versionadded:: 1.4.0
     """
 
@@ -952,13 +964,13 @@ class DecisionTreeClassifier(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPred
                  probabilityCol="probability", 
rawPredictionCol="rawPrediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, 
impurity="gini",
-                 seed=None):
+                 seed=None, weightCol=None):
         """
         __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
                  probabilityCol="probability", 
rawPredictionCol="rawPrediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, 
impurity="gini", \
-                 seed=None)
+                 seed=None, weightCol=None)
         """
         super(DecisionTreeClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -975,13 +987,13 @@ class DecisionTreeClassifier(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPred
                   probabilityCol="probability", 
rawPredictionCol="rawPrediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
-                  impurity="gini", seed=None):
+                  impurity="gini", seed=None, weightCol=None):
         """
         setParams(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
                   probabilityCol="probability", 
rawPredictionCol="rawPrediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, 
checkpointInterval=10, impurity="gini", \
-                  seed=None)
+                  seed=None, weightCol=None)
         Sets params for the DecisionTreeClassifier.
         """
         kwargs = self._input_kwargs
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 7841de9..927cc77 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -748,9 +748,10 @@ class GBTRegressorParams(GBTParams, TreeRegressorParams):
 
 
 @inherit_doc
-class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol,
-                            DecisionTreeParams, TreeRegressorParams, 
HasCheckpointInterval,
-                            HasSeed, JavaMLWritable, JavaMLReadable, 
HasVarianceCol):
+class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasWeightCol,
+                            HasPredictionCol, DecisionTreeParams, 
TreeRegressorParams,
+                            HasCheckpointInterval, HasSeed, JavaMLWritable, 
JavaMLReadable,
+                            HasVarianceCol):
     """
     `Decision tree <http://en.wikipedia.org/wiki/Decision_tree_learning>`_
     learning algorithm for regression.
@@ -791,6 +792,15 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredi
     >>> model.transform(test1).head().variance
     0.0
 
+    >>> df3 = spark.createDataFrame([
+    ...     (1.0, 0.2, Vectors.dense(1.0)),
+    ...     (1.0, 0.8, Vectors.dense(1.0)),
+    ...     (0.0, 1.0, Vectors.sparse(1, [], []))], ["label", "weight", 
"features"])
+    >>> dt3 = DecisionTreeRegressor(maxDepth=2, weightCol="weight", 
varianceCol="variance")
+    >>> model3 = dt3.fit(df3)
+    >>> print(model3.toDebugString)
+    DecisionTreeRegressionModel (uid=...) of depth 1 with 3 nodes...
+
     .. versionadded:: 1.4.0
     """
 
@@ -798,12 +808,12 @@ class DecisionTreeRegressor(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPredi
     def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, 
impurity="variance",
-                 seed=None, varianceCol=None):
+                 seed=None, varianceCol=None, weightCol=None):
         """
         __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, 
\
-                 impurity="variance", seed=None, varianceCol=None)
+                 impurity="variance", seed=None, varianceCol=None, 
weightCol=None)
         """
         super(DecisionTreeRegressor, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -819,12 +829,12 @@ class DecisionTreeRegressor(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPredi
     def setParams(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
-                  impurity="variance", seed=None, varianceCol=None):
+                  impurity="variance", seed=None, varianceCol=None, 
weightCol=None):
         """
         setParams(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, 
checkpointInterval=10, \
-                  impurity="variance", seed=None, varianceCol=None)
+                  impurity="variance", seed=None, varianceCol=None, 
weightCol=None)
         Sets params for the DecisionTreeRegressor.
         """
         kwargs = self._input_kwargs


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-19591][ML][PYSPARK][FOLLOWUP] Add sample weights to decision trees

Reply via email to