This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 212b327e6367 [SPARK-50876][ML][PYTHON][CONNECT] Support Tree 
Regressors on Connect
212b327e6367 is described below

commit 212b327e63678dc4e6aa2b457019301068106556
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Jan 20 13:11:33 2025 +0800

    [SPARK-50876][ML][PYTHON][CONNECT] Support Tree Regressors on Connect
    
    ### What changes were proposed in this pull request?
    Support Tree Regressors on Connect:
    
    - DecisionTreeRegressor
    - RandomForestRegressor
    - GBTRegressor
    
    ### Why are the changes needed?
    for feature parity
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, new algorithms supported on Connect
    
    ### How was this patch tested?
    added tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #49566 from zhengruifeng/ml_reg_tree.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
    (cherry picked from commit 1c4cfcb0277d673a3f53b49e74ab74452118da91)
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 .../services/org.apache.spark.ml.Estimator         |   3 +
 python/pyspark/ml/tests/test_regression.py         | 201 +++++++++++++++++++++
 2 files changed, 204 insertions(+)

diff --git 
a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator 
b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
index b9a69ed55094..b4b49ef09bbb 100644
--- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
+++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
@@ -27,3 +27,6 @@ org.apache.spark.ml.classification.GBTClassifier
 
 # regression
 org.apache.spark.ml.regression.LinearRegression
+org.apache.spark.ml.regression.DecisionTreeRegressor
+org.apache.spark.ml.regression.RandomForestRegressor
+org.apache.spark.ml.regression.GBTRegressor
diff --git a/python/pyspark/ml/tests/test_regression.py 
b/python/pyspark/ml/tests/test_regression.py
index 305e2514a382..8d5d14ca5cf6 100644
--- a/python/pyspark/ml/tests/test_regression.py
+++ b/python/pyspark/ml/tests/test_regression.py
@@ -27,6 +27,12 @@ from pyspark.ml.regression import (
     LinearRegressionModel,
     LinearRegressionSummary,
     LinearRegressionTrainingSummary,
+    DecisionTreeRegressor,
+    DecisionTreeRegressionModel,
+    RandomForestRegressor,
+    RandomForestRegressionModel,
+    GBTRegressor,
+    GBTRegressionModel,
 )
 
 
@@ -159,6 +165,201 @@ class RegressionTestsMixin:
             model2 = LinearRegressionModel.load(d)
             self.assertEqual(str(model), str(model2))
 
+    def test_decision_tree_regressor(self):
+        df = self.df
+
+        dt = DecisionTreeRegressor(
+            maxDepth=2,
+            labelCol="label",
+            leafCol="leaf",
+            seed=1,
+        )
+        self.assertEqual(dt.getMaxDepth(), 2)
+        self.assertEqual(dt.getSeed(), 1)
+        self.assertEqual(dt.getLabelCol(), "label")
+        self.assertEqual(dt.getLeafCol(), "leaf")
+
+        # Estimator save & load
+        with tempfile.TemporaryDirectory(prefix="decision_tree_regressor") as 
d:
+            dt.write().overwrite().save(d)
+            dt2 = DecisionTreeRegressor.load(d)
+            self.assertEqual(str(dt), str(dt2))
+
+        model = dt.fit(df)
+        self.assertEqual(model.numFeatures, 2)
+        self.assertEqual(model.depth, 2)
+        self.assertEqual(model.numNodes, 5)
+
+        featureImportances = model.featureImportances
+        self.assertTrue(
+            np.allclose(featureImportances, [0.5756, 0.4244], atol=1e-4),
+            featureImportances,
+        )
+
+        debugString = model.toDebugString
+        self.assertTrue("depth=2, numNodes=5, numFeatures=2" in debugString, 
debugString)
+        self.assertTrue("If (feature 0 <= 1.75)" in debugString, debugString)
+
+        vec = Vectors.dense(0.0, 5.0)
+        self.assertTrue(np.allclose(model.predict(vec), 0.85, atol=1e-4))
+        self.assertEqual(model.predictLeaf(vec), 1.0)
+
+        output = model.transform(df)
+        expected_cols = [
+            "label",
+            "weight",
+            "features",
+            "prediction",
+            "leaf",
+        ]
+        self.assertEqual(output.columns, expected_cols)
+        self.assertEqual(output.count(), 4)
+
+        # Model save & load
+        with 
tempfile.TemporaryDirectory(prefix="decision_tree_regression_model") as d:
+            model.write().overwrite().save(d)
+            model2 = DecisionTreeRegressionModel.load(d)
+            self.assertEqual(str(model), str(model2))
+            self.assertEqual(model.toDebugString, model2.toDebugString)
+
+    def test_gbt_regressor(self):
+        df = self.df
+
+        gbt = GBTRegressor(
+            maxIter=3,
+            maxDepth=2,
+            labelCol="label",
+            leafCol="leaf",
+            seed=1,
+        )
+        self.assertEqual(gbt.getMaxIter(), 3)
+        self.assertEqual(gbt.getMaxDepth(), 2)
+        self.assertEqual(gbt.getSeed(), 1)
+        self.assertEqual(gbt.getLabelCol(), "label")
+        self.assertEqual(gbt.getLeafCol(), "leaf")
+
+        # Estimator save & load
+        with tempfile.TemporaryDirectory(prefix="gbt_regressor") as d:
+            gbt.write().overwrite().save(d)
+            gbt2 = GBTRegressor.load(d)
+            self.assertEqual(str(gbt), str(gbt2))
+
+        model = gbt.fit(df)
+        self.assertEqual(model.numFeatures, 2)
+        # TODO(SPARK-50843): Support access submodel in TreeEnsembleModel
+        # model.trees
+        self.assertEqual(model.treeWeights, [1.0, 0.1, 0.1])
+        self.assertEqual(model.totalNumNodes, 15)
+
+        featureImportances = model.featureImportances
+        self.assertTrue(
+            np.allclose(featureImportances, [0.5944156994359766, 
0.4055843005640234], atol=1e-4),
+            featureImportances,
+        )
+
+        debugString = model.toDebugString
+        self.assertTrue("numTrees=3, numFeatures=2" in debugString, 
debugString)
+        self.assertTrue("If (feature 0 <= 1.75)" in debugString, debugString)
+
+        vec = Vectors.dense(0.0, 5.0)
+        self.assertTrue(np.allclose(model.predict(vec), 0.904, atol=1e-4))
+        self.assertEqual(model.predictLeaf(vec), Vectors.dense(1.0, 0.0, 0.0))
+
+        # GBT-specific method: evaluateEachIteration
+        self.assertTrue(
+            np.allclose(
+                model.evaluateEachIteration(df, "squared"),
+                [0.011250000000000003, 0.0072, 0.0046079999999999975],
+                atol=1e-4,
+            )
+        )
+        self.assertTrue(
+            np.allclose(
+                model.evaluateEachIteration(df, "absolute"),
+                [0.07500000000000007, 0.06000000000000006, 
0.048000000000000057],
+                atol=1e-4,
+            )
+        )
+
+        output = model.transform(df)
+        expected_cols = [
+            "label",
+            "weight",
+            "features",
+            "prediction",
+            "leaf",
+        ]
+        self.assertEqual(output.columns, expected_cols)
+        self.assertEqual(output.count(), 4)
+
+        # Model save & load
+        with tempfile.TemporaryDirectory(prefix="gbt_regression_model") as d:
+            model.write().overwrite().save(d)
+            model2 = GBTRegressionModel.load(d)
+            self.assertEqual(str(model), str(model2))
+            self.assertEqual(model.toDebugString, model2.toDebugString)
+
+    def test_random_forest_regressor(self):
+        df = self.df
+
+        rf = RandomForestRegressor(
+            numTrees=3,
+            maxDepth=2,
+            labelCol="label",
+            leafCol="leaf",
+            seed=1,
+        )
+        self.assertEqual(rf.getNumTrees(), 3)
+        self.assertEqual(rf.getMaxDepth(), 2)
+        self.assertEqual(rf.getSeed(), 1)
+        self.assertEqual(rf.getLabelCol(), "label")
+        self.assertEqual(rf.getLeafCol(), "leaf")
+
+        # Estimator save & load
+        with tempfile.TemporaryDirectory(prefix="random_forest_regressor") as 
d:
+            rf.write().overwrite().save(d)
+            rf2 = RandomForestRegressor.load(d)
+            self.assertEqual(str(rf), str(rf2))
+
+        model = rf.fit(df)
+        self.assertEqual(model.numFeatures, 2)
+        # TODO(SPARK-50843): Support access submodel in TreeEnsembleModel
+        # model.trees
+        self.assertEqual(model.treeWeights, [1.0, 1.0, 1.0])
+        self.assertEqual(model.totalNumNodes, 11)
+
+        featureImportances = model.featureImportances
+        self.assertTrue(
+            np.allclose(featureImportances, [0.5615222294986538, 
0.43847777050134623], atol=1e-4),
+            featureImportances,
+        )
+
+        debugString = model.toDebugString
+        self.assertTrue("numTrees=3, numFeatures=2" in debugString, 
debugString)
+        self.assertTrue("If (feature 0 <= 1.75)" in debugString, debugString)
+
+        vec = Vectors.dense(0.0, 5.0)
+        self.assertTrue(np.allclose(model.predict(vec), 0.6166666666666667, 
atol=1e-4))
+        self.assertEqual(model.predictLeaf(vec), Vectors.dense(1.0, 0.0, 1.0))
+
+        output = model.transform(df)
+        expected_cols = [
+            "label",
+            "weight",
+            "features",
+            "prediction",
+            "leaf",
+        ]
+        self.assertEqual(output.columns, expected_cols)
+        self.assertEqual(output.count(), 4)
+
+        # Model save & load
+        with 
tempfile.TemporaryDirectory(prefix="random_forest_regression_model") as d:
+            model.write().overwrite().save(d)
+            model2 = RandomForestRegressionModel.load(d)
+            self.assertEqual(str(model), str(model2))
+            self.assertEqual(model.toDebugString, model2.toDebugString)
+
 
 class RegressionTests(RegressionTestsMixin, unittest.TestCase):
     def setUp(self) -> None:


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to