Github user hhbyyh commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19020#discussion_r147316970
  
    --- Diff: 
mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala 
---
    @@ -998,6 +1047,198 @@ class LinearRegressionSuite
           }
         }
       }
    +
    +  test("linear regression (huber loss) with intercept without 
regularization") {
    +    val trainer1 = (new LinearRegression).setLoss("huber")
    +      .setFitIntercept(true).setStandardization(true)
    +    val trainer2 = (new LinearRegression).setLoss("huber")
    +      .setFitIntercept(true).setStandardization(false)
    +
    +    val model1 = trainer1.fit(datasetWithOutlier)
    +    val model2 = trainer2.fit(datasetWithOutlier)
    +
    +    /*
    +      Using the following Python code to load the data and train the model 
using
    +      scikit-learn package.
    +
    +      import pandas as pd
    +      import numpy as np
    +      from sklearn.linear_model import HuberRegressor
    +      df = pd.read_csv("path", header = None)
    +      X = df[df.columns[1:3]]
    +      y = np.array(df[df.columns[0]])
    +      huber = HuberRegressor(fit_intercept=True, alpha=0.0, max_iter=100, 
epsilon=1.35)
    +      huber.fit(X, y)
    +
    +      >>> huber.coef_
    +      array([ 4.68998007,  7.19429011])
    +      >>> huber.intercept_
    +      6.3002404351083037
    +      >>> huber.scale_
    +      0.077810159205220747
    +     */
    +    val coefficientsPy = Vectors.dense(4.68998007, 7.19429011)
    +    val interceptPy = 6.30024044
    +    val scalePy = 0.07781016
    +
    +    assert(model1.coefficients ~= coefficientsPy relTol 1E-3)
    +    assert(model1.intercept ~== interceptPy relTol 1E-3)
    +    assert(model1.scale ~== scalePy relTol 1E-3)
    +
    +    // Without regularization, with or without standardization will 
converge to the same solution.
    +    assert(model2.coefficients ~= coefficientsPy relTol 1E-3)
    +    assert(model2.intercept ~== interceptPy relTol 1E-3)
    +    assert(model2.scale ~== scalePy relTol 1E-3)
    +  }
    +
    +  test("linear regression (huber loss) without intercept without 
regularization") {
    +    val trainer1 = (new LinearRegression).setLoss("huber")
    +      .setFitIntercept(false).setStandardization(true)
    +    val trainer2 = (new LinearRegression).setLoss("huber")
    +      .setFitIntercept(false).setStandardization(false)
    +
    +    val model1 = trainer1.fit(datasetWithOutlier)
    +    val model2 = trainer2.fit(datasetWithOutlier)
    +
    +    /*
    +      huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100, 
epsilon=1.35)
    +      huber.fit(X, y)
    +
    +      >>> huber.coef_
    +      array([ 6.71756703,  5.08873222])
    +      >>> huber.intercept_
    +      0.0
    +      >>> huber.scale_
    +      2.5560209922722317
    +     */
    +    val coefficientsPy = Vectors.dense(6.71756703, 5.08873222)
    +    val interceptPy = 0.0
    +    val scalePy = 2.55602099
    +
    +    assert(model1.coefficients ~= coefficientsPy relTol 1E-3)
    +    assert(model1.intercept === interceptPy)
    +    assert(model1.scale ~== scalePy relTol 1E-3)
    +
    +    // Without regularization, with or without standardization will 
converge to the same solution.
    +    assert(model2.coefficients ~= coefficientsPy relTol 1E-3)
    +    assert(model2.intercept === interceptPy)
    +    assert(model2.scale ~== scalePy relTol 1E-3)
    +  }
    +
    +  test("linear regression (huber loss) with intercept with L2 
regularization") {
    +    val trainer1 = (new LinearRegression).setLoss("huber")
    +      .setFitIntercept(true).setRegParam(0.21).setStandardization(true)
    +    val trainer2 = (new LinearRegression).setLoss("huber")
    +      .setFitIntercept(true).setRegParam(0.21).setStandardization(false)
    +
    +    val model1 = trainer1.fit(datasetWithOutlier)
    +    val model2 = trainer2.fit(datasetWithOutlier)
    +
    +    /*
    +      Since scikit-learn HuberRegressor does not support standardization,
    +      we do it manually out of the estimator.
    +
    +      xStd = np.std(X, axis=0)
    +      scaledX = X / xStd
    +      huber = HuberRegressor(fit_intercept=True, alpha=210, max_iter=100, 
epsilon=1.35)
    +      huber.fit(scaledX, y)
    +
    +      >>> np.array(huber.coef_ / xStd)
    +      array([ 1.97732633,  3.38816722])
    +      >>> huber.intercept_
    +      3.7527581430531227
    +      >>> huber.scale_
    +      3.787363673371801
    +     */
    +    val coefficientsPy1 = Vectors.dense(1.97732633, 3.38816722)
    +    val interceptPy1 = 3.75275814
    +    val scalePy1 = 3.78736367
    +
    +    assert(model1.coefficients ~= coefficientsPy1 relTol 1E-2)
    +    assert(model1.intercept ~== interceptPy1 relTol 1E-2)
    +    assert(model1.scale ~== scalePy1 relTol 1E-2)
    +
    +    /*
    +      huber = HuberRegressor(fit_intercept=True, alpha=210, max_iter=100, 
epsilon=1.35)
    +      huber.fit(X, y)
    +
    +      >>> huber.coef_
    +      array([ 1.73346444,  3.63746999])
    +      >>> huber.intercept_
    +      4.3017134790781739
    +      >>> huber.scale_
    +      3.6472742809286793
    +     */
    +    val coefficientsPy2 = Vectors.dense(1.73346444, 3.63746999)
    +    val interceptPy2 = 4.30171347
    +    val scalePy2 = 3.64727428
    +
    +    assert(model2.coefficients ~= coefficientsPy2 relTol 1E-3)
    +    assert(model2.intercept ~== interceptPy2 relTol 1E-3)
    +    assert(model2.scale ~== scalePy2 relTol 1E-3)
    +  }
    +
    +  test("linear regression (huber loss) without intercept with L2 
regularization") {
    +    val trainer1 = (new LinearRegression).setLoss("huber")
    +      .setFitIntercept(false).setRegParam(0.21).setStandardization(true)
    +    val trainer2 = (new LinearRegression).setLoss("huber")
    +      .setFitIntercept(false).setRegParam(0.21).setStandardization(false)
    +
    +    val model1 = trainer1.fit(datasetWithOutlier)
    +    val model2 = trainer2.fit(datasetWithOutlier)
    +
    +    /*
    +      Since scikit-learn HuberRegressor does not support standardization,
    +      we do it manually out of the estimator.
    +
    +      xStd = np.std(X, axis=0)
    +      scaledX = X / xStd
    +      huber = HuberRegressor(fit_intercept=False, alpha=210, max_iter=100, 
epsilon=1.35)
    +      huber.fit(scaledX, y)
    +
    +      >>> np.array(huber.coef_ / xStd)
    +      array([ 2.59679008,  2.26973102])
    +      >>> huber.intercept_
    +      0.0
    +      >>> huber.scale_
    +      4.5766311924091791
    +     */
    +    val coefficientsPy1 = Vectors.dense(2.59679008, 2.26973102)
    +    val interceptPy1 = 0.0
    +    val scalePy1 = 4.57663119
    +
    +    assert(model1.coefficients ~= coefficientsPy1 relTol 1E-2)
    +    assert(model1.intercept === interceptPy1)
    +    assert(model1.scale ~== scalePy1 relTol 1E-2)
    +
    +    /*
    +      huber = HuberRegressor(fit_intercept=False, alpha=210, max_iter=100, 
epsilon=1.35)
    +      huber.fit(X, y)
    +
    +      >>> huber.coef_
    +      array([ 2.28423908,  2.25196887])
    +      >>> huber.intercept_
    +      0.0
    +      >>> huber.scale_
    +      4.5979643506051753
    +     */
    +    val coefficientsPy2 = Vectors.dense(2.28423908, 2.25196887)
    +    val interceptPy2 = 0.0
    +    val scalePy2 = 4.59796435
    +
    +    assert(model2.coefficients ~= coefficientsPy2 relTol 1E-3)
    +    assert(model2.intercept === interceptPy2)
    +    assert(model2.scale ~== scalePy2 relTol 1E-3)
    +  }
    +
    +  test("huber loss model match squared error for large m") {
    --- End diff --
    
    m -> epsilon


---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to