[GitHub] spark pull request: [SPARK-5253] [ML] LinearRegression with L1/L2 ...

dbtsai Sat, 25 Apr 2015 00:03:00 -0700

Github user dbtsai commented on a diff in the pull request:

    https://github.com/apache/spark/pull/4259#discussion_r29098568
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala ---
    @@ -42,34 +50,122 @@ private[regression] trait LinearRegressionParams 
extends RegressorParams
     class LinearRegression extends Regressor[Vector, LinearRegression, 
LinearRegressionModel]
       with LinearRegressionParams {
     
    -  setDefault(regParam -> 0.1, maxIter -> 100)
    -
    -  /** @group setParam */
    +  /**
    +   * Set the regularization parameter.
    +   * Default is 0.0.
    +   * @group setParam
    +   */
       def setRegParam(value: Double): this.type = set(regParam, value)
    +  setDefault(regParam -> 0.0)
    +
    +  /**
    +   * Set the ElasticNet mixing parameter.
    +   * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an 
L1 penalty.
    +   * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
    +   * Default is 0.0 which is an L2 penalty.
    +   * @group setParam
    +   */
    +  def setElasticNetParam(value: Double): this.type = set(elasticNetParam, 
value)
    +  setDefault(elasticNetParam -> 0.0)
     
    -  /** @group setParam */
    +  /**
    +   * Set the maximal number of iterations.
    +   * Default is 100.
    +   * @group setParam
    +   */
       def setMaxIter(value: Int): this.type = set(maxIter, value)
    +  setDefault(maxIter -> 100)
    +
    +  /**
    +   * Set the convergence tolerance of iterations.
    +   * Smaller value will lead to higher accuracy with the cost of more 
iterations.
    +   * Default is 1E-9.
    +   * @group setParam
    +   */
    +  def setTol(value: Double): this.type = set(tol, value)
    +  setDefault(tol -> 1E-9)
     
       override protected def train(dataset: DataFrame, paramMap: ParamMap): 
LinearRegressionModel = {
    -    // Extract columns from data.  If dataset is persisted, do not persist 
oldDataset.
    -    val oldDataset = extractLabeledPoints(dataset, paramMap)
    +    // Extract columns from data.  If dataset is persisted, do not persist 
instances.
    +    val instances = extractLabeledPoints(dataset, paramMap).map {
    +      case LabeledPoint(label: Double, features: Vector) => (label, 
features)
    +    }
         val handlePersistence = dataset.rdd.getStorageLevel == 
StorageLevel.NONE
         if (handlePersistence) {
    -      oldDataset.persist(StorageLevel.MEMORY_AND_DISK)
    +      instances.persist(StorageLevel.MEMORY_AND_DISK)
         }
     
    -    // Train model
    -    val lr = new LinearRegressionWithSGD()
    -    lr.optimizer
    -      .setRegParam(paramMap(regParam))
    -      .setNumIterations(paramMap(maxIter))
    -    val model = lr.run(oldDataset)
    -    val lrm = new LinearRegressionModel(this, paramMap, model.weights, 
model.intercept)
    +    val (summarizer, statCounter) = instances.treeAggregate(
    +      (new MultivariateOnlineSummarizer, new StatCounter))( {
    +        case ((summarizer: MultivariateOnlineSummarizer, statCounter: 
StatCounter),
    +        (label: Double, features: Vector)) =>
    +          (summarizer.add(features), statCounter.merge(label))
    +      }, {
    +        case ((summarizer1: MultivariateOnlineSummarizer, statCounter1: 
StatCounter),
    +        (summarizer2: MultivariateOnlineSummarizer, statCounter2: 
StatCounter)) =>
    +          (summarizer1.merge(summarizer2), 
statCounter1.merge(statCounter2))
    +      })
    +
    +    val numFeatures = summarizer.mean.size
    +    val yMean = statCounter.mean
    +    val yStd = math.sqrt(statCounter.variance)
    +
    +    val featuresMean = summarizer.mean.toArray
    +    val featuresStd = summarizer.variance.toArray.map(math.sqrt)
    +
    +    // Since we implicitly do the feature scaling when we compute the cost 
function
    +    // to improve the convergence, the effective regParam will be changed.
    +    val effectiveRegParam = paramMap(regParam) / yStd
    +    val effectiveL1RegParam = paramMap(elasticNetParam) * effectiveRegParam
    +    val effectiveL2RegParam = (1.0 - paramMap(elasticNetParam)) * 
effectiveRegParam
    +
    +    val costFun = new LeastSquaresCostFun(
    +      instances,
    +      yStd, yMean,
    +      featuresStd,
    +      featuresMean,
    +      effectiveL2RegParam)
    +
    +    val optimizer = if (paramMap(elasticNetParam) == 0.0 || 
effectiveRegParam == 0.0) {
    +      new BreezeLBFGS[BDV[Double]](paramMap(maxIter), 10, paramMap(tol))
    +    } else {
    +      new BreezeOWLQN[Int, BDV[Double]](paramMap(maxIter), 10, 
effectiveL1RegParam, paramMap(tol))
    +    }
    +
    +    val initialWeights = Vectors.zeros(numFeatures)
    +    val states =
    +      optimizer.iterations(new CachedDiffFunction(costFun), 
initialWeights.toBreeze.toDenseVector)
    --- End diff --
    
    `optimizer.iterations` explicitly takes `BDV`.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-5253] [ML] LinearRegression with L1/L2 ...

Reply via email to