[GitHub] spark pull request: [SPARK-8538][SPARK-8539][ML] Linear Regression...

jkbradley Tue, 07 Jul 2015 13:13:48 -0700

Github user jkbradley commented on a diff in the pull request:

    https://github.com/apache/spark/pull/7099#discussion_r34084507
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala ---
    @@ -212,12 +230,140 @@ class LinearRegressionModel private[ml] (
       extends RegressionModel[Vector, LinearRegressionModel]
       with LinearRegressionParams {
     
    +  private var trainingSummary: Option[LinearRegressionTrainingSummary] = 
None
    +
    +  /**
    +   * Gets results summary (e.g. residuals, mse, r-squared ) of model on 
training set. An exception
    +   * is thrown if `trainingSummary == None`.
    +   */
    +  def summary: LinearRegressionTrainingSummary = trainingSummary match {
    +    case Some(summ) => summ
    +    case None =>
    +      throw new SparkException(
    +        "No training summary available for this LinearRegressionModel",
    +        new NullPointerException())
    +  }
    +
    +  private[regression] def setSummary(summary: 
LinearRegressionTrainingSummary): this.type = {
    +    this.trainingSummary = Some(summary)
    +    this
    +  }
    +
    +  def hasSummary: Boolean = trainingSummary.isDefined
    +
    +
    +  /**
    +   * Evaluates the model on a testset.
    +   * @param dataset Test dataset to evaluate model on.
    +   */
    +  def evaluate(dataset: DataFrame): LinearRegressionSummary = {
    +    val t = udf { features: Vector => predict(features) }
    +    val predictionAndObservations = dataset
    +      .select(col($(labelCol)), 
t(col($(featuresCol))).as($(predictionCol)))
    +
    +    new LinearRegressionSummary(predictionAndObservations, 
$(predictionCol), $(labelCol))
    +  }
    +
       override protected def predict(features: Vector): Double = {
         dot(features, weights) + intercept
       }
     
       override def copy(extra: ParamMap): LinearRegressionModel = {
    -    copyValues(new LinearRegressionModel(uid, weights, intercept), extra)
    +    val newModel = new LinearRegressionModel(uid, weights, intercept)
    +    if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
    +    copyValues(newModel, extra)
    +  }
    +}
    +
    +/**
    + * :: Experimental ::
    + * Linear regression training results.
    + * @param predictions predictions outputted by the model's `transform` 
method.
    + * @param objectiveTrace objective function (scaled loss + regularization) 
at each iteration.
    + */
    +@Experimental
    +class LinearRegressionTrainingSummary private[ml] (
    +    predictions: DataFrame,
    +    predictionCol: String,
    +    labelCol: String,
    +    val objectiveTrace: DataFrame)
    +  extends LinearRegressionSummary(predictions, predictionCol, labelCol) {
    +
    +  def this(
    +      predictions: DataFrame,
    +      predictionCol: String,
    +      labelCol: String,
    +      objectiveTrace: Array[Double]) = {
    +
    +    this(
    +      predictions,
    +      predictionCol,
    +      labelCol,
    +      predictions.sqlContext
    +        .createDataFrame(objectiveTrace.zipWithIndex.map(_.swap).toSeq)
    +        .toDF("iteration", "objective")
    +    )
    +  }
    +
    +  /** Number of training iterations until termination */
    +  val totalIterations = objectiveTrace.count()
    +
    +}
    +
    +/**
    + * :: Experimental ::
    + * Linear regression results evaluated on a dataset.
    + * @param predictions predictions outputted by the model's `transform` 
method.
    + */
    +@Experimental
    +class LinearRegressionSummary private[ml] (
    +    @transient val predictions: DataFrame,
    +    val predictionCol: String,
    +    val labelCol: String) extends Serializable {
    +
    +  private val metrics = new RegressionMetrics(predictions
    +    .select(predictionCol, labelCol)
    +    .map { case Row(pred: Double, label: Double) => (pred, label) }
    +  )
    +
    +  // Force evaluation of lazy RegressionMetrics.summary for proper 
serialization
    +  metrics.explainedVariance
    --- End diff --
    
    We should not force evaluation.  This will only be needed if the 
LinearRegressionSummary gets serialized and sent to workers for them to use the 
metrics, but I had planned not to support that.  Users can still extract 
metrics on the driver and then send those values to workers.  Why does 
LinearRegressionSummary need to be serializable?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-8538][SPARK-8539][ML] Linear Regression...

Reply via email to