Github user feynmanliang commented on a diff in the pull request:
https://github.com/apache/spark/pull/7099#discussion_r34085177
--- Diff:
mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala ---
@@ -212,12 +230,140 @@ class LinearRegressionModel private[ml] (
extends RegressionModel[Vector, LinearRegressionModel]
with LinearRegressionParams {
+ private var trainingSummary: Option[LinearRegressionTrainingSummary] =
None
+
+ /**
+ * Gets results summary (e.g. residuals, mse, r-squared ) of model on
training set. An exception
+ * is thrown if `trainingSummary == None`.
+ */
+ def summary: LinearRegressionTrainingSummary = trainingSummary match {
+ case Some(summ) => summ
+ case None =>
+ throw new SparkException(
+ "No training summary available for this LinearRegressionModel",
+ new NullPointerException())
+ }
+
+ private[regression] def setSummary(summary:
LinearRegressionTrainingSummary): this.type = {
+ this.trainingSummary = Some(summary)
+ this
+ }
+
+ def hasSummary: Boolean = trainingSummary.isDefined
+
+
+ /**
+ * Evaluates the model on a testset.
+ * @param dataset Test dataset to evaluate model on.
+ */
+ def evaluate(dataset: DataFrame): LinearRegressionSummary = {
+ val t = udf { features: Vector => predict(features) }
+ val predictionAndObservations = dataset
+ .select(col($(labelCol)),
t(col($(featuresCol))).as($(predictionCol)))
+
+ new LinearRegressionSummary(predictionAndObservations,
$(predictionCol), $(labelCol))
+ }
+
override protected def predict(features: Vector): Double = {
dot(features, weights) + intercept
}
override def copy(extra: ParamMap): LinearRegressionModel = {
- copyValues(new LinearRegressionModel(uid, weights, intercept), extra)
+ val newModel = new LinearRegressionModel(uid, weights, intercept)
+ if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
+ copyValues(newModel, extra)
+ }
+}
+
+/**
+ * :: Experimental ::
+ * Linear regression training results.
+ * @param predictions predictions outputted by the model's `transform`
method.
+ * @param objectiveTrace objective function (scaled loss + regularization)
at each iteration.
+ */
+@Experimental
+class LinearRegressionTrainingSummary private[ml] (
+ predictions: DataFrame,
+ predictionCol: String,
+ labelCol: String,
+ val objectiveTrace: DataFrame)
+ extends LinearRegressionSummary(predictions, predictionCol, labelCol) {
+
+ def this(
+ predictions: DataFrame,
+ predictionCol: String,
+ labelCol: String,
+ objectiveTrace: Array[Double]) = {
+
+ this(
+ predictions,
+ predictionCol,
+ labelCol,
+ predictions.sqlContext
+ .createDataFrame(objectiveTrace.zipWithIndex.map(_.swap).toSeq)
+ .toDF("iteration", "objective")
+ )
+ }
+
+ /** Number of training iterations until termination */
+ val totalIterations = objectiveTrace.count()
+
+}
+
+/**
+ * :: Experimental ::
+ * Linear regression results evaluated on a dataset.
+ * @param predictions predictions outputted by the model's `transform`
method.
+ */
+@Experimental
+class LinearRegressionSummary private[ml] (
+ @transient val predictions: DataFrame,
+ val predictionCol: String,
+ val labelCol: String) extends Serializable {
+
+ private val metrics = new RegressionMetrics(predictions
+ .select(predictionCol, labelCol)
+ .map { case Row(pred: Double, label: Double) => (pred, label) }
+ )
+
+ // Force evaluation of lazy RegressionMetrics.summary for proper
serialization
+ metrics.explainedVariance
--- End diff --
I agree wtih @dbtsai that not all of `LinearRegressionSummary` should be
transient since small things (objectiveTrace, even the metrics like r^2, MSE,
etc) can be kept with the `LinearRegressionTrainingSummary` of each
`LinearRegressionModel`.
Since `LinearRegressionModel` is sent to workers, all its fields need to be
serializable.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]