Github user actuaryzhang commented on a diff in the pull request:

    https://github.com/apache/spark/pull/16699#discussion_r100581520
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
 ---
    @@ -1218,16 +1266,35 @@ class GeneralizedLinearRegressionSummary 
private[regression] (
        */
       @Since("2.0.0")
       lazy val nullDeviance: Double = {
    -    val w = weightCol
    -    val wtdmu: Double = if (model.getFitIntercept) {
    -      val agg = predictions.agg(sum(w.multiply(col(model.getLabelCol))), 
sum(w)).first()
    -      agg.getDouble(0) / agg.getDouble(1)
    +    val intercept: Double = if (!model.getFitIntercept) {
    +      0.0
         } else {
    -      link.unlink(0.0)
    +      /*
    +        Estimate intercept analytically when there is no offset, or when 
there is offset but
    +        the model is Gaussian family with identity link. Otherwise, fit an 
intercept only model.
    +       */
    +      if (!isSetOffsetCol(model) ||
    +        (isSetOffsetCol(model) && family == Gaussian && link == Identity)) 
{
    +        val agg = predictions.agg(sum(weight.multiply(
    +          label.minus(offset))), sum(weight)).first()
    +        link.link(agg.getDouble(0) / agg.getDouble(1))
    +      } else {
    +        // Create empty feature column and fit intercept only model using 
param setting from model
    +        val featureNull = "feature_" + java.util.UUID.randomUUID.toString
    +        val paramMap = model.extractParamMap()
    +        paramMap.put(model.featuresCol, featureNull)
    +        if (family.name != "tweedie") {
    +          paramMap.remove(model.variancePower)
    +        }
    +        val emptyVectorUDF = udf{ () => Vectors.zeros(0) }
    +        model.parent.fit(
    +          dataset.withColumn(featureNull, emptyVectorUDF()), paramMap
    +        ).intercept
    +      }
         }
    -    predictions.select(col(model.getLabelCol).cast(DoubleType), w).rdd.map 
{
    -      case Row(y: Double, weight: Double) =>
    -        family.deviance(y, wtdmu, weight)
    +    predictions.select(label, offset, weight).rdd.map {
    +      case Row(y: Double, offset: Double, weight: Double) =>
    +        family.deviance(y, link.unlink(intercept + offset), weight)
         }.sum()
       }
    --- End diff --
    
    @sethah This part is the most critical change since last time. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to