Repository: spark Updated Branches: refs/heads/master 06b9d623e -> 83af297ac
[SPARK-13925][ML][SPARKR] Expose R-like summary statistics in SparkR::glm for more family and link functions ## What changes were proposed in this pull request? Expose R-like summary statistics in SparkR::glm for more family and link functions. Note: Not all values in R [summary.glm](http://stat.ethz.ch/R-manual/R-patched/library/stats/html/summary.glm.html) are exposed, we only provide the most commonly used statistics in this PR. More statistics can be added in the followup work. ## How was this patch tested? Unit tests. SparkR Output: ``` Deviance Residuals: (Note: These are approximate quantiles with relative error <= 0.01) Min 1Q Median 3Q Max -0.95096 -0.16585 -0.00232 0.17410 0.72918 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.6765 0.23536 7.1231 4.4561e-11 Sepal_Length 0.34988 0.046301 7.5566 4.1873e-12 Species_versicolor -0.98339 0.072075 -13.644 0 Species_virginica -1.0075 0.093306 -10.798 0 (Dispersion parameter for gaussian family taken to be 0.08351462) Null deviance: 28.307 on 149 degrees of freedom Residual deviance: 12.193 on 146 degrees of freedom AIC: 59.22 Number of Fisher Scoring iterations: 1 ``` R output: ``` Deviance Residuals: Min 1Q Median 3Q Max -0.95096 -0.16522 0.00171 0.18416 0.72918 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.67650 0.23536 7.123 4.46e-11 *** Sepal.Length 0.34988 0.04630 7.557 4.19e-12 *** Speciesversicolor -0.98339 0.07207 -13.644 < 2e-16 *** Speciesvirginica -1.00751 0.09331 -10.798 < 2e-16 *** --- Signif. codes: 0 â***â 0.001 â**â 0.01 â*â 0.05 â.â 0.1 â â 1 (Dispersion parameter for gaussian family taken to be 0.08351462) Null deviance: 28.307 on 149 degrees of freedom Residual deviance: 12.193 on 146 degrees of freedom AIC: 59.217 Number of Fisher Scoring iterations: 2 ``` cc mengxr Author: Yanbo Liang <yblia...@gmail.com> Closes #12393 from yanboliang/spark-13925. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/83af297a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/83af297a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/83af297a Branch: refs/heads/master Commit: 83af297ac42546580983f91079f74e3a4cf25050 Parents: 06b9d62 Author: Yanbo Liang <yblia...@gmail.com> Authored: Fri Apr 15 08:23:51 2016 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Fri Apr 15 08:23:51 2016 -0700 ---------------------------------------------------------------------- R/pkg/NAMESPACE | 3 +- R/pkg/R/mllib.R | 49 ++++++++++++++++-- R/pkg/inst/tests/testthat/test_mllib.R | 49 ++++++++++++++++++ .../r/GeneralizedLinearRegressionWrapper.scala | 52 +++++++++++++++++--- 4 files changed, 143 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/83af297a/R/pkg/NAMESPACE ---------------------------------------------------------------------- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index f48c61c..94ac7e7 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -292,7 +292,8 @@ export("as.DataFrame", "tableToDF", "tableNames", "tables", - "uncacheTable") + "uncacheTable", + "print.summary.GeneralizedLinearRegressionModel") export("structField", "structField.jobj", http://git-wip-us.apache.org/repos/asf/spark/blob/83af297a/R/pkg/R/mllib.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 31bca16..922a9b1 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -101,12 +101,55 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"), jobj <- object@jobj features <- callJMethod(jobj, "rFeatures") coefficients <- callJMethod(jobj, "rCoefficients") - coefficients <- as.matrix(unlist(coefficients)) - colnames(coefficients) <- c("Estimate") + deviance.resid <- callJMethod(jobj, "rDevianceResiduals") + dispersion <- callJMethod(jobj, "rDispersion") + null.deviance <- callJMethod(jobj, "rNullDeviance") + deviance <- callJMethod(jobj, "rDeviance") + df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull") + df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom") + aic <- callJMethod(jobj, "rAic") + iter <- callJMethod(jobj, "rNumIterations") + family <- callJMethod(jobj, "rFamily") + + deviance.resid <- dataFrame(deviance.resid) + coefficients <- matrix(coefficients, ncol = 4) + colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)") rownames(coefficients) <- unlist(features) - return(list(coefficients = coefficients)) + ans <- list(deviance.resid = deviance.resid, coefficients = coefficients, + dispersion = dispersion, null.deviance = null.deviance, + deviance = deviance, df.null = df.null, df.residual = df.residual, + aic = aic, iter = iter, family = family) + class(ans) <- "summary.GeneralizedLinearRegressionModel" + return(ans) }) +#' Print the summary of GeneralizedLinearRegressionModel +#' +#' @rdname print +#' @name print.summary.GeneralizedLinearRegressionModel +#' @export +print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { + x$deviance.resid <- setNames(unlist(approxQuantile(x$deviance.resid, "devianceResiduals", + c(0.0, 0.25, 0.5, 0.75, 1.0), 0.01)), c("Min", "1Q", "Median", "3Q", "Max")) + x$deviance.resid <- zapsmall(x$deviance.resid, 5L) + cat("\nDeviance Residuals: \n") + cat("(Note: These are approximate quantiles with relative error <= 0.01)\n") + print.default(x$deviance.resid, digits = 5L, na.print = "", print.gap = 2L) + + cat("\nCoefficients:\n") + print.default(x$coefficients, digits = 5L, na.print = "", print.gap = 2L) + + cat("\n(Dispersion parameter for ", x$family, " family taken to be ", format(x$dispersion), + ")\n\n", apply(cbind(paste(format(c("Null", "Residual"), justify = "right"), "deviance:"), + format(unlist(x[c("null.deviance", "deviance")]), digits = 5L), + " on", format(unlist(x[c("df.null", "df.residual")])), " degrees of freedom\n"), + 1L, paste, collapse = " "), sep = "") + cat("AIC: ", format(x$aic, digits = 4L), "\n\n", + "Number of Fisher Scoring iterations: ", x$iter, "\n", sep = "") + cat("\n") + invisible(x) + } + #' Make predictions from a generalized linear model #' #' Makes predictions from a generalized linear model produced by glm(), similarly to R's predict(). http://git-wip-us.apache.org/repos/asf/spark/blob/83af297a/R/pkg/inst/tests/testthat/test_mllib.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index a9dbd2b..47bbf7e 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -77,6 +77,55 @@ test_that("glm and predict", { expect_equal(length(predict(lm(y ~ x))), 15) }) +test_that("glm summary", { + # gaussian family + training <- suppressWarnings(createDataFrame(sqlContext, iris)) + stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training)) + + rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)) + + coefs <- unlist(stats$coefficients) + rCoefs <- unlist(rStats$coefficients) + expect_true(all(abs(rCoefs - coefs) < 1e-4)) + expect_true(all( + rownames(stats$coefficients) == + c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica"))) + expect_equal(stats$dispersion, rStats$dispersion) + expect_equal(stats$null.deviance, rStats$null.deviance) + expect_equal(stats$deviance, rStats$deviance) + expect_equal(stats$df.null, rStats$df.null) + expect_equal(stats$df.residual, rStats$df.residual) + expect_equal(stats$aic, rStats$aic) + + # binomial family + df <- suppressWarnings(createDataFrame(sqlContext, iris)) + training <- df[df$Species %in% c("versicolor", "virginica"), ] + stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training, + family = binomial(link = "logit"))) + + rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ] + rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining, + family = binomial(link = "logit"))) + + coefs <- unlist(stats$coefficients) + rCoefs <- unlist(rStats$coefficients) + expect_true(all(abs(rCoefs - coefs) < 1e-4)) + expect_true(all( + rownames(stats$coefficients) == + c("(Intercept)", "Sepal_Length", "Sepal_Width"))) + expect_equal(stats$dispersion, rStats$dispersion) + expect_equal(stats$null.deviance, rStats$null.deviance) + expect_equal(stats$deviance, rStats$deviance) + expect_equal(stats$df.null, rStats$df.null) + expect_equal(stats$df.residual, rStats$df.residual) + expect_equal(stats$aic, rStats$aic) + + # Test summary works on base GLM models + baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris) + baseSummary <- summary(baseModel) + expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4) +}) + test_that("kmeans", { newIris <- iris newIris$Species <- NULL http://git-wip-us.apache.org/repos/asf/spark/blob/83af297a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala index 475a308..f66323e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala @@ -30,19 +30,59 @@ private[r] class GeneralizedLinearRegressionWrapper private ( private val glm: GeneralizedLinearRegressionModel = pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel] + lazy val rFeatures: Array[String] = if (glm.getFitIntercept) { + Array("(Intercept)") ++ features + } else { + features + } + lazy val rCoefficients: Array[Double] = if (glm.getFitIntercept) { - Array(glm.intercept) ++ glm.coefficients.toArray + Array(glm.intercept) ++ glm.coefficients.toArray ++ + rCoefficientStandardErrors ++ rTValues ++ rPValues } else { - glm.coefficients.toArray + glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues } - lazy val rFeatures: Array[String] = if (glm.getFitIntercept) { - Array("(Intercept)") ++ features + private lazy val rCoefficientStandardErrors = if (glm.getFitIntercept) { + Array(glm.summary.coefficientStandardErrors.last) ++ + glm.summary.coefficientStandardErrors.dropRight(1) } else { - features + glm.summary.coefficientStandardErrors + } + + private lazy val rTValues = if (glm.getFitIntercept) { + Array(glm.summary.tValues.last) ++ glm.summary.tValues.dropRight(1) + } else { + glm.summary.tValues } - def transform(dataset: DataFrame): DataFrame = { + private lazy val rPValues = if (glm.getFitIntercept) { + Array(glm.summary.pValues.last) ++ glm.summary.pValues.dropRight(1) + } else { + glm.summary.pValues + } + + lazy val rDispersion: Double = glm.summary.dispersion + + lazy val rNullDeviance: Double = glm.summary.nullDeviance + + lazy val rDeviance: Double = glm.summary.deviance + + lazy val rResidualDegreeOfFreedomNull: Long = glm.summary.residualDegreeOfFreedomNull + + lazy val rResidualDegreeOfFreedom: Long = glm.summary.residualDegreeOfFreedom + + lazy val rAic: Double = glm.summary.aic + + lazy val rNumIterations: Int = glm.summary.numIterations + + lazy val rDevianceResiduals: DataFrame = glm.summary.residuals() + + lazy val rFamily: String = glm.getFamily + + def residuals(residualsType: String): DataFrame = glm.summary.residuals(residualsType) + + def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset).drop(glm.getFeaturesCol) } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org