Repository: spark
Updated Branches:
  refs/heads/master 74b47845e -> 55aa4da28


[SPARK-21622][ML][SPARKR] Support offset in SparkR GLM

## What changes were proposed in this pull request?
Support offset in SparkR GLM #16699

Author: actuaryzhang <actuaryzhan...@gmail.com>

Closes #18831 from actuaryzhang/sparkROffset.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/55aa4da2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/55aa4da2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/55aa4da2

Branch: refs/heads/master
Commit: 55aa4da285ef0f2fa43852f2bbcffacc7ddefdbf
Parents: 74b4784
Author: actuaryzhang <actuaryzhan...@gmail.com>
Authored: Sun Aug 6 15:14:12 2017 -0700
Committer: Felix Cheung <felixche...@apache.org>
Committed: Sun Aug 6 15:14:12 2017 -0700

----------------------------------------------------------------------
 R/pkg/R/mllib_regression.R                      | 22 ++++++++++++++++----
 R/pkg/tests/fulltests/test_mllib_regression.R   |  8 +++++++
 .../r/GeneralizedLinearRegressionWrapper.scala  |  4 +++-
 3 files changed, 29 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/55aa4da2/R/pkg/R/mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 9ecd887..ebaeae9 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -76,6 +76,8 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #'                               "frequencyDesc", "frequencyAsc", 
"alphabetDesc", and "alphabetAsc".
 #'                               The default value is "frequencyDesc". When 
the ordering is set to
 #'                               "alphabetDesc", this drops the same category 
as R when encoding strings.
+#' @param offsetCol the offset column name. If this is not set or empty, we 
treat all instance offsets
+#'                  as 0.0. The feature specified as offset has a constant 
coefficient of 1.0.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -127,7 +129,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
           function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, 
weightCol = NULL,
                    regParam = 0.0, var.power = 0.0, link.power = 1.0 - 
var.power,
                    stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
-                                              "alphabetDesc", "alphabetAsc")) {
+                                              "alphabetDesc", "alphabetAsc"),
+                   offsetCol = NULL) {
 
             stringIndexerOrderType <- match.arg(stringIndexerOrderType)
             if (is.character(family)) {
@@ -159,12 +162,19 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
               weightCol <- as.character(weightCol)
             }
 
+            if (!is.null(offsetCol)) {
+              offsetCol <- as.character(offsetCol)
+              if (nchar(offsetCol) == 0) {
+                offsetCol <- NULL
+              }
+            }
+
             # For known families, Gamma is upper-cased
             jobj <- 
callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
                                 "fit", formula, data@sdf, 
tolower(family$family), family$link,
                                 tol, as.integer(maxIter), weightCol, regParam,
                                 as.double(var.power), as.double(link.power),
-                                stringIndexerOrderType)
+                                stringIndexerOrderType, offsetCol)
             new("GeneralizedLinearRegressionModel", jobj = jobj)
           })
 
@@ -192,6 +202,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 #'                               "frequencyDesc", "frequencyAsc", 
"alphabetDesc", and "alphabetAsc".
 #'                               The default value is "frequencyDesc". When 
the ordering is set to
 #'                               "alphabetDesc", this drops the same category 
as R when encoding strings.
+#' @param offsetCol the offset column name. If this is not set or empty, we 
treat all instance offsets
+#'                  as 0.0. The feature specified as offset has a constant 
coefficient of 1.0.
 #' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @export
@@ -209,10 +221,12 @@ setMethod("glm", signature(formula = "formula", family = 
"ANY", data = "SparkDat
           function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 
25, weightCol = NULL,
                    var.power = 0.0, link.power = 1.0 - var.power,
                    stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
-                                              "alphabetDesc", "alphabetAsc")) {
+                                              "alphabetDesc", "alphabetAsc"),
+                   offsetCol = NULL) {
             spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, 
weightCol = weightCol,
                       var.power = var.power, link.power = link.power,
-                      stringIndexerOrderType = stringIndexerOrderType)
+                      stringIndexerOrderType = stringIndexerOrderType,
+                      offsetCol = offsetCol)
           })
 
 #  Returns the summary of a model produced by glm() or spark.glm(), similarly 
to R's summary().

http://git-wip-us.apache.org/repos/asf/spark/blob/55aa4da2/R/pkg/tests/fulltests/test_mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_regression.R 
b/R/pkg/tests/fulltests/test_mllib_regression.R
index 6b72a09..23daca7 100644
--- a/R/pkg/tests/fulltests/test_mllib_regression.R
+++ b/R/pkg/tests/fulltests/test_mllib_regression.R
@@ -173,6 +173,14 @@ test_that("spark.glm summary", {
   expect_equal(stats$df.residual, rStats$df.residual)
   expect_equal(stats$aic, rStats$aic)
 
+  # Test spark.glm works with offset
+  training <- suppressWarnings(createDataFrame(iris))
+  stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
+                             family = poisson(), offsetCol = "Petal_Length"))
+  rStats <- suppressWarnings(summary(glm(Sepal.Width ~ Sepal.Length + Species,
+                        data = iris, family = poisson(), offset = 
iris$Petal.Length)))
+  expect_true(all(abs(rStats$coefficients - stats$coefficients) < 1e-3))
+
   # Test summary works on base GLM models
   baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
   baseSummary <- summary(baseModel)

http://git-wip-us.apache.org/repos/asf/spark/blob/55aa4da2/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 176a6cf..64575b0 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -77,7 +77,8 @@ private[r] object GeneralizedLinearRegressionWrapper
       regParam: Double,
       variancePower: Double,
       linkPower: Double,
-      stringIndexerOrderType: String): GeneralizedLinearRegressionWrapper = {
+      stringIndexerOrderType: String,
+      offsetCol: String): GeneralizedLinearRegressionWrapper = {
   // scalastyle:on
     val rFormula = new RFormula().setFormula(formula)
       .setStringIndexerOrderType(stringIndexerOrderType)
@@ -99,6 +100,7 @@ private[r] object GeneralizedLinearRegressionWrapper
       glr.setLink(link)
     }
     if (weightCol != null) glr.setWeightCol(weightCol)
+    if (offsetCol != null) glr.setOffsetCol(offsetCol)
 
     val pipeline = new Pipeline()
       .setStages(Array(rFormulaModel, glr))


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to