spark git commit: [SPARK-18326][SPARKR][ML] Review SparkR ML wrappers API for 2.1
Repository: spark Updated Branches: refs/heads/master 82253617f -> 97255497d [SPARK-18326][SPARKR][ML] Review SparkR ML wrappers API for 2.1 ## What changes were proposed in this pull request? Reviewing SparkR ML wrappers API for 2.1 release, mainly two issues: * Remove ```probabilityCol``` from the argument list of ```spark.logit``` and ```spark.randomForest```. Since it was used when making prediction and should be an argument of ```predict```, and we will work on this at [SPARK-18618](https://issues.apache.org/jira/browse/SPARK-18618) in the next release cycle. * Fix ```spark.als``` params to make it consistent with MLlib. ## How was this patch tested? Existing tests. Author: Yanbo Liang Closes #16169 from yanboliang/spark-18326. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/97255497 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/97255497 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/97255497 Branch: refs/heads/master Commit: 97255497d885f0f8ccfc808e868bc8aa5e4d1063 Parents: 8225361 Author: Yanbo Liang Authored: Wed Dec 7 20:23:28 2016 -0800 Committer: Yanbo Liang Committed: Wed Dec 7 20:23:28 2016 -0800 -- R/pkg/R/mllib.R | 23 +--- R/pkg/inst/tests/testthat/test_mllib.R | 4 ++-- .../spark/ml/r/LogisticRegressionWrapper.scala | 4 +--- .../r/RandomForestClassificationWrapper.scala | 2 -- 4 files changed, 13 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/97255497/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 074e9cb..632e4ad 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -733,7 +733,6 @@ setMethod("predict", signature(object = "KMeansModel"), #' excepting that at most one value may be 0. The class with largest value p/t is predicted, where p #' is the original probability of that class and t is the class's threshold. #' @param weightCol The weight column name. -#' @param probabilityCol column name for predicted class conditional probabilities. #' @param ... additional arguments passed to the method. #' @return \code{spark.logit} returns a fitted logistic regression model #' @rdname spark.logit @@ -772,7 +771,7 @@ setMethod("predict", signature(object = "KMeansModel"), setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100, tol = 1E-6, family = "auto", standardization = TRUE, - thresholds = 0.5, weightCol = NULL, probabilityCol = "probability") { + thresholds = 0.5, weightCol = NULL) { formula <- paste(deparse(formula), collapse = "") if (is.null(weightCol)) { @@ -784,7 +783,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") as.numeric(elasticNetParam), as.integer(maxIter), as.numeric(tol), as.character(family), as.logical(standardization), as.array(thresholds), -as.character(weightCol), as.character(probabilityCol)) +as.character(weightCol)) new("LogisticRegressionModel", jobj = jobj) }) @@ -1425,7 +1424,7 @@ setMethod("predict", signature(object = "GaussianMixtureModel"), #' @param userCol column name for user ids. Ids must be (or can be coerced into) integers. #' @param itemCol column name for item ids. Ids must be (or can be coerced into) integers. #' @param rank rank of the matrix factorization (> 0). -#' @param reg regularization parameter (>= 0). +#' @param regParam regularization parameter (>= 0). #' @param maxIter maximum number of iterations (>= 0). #' @param nonnegative logical value indicating whether to apply nonnegativity constraints. #' @param implicitPrefs logical value indicating whether to use implicit preference. @@ -1464,21 +1463,21 @@ setMethod("predict", signature(object = "GaussianMixtureModel"), #' #' # set other arguments #' modelS <- spark.als(df, "rating", "user", "item", rank = 20, -#' reg = 0.1, nonnegative = TRUE) +#' regParam = 0.1, nonnegative = TRUE) #' statsS <- summary(modelS) #' } #' @note spark.als since 2.1.0 setMethod("spark.als", signature(data = "SparkDataFrame"), function(data, ratingCol = "rating", userCol = "user", itemCol = "item", - rank = 10, reg = 0.1, maxIter = 10, nonnegative = FALSE, + rank = 10, regParam = 0.1, maxIter = 10, nonnegative =
spark git commit: [SPARK-18326][SPARKR][ML] Review SparkR ML wrappers API for 2.1
Repository: spark Updated Branches: refs/heads/branch-2.1 ab865cfd9 -> 1c3f1da82 [SPARK-18326][SPARKR][ML] Review SparkR ML wrappers API for 2.1 ## What changes were proposed in this pull request? Reviewing SparkR ML wrappers API for 2.1 release, mainly two issues: * Remove ```probabilityCol``` from the argument list of ```spark.logit``` and ```spark.randomForest```. Since it was used when making prediction and should be an argument of ```predict```, and we will work on this at [SPARK-18618](https://issues.apache.org/jira/browse/SPARK-18618) in the next release cycle. * Fix ```spark.als``` params to make it consistent with MLlib. ## How was this patch tested? Existing tests. Author: Yanbo Liang Closes #16169 from yanboliang/spark-18326. (cherry picked from commit 97255497d885f0f8ccfc808e868bc8aa5e4d1063) Signed-off-by: Yanbo Liang Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1c3f1da8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1c3f1da8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1c3f1da8 Branch: refs/heads/branch-2.1 Commit: 1c3f1da82356426b6b550fee67e66dc82eaf1c85 Parents: ab865cf Author: Yanbo Liang Authored: Wed Dec 7 20:23:28 2016 -0800 Committer: Yanbo Liang Committed: Wed Dec 7 20:23:45 2016 -0800 -- R/pkg/R/mllib.R | 23 +--- R/pkg/inst/tests/testthat/test_mllib.R | 4 ++-- .../spark/ml/r/LogisticRegressionWrapper.scala | 4 +--- .../r/RandomForestClassificationWrapper.scala | 2 -- 4 files changed, 13 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1c3f1da8/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 074e9cb..632e4ad 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -733,7 +733,6 @@ setMethod("predict", signature(object = "KMeansModel"), #' excepting that at most one value may be 0. The class with largest value p/t is predicted, where p #' is the original probability of that class and t is the class's threshold. #' @param weightCol The weight column name. -#' @param probabilityCol column name for predicted class conditional probabilities. #' @param ... additional arguments passed to the method. #' @return \code{spark.logit} returns a fitted logistic regression model #' @rdname spark.logit @@ -772,7 +771,7 @@ setMethod("predict", signature(object = "KMeansModel"), setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100, tol = 1E-6, family = "auto", standardization = TRUE, - thresholds = 0.5, weightCol = NULL, probabilityCol = "probability") { + thresholds = 0.5, weightCol = NULL) { formula <- paste(deparse(formula), collapse = "") if (is.null(weightCol)) { @@ -784,7 +783,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") as.numeric(elasticNetParam), as.integer(maxIter), as.numeric(tol), as.character(family), as.logical(standardization), as.array(thresholds), -as.character(weightCol), as.character(probabilityCol)) +as.character(weightCol)) new("LogisticRegressionModel", jobj = jobj) }) @@ -1425,7 +1424,7 @@ setMethod("predict", signature(object = "GaussianMixtureModel"), #' @param userCol column name for user ids. Ids must be (or can be coerced into) integers. #' @param itemCol column name for item ids. Ids must be (or can be coerced into) integers. #' @param rank rank of the matrix factorization (> 0). -#' @param reg regularization parameter (>= 0). +#' @param regParam regularization parameter (>= 0). #' @param maxIter maximum number of iterations (>= 0). #' @param nonnegative logical value indicating whether to apply nonnegativity constraints. #' @param implicitPrefs logical value indicating whether to use implicit preference. @@ -1464,21 +1463,21 @@ setMethod("predict", signature(object = "GaussianMixtureModel"), #' #' # set other arguments #' modelS <- spark.als(df, "rating", "user", "item", rank = 20, -#' reg = 0.1, nonnegative = TRUE) +#' regParam = 0.1, nonnegative = TRUE) #' statsS <- summary(modelS) #' } #' @note spark.als since 2.1.0 setMethod("spark.als", signature(data = "SparkDataFrame"), function(data, ratingCol = "rating", userCol = "user", itemCol = "item", - rank = 10, reg = 0.1, maxI