Repository: spark
Updated Branches:
  refs/heads/master a3ceb875c -> 9a9c6f5c2


[SPARK-15222][SPARKR][ML] SparkR ML examples update in 2.0

## What changes were proposed in this pull request?
Update example code in examples/src/main/r/ml.R to reflect the new algorithms.
* spark.glm and glm
* spark.survreg
* spark.naiveBayes
* spark.kmeans

## How was this patch tested?
Offline test.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #13000 from yanboliang/spark-15222.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9a9c6f5c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9a9c6f5c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9a9c6f5c

Branch: refs/heads/master
Commit: 9a9c6f5c22248c5a891e9d3b788ff12b6b4718b2
Parents: a3ceb87
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Fri May 20 09:30:20 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Fri May 20 09:30:20 2016 -0700

----------------------------------------------------------------------
 examples/src/main/r/ml.R | 129 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 112 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/9a9c6f5c/examples/src/main/r/ml.R
----------------------------------------------------------------------
diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R
index a0c9039..fd35936 100644
--- a/examples/src/main/r/ml.R
+++ b/examples/src/main/r/ml.R
@@ -16,7 +16,7 @@
 #
 
 # To run this example use
-# ./bin/sparkR examples/src/main/r/ml.R
+# ./bin/spark-submit examples/src/main/r/ml.R
 
 # Load SparkR library into your R session
 library(SparkR)
@@ -25,30 +25,125 @@ library(SparkR)
 sc <- sparkR.init(appName="SparkR-ML-example")
 sqlContext <- sparkRSQL.init(sc)
 
-# Train GLM of family 'gaussian'
-training1 <- suppressWarnings(createDataFrame(sqlContext, iris))
-test1 <- training1
-model1 <- glm(Sepal_Length ~ Sepal_Width + Species, training1, family = 
"gaussian")
+############################ spark.glm and glm 
##############################################
+
+irisDF <- suppressWarnings(createDataFrame(sqlContext, iris))
+# Fit a generalized linear model of family "gaussian" with spark.glm
+gaussianDF <- irisDF
+gaussianTestDF <- irisDF
+gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, 
family = "gaussian")
 
 # Model summary
-summary(model1)
+summary(gaussianGLM)
 
 # Prediction
-predictions1 <- predict(model1, test1)
-head(select(predictions1, "Sepal_Length", "prediction"))
+gaussianPredictions <- predict(gaussianGLM, gaussianTestDF)
+showDF(gaussianPredictions)
+
+# Fit a generalized linear model with glm (R-compliant)
+gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = 
"gaussian")
+summary(gaussianGLM2)
+
+# Fit a generalized linear model of family "binomial" with spark.glm
+binomialDF <- filter(irisDF, irisDF$Species != "setosa")
+binomialTestDF <- binomialDF
+binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, 
family = "binomial")
+
+# Model summary
+summary(binomialGLM)
+
+# Prediction
+binomialPredictions <- predict(binomialGLM, binomialTestDF)
+showDF(binomialPredictions)
+
+############################ spark.survreg 
##############################################
+
+# Use the ovarian dataset available in R survival package
+library(survival)
 
-# Train GLM of family 'binomial'
-training2 <- filter(training1, training1$Species != "setosa")
-test2 <- training2
-model2 <- glm(Species ~ Sepal_Length + Sepal_Width, data = training2, family = 
"binomial")
+# Fit an accelerated failure time (AFT) survival regression model with 
spark.survreg
+ovarianDF <- suppressWarnings(createDataFrame(sqlContext, ovarian))
+aftDF <- ovarianDF
+aftTestDF <- ovarianDF
+aftModel <- spark.survreg(aftDF, Surv(futime, fustat) ~ ecog_ps + rx)
 
 # Model summary
-summary(model2)
+summary(aftModel)
+
+# Prediction
+aftPredictions <- predict(aftModel, aftTestDF)
+showDF(aftPredictions)
+
+############################ spark.naiveBayes 
##############################################
+
+# Fit a Bernoulli naive Bayes model with spark.naiveBayes
+titanic <- as.data.frame(Titanic)
+titanicDF <- suppressWarnings(createDataFrame(sqlContext, titanic[titanic$Freq 
> 0, -5]))
+nbDF <- titanicDF
+nbTestDF <- titanicDF
+nbModel <- spark.naiveBayes(nbDF, Survived ~ Class + Sex + Age)
+
+# Model summary
+summary(nbModel)
+
+# Prediction
+nbPredictions <- predict(nbModel, nbTestDF)
+showDF(nbPredictions)
+
+############################ spark.kmeans 
##############################################
+
+# Fit a k-means model with spark.kmeans
+irisDF <- suppressWarnings(createDataFrame(sqlContext, iris))
+kmeansDF <- irisDF
+kmeansTestDF <- irisDF
+kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + 
Petal_Length + Petal_Width,
+                            k = 3)
+
+# Model summary
+summary(kmeansModel)
+
+# Get fitted result from the k-means model
+showDF(fitted(kmeansModel))
+
+# Prediction
+kmeansPredictions <- predict(kmeansModel, kmeansTestDF)
+showDF(kmeansPredictions)
+
+############################ model read/write 
##############################################
+
+irisDF <- suppressWarnings(createDataFrame(sqlContext, iris))
+# Fit a generalized linear model of family "gaussian" with spark.glm
+gaussianDF <- irisDF
+gaussianTestDF <- irisDF
+gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, 
family = "gaussian")
+
+# Save and then load a fitted MLlib model
+modelPath <- tempfile(pattern = "ml", fileext = ".tmp")
+write.ml(gaussianGLM, modelPath)
+gaussianGLM2 <- read.ml(modelPath)
+
+# Check model summary
+summary(gaussianGLM2)
+
+# Check model prediction
+gaussianPredictions <- predict(gaussianGLM2, gaussianTestDF)
+showDF(gaussianPredictions)
+
+unlink(modelPath)
+
+############################ fit models with spark.lapply 
#####################################
+
+# Perform distributed training of multiple models with spark.lapply
+families <- c("gaussian", "poisson")
+train <- function(family) {
+  model <- glm(Sepal.Length ~ Sepal.Width + Species, iris, family = family)
+  summary(model)
+}
+model.summaries <- spark.lapply(sc, families, train)
+
+# Print the summary of each model
+print(model.summaries)
 
-# Prediction (Currently the output of prediction for binomial GLM is the 
indexed label,
-# we need to transform back to the original string label later)
-predictions2 <- predict(model2, test2)
-head(select(predictions2, "Species", "prediction"))
 
 # Stop the SparkContext now
 sparkR.stop()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to