spark git commit: [SPARK-16310][SPARKR] R na.string-like default for csv source

2016-07-07 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/master 28710b42b -> f4767bcc7


[SPARK-16310][SPARKR] R na.string-like default for csv source

## What changes were proposed in this pull request?

Apply default "NA" as null string for R, like R read.csv na.string parameter.

https://stat.ethz.ch/R-manual/R-devel/library/utils/html/read.table.html
na.strings = "NA"

An user passing a csv file with NA value should get the same behavior with 
SparkR read.df(... source = "csv")

(couldn't open JIRA, will do that later)

## How was this patch tested?

unit tests

shivaram

Author: Felix Cheung 

Closes #13984 from felixcheung/rcsvnastring.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f4767bcc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f4767bcc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f4767bcc

Branch: refs/heads/master
Commit: f4767bcc7a9d1bdd301f054776aa45e7c9f344a7
Parents: 28710b4
Author: Felix Cheung 
Authored: Thu Jul 7 15:21:57 2016 -0700
Committer: Shivaram Venkataraman 
Committed: Thu Jul 7 15:21:57 2016 -0700

--
 R/pkg/R/SQLContext.R  | 10 ++--
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 32 +-
 2 files changed, 34 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f4767bcc/R/pkg/R/SQLContext.R
--
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 8df73db..bc0daa2 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -714,11 +714,14 @@ dropTempView <- function(viewName) {
 #'
 #' The data source is specified by the `source` and a set of options(...).
 #' If `source` is not specified, the default data source configured by
-#' "spark.sql.sources.default" will be used.
+#' "spark.sql.sources.default" will be used. \cr
+#' Similar to R read.csv, when `source` is "csv", by default, a value of "NA" 
will be interpreted
+#' as NA.
 #'
 #' @param path The path of files to load
 #' @param source The name of external data source
 #' @param schema The data schema defined in structType
+#' @param na.strings Default string value for NA when source is "csv"
 #' @return SparkDataFrame
 #' @rdname read.df
 #' @name read.df
@@ -735,7 +738,7 @@ dropTempView <- function(viewName) {
 #' @name read.df
 #' @method read.df default
 #' @note read.df since 1.4.0
-read.df.default <- function(path = NULL, source = NULL, schema = NULL, ...) {
+read.df.default <- function(path = NULL, source = NULL, schema = NULL, 
na.strings = "NA", ...) {
   sparkSession <- getSparkSession()
   options <- varargsToEnv(...)
   if (!is.null(path)) {
@@ -744,6 +747,9 @@ read.df.default <- function(path = NULL, source = NULL, 
schema = NULL, ...) {
   if (is.null(source)) {
 source <- getDefaultSqlSource()
   }
+  if (source == "csv" && is.null(options[["nullValue"]])) {
+options[["nullValue"]] <- na.strings
+  }
   if (!is.null(schema)) {
 stopifnot(class(schema) == "structType")
 sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", 
sparkSession, source,

http://git-wip-us.apache.org/repos/asf/spark/blob/f4767bcc/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index a3aa26d..a0ab719 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -213,15 +213,35 @@ test_that("read csv as DataFrame", {
   mockLinesCsv <- c("year,make,model,comment,blank",
"\"2012\",\"Tesla\",\"S\",\"No comment\",",
"1997,Ford,E350,\"Go get one now they are going fast\",",
-   "2015,Chevy,Volt")
+   "2015,Chevy,Volt",
+   "NA,Dummy,Placeholder")
   writeLines(mockLinesCsv, csvPath)
 
-  # default "header" is false
-  df <- read.df(csvPath, "csv", header = "true")
-  expect_equal(count(df), 3)
+  # default "header" is false, inferSchema to handle "year" as "int"
+  df <- read.df(csvPath, "csv", header = "true", inferSchema = "true")
+  expect_equal(count(df), 4)
   expect_equal(columns(df), c("year", "make", "model", "comment", "blank"))
-  expect_equal(sort(unlist(collect(where(df, df$year == "2015",
-   sort(unlist(list(year = "2015", make = "Chevy", model = 
"Volt"
+  expect_equal(sort(unlist(collect(where(df, df$year == 2015,
+   sort(unlist(list(year = 2015, make = "Chevy", model = "Volt"
+
+  # since "year" is "int", let's skip the NA values
+  withoutna <- na.omit(df, how = "any", cols = "year")
+  

spark git commit: [SPARK-16310][SPARKR] R na.string-like default for csv source

2016-07-07 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 30cb3f1d3 -> 5828da41c


[SPARK-16310][SPARKR] R na.string-like default for csv source

## What changes were proposed in this pull request?

Apply default "NA" as null string for R, like R read.csv na.string parameter.

https://stat.ethz.ch/R-manual/R-devel/library/utils/html/read.table.html
na.strings = "NA"

An user passing a csv file with NA value should get the same behavior with 
SparkR read.df(... source = "csv")

(couldn't open JIRA, will do that later)

## How was this patch tested?

unit tests

shivaram

Author: Felix Cheung 

Closes #13984 from felixcheung/rcsvnastring.

(cherry picked from commit f4767bcc7a9d1bdd301f054776aa45e7c9f344a7)
Signed-off-by: Shivaram Venkataraman 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5828da41
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5828da41
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5828da41

Branch: refs/heads/branch-2.0
Commit: 5828da41cb2d815708191bd9a5cf3bd82795aa41
Parents: 30cb3f1
Author: Felix Cheung 
Authored: Thu Jul 7 15:21:57 2016 -0700
Committer: Shivaram Venkataraman 
Committed: Thu Jul 7 15:22:06 2016 -0700

--
 R/pkg/R/SQLContext.R  | 10 ++--
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 32 +-
 2 files changed, 34 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5828da41/R/pkg/R/SQLContext.R
--
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 8df73db..bc0daa2 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -714,11 +714,14 @@ dropTempView <- function(viewName) {
 #'
 #' The data source is specified by the `source` and a set of options(...).
 #' If `source` is not specified, the default data source configured by
-#' "spark.sql.sources.default" will be used.
+#' "spark.sql.sources.default" will be used. \cr
+#' Similar to R read.csv, when `source` is "csv", by default, a value of "NA" 
will be interpreted
+#' as NA.
 #'
 #' @param path The path of files to load
 #' @param source The name of external data source
 #' @param schema The data schema defined in structType
+#' @param na.strings Default string value for NA when source is "csv"
 #' @return SparkDataFrame
 #' @rdname read.df
 #' @name read.df
@@ -735,7 +738,7 @@ dropTempView <- function(viewName) {
 #' @name read.df
 #' @method read.df default
 #' @note read.df since 1.4.0
-read.df.default <- function(path = NULL, source = NULL, schema = NULL, ...) {
+read.df.default <- function(path = NULL, source = NULL, schema = NULL, 
na.strings = "NA", ...) {
   sparkSession <- getSparkSession()
   options <- varargsToEnv(...)
   if (!is.null(path)) {
@@ -744,6 +747,9 @@ read.df.default <- function(path = NULL, source = NULL, 
schema = NULL, ...) {
   if (is.null(source)) {
 source <- getDefaultSqlSource()
   }
+  if (source == "csv" && is.null(options[["nullValue"]])) {
+options[["nullValue"]] <- na.strings
+  }
   if (!is.null(schema)) {
 stopifnot(class(schema) == "structType")
 sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", 
sparkSession, source,

http://git-wip-us.apache.org/repos/asf/spark/blob/5828da41/R/pkg/inst/tests/testthat/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index d22baf6..003fcce 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -213,15 +213,35 @@ test_that("read csv as DataFrame", {
   mockLinesCsv <- c("year,make,model,comment,blank",
"\"2012\",\"Tesla\",\"S\",\"No comment\",",
"1997,Ford,E350,\"Go get one now they are going fast\",",
-   "2015,Chevy,Volt")
+   "2015,Chevy,Volt",
+   "NA,Dummy,Placeholder")
   writeLines(mockLinesCsv, csvPath)
 
-  # default "header" is false
-  df <- read.df(csvPath, "csv", header = "true")
-  expect_equal(count(df), 3)
+  # default "header" is false, inferSchema to handle "year" as "int"
+  df <- read.df(csvPath, "csv", header = "true", inferSchema = "true")
+  expect_equal(count(df), 4)
   expect_equal(columns(df), c("year", "make", "model", "comment", "blank"))
-  expect_equal(sort(unlist(collect(where(df, df$year == "2015",
-   sort(unlist(list(year = "2015", make = "Chevy", model = 
"Volt"
+  expect_equal(sort(unlist(collect(where(df, df$year == 2015,
+   sort(unlist(list(year = 2015, make = "Chevy", model =