spark git commit: [SPARK-11781][SPARKR] SparkR has problem in inferring type of raw type.

shivaram Sun, 29 Nov 2015 11:09:07 -0800

Repository: spark
Updated Branches:
  refs/heads/master c793d2d9a -> cc7a1bc93



[SPARK-11781][SPARKR] SparkR has problem in inferring type of raw type.

Author: Sun Rui <rui....@intel.com>

Closes #9769 from sun-rui/SPARK-11781.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc7a1bc9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc7a1bc9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc7a1bc9

Branch: refs/heads/master
Commit: cc7a1bc9370b163f51230e5ca4be612d133a5086
Parents: c793d2d
Author: Sun Rui <rui....@intel.com>
Authored: Sun Nov 29 11:08:26 2015 -0800
Committer: Shivaram Venkataraman <shiva...@cs.berkeley.edu>
Committed: Sun Nov 29 11:08:26 2015 -0800

----------------------------------------------------------------------
 R/pkg/R/DataFrame.R              | 34 ++++++++++++++++++--------------
 R/pkg/R/SQLContext.R             |  2 +-
 R/pkg/R/types.R                  | 37 ++++++++++++++++++++---------------
 R/pkg/inst/tests/test_sparkSQL.R |  6 ++++++
 4 files changed, 47 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/cc7a1bc9/R/pkg/R/DataFrame.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index f89e268..a82ded9 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -793,8 +793,8 @@ setMethod("dim",
 setMethod("collect",
           signature(x = "DataFrame"),
           function(x, stringsAsFactors = FALSE) {
-            names <- columns(x)
-            ncol <- length(names)
+            dtypes <- dtypes(x)
+            ncol <- length(dtypes)
             if (ncol <= 0) {
               # empty data.frame with 0 columns and 0 rows
               data.frame()
@@ -817,25 +817,29 @@ setMethod("collect",
                 # data of complex type can be held. But getting a cell from a 
column
                 # of list type returns a list instead of a vector. So for 
columns of
                 # non-complex type, append them as vector.
+                #
+                # For columns of complex type, be careful to access them.
+                # Get a column of complex type returns a list.
+                # Get a cell from a column of complex type returns a list 
instead of a vector.
                 col <- listCols[[colIndex]]
+                colName <- dtypes[[colIndex]][[1]]
                 if (length(col) <= 0) {
-                  df[[names[colIndex]]] <- col
+                  df[[colName]] <- col
                 } else {
-                  # TODO: more robust check on column of primitive types
-                  vec <- do.call(c, col)
-                  if (class(vec) != "list") {
-                    df[[names[colIndex]]] <- vec
+                  colType <- dtypes[[colIndex]][[2]]
+                  # Note that "binary" columns behave like complex types.
+                  if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != 
"binary") {
+                    vec <- do.call(c, col)
+                    stopifnot(class(vec) != "list")
+                    df[[colName]] <- vec
                   } else {
-                    # For columns of complex type, be careful to access them.
-                    # Get a column of complex type returns a list.
-                    # Get a cell from a column of complex type returns a list 
instead of a vector.
-                    df[[names[colIndex]]] <- col
-                 }
+                    df[[colName]] <- col
+                  }
+                }
               }
+              df
             }
-            df
-          }
-        })
+          })
 
 #' Limit
 #'

http://git-wip-us.apache.org/repos/asf/spark/blob/cc7a1bc9/R/pkg/R/SQLContext.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index a62b25f..85541c8 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -63,7 +63,7 @@ infer_type <- function(x) {
     })
     type <- Reduce(paste0, type)
     type <- paste0("struct<", substr(type, 1, nchar(type) - 1), ">")
-  } else if (length(x) > 1) {
+  } else if (length(x) > 1 && type != "binary") {
     paste0("array<", infer_type(x[[1]]), ">")
   } else {
     type

http://git-wip-us.apache.org/repos/asf/spark/blob/cc7a1bc9/R/pkg/R/types.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R
index dae4fe8..1f06af7 100644
--- a/R/pkg/R/types.R
+++ b/R/pkg/R/types.R
@@ -19,25 +19,30 @@
 # values are equivalent R types. This is stored in an environment to allow for
 # more efficient look up (environments use hashmaps).
 PRIMITIVE_TYPES <- as.environment(list(
-  "byte"="integer",
-  "tinyint"="integer",
-  "smallint"="integer",
-  "integer"="integer",
-  "bigint"="numeric",
-  "float"="numeric",
-  "double"="numeric",
-  "decimal"="numeric",
-  "string"="character",
-  "binary"="raw",
-  "boolean"="logical",
-  "timestamp"="POSIXct",
-  "date"="Date"))
+  "tinyint" = "integer",
+  "smallint" = "integer",
+  "int" = "integer",
+  "bigint" = "numeric",
+  "float" = "numeric",
+  "double" = "numeric",
+  "decimal" = "numeric",
+  "string" = "character",
+  "binary" = "raw",
+  "boolean" = "logical",
+  "timestamp" = "POSIXct",
+  "date" = "Date",
+  # following types are not SQL types returned by dtypes(). They are listed 
here for usage
+  # by checkType() in schema.R.
+  # TODO: refactor checkType() in schema.R.
+  "byte" = "integer",
+  "integer" = "integer"
+  ))
 
 # The complex data types. These do not have any direct mapping to R's types.
 COMPLEX_TYPES <- list(
-  "map"=NA,
-  "array"=NA,
-  "struct"=NA)
+  "map" = NA,
+  "array" = NA,
+  "struct" = NA)
 
 # The full list of data types.
 DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES))

http://git-wip-us.apache.org/repos/asf/spark/blob/cc7a1bc9/R/pkg/inst/tests/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index d3b2f20..92ec820 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -72,6 +72,8 @@ test_that("infer types and check types", {
   expect_equal(infer_type(e), "map<string,integer>")
 
   expect_error(checkType("map<integer,integer>"), "Key type in a map must be 
string or character")
+
+  expect_equal(infer_type(as.raw(c(1, 2, 3))), "binary")
 })
 
 test_that("structType and structField", {
@@ -250,6 +252,10 @@ test_that("create DataFrame from list or data.frame", {
 
   mtcarsdf <- createDataFrame(sqlContext, mtcars)
   expect_equivalent(collect(mtcarsdf), mtcars)
+
+  bytes <- as.raw(c(1, 2, 3))
+  df <- createDataFrame(sqlContext, list(list(bytes)))
+  expect_equal(collect(df)[[1]][[1]], bytes)
 })
 
 test_that("create DataFrame with different data types", {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11781][SPARKR] SparkR has problem in inferring type of raw type.

Reply via email to