[arrow] branch master updated: ARROW-6537 [R]: Pass column_types to CSV reader

npr Thu, 08 Oct 2020 08:01:26 -0700

This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 7490391  ARROW-6537 [R]: Pass column_types to CSV reader
7490391 is described below

commit 74903919625220f5d67a96cf21411acb645c789f
Author: Romain Francois <[email protected]>
AuthorDate: Thu Oct 8 07:58:58 2020 -0700

    ARROW-6537 [R]: Pass column_types to CSV reader
    
    Either passing down NULL or a Schema.
    
    But perhaps a schema is confusing because the only thing that is being 
controlled by it here is the types, not their order etc .. which I believe 
feels implied if you supply a schema.
    
    Closes #7807 from romainfrancois/ARROW-6537/column_types
    
    Lead-authored-by: Romain Francois <[email protected]>
    Co-authored-by: Neal Richardson <[email protected]>
    Signed-off-by: Neal Richardson <[email protected]>
---
 r/NAMESPACE                 |   4 +
 r/R/arrow-package.R         |   4 +-
 r/R/arrowExports.R          |  20 ++++
 r/R/csv.R                   | 239 +++++++++++++++++++++++++++++++++++++++-----
 r/R/schema.R                |   3 +
 r/man/CsvReadOptions.Rd     |  36 ++++++-
 r/man/read_delim_arrow.Rd   |  78 ++++++++++++++-
 r/src/arrowExports.cpp      |  79 +++++++++++++++
 r/src/arrow_exports.h       |   1 +
 r/src/csv.cpp               |  97 ++++++++++++++++--
 r/tests/testthat/test-csv.R |  80 +++++++++++++++
 11 files changed, 599 insertions(+), 42 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index f33d392..656f461 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -34,6 +34,7 @@ S3method(as.integer,Array)
 S3method(as.integer,ChunkedArray)
 S3method(as.integer,Scalar)
 S3method(as.list,RecordBatch)
+S3method(as.list,Schema)
 S3method(as.list,Table)
 S3method(as.raw,Buffer)
 S3method(as.vector,Array)
@@ -183,6 +184,7 @@ export(StructScalar)
 export(SubTreeFileSystem)
 export(Table)
 export(TimeUnit)
+export(TimestampParser)
 export(Type)
 export(UnionDataset)
 export(arrow_available)
@@ -275,6 +277,7 @@ importFrom(bit64,print.integer64)
 importFrom(bit64,str.integer64)
 importFrom(methods,as)
 importFrom(purrr,as_mapper)
+importFrom(purrr,keep)
 importFrom(purrr,map)
 importFrom(purrr,map2)
 importFrom(purrr,map_chr)
@@ -291,6 +294,7 @@ importFrom(rlang,env)
 importFrom(rlang,env_bind)
 importFrom(rlang,eval_tidy)
 importFrom(rlang,exec)
+importFrom(rlang,is_bare_character)
 importFrom(rlang,is_false)
 importFrom(rlang,is_integerish)
 importFrom(rlang,list2)
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index f17b4f9..613f2ac 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -16,9 +16,9 @@
 # under the License.
 
 #' @importFrom R6 R6Class
-#' @importFrom purrr as_mapper map map2 map_chr map_dfr map_int map_lgl
+#' @importFrom purrr as_mapper map map2 map_chr map_dfr map_int map_lgl keep
 #' @importFrom assertthat assert_that is.string
-#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null 
enquos is_integerish quos eval_tidy new_data_mask syms env env_bind as_label 
set_names exec
+#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null 
enquos is_integerish quos eval_tidy new_data_mask syms env env_bind as_label 
set_names exec is_bare_character
 #' @importFrom tidyselect vars_select
 #' @useDynLib arrow, .registration = TRUE
 #' @keywords internal
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 932d953..a79cbe7 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -312,6 +312,10 @@ csv___ParseOptions__initialize <- function(options){
     .Call(`_arrow_csv___ParseOptions__initialize` , options)
 }
 
+csv___ReadOptions__column_names <- function(options){
+    .Call(`_arrow_csv___ReadOptions__column_names` , options)
+}
+
 csv___ConvertOptions__initialize <- function(options){
     .Call(`_arrow_csv___ConvertOptions__initialize` , options)
 }
@@ -324,6 +328,22 @@ csv___TableReader__Read <- function(table_reader){
     .Call(`_arrow_csv___TableReader__Read` , table_reader)
 }
 
+TimestampParser__kind <- function(parser){
+    .Call(`_arrow_TimestampParser__kind` , parser)
+}
+
+TimestampParser__format <- function(parser){
+    .Call(`_arrow_TimestampParser__format` , parser)
+}
+
+TimestampParser__MakeStrptime <- function(format){
+    .Call(`_arrow_TimestampParser__MakeStrptime` , format)
+}
+
+TimestampParser__MakeISO8601 <- function(){
+    .Call(`_arrow_TimestampParser__MakeISO8601` )
+}
+
 dataset___Dataset__NewScan <- function(ds){
     .Call(`_arrow_dataset___Dataset__NewScan` , ds)
 }
diff --git a/r/R/csv.R b/r/R/csv.R
index c4460d7..a7da10b 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -32,6 +32,51 @@
 #' `parse_options`, `convert_options`, or `read_options` arguments, or you can
 #' use [CsvTableReader] directly for lower-level access.
 #'
+#' @section Specifying column types and names:
+#'
+#' By default, the CSV reader will infer the column names and data types from 
the file, but there
+#' are a few ways you can specify them directly.
+#'
+#' One way is to provide an Arrow [Schema] in the `schema` argument,
+#' which is an ordered map of column name to type.
+#' When provided, it satisfies both the `col_names` and `col_types` arguments.
+#' This is good if you know all of this information up front.
+#'
+#' You can also pass a `Schema` to the `col_types` argument. If you do this,
+#' column names will still be inferred from the file unless you also specify
+#' `col_names`. In either case, the column names in the `Schema` must match the
+#' data's column names, whether they are explicitly provided or inferred. That
+#' said, this `Schema` does not have to reference all columns: those omitted
+#' will have their types inferred.
+#'
+#' Alternatively, you can declare column types by providing the compact string 
representation
+#' that `readr` uses to the `col_types` argument. This means you provide a
+#' single string, one character per column, where the characters map to Arrow
+#' types analogously to the `readr` type mapping:
+#'
+#' * "c": `utf8()`
+#' * "i": `int32()`
+#' * "n": `float64()`
+#' * "d": `float64()`
+#' * "l": `bool()`
+#' * "f": `dictionary()`
+#' * "D": `date32()`
+#' * "T": `time32()`
+#' * "t": `timestamp()`
+#' * "_": `null()`
+#' * "-": `null()`
+#' * "?": infer the type from the data
+#'
+#' If you use the compact string representation for `col_types`, you must also
+#' specify `col_names`.
+#'
+#' Regardless of how types are specified, all columns with a `null()` type will
+#' be dropped.
+#'
+#' Note that if you are specifying column names, whether by `schema` or
+#' `col_names`, and the CSV file has a header row that would otherwise be used
+#' to idenfity column names, you'll need to add `skip = 1` to skip that row.
+#'
 #' @param file A character file name or URI, `raw` vector, or an Arrow input 
stream.
 #' If a file name, a memory-mapped Arrow [InputStream] will be opened and
 #' closed when finished; compression will be detected from the file extension
@@ -46,10 +91,14 @@
 #' characters? This is more general than `escape_double` as backslashes
 #' can be used to escape the delimiter character, the quote character, or
 #' to add special characters like `\\n`.
+#' @param schema [Schema] that describes the table. If provided, it will be
+#' used to satisfy both `col_names` and `col_types`.
 #' @param col_names If `TRUE`, the first row of the input will be used as the
 #' column names and will not be included in the data frame. If `FALSE`, column
 #' names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
 #' Alternatively, you can specify a character vector of column names.
+#' @param col_types A compact string representation of the column types, or
+#' `NULL` (the default) to infer types from the data.
 #' @param col_select A character vector of column names to keep, as in the
 #' "select" argument to `data.table::fread()`, or a
 #' [tidy selection specification][tidyselect::vars_select()]
@@ -63,6 +112,12 @@
 #' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be
 #' filled with missings.
 #' @param skip Number of lines to skip before reading data.
+#' @param timestamp_parsers User-defined timestamp parsers. If more than one
+#' parser is specified, the CSV conversion logic will try parsing values
+#' starting from the beginning of this vector. Possible values are:
+#'  - `NULL`: the default, which uses the ISO-8601 parser
+#'  - a character vector of [strptime][base::strptime()] parse strings
+#'  - a list of [TimestampParser] objects
 #' @param parse_options see [file reader options][CsvReadOptions].
 #' If given, this overrides any
 #' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.).
@@ -90,8 +145,9 @@ read_delim_arrow <- function(file,
                              quote = '"',
                              escape_double = TRUE,
                              escape_backslash = FALSE,
+                             schema = NULL,
                              col_names = TRUE,
-                             # col_types = TRUE,
+                             col_types = NULL,
                              col_select = NULL,
                              na = c("", "NA"),
                              quoted_na = TRUE,
@@ -101,8 +157,12 @@ read_delim_arrow <- function(file,
                              convert_options = NULL,
                              read_options = NULL,
                              filesystem = NULL,
-                             as_data_frame = TRUE) {
-
+                             as_data_frame = TRUE,
+                             timestamp_parsers = NULL) {
+  if (inherits(schema, "Schema")) {
+    col_names <- names(schema)
+    col_types <- schema
+  }
   if (is.null(parse_options)) {
     parse_options <- readr_to_csv_parse_options(
       delim,
@@ -112,13 +172,17 @@ read_delim_arrow <- function(file,
       skip_empty_rows
     )
   }
-
   if (is.null(read_options)) {
     read_options <- readr_to_csv_read_options(skip, col_names)
   }
   if (is.null(convert_options)) {
-    # TODO: col_types (needs wiring in CsvConvertOptions)
-    convert_options <- readr_to_csv_convert_options(na, quoted_na)
+    convert_options <- readr_to_csv_convert_options(
+      na,
+      quoted_na,
+      col_types = col_types,
+      col_names = read_options$column_names,
+      timestamp_parsers = timestamp_parsers
+    )
   }
 
   if (!inherits(file, "InputStream")) {
@@ -134,6 +198,7 @@ read_delim_arrow <- function(file,
 
   tab <- reader$Read()
 
+  # TODO: move this into convert_options using include_columns
   col_select <- enquo(col_select)
   if (!quo_is_null(col_select)) {
     tab <- tab[vars_select(names(tab), !!col_select)]
@@ -152,8 +217,9 @@ read_csv_arrow <- function(file,
                            quote = '"',
                            escape_double = TRUE,
                            escape_backslash = FALSE,
+                           schema = NULL,
                            col_names = TRUE,
-                           # col_types = TRUE,
+                           col_types = NULL,
                            col_select = NULL,
                            na = c("", "NA"),
                            quoted_na = TRUE,
@@ -162,7 +228,8 @@ read_csv_arrow <- function(file,
                            parse_options = NULL,
                            convert_options = NULL,
                            read_options = NULL,
-                           as_data_frame = TRUE) {
+                           as_data_frame = TRUE,
+                           timestamp_parsers = NULL) {
 
   mc <- match.call()
   mc$delim <- ","
@@ -176,8 +243,9 @@ read_tsv_arrow <- function(file,
                            quote = '"',
                            escape_double = TRUE,
                            escape_backslash = FALSE,
+                           schema = NULL,
                            col_names = TRUE,
-                           # col_types = TRUE,
+                           col_types = NULL,
                            col_select = NULL,
                            na = c("", "NA"),
                            quoted_na = TRUE,
@@ -186,7 +254,8 @@ read_tsv_arrow <- function(file,
                            parse_options = NULL,
                            convert_options = NULL,
                            read_options = NULL,
-                           as_data_frame = TRUE) {
+                           as_data_frame = TRUE,
+                           timestamp_parsers = NULL) {
 
   mc <- match.call()
   mc$delim <- "\t"
@@ -244,7 +313,7 @@ CsvTableReader$create <- function(file,
 #' @usage NULL
 #' @format NULL
 #' @description `CsvReadOptions`, `CsvParseOptions`, `CsvConvertOptions`,
-#' `JsonReadOptions`, and `JsonParseOptions` are containers for various
+#' `JsonReadOptions`, `JsonParseOptions`, and `TimestampParser` are containers 
for various
 #' file reading options. See their usage in [read_csv_arrow()] and
 #' [read_json_arrow()], respectively.
 #'
@@ -292,19 +361,48 @@ CsvTableReader$create <- function(file,
 #' - `strings_can_be_null` Logical: can string / binary columns have
 #'    null values? Similar to the `quoted_na` argument to `readr::read_csv()`.
 #'    (default `FALSE`)
+#' - `true_values` character vector of recognized spellings for `TRUE` values
+#' - `false_values` character vector of recognized spellings for `FALSE` values
+#' - `col_types` A `Schema` or `NULL` to infer types
+#' - `auto_dict_encode` Logical: Whether to try to automatically
+#'    dictionary-encode string / binary data (think `stringsAsFactors`). 
Default `FALSE`.
+#'    This setting is ignored for non-inferred columns (those in `col_types`).
+#' - `auto_dict_max_cardinality` If `auto_dict_encode`, string/binary columns
+#'    are dictionary-encoded up to this number of unique values (default 50),
+#'    after which it switches to regular encoding.
+#' - `include_columns` If non-empty, indicates the names of columns from the
+#'    CSV file that should be actually read and converted (in the vector's 
order).
+#' - `include_missing_columns` Logical: if `include_columns` is provided, 
should
+#'    columns named in it but not found in the data be included as a column of
+#'    type `null()`? The default (`FALSE`) means that the reader will instead
+#'    raise an error.
+#' - `timestamp_parsers` User-defined timestamp parsers. If more than one
+#'    parser is specified, the CSV conversion logic will try parsing values
+#'    starting from the beginning of this vector. Possible values are
+#'    (a) `NULL`, the default, which uses the ISO-8601 parser;
+#'    (b) a character vector of [strptime][base::strptime()] parse strings; or
+#'    (c) a list of [TimestampParser] objects.
 #'
-#' @section Methods:
+#' `TimestampParser$create()` takes an optional `format` string argument.
+#' See [`strptime()`][base::strptime()] for example syntax.
+#' The default is to use an ISO-8601 format parser.
+#' @section Active bindings:
 #'
-#' These classes have no implemented methods. They are containers for the
-#' options.
+#' - `column_names`: from `CsvReadOptions`
 #'
 #' @export
-CsvReadOptions <- R6Class("CsvReadOptions", inherit = ArrowObject)
+CsvReadOptions <- R6Class("CsvReadOptions",
+  inherit = ArrowObject,
+  active = list(
+    column_names = function() csv___ReadOptions__column_names(self)
+  )
+)
 CsvReadOptions$create <- function(use_threads = option_use_threads(),
                                   block_size = 1048576L,
                                   skip_rows = 0L,
                                   column_names = character(0),
                                   autogenerate_column_names = FALSE) {
+
   shared_ptr(CsvReadOptions, csv___ReadOptions__initialize(
     list(
       use_threads = use_threads,
@@ -316,7 +414,7 @@ CsvReadOptions$create <- function(use_threads = 
option_use_threads(),
   ))
 }
 
-readr_to_csv_read_options <- function(skip, col_names) {
+readr_to_csv_read_options <- function(skip, col_names, col_types) {
   if (isTRUE(col_names)) {
     # C++ default to parse is 0-length string array
     col_names <- character(0)
@@ -381,26 +479,115 @@ readr_to_csv_parse_options <- function(delim = ",",
 #' @format NULL
 #' @docType class
 #' @export
+TimestampParser <- R6Class("TimestampParser", inherit = ArrowObject,
+  public = list(
+    kind = function() TimestampParser__kind(self),
+    format = function() TimestampParser__format(self)
+  )
+)
+TimestampParser$create <- function(format = NULL) {
+  if (is.null(format)) {
+    shared_ptr(TimestampParser, TimestampParser__MakeISO8601())
+  } else {
+    shared_ptr(TimestampParser, TimestampParser__MakeStrptime(format))
+  }
+}
+
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
 CsvConvertOptions <- R6Class("CsvConvertOptions", inherit = ArrowObject)
 CsvConvertOptions$create <- function(check_utf8 = TRUE,
                                      null_values = c("", "NA"),
-                                     strings_can_be_null = FALSE) {
-  # TODO: there are more conversion options available:
-  # // Optional per-column types (disabling type inference on those columns)
-  # std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
-  # // Recognized spellings for boolean values
-  # std::vector<std::string> true_values;
-  # std::vector<std::string> false_values;
+                                     true_values = c("T", "true", "TRUE"),
+                                     false_values= c("F", "false", "FALSE"),
+                                     strings_can_be_null = FALSE,
+                                     col_types = NULL,
+                                     auto_dict_encode = FALSE,
+                                     auto_dict_max_cardinality = 50L,
+                                     include_columns = character(),
+                                     include_missing_columns = FALSE,
+                                     timestamp_parsers = NULL) {
+
+  if (!is.null(col_types) && !inherits(col_types, "Schema")) {
+    abort(c(
+      "Unsupported `col_types` specification.",
+      i = "`col_types` must be NULL, or a <Schema>."
+    ))
+  }
 
   shared_ptr(CsvConvertOptions, csv___ConvertOptions__initialize(
     list(
       check_utf8 = check_utf8,
       null_values = null_values,
-      strings_can_be_null = strings_can_be_null
+      strings_can_be_null = strings_can_be_null,
+      col_types = col_types,
+      true_values = true_values,
+      false_values = false_values,
+      auto_dict_encode = auto_dict_encode,
+      auto_dict_max_cardinality = auto_dict_max_cardinality,
+      include_columns = include_columns,
+      include_missing_columns = include_missing_columns,
+      timestamp_parsers = timestamp_parsers
     )
   ))
 }
 
-readr_to_csv_convert_options <- function(na, quoted_na) {
-    CsvConvertOptions$create(null_values = na, strings_can_be_null = quoted_na)
+readr_to_csv_convert_options <- function(na,
+                                         quoted_na,
+                                         col_types = NULL,
+                                         col_names = NULL,
+                                         timestamp_parsers = NULL) {
+  include_columns <- character()
+
+  if (is.character(col_types)) {
+    if (length(col_types) != 1L) {
+      abort("`col_types` is a character vector that is not of size 1")
+    }
+    n <- nchar(col_types)
+    specs <- substring(col_types, seq_len(n), seq_len(n))
+    if (!is_bare_character(col_names, n)) {
+      abort("Compact specification for `col_types` requires `col_names`")
+    }
+
+    col_types <- set_names(nm = col_names, map2(specs, col_names, ~{
+      switch(.x,
+             "c" = utf8(),
+             "i" = int32(),
+             "n" = float64(),
+             "d" = float64(),
+             "l" = bool(),
+             "f" = dictionary(),
+             "D" = date32(),
+             "T" = time32(),
+             "t" = timestamp(),
+             "_" = null(),
+             "-" = null(),
+             "?" = NULL,
+             abort("Unsupported compact specification: '", .x,"' for column 
'", .y, "'")
+      )
+    }))
+    # To "guess" types, omit them from col_types
+    col_types <- keep(col_types, ~!is.null(.x))
+    col_types <- schema(!!!col_types)
+  }
+
+  if (!is.null(col_types)) {
+    assert_is(col_types, "Schema")
+    # If any columns are null(), drop them
+    # (by specifying the other columns in include_columns)
+    nulls <- map_lgl(col_types$fields, ~.$type$Equals(null()))
+    if (any(nulls)) {
+      include_columns <- setdiff(col_names, names(col_types)[nulls])
+    }
+  }
+  CsvConvertOptions$create(
+    null_values = na,
+    strings_can_be_null = quoted_na,
+    col_types = col_types,
+    timestamp_parsers = timestamp_parsers,
+    include_columns = include_columns
+  )
 }
diff --git a/r/R/schema.R b/r/R/schema.R
index ef25988..4bba8b8 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -182,6 +182,9 @@ length.Schema <- function(x) x$num_fields
   }
 }
 
+#' @export
+as.list.Schema <- function(x, ...) x$fields
+
 #' read a Schema from a stream
 #'
 #' @param stream a `Message`, `InputStream`, or `Buffer`
diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd
index 81c6be4..8053307 100644
--- a/r/man/CsvReadOptions.Rd
+++ b/r/man/CsvReadOptions.Rd
@@ -4,13 +4,14 @@
 \name{CsvReadOptions}
 \alias{CsvReadOptions}
 \alias{CsvParseOptions}
+\alias{TimestampParser}
 \alias{CsvConvertOptions}
 \alias{JsonReadOptions}
 \alias{JsonParseOptions}
 \title{File reader options}
 \description{
 \code{CsvReadOptions}, \code{CsvParseOptions}, \code{CsvConvertOptions},
-\code{JsonReadOptions}, and \code{JsonParseOptions} are containers for various
+\code{JsonReadOptions}, \code{JsonParseOptions}, and \code{TimestampParser} 
are containers for various
 file reading options. See their usage in 
\code{\link[=read_csv_arrow]{read_csv_arrow()}} and
 \code{\link[=read_json_arrow]{read_json_arrow()}}, respectively.
 }
@@ -62,13 +63,38 @@ Analogous to the \code{na.strings} argument to
 \item \code{strings_can_be_null} Logical: can string / binary columns have
 null values? Similar to the \code{quoted_na} argument to 
\code{readr::read_csv()}.
 (default \code{FALSE})
-}
+\item \code{true_values} character vector of recognized spellings for 
\code{TRUE} values
+\item \code{false_values} character vector of recognized spellings for 
\code{FALSE} values
+\item \code{col_types} A \code{Schema} or \code{NULL} to infer types
+\item \code{auto_dict_encode} Logical: Whether to try to automatically
+dictionary-encode string / binary data (think \code{stringsAsFactors}). 
Default \code{FALSE}.
+This setting is ignored for non-inferred columns (those in \code{col_types}).
+\item \code{auto_dict_max_cardinality} If \code{auto_dict_encode}, 
string/binary columns
+are dictionary-encoded up to this number of unique values (default 50),
+after which it switches to regular encoding.
+\item \code{include_columns} If non-empty, indicates the names of columns from 
the
+CSV file that should be actually read and converted (in the vector's order).
+\item \code{include_missing_columns} Logical: if \code{include_columns} is 
provided, should
+columns named in it but not found in the data be included as a column of
+type \code{null()}? The default (\code{FALSE}) means that the reader will 
instead
+raise an error.
+\item \code{timestamp_parsers} User-defined timestamp parsers. If more than one
+parser is specified, the CSV conversion logic will try parsing values
+starting from the beginning of this vector. Possible values are
+(a) \code{NULL}, the default, which uses the ISO-8601 parser;
+(b) a character vector of \link[base:strptime]{strptime} parse strings; or
+(c) a list of \link{TimestampParser} objects.
 }
 
-\section{Methods}{
+\code{TimestampParser$create()} takes an optional \code{format} string 
argument.
+See \code{\link[base:strptime]{strptime()}} for example syntax.
+The default is to use an ISO-8601 format parser.
+}
 
+\section{Active bindings}{
 
-These classes have no implemented methods. They are containers for the
-options.
+\itemize{
+\item \code{column_names}: from \code{CsvReadOptions}
+}
 }
 
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index 38251c7..335549e 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -12,7 +12,9 @@ read_delim_arrow(
   quote = "\\"",
   escape_double = TRUE,
   escape_backslash = FALSE,
+  schema = NULL,
   col_names = TRUE,
+  col_types = NULL,
   col_select = NULL,
   na = c("", "NA"),
   quoted_na = TRUE,
@@ -22,7 +24,8 @@ read_delim_arrow(
   convert_options = NULL,
   read_options = NULL,
   filesystem = NULL,
-  as_data_frame = TRUE
+  as_data_frame = TRUE,
+  timestamp_parsers = NULL
 )
 
 read_csv_arrow(
@@ -30,7 +33,9 @@ read_csv_arrow(
   quote = "\\"",
   escape_double = TRUE,
   escape_backslash = FALSE,
+  schema = NULL,
   col_names = TRUE,
+  col_types = NULL,
   col_select = NULL,
   na = c("", "NA"),
   quoted_na = TRUE,
@@ -39,7 +44,8 @@ read_csv_arrow(
   parse_options = NULL,
   convert_options = NULL,
   read_options = NULL,
-  as_data_frame = TRUE
+  as_data_frame = TRUE,
+  timestamp_parsers = NULL
 )
 
 read_tsv_arrow(
@@ -47,7 +53,9 @@ read_tsv_arrow(
   quote = "\\"",
   escape_double = TRUE,
   escape_backslash = FALSE,
+  schema = NULL,
   col_names = TRUE,
+  col_types = NULL,
   col_select = NULL,
   na = c("", "NA"),
   quoted_na = TRUE,
@@ -56,7 +64,8 @@ read_tsv_arrow(
   parse_options = NULL,
   convert_options = NULL,
   read_options = NULL,
-  as_data_frame = TRUE
+  as_data_frame = TRUE,
+  timestamp_parsers = NULL
 )
 }
 \arguments{
@@ -79,11 +88,17 @@ characters? This is more general than \code{escape_double} 
as backslashes
 can be used to escape the delimiter character, the quote character, or
 to add special characters like \verb{\\\\n}.}
 
+\item{schema}{\link{Schema} that describes the table. If provided, it will be
+used to satisfy both \code{col_names} and \code{col_types}.}
+
 \item{col_names}{If \code{TRUE}, the first row of the input will be used as the
 column names and will not be included in the data frame. If \code{FALSE}, 
column
 names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
 Alternatively, you can specify a character vector of column names.}
 
+\item{col_types}{A compact string representation of the column types, or
+\code{NULL} (the default) to infer types from the data.}
+
 \item{col_select}{A character vector of column names to keep, as in the
 "select" argument to \code{data.table::fread()}, or a
 \link[tidyselect:vars_select]{tidy selection specification}
@@ -115,6 +130,15 @@ string file path; default is the local file system}
 
 \item{as_data_frame}{Should the function return a \code{data.frame} (default) 
or
 an Arrow \link{Table}?}
+
+\item{timestamp_parsers}{User-defined timestamp parsers. If more than one
+parser is specified, the CSV conversion logic will try parsing values
+starting from the beginning of this vector. Possible values are:
+\itemize{
+\item \code{NULL}: the default, which uses the ISO-8601 parser
+\item a character vector of \link[base:strptime]{strptime} parse strings
+\item a list of \link{TimestampParser} objects
+}}
 }
 \value{
 A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}.
@@ -136,6 +160,54 @@ equivalent in \code{readr::read_csv()}, you can either 
provide them in the
 \code{parse_options}, \code{convert_options}, or \code{read_options} 
arguments, or you can
 use \link{CsvTableReader} directly for lower-level access.
 }
+\section{Specifying column types and names}{
+
+
+By default, the CSV reader will infer the column names and data types from the 
file, but there
+are a few ways you can specify them directly.
+
+One way is to provide an Arrow \link{Schema} in the \code{schema} argument,
+which is an ordered map of column name to type.
+When provided, it satisfies both the \code{col_names} and \code{col_types} 
arguments.
+This is good if you know all of this information up front.
+
+You can also pass a \code{Schema} to the \code{col_types} argument. If you do 
this,
+column names will still be inferred from the file unless you also specify
+\code{col_names}. In either case, the column names in the \code{Schema} must 
match the
+data's column names, whether they are explicitly provided or inferred. That
+said, this \code{Schema} does not have to reference all columns: those omitted
+will have their types inferred.
+
+Alternatively, you can declare column types by providing the compact string 
representation
+that \code{readr} uses to the \code{col_types} argument. This means you 
provide a
+single string, one character per column, where the characters map to Arrow
+types analogously to the \code{readr} type mapping:
+\itemize{
+\item "c": \code{utf8()}
+\item "i": \code{int32()}
+\item "n": \code{float64()}
+\item "d": \code{float64()}
+\item "l": \code{bool()}
+\item "f": \code{dictionary()}
+\item "D": \code{date32()}
+\item "T": \code{time32()}
+\item "t": \code{timestamp()}
+\item "_": \code{null()}
+\item "-": \code{null()}
+\item "?": infer the type from the data
+}
+
+If you use the compact string representation for \code{col_types}, you must 
also
+specify \code{col_names}.
+
+Regardless of how types are specified, all columns with a \code{null()} type 
will
+be dropped.
+
+Note that if you are specifying column names, whether by \code{schema} or
+\code{col_names}, and the CSV file has a header row that would otherwise be 
used
+to idenfity column names, you'll need to add \code{skip = 1} to skip that row.
+}
+
 \examples{
 \donttest{
   tf <- tempfile()
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 1c7ab14..d2f4465 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -1227,6 +1227,21 @@ extern "C" SEXP 
_arrow_csv___ParseOptions__initialize(SEXP options_sexp){
 
 // csv.cpp
 #if defined(ARROW_R_WITH_ARROW)
+SEXP csv___ReadOptions__column_names(const 
std::shared_ptr<arrow::csv::ReadOptions>& options);
+extern "C" SEXP _arrow_csv___ReadOptions__column_names(SEXP options_sexp){
+BEGIN_CPP11
+       arrow::r::Input<const std::shared_ptr<arrow::csv::ReadOptions>&>::type 
options(options_sexp);
+       return cpp11::as_sexp(csv___ReadOptions__column_names(options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_csv___ReadOptions__column_names(SEXP options_sexp){
+       Rf_error("Cannot call csv___ReadOptions__column_names(). Please use 
arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
 std::shared_ptr<arrow::csv::ConvertOptions> 
csv___ConvertOptions__initialize(cpp11::list options);
 extern "C" SEXP _arrow_csv___ConvertOptions__initialize(SEXP options_sexp){
 BEGIN_CPP11
@@ -1273,6 +1288,65 @@ extern "C" SEXP _arrow_csv___TableReader__Read(SEXP 
table_reader_sexp){
 }
 #endif
 
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string TimestampParser__kind(const 
std::shared_ptr<arrow::TimestampParser>& parser);
+extern "C" SEXP _arrow_TimestampParser__kind(SEXP parser_sexp){
+BEGIN_CPP11
+       arrow::r::Input<const std::shared_ptr<arrow::TimestampParser>&>::type 
parser(parser_sexp);
+       return cpp11::as_sexp(TimestampParser__kind(parser));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampParser__kind(SEXP parser_sexp){
+       Rf_error("Cannot call TimestampParser__kind(). Please use 
arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string TimestampParser__format(const 
std::shared_ptr<arrow::TimestampParser>& parser);
+extern "C" SEXP _arrow_TimestampParser__format(SEXP parser_sexp){
+BEGIN_CPP11
+       arrow::r::Input<const std::shared_ptr<arrow::TimestampParser>&>::type 
parser(parser_sexp);
+       return cpp11::as_sexp(TimestampParser__format(parser));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampParser__format(SEXP parser_sexp){
+       Rf_error("Cannot call TimestampParser__format(). Please use 
arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::TimestampParser> 
TimestampParser__MakeStrptime(std::string format);
+extern "C" SEXP _arrow_TimestampParser__MakeStrptime(SEXP format_sexp){
+BEGIN_CPP11
+       arrow::r::Input<std::string>::type format(format_sexp);
+       return cpp11::as_sexp(TimestampParser__MakeStrptime(format));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampParser__MakeStrptime(SEXP format_sexp){
+       Rf_error("Cannot call TimestampParser__MakeStrptime(). Please use 
arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::TimestampParser> TimestampParser__MakeISO8601();
+extern "C" SEXP _arrow_TimestampParser__MakeISO8601(){
+BEGIN_CPP11
+       return cpp11::as_sexp(TimestampParser__MakeISO8601());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampParser__MakeISO8601(){
+       Rf_error("Cannot call TimestampParser__MakeISO8601(). Please use 
arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
 // dataset.cpp
 #if defined(ARROW_R_WITH_ARROW)
 std::shared_ptr<ds::ScannerBuilder> dataset___Dataset__NewScan(const 
std::shared_ptr<ds::Dataset>& ds);
@@ -6250,9 +6324,14 @@ static const R_CallMethodDef CallEntries[] = {
                { "_arrow_compute__CallFunction", (DL_FUNC) 
&_arrow_compute__CallFunction, 3}, 
                { "_arrow_csv___ReadOptions__initialize", (DL_FUNC) 
&_arrow_csv___ReadOptions__initialize, 1}, 
                { "_arrow_csv___ParseOptions__initialize", (DL_FUNC) 
&_arrow_csv___ParseOptions__initialize, 1}, 
+               { "_arrow_csv___ReadOptions__column_names", (DL_FUNC) 
&_arrow_csv___ReadOptions__column_names, 1}, 
                { "_arrow_csv___ConvertOptions__initialize", (DL_FUNC) 
&_arrow_csv___ConvertOptions__initialize, 1}, 
                { "_arrow_csv___TableReader__Make", (DL_FUNC) 
&_arrow_csv___TableReader__Make, 4}, 
                { "_arrow_csv___TableReader__Read", (DL_FUNC) 
&_arrow_csv___TableReader__Read, 1}, 
+               { "_arrow_TimestampParser__kind", (DL_FUNC) 
&_arrow_TimestampParser__kind, 1}, 
+               { "_arrow_TimestampParser__format", (DL_FUNC) 
&_arrow_TimestampParser__format, 1}, 
+               { "_arrow_TimestampParser__MakeStrptime", (DL_FUNC) 
&_arrow_TimestampParser__MakeStrptime, 1}, 
+               { "_arrow_TimestampParser__MakeISO8601", (DL_FUNC) 
&_arrow_TimestampParser__MakeISO8601, 0}, 
                { "_arrow_dataset___Dataset__NewScan", (DL_FUNC) 
&_arrow_dataset___Dataset__NewScan, 1}, 
                { "_arrow_dataset___Dataset__schema", (DL_FUNC) 
&_arrow_dataset___Dataset__schema, 1}, 
                { "_arrow_dataset___Dataset__type_name", (DL_FUNC) 
&_arrow_dataset___Dataset__type_name, 1}, 
diff --git a/r/src/arrow_exports.h b/r/src/arrow_exports.h
index 92d451d..c4cc0ff 100644
--- a/r/src/arrow_exports.h
+++ b/r/src/arrow_exports.h
@@ -29,6 +29,7 @@
 #include <arrow/status.h>
 #include <arrow/type_fwd.h>
 #include <arrow/util/compression.h>
+#include <arrow/util/value_parsing.h>
 
 namespace arrow {
 
diff --git a/r/src/csv.cpp b/r/src/csv.cpp
index ce9c068..efa6462 100644
--- a/r/src/csv.cpp
+++ b/r/src/csv.cpp
@@ -20,6 +20,7 @@
 #if defined(ARROW_R_WITH_ARROW)
 
 #include <arrow/csv/reader.h>
+#include <arrow/util/value_parsing.h>
 
 // [[arrow::export]]
 std::shared_ptr<arrow::csv::ReadOptions> csv___ReadOptions__initialize(
@@ -32,6 +33,7 @@ std::shared_ptr<arrow::csv::ReadOptions> 
csv___ReadOptions__initialize(
   res->column_names = 
cpp11::as_cpp<std::vector<std::string>>(options["column_names"]);
   res->autogenerate_column_names =
       cpp11::as_cpp<bool>(options["autogenerate_column_names"]);
+
   return res;
 }
 
@@ -51,6 +53,16 @@ std::shared_ptr<arrow::csv::ParseOptions> 
csv___ParseOptions__initialize(
 }
 
 // [[arrow::export]]
+SEXP csv___ReadOptions__column_names(
+    const std::shared_ptr<arrow::csv::ReadOptions>& options) {
+  if (options->autogenerate_column_names) {
+    return R_NilValue;
+  }
+
+  return cpp11::as_sexp(options->column_names);
+}
+
+// [[arrow::export]]
 std::shared_ptr<arrow::csv::ConvertOptions> csv___ConvertOptions__initialize(
     cpp11::list options) {
   auto res = std::make_shared<arrow::csv::ConvertOptions>(
@@ -62,12 +74,63 @@ std::shared_ptr<arrow::csv::ConvertOptions> 
csv___ConvertOptions__initialize(
   // If true, then strings in "null_values" are considered null for string 
columns.
   // If false, then all strings are valid string values.
   res->strings_can_be_null = 
cpp11::as_cpp<bool>(options["strings_can_be_null"]);
-  // TODO: there are more conversion options available:
-  // // Optional per-column types (disabling type inference on those columns)
-  // std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
-  // // Recognized spellings for boolean values
-  // std::vector<std::string> true_values;
-  // std::vector<std::string> false_values;
+
+  res->true_values = 
cpp11::as_cpp<std::vector<std::string>>(options["true_values"]);
+  res->false_values = 
cpp11::as_cpp<std::vector<std::string>>(options["false_values"]);
+
+  SEXP col_types = options["col_types"];
+  if (Rf_inherits(col_types, "Schema")) {
+    auto schema = cpp11::as_cpp<std::shared_ptr<arrow::Schema>>(col_types);
+    std::unordered_map<std::string, std::shared_ptr<arrow::DataType>> 
column_types;
+    for (const auto& field : schema->fields()) {
+      column_types.insert(std::make_pair(field->name(), field->type()));
+    }
+    res->column_types = column_types;
+  }
+
+  res->auto_dict_encode = cpp11::as_cpp<bool>(options["auto_dict_encode"]);
+  res->auto_dict_max_cardinality =
+      cpp11::as_cpp<int>(options["auto_dict_max_cardinality"]);
+  res->include_columns =
+      cpp11::as_cpp<std::vector<std::string>>(options["include_columns"]);
+  res->include_missing_columns = 
cpp11::as_cpp<bool>(options["include_missing_columns"]);
+
+  SEXP op_timestamp_parsers = options["timestamp_parsers"];
+  if (!Rf_isNull(op_timestamp_parsers)) {
+    std::vector<std::shared_ptr<arrow::TimestampParser>> timestamp_parsers;
+
+    // if we have a character vector, convert to arrow::TimestampParser
+    if (TYPEOF(op_timestamp_parsers) == STRSXP) {
+      cpp11::strings s_timestamp_parsers(op_timestamp_parsers);
+      for (cpp11::r_string s : s_timestamp_parsers) {
+        timestamp_parsers.push_back(arrow::TimestampParser::MakeStrptime(s));
+      }
+
+    } else if (TYPEOF(op_timestamp_parsers) == VECSXP) {
+      cpp11::list lst_parsers(op_timestamp_parsers);
+
+      for (SEXP x : lst_parsers) {
+        // handle scalar string and TimestampParser instances
+        if (TYPEOF(x) == STRSXP && XLENGTH(x) == 1) {
+          timestamp_parsers.push_back(
+              arrow::TimestampParser::MakeStrptime(CHAR(STRING_ELT(x, 0))));
+        } else if (Rf_inherits(x, "TimestampParser")) {
+          timestamp_parsers.push_back(
+              cpp11::as_cpp<std::shared_ptr<arrow::TimestampParser>>(x));
+        } else {
+          cpp11::stop(
+              "unsupported timestamp parser, must be a scalar string or a "
+              "<TimestampParser> object");
+        }
+      }
+
+    } else {
+      cpp11::stop(
+          "unsupported timestamp parser, must be character vector of strptime "
+          "specifications, or a list of <TimestampParser> objects");
+    }
+    res->timestamp_parsers = timestamp_parsers;
+  }
 
   return res;
 }
@@ -89,4 +152,26 @@ std::shared_ptr<arrow::Table> csv___TableReader__Read(
   return ValueOrStop(table_reader->Read());
 }
 
+// [[arrow::export]]
+std::string TimestampParser__kind(const 
std::shared_ptr<arrow::TimestampParser>& parser) {
+  return parser->kind();
+}
+
+// [[arrow::export]]
+std::string TimestampParser__format(
+    const std::shared_ptr<arrow::TimestampParser>& parser) {
+  return parser->format();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::TimestampParser> TimestampParser__MakeStrptime(
+    std::string format) {
+  return arrow::TimestampParser::MakeStrptime(format);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::TimestampParser> TimestampParser__MakeISO8601() {
+  return arrow::TimestampParser::MakeISO8601();
+}
+
 #endif
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 2d85437..94dd10b 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -174,3 +174,83 @@ test_that("read_csv_arrow() can detect compression from 
file name", {
   tab1 <- read_csv_arrow(tf)
   expect_equivalent(tbl, tab1)
 })
+
+test_that("read_csv_arrow(schema=)", {
+  tbl <- example_data[, "int"]
+  tf <- tempfile(); on.exit(unlink(tf))
+  write.csv(tbl, tf, row.names = FALSE)
+
+  df <- read_csv_arrow(tf, schema = schema(int = float64()), skip = 1)
+  expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
+})
+
+test_that("read_csv_arrow(col_types = <Schema>)", {
+  tbl <- example_data[, "int"]
+  tf <- tempfile(); on.exit(unlink(tf))
+  write.csv(tbl, tf, row.names = FALSE)
+
+  df <- read_csv_arrow(tf, col_types = schema(int = float64()))
+  expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
+})
+
+test_that("read_csv_arrow(col_types=string, col_names)", {
+  tbl <- example_data[, "int"]
+  tf <- tempfile(); on.exit(unlink(tf))
+  write.csv(tbl, tf, row.names = FALSE)
+
+  df <- read_csv_arrow(tf, col_names = "int", col_types = "d", skip = 1)
+  expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
+
+  expect_error(read_csv_arrow(tf, col_types = c("i", "d")))
+  expect_error(read_csv_arrow(tf, col_types = "d"))
+  expect_error(read_csv_arrow(tf, col_types = "i", col_names = c("a", "b")))
+  expect_error(read_csv_arrow(tf, col_types = "y", col_names = "a"))
+})
+
+test_that("read_csv_arrow() can read timestamps", {
+  tbl <- tibble::tibble(time = as.POSIXct("2020-07-20 16:20", tz = "UTC"))
+  tf <- tempfile(); on.exit(unlink(tf))
+  write.csv(tbl, tf, row.names = FALSE)
+
+  df <- read_csv_arrow(tf, col_types = schema(time = timestamp()))
+  expect_equal(tbl, df)
+
+  df <- read_csv_arrow(tf, col_types = "t", col_names = "time", skip = 1)
+  expect_equal(tbl, df)
+})
+
+test_that("read_csv_arrow(timestamp_parsers=)", {
+  tf <- tempfile(); on.exit(unlink(tf))
+  tbl <- tibble::tibble(time = "23/09/2020")
+  write.csv(tbl, tf, row.names = FALSE)
+
+  df <- read_csv_arrow(
+    tf,
+    col_types = schema(time = timestamp(timezone = "UTC")),
+    timestamp_parsers = "%d/%m/%Y"
+  )
+  expect_equal(df$time, as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC"))
+})
+
+test_that("Skipping columns with null()", {
+  tf <- tempfile(); on.exit(unlink(tf))
+  cols <- c("dbl", "lgl", "false", "chr")
+  tbl <- example_data[, cols]
+  write.csv(tbl, tf, row.names = FALSE)
+
+  df <- read_csv_arrow(tf, col_types = "d-_c", col_names = cols, skip = 1)
+  expect_identical(df, tbl[, c("dbl", "chr")])
+})
+
+test_that("Mix of guessing and declaring types", {
+  tf <- tempfile(); on.exit(unlink(tf))
+  cols <- c("dbl", "lgl", "false", "chr")
+  tbl <- example_data[, cols]
+  write.csv(tbl, tf, row.names = FALSE)
+
+  tab <- read_csv_arrow(tf, col_types = schema(dbl = float32()), as_data_frame 
= FALSE)
+  expect_equal(tab$schema, schema(dbl = float32(), lgl = bool(), false = 
bool(), chr = utf8()))
+
+  df <- read_csv_arrow(tf, col_types = "d-?c", col_names = cols, skip = 1)
+  expect_identical(df, tbl[, c("dbl", "false", "chr")])
+})

[arrow] branch master updated: ARROW-6537 [R]: Pass column_types to CSV reader

Reply via email to