This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 7490391 ARROW-6537 [R]: Pass column_types to CSV reader
7490391 is described below
commit 74903919625220f5d67a96cf21411acb645c789f
Author: Romain Francois <[email protected]>
AuthorDate: Thu Oct 8 07:58:58 2020 -0700
ARROW-6537 [R]: Pass column_types to CSV reader
Either passing down NULL or a Schema.
But perhaps a schema is confusing because the only thing that is being
controlled by it here is the types, not their order etc .. which I believe
feels implied if you supply a schema.
Closes #7807 from romainfrancois/ARROW-6537/column_types
Lead-authored-by: Romain Francois <[email protected]>
Co-authored-by: Neal Richardson <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
---
r/NAMESPACE | 4 +
r/R/arrow-package.R | 4 +-
r/R/arrowExports.R | 20 ++++
r/R/csv.R | 239 +++++++++++++++++++++++++++++++++++++++-----
r/R/schema.R | 3 +
r/man/CsvReadOptions.Rd | 36 ++++++-
r/man/read_delim_arrow.Rd | 78 ++++++++++++++-
r/src/arrowExports.cpp | 79 +++++++++++++++
r/src/arrow_exports.h | 1 +
r/src/csv.cpp | 97 ++++++++++++++++--
r/tests/testthat/test-csv.R | 80 +++++++++++++++
11 files changed, 599 insertions(+), 42 deletions(-)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index f33d392..656f461 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -34,6 +34,7 @@ S3method(as.integer,Array)
S3method(as.integer,ChunkedArray)
S3method(as.integer,Scalar)
S3method(as.list,RecordBatch)
+S3method(as.list,Schema)
S3method(as.list,Table)
S3method(as.raw,Buffer)
S3method(as.vector,Array)
@@ -183,6 +184,7 @@ export(StructScalar)
export(SubTreeFileSystem)
export(Table)
export(TimeUnit)
+export(TimestampParser)
export(Type)
export(UnionDataset)
export(arrow_available)
@@ -275,6 +277,7 @@ importFrom(bit64,print.integer64)
importFrom(bit64,str.integer64)
importFrom(methods,as)
importFrom(purrr,as_mapper)
+importFrom(purrr,keep)
importFrom(purrr,map)
importFrom(purrr,map2)
importFrom(purrr,map_chr)
@@ -291,6 +294,7 @@ importFrom(rlang,env)
importFrom(rlang,env_bind)
importFrom(rlang,eval_tidy)
importFrom(rlang,exec)
+importFrom(rlang,is_bare_character)
importFrom(rlang,is_false)
importFrom(rlang,is_integerish)
importFrom(rlang,list2)
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index f17b4f9..613f2ac 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -16,9 +16,9 @@
# under the License.
#' @importFrom R6 R6Class
-#' @importFrom purrr as_mapper map map2 map_chr map_dfr map_int map_lgl
+#' @importFrom purrr as_mapper map map2 map_chr map_dfr map_int map_lgl keep
#' @importFrom assertthat assert_that is.string
-#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null
enquos is_integerish quos eval_tidy new_data_mask syms env env_bind as_label
set_names exec
+#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null
enquos is_integerish quos eval_tidy new_data_mask syms env env_bind as_label
set_names exec is_bare_character
#' @importFrom tidyselect vars_select
#' @useDynLib arrow, .registration = TRUE
#' @keywords internal
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 932d953..a79cbe7 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -312,6 +312,10 @@ csv___ParseOptions__initialize <- function(options){
.Call(`_arrow_csv___ParseOptions__initialize` , options)
}
+csv___ReadOptions__column_names <- function(options){
+ .Call(`_arrow_csv___ReadOptions__column_names` , options)
+}
+
csv___ConvertOptions__initialize <- function(options){
.Call(`_arrow_csv___ConvertOptions__initialize` , options)
}
@@ -324,6 +328,22 @@ csv___TableReader__Read <- function(table_reader){
.Call(`_arrow_csv___TableReader__Read` , table_reader)
}
+TimestampParser__kind <- function(parser){
+ .Call(`_arrow_TimestampParser__kind` , parser)
+}
+
+TimestampParser__format <- function(parser){
+ .Call(`_arrow_TimestampParser__format` , parser)
+}
+
+TimestampParser__MakeStrptime <- function(format){
+ .Call(`_arrow_TimestampParser__MakeStrptime` , format)
+}
+
+TimestampParser__MakeISO8601 <- function(){
+ .Call(`_arrow_TimestampParser__MakeISO8601` )
+}
+
dataset___Dataset__NewScan <- function(ds){
.Call(`_arrow_dataset___Dataset__NewScan` , ds)
}
diff --git a/r/R/csv.R b/r/R/csv.R
index c4460d7..a7da10b 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -32,6 +32,51 @@
#' `parse_options`, `convert_options`, or `read_options` arguments, or you can
#' use [CsvTableReader] directly for lower-level access.
#'
+#' @section Specifying column types and names:
+#'
+#' By default, the CSV reader will infer the column names and data types from
the file, but there
+#' are a few ways you can specify them directly.
+#'
+#' One way is to provide an Arrow [Schema] in the `schema` argument,
+#' which is an ordered map of column name to type.
+#' When provided, it satisfies both the `col_names` and `col_types` arguments.
+#' This is good if you know all of this information up front.
+#'
+#' You can also pass a `Schema` to the `col_types` argument. If you do this,
+#' column names will still be inferred from the file unless you also specify
+#' `col_names`. In either case, the column names in the `Schema` must match the
+#' data's column names, whether they are explicitly provided or inferred. That
+#' said, this `Schema` does not have to reference all columns: those omitted
+#' will have their types inferred.
+#'
+#' Alternatively, you can declare column types by providing the compact string
representation
+#' that `readr` uses to the `col_types` argument. This means you provide a
+#' single string, one character per column, where the characters map to Arrow
+#' types analogously to the `readr` type mapping:
+#'
+#' * "c": `utf8()`
+#' * "i": `int32()`
+#' * "n": `float64()`
+#' * "d": `float64()`
+#' * "l": `bool()`
+#' * "f": `dictionary()`
+#' * "D": `date32()`
+#' * "T": `time32()`
+#' * "t": `timestamp()`
+#' * "_": `null()`
+#' * "-": `null()`
+#' * "?": infer the type from the data
+#'
+#' If you use the compact string representation for `col_types`, you must also
+#' specify `col_names`.
+#'
+#' Regardless of how types are specified, all columns with a `null()` type will
+#' be dropped.
+#'
+#' Note that if you are specifying column names, whether by `schema` or
+#' `col_names`, and the CSV file has a header row that would otherwise be used
+#' to idenfity column names, you'll need to add `skip = 1` to skip that row.
+#'
#' @param file A character file name or URI, `raw` vector, or an Arrow input
stream.
#' If a file name, a memory-mapped Arrow [InputStream] will be opened and
#' closed when finished; compression will be detected from the file extension
@@ -46,10 +91,14 @@
#' characters? This is more general than `escape_double` as backslashes
#' can be used to escape the delimiter character, the quote character, or
#' to add special characters like `\\n`.
+#' @param schema [Schema] that describes the table. If provided, it will be
+#' used to satisfy both `col_names` and `col_types`.
#' @param col_names If `TRUE`, the first row of the input will be used as the
#' column names and will not be included in the data frame. If `FALSE`, column
#' names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
#' Alternatively, you can specify a character vector of column names.
+#' @param col_types A compact string representation of the column types, or
+#' `NULL` (the default) to infer types from the data.
#' @param col_select A character vector of column names to keep, as in the
#' "select" argument to `data.table::fread()`, or a
#' [tidy selection specification][tidyselect::vars_select()]
@@ -63,6 +112,12 @@
#' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be
#' filled with missings.
#' @param skip Number of lines to skip before reading data.
+#' @param timestamp_parsers User-defined timestamp parsers. If more than one
+#' parser is specified, the CSV conversion logic will try parsing values
+#' starting from the beginning of this vector. Possible values are:
+#' - `NULL`: the default, which uses the ISO-8601 parser
+#' - a character vector of [strptime][base::strptime()] parse strings
+#' - a list of [TimestampParser] objects
#' @param parse_options see [file reader options][CsvReadOptions].
#' If given, this overrides any
#' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.).
@@ -90,8 +145,9 @@ read_delim_arrow <- function(file,
quote = '"',
escape_double = TRUE,
escape_backslash = FALSE,
+ schema = NULL,
col_names = TRUE,
- # col_types = TRUE,
+ col_types = NULL,
col_select = NULL,
na = c("", "NA"),
quoted_na = TRUE,
@@ -101,8 +157,12 @@ read_delim_arrow <- function(file,
convert_options = NULL,
read_options = NULL,
filesystem = NULL,
- as_data_frame = TRUE) {
-
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL) {
+ if (inherits(schema, "Schema")) {
+ col_names <- names(schema)
+ col_types <- schema
+ }
if (is.null(parse_options)) {
parse_options <- readr_to_csv_parse_options(
delim,
@@ -112,13 +172,17 @@ read_delim_arrow <- function(file,
skip_empty_rows
)
}
-
if (is.null(read_options)) {
read_options <- readr_to_csv_read_options(skip, col_names)
}
if (is.null(convert_options)) {
- # TODO: col_types (needs wiring in CsvConvertOptions)
- convert_options <- readr_to_csv_convert_options(na, quoted_na)
+ convert_options <- readr_to_csv_convert_options(
+ na,
+ quoted_na,
+ col_types = col_types,
+ col_names = read_options$column_names,
+ timestamp_parsers = timestamp_parsers
+ )
}
if (!inherits(file, "InputStream")) {
@@ -134,6 +198,7 @@ read_delim_arrow <- function(file,
tab <- reader$Read()
+ # TODO: move this into convert_options using include_columns
col_select <- enquo(col_select)
if (!quo_is_null(col_select)) {
tab <- tab[vars_select(names(tab), !!col_select)]
@@ -152,8 +217,9 @@ read_csv_arrow <- function(file,
quote = '"',
escape_double = TRUE,
escape_backslash = FALSE,
+ schema = NULL,
col_names = TRUE,
- # col_types = TRUE,
+ col_types = NULL,
col_select = NULL,
na = c("", "NA"),
quoted_na = TRUE,
@@ -162,7 +228,8 @@ read_csv_arrow <- function(file,
parse_options = NULL,
convert_options = NULL,
read_options = NULL,
- as_data_frame = TRUE) {
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL) {
mc <- match.call()
mc$delim <- ","
@@ -176,8 +243,9 @@ read_tsv_arrow <- function(file,
quote = '"',
escape_double = TRUE,
escape_backslash = FALSE,
+ schema = NULL,
col_names = TRUE,
- # col_types = TRUE,
+ col_types = NULL,
col_select = NULL,
na = c("", "NA"),
quoted_na = TRUE,
@@ -186,7 +254,8 @@ read_tsv_arrow <- function(file,
parse_options = NULL,
convert_options = NULL,
read_options = NULL,
- as_data_frame = TRUE) {
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL) {
mc <- match.call()
mc$delim <- "\t"
@@ -244,7 +313,7 @@ CsvTableReader$create <- function(file,
#' @usage NULL
#' @format NULL
#' @description `CsvReadOptions`, `CsvParseOptions`, `CsvConvertOptions`,
-#' `JsonReadOptions`, and `JsonParseOptions` are containers for various
+#' `JsonReadOptions`, `JsonParseOptions`, and `TimestampParser` are containers
for various
#' file reading options. See their usage in [read_csv_arrow()] and
#' [read_json_arrow()], respectively.
#'
@@ -292,19 +361,48 @@ CsvTableReader$create <- function(file,
#' - `strings_can_be_null` Logical: can string / binary columns have
#' null values? Similar to the `quoted_na` argument to `readr::read_csv()`.
#' (default `FALSE`)
+#' - `true_values` character vector of recognized spellings for `TRUE` values
+#' - `false_values` character vector of recognized spellings for `FALSE` values
+#' - `col_types` A `Schema` or `NULL` to infer types
+#' - `auto_dict_encode` Logical: Whether to try to automatically
+#' dictionary-encode string / binary data (think `stringsAsFactors`).
Default `FALSE`.
+#' This setting is ignored for non-inferred columns (those in `col_types`).
+#' - `auto_dict_max_cardinality` If `auto_dict_encode`, string/binary columns
+#' are dictionary-encoded up to this number of unique values (default 50),
+#' after which it switches to regular encoding.
+#' - `include_columns` If non-empty, indicates the names of columns from the
+#' CSV file that should be actually read and converted (in the vector's
order).
+#' - `include_missing_columns` Logical: if `include_columns` is provided,
should
+#' columns named in it but not found in the data be included as a column of
+#' type `null()`? The default (`FALSE`) means that the reader will instead
+#' raise an error.
+#' - `timestamp_parsers` User-defined timestamp parsers. If more than one
+#' parser is specified, the CSV conversion logic will try parsing values
+#' starting from the beginning of this vector. Possible values are
+#' (a) `NULL`, the default, which uses the ISO-8601 parser;
+#' (b) a character vector of [strptime][base::strptime()] parse strings; or
+#' (c) a list of [TimestampParser] objects.
#'
-#' @section Methods:
+#' `TimestampParser$create()` takes an optional `format` string argument.
+#' See [`strptime()`][base::strptime()] for example syntax.
+#' The default is to use an ISO-8601 format parser.
+#' @section Active bindings:
#'
-#' These classes have no implemented methods. They are containers for the
-#' options.
+#' - `column_names`: from `CsvReadOptions`
#'
#' @export
-CsvReadOptions <- R6Class("CsvReadOptions", inherit = ArrowObject)
+CsvReadOptions <- R6Class("CsvReadOptions",
+ inherit = ArrowObject,
+ active = list(
+ column_names = function() csv___ReadOptions__column_names(self)
+ )
+)
CsvReadOptions$create <- function(use_threads = option_use_threads(),
block_size = 1048576L,
skip_rows = 0L,
column_names = character(0),
autogenerate_column_names = FALSE) {
+
shared_ptr(CsvReadOptions, csv___ReadOptions__initialize(
list(
use_threads = use_threads,
@@ -316,7 +414,7 @@ CsvReadOptions$create <- function(use_threads =
option_use_threads(),
))
}
-readr_to_csv_read_options <- function(skip, col_names) {
+readr_to_csv_read_options <- function(skip, col_names, col_types) {
if (isTRUE(col_names)) {
# C++ default to parse is 0-length string array
col_names <- character(0)
@@ -381,26 +479,115 @@ readr_to_csv_parse_options <- function(delim = ",",
#' @format NULL
#' @docType class
#' @export
+TimestampParser <- R6Class("TimestampParser", inherit = ArrowObject,
+ public = list(
+ kind = function() TimestampParser__kind(self),
+ format = function() TimestampParser__format(self)
+ )
+)
+TimestampParser$create <- function(format = NULL) {
+ if (is.null(format)) {
+ shared_ptr(TimestampParser, TimestampParser__MakeISO8601())
+ } else {
+ shared_ptr(TimestampParser, TimestampParser__MakeStrptime(format))
+ }
+}
+
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
CsvConvertOptions <- R6Class("CsvConvertOptions", inherit = ArrowObject)
CsvConvertOptions$create <- function(check_utf8 = TRUE,
null_values = c("", "NA"),
- strings_can_be_null = FALSE) {
- # TODO: there are more conversion options available:
- # // Optional per-column types (disabling type inference on those columns)
- # std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
- # // Recognized spellings for boolean values
- # std::vector<std::string> true_values;
- # std::vector<std::string> false_values;
+ true_values = c("T", "true", "TRUE"),
+ false_values= c("F", "false", "FALSE"),
+ strings_can_be_null = FALSE,
+ col_types = NULL,
+ auto_dict_encode = FALSE,
+ auto_dict_max_cardinality = 50L,
+ include_columns = character(),
+ include_missing_columns = FALSE,
+ timestamp_parsers = NULL) {
+
+ if (!is.null(col_types) && !inherits(col_types, "Schema")) {
+ abort(c(
+ "Unsupported `col_types` specification.",
+ i = "`col_types` must be NULL, or a <Schema>."
+ ))
+ }
shared_ptr(CsvConvertOptions, csv___ConvertOptions__initialize(
list(
check_utf8 = check_utf8,
null_values = null_values,
- strings_can_be_null = strings_can_be_null
+ strings_can_be_null = strings_can_be_null,
+ col_types = col_types,
+ true_values = true_values,
+ false_values = false_values,
+ auto_dict_encode = auto_dict_encode,
+ auto_dict_max_cardinality = auto_dict_max_cardinality,
+ include_columns = include_columns,
+ include_missing_columns = include_missing_columns,
+ timestamp_parsers = timestamp_parsers
)
))
}
-readr_to_csv_convert_options <- function(na, quoted_na) {
- CsvConvertOptions$create(null_values = na, strings_can_be_null = quoted_na)
+readr_to_csv_convert_options <- function(na,
+ quoted_na,
+ col_types = NULL,
+ col_names = NULL,
+ timestamp_parsers = NULL) {
+ include_columns <- character()
+
+ if (is.character(col_types)) {
+ if (length(col_types) != 1L) {
+ abort("`col_types` is a character vector that is not of size 1")
+ }
+ n <- nchar(col_types)
+ specs <- substring(col_types, seq_len(n), seq_len(n))
+ if (!is_bare_character(col_names, n)) {
+ abort("Compact specification for `col_types` requires `col_names`")
+ }
+
+ col_types <- set_names(nm = col_names, map2(specs, col_names, ~{
+ switch(.x,
+ "c" = utf8(),
+ "i" = int32(),
+ "n" = float64(),
+ "d" = float64(),
+ "l" = bool(),
+ "f" = dictionary(),
+ "D" = date32(),
+ "T" = time32(),
+ "t" = timestamp(),
+ "_" = null(),
+ "-" = null(),
+ "?" = NULL,
+ abort("Unsupported compact specification: '", .x,"' for column
'", .y, "'")
+ )
+ }))
+ # To "guess" types, omit them from col_types
+ col_types <- keep(col_types, ~!is.null(.x))
+ col_types <- schema(!!!col_types)
+ }
+
+ if (!is.null(col_types)) {
+ assert_is(col_types, "Schema")
+ # If any columns are null(), drop them
+ # (by specifying the other columns in include_columns)
+ nulls <- map_lgl(col_types$fields, ~.$type$Equals(null()))
+ if (any(nulls)) {
+ include_columns <- setdiff(col_names, names(col_types)[nulls])
+ }
+ }
+ CsvConvertOptions$create(
+ null_values = na,
+ strings_can_be_null = quoted_na,
+ col_types = col_types,
+ timestamp_parsers = timestamp_parsers,
+ include_columns = include_columns
+ )
}
diff --git a/r/R/schema.R b/r/R/schema.R
index ef25988..4bba8b8 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -182,6 +182,9 @@ length.Schema <- function(x) x$num_fields
}
}
+#' @export
+as.list.Schema <- function(x, ...) x$fields
+
#' read a Schema from a stream
#'
#' @param stream a `Message`, `InputStream`, or `Buffer`
diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd
index 81c6be4..8053307 100644
--- a/r/man/CsvReadOptions.Rd
+++ b/r/man/CsvReadOptions.Rd
@@ -4,13 +4,14 @@
\name{CsvReadOptions}
\alias{CsvReadOptions}
\alias{CsvParseOptions}
+\alias{TimestampParser}
\alias{CsvConvertOptions}
\alias{JsonReadOptions}
\alias{JsonParseOptions}
\title{File reader options}
\description{
\code{CsvReadOptions}, \code{CsvParseOptions}, \code{CsvConvertOptions},
-\code{JsonReadOptions}, and \code{JsonParseOptions} are containers for various
+\code{JsonReadOptions}, \code{JsonParseOptions}, and \code{TimestampParser}
are containers for various
file reading options. See their usage in
\code{\link[=read_csv_arrow]{read_csv_arrow()}} and
\code{\link[=read_json_arrow]{read_json_arrow()}}, respectively.
}
@@ -62,13 +63,38 @@ Analogous to the \code{na.strings} argument to
\item \code{strings_can_be_null} Logical: can string / binary columns have
null values? Similar to the \code{quoted_na} argument to
\code{readr::read_csv()}.
(default \code{FALSE})
-}
+\item \code{true_values} character vector of recognized spellings for
\code{TRUE} values
+\item \code{false_values} character vector of recognized spellings for
\code{FALSE} values
+\item \code{col_types} A \code{Schema} or \code{NULL} to infer types
+\item \code{auto_dict_encode} Logical: Whether to try to automatically
+dictionary-encode string / binary data (think \code{stringsAsFactors}).
Default \code{FALSE}.
+This setting is ignored for non-inferred columns (those in \code{col_types}).
+\item \code{auto_dict_max_cardinality} If \code{auto_dict_encode},
string/binary columns
+are dictionary-encoded up to this number of unique values (default 50),
+after which it switches to regular encoding.
+\item \code{include_columns} If non-empty, indicates the names of columns from
the
+CSV file that should be actually read and converted (in the vector's order).
+\item \code{include_missing_columns} Logical: if \code{include_columns} is
provided, should
+columns named in it but not found in the data be included as a column of
+type \code{null()}? The default (\code{FALSE}) means that the reader will
instead
+raise an error.
+\item \code{timestamp_parsers} User-defined timestamp parsers. If more than one
+parser is specified, the CSV conversion logic will try parsing values
+starting from the beginning of this vector. Possible values are
+(a) \code{NULL}, the default, which uses the ISO-8601 parser;
+(b) a character vector of \link[base:strptime]{strptime} parse strings; or
+(c) a list of \link{TimestampParser} objects.
}
-\section{Methods}{
+\code{TimestampParser$create()} takes an optional \code{format} string
argument.
+See \code{\link[base:strptime]{strptime()}} for example syntax.
+The default is to use an ISO-8601 format parser.
+}
+\section{Active bindings}{
-These classes have no implemented methods. They are containers for the
-options.
+\itemize{
+\item \code{column_names}: from \code{CsvReadOptions}
+}
}
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index 38251c7..335549e 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -12,7 +12,9 @@ read_delim_arrow(
quote = "\\"",
escape_double = TRUE,
escape_backslash = FALSE,
+ schema = NULL,
col_names = TRUE,
+ col_types = NULL,
col_select = NULL,
na = c("", "NA"),
quoted_na = TRUE,
@@ -22,7 +24,8 @@ read_delim_arrow(
convert_options = NULL,
read_options = NULL,
filesystem = NULL,
- as_data_frame = TRUE
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL
)
read_csv_arrow(
@@ -30,7 +33,9 @@ read_csv_arrow(
quote = "\\"",
escape_double = TRUE,
escape_backslash = FALSE,
+ schema = NULL,
col_names = TRUE,
+ col_types = NULL,
col_select = NULL,
na = c("", "NA"),
quoted_na = TRUE,
@@ -39,7 +44,8 @@ read_csv_arrow(
parse_options = NULL,
convert_options = NULL,
read_options = NULL,
- as_data_frame = TRUE
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL
)
read_tsv_arrow(
@@ -47,7 +53,9 @@ read_tsv_arrow(
quote = "\\"",
escape_double = TRUE,
escape_backslash = FALSE,
+ schema = NULL,
col_names = TRUE,
+ col_types = NULL,
col_select = NULL,
na = c("", "NA"),
quoted_na = TRUE,
@@ -56,7 +64,8 @@ read_tsv_arrow(
parse_options = NULL,
convert_options = NULL,
read_options = NULL,
- as_data_frame = TRUE
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL
)
}
\arguments{
@@ -79,11 +88,17 @@ characters? This is more general than \code{escape_double}
as backslashes
can be used to escape the delimiter character, the quote character, or
to add special characters like \verb{\\\\n}.}
+\item{schema}{\link{Schema} that describes the table. If provided, it will be
+used to satisfy both \code{col_names} and \code{col_types}.}
+
\item{col_names}{If \code{TRUE}, the first row of the input will be used as the
column names and will not be included in the data frame. If \code{FALSE},
column
names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
Alternatively, you can specify a character vector of column names.}
+\item{col_types}{A compact string representation of the column types, or
+\code{NULL} (the default) to infer types from the data.}
+
\item{col_select}{A character vector of column names to keep, as in the
"select" argument to \code{data.table::fread()}, or a
\link[tidyselect:vars_select]{tidy selection specification}
@@ -115,6 +130,15 @@ string file path; default is the local file system}
\item{as_data_frame}{Should the function return a \code{data.frame} (default)
or
an Arrow \link{Table}?}
+
+\item{timestamp_parsers}{User-defined timestamp parsers. If more than one
+parser is specified, the CSV conversion logic will try parsing values
+starting from the beginning of this vector. Possible values are:
+\itemize{
+\item \code{NULL}: the default, which uses the ISO-8601 parser
+\item a character vector of \link[base:strptime]{strptime} parse strings
+\item a list of \link{TimestampParser} objects
+}}
}
\value{
A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}.
@@ -136,6 +160,54 @@ equivalent in \code{readr::read_csv()}, you can either
provide them in the
\code{parse_options}, \code{convert_options}, or \code{read_options}
arguments, or you can
use \link{CsvTableReader} directly for lower-level access.
}
+\section{Specifying column types and names}{
+
+
+By default, the CSV reader will infer the column names and data types from the
file, but there
+are a few ways you can specify them directly.
+
+One way is to provide an Arrow \link{Schema} in the \code{schema} argument,
+which is an ordered map of column name to type.
+When provided, it satisfies both the \code{col_names} and \code{col_types}
arguments.
+This is good if you know all of this information up front.
+
+You can also pass a \code{Schema} to the \code{col_types} argument. If you do
this,
+column names will still be inferred from the file unless you also specify
+\code{col_names}. In either case, the column names in the \code{Schema} must
match the
+data's column names, whether they are explicitly provided or inferred. That
+said, this \code{Schema} does not have to reference all columns: those omitted
+will have their types inferred.
+
+Alternatively, you can declare column types by providing the compact string
representation
+that \code{readr} uses to the \code{col_types} argument. This means you
provide a
+single string, one character per column, where the characters map to Arrow
+types analogously to the \code{readr} type mapping:
+\itemize{
+\item "c": \code{utf8()}
+\item "i": \code{int32()}
+\item "n": \code{float64()}
+\item "d": \code{float64()}
+\item "l": \code{bool()}
+\item "f": \code{dictionary()}
+\item "D": \code{date32()}
+\item "T": \code{time32()}
+\item "t": \code{timestamp()}
+\item "_": \code{null()}
+\item "-": \code{null()}
+\item "?": infer the type from the data
+}
+
+If you use the compact string representation for \code{col_types}, you must
also
+specify \code{col_names}.
+
+Regardless of how types are specified, all columns with a \code{null()} type
will
+be dropped.
+
+Note that if you are specifying column names, whether by \code{schema} or
+\code{col_names}, and the CSV file has a header row that would otherwise be
used
+to idenfity column names, you'll need to add \code{skip = 1} to skip that row.
+}
+
\examples{
\donttest{
tf <- tempfile()
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 1c7ab14..d2f4465 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -1227,6 +1227,21 @@ extern "C" SEXP
_arrow_csv___ParseOptions__initialize(SEXP options_sexp){
// csv.cpp
#if defined(ARROW_R_WITH_ARROW)
+SEXP csv___ReadOptions__column_names(const
std::shared_ptr<arrow::csv::ReadOptions>& options);
+extern "C" SEXP _arrow_csv___ReadOptions__column_names(SEXP options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::csv::ReadOptions>&>::type
options(options_sexp);
+ return cpp11::as_sexp(csv___ReadOptions__column_names(options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_csv___ReadOptions__column_names(SEXP options_sexp){
+ Rf_error("Cannot call csv___ReadOptions__column_names(). Please use
arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
std::shared_ptr<arrow::csv::ConvertOptions>
csv___ConvertOptions__initialize(cpp11::list options);
extern "C" SEXP _arrow_csv___ConvertOptions__initialize(SEXP options_sexp){
BEGIN_CPP11
@@ -1273,6 +1288,65 @@ extern "C" SEXP _arrow_csv___TableReader__Read(SEXP
table_reader_sexp){
}
#endif
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string TimestampParser__kind(const
std::shared_ptr<arrow::TimestampParser>& parser);
+extern "C" SEXP _arrow_TimestampParser__kind(SEXP parser_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::TimestampParser>&>::type
parser(parser_sexp);
+ return cpp11::as_sexp(TimestampParser__kind(parser));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampParser__kind(SEXP parser_sexp){
+ Rf_error("Cannot call TimestampParser__kind(). Please use
arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string TimestampParser__format(const
std::shared_ptr<arrow::TimestampParser>& parser);
+extern "C" SEXP _arrow_TimestampParser__format(SEXP parser_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::TimestampParser>&>::type
parser(parser_sexp);
+ return cpp11::as_sexp(TimestampParser__format(parser));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampParser__format(SEXP parser_sexp){
+ Rf_error("Cannot call TimestampParser__format(). Please use
arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::TimestampParser>
TimestampParser__MakeStrptime(std::string format);
+extern "C" SEXP _arrow_TimestampParser__MakeStrptime(SEXP format_sexp){
+BEGIN_CPP11
+ arrow::r::Input<std::string>::type format(format_sexp);
+ return cpp11::as_sexp(TimestampParser__MakeStrptime(format));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampParser__MakeStrptime(SEXP format_sexp){
+ Rf_error("Cannot call TimestampParser__MakeStrptime(). Please use
arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::TimestampParser> TimestampParser__MakeISO8601();
+extern "C" SEXP _arrow_TimestampParser__MakeISO8601(){
+BEGIN_CPP11
+ return cpp11::as_sexp(TimestampParser__MakeISO8601());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampParser__MakeISO8601(){
+ Rf_error("Cannot call TimestampParser__MakeISO8601(). Please use
arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
// dataset.cpp
#if defined(ARROW_R_WITH_ARROW)
std::shared_ptr<ds::ScannerBuilder> dataset___Dataset__NewScan(const
std::shared_ptr<ds::Dataset>& ds);
@@ -6250,9 +6324,14 @@ static const R_CallMethodDef CallEntries[] = {
{ "_arrow_compute__CallFunction", (DL_FUNC)
&_arrow_compute__CallFunction, 3},
{ "_arrow_csv___ReadOptions__initialize", (DL_FUNC)
&_arrow_csv___ReadOptions__initialize, 1},
{ "_arrow_csv___ParseOptions__initialize", (DL_FUNC)
&_arrow_csv___ParseOptions__initialize, 1},
+ { "_arrow_csv___ReadOptions__column_names", (DL_FUNC)
&_arrow_csv___ReadOptions__column_names, 1},
{ "_arrow_csv___ConvertOptions__initialize", (DL_FUNC)
&_arrow_csv___ConvertOptions__initialize, 1},
{ "_arrow_csv___TableReader__Make", (DL_FUNC)
&_arrow_csv___TableReader__Make, 4},
{ "_arrow_csv___TableReader__Read", (DL_FUNC)
&_arrow_csv___TableReader__Read, 1},
+ { "_arrow_TimestampParser__kind", (DL_FUNC)
&_arrow_TimestampParser__kind, 1},
+ { "_arrow_TimestampParser__format", (DL_FUNC)
&_arrow_TimestampParser__format, 1},
+ { "_arrow_TimestampParser__MakeStrptime", (DL_FUNC)
&_arrow_TimestampParser__MakeStrptime, 1},
+ { "_arrow_TimestampParser__MakeISO8601", (DL_FUNC)
&_arrow_TimestampParser__MakeISO8601, 0},
{ "_arrow_dataset___Dataset__NewScan", (DL_FUNC)
&_arrow_dataset___Dataset__NewScan, 1},
{ "_arrow_dataset___Dataset__schema", (DL_FUNC)
&_arrow_dataset___Dataset__schema, 1},
{ "_arrow_dataset___Dataset__type_name", (DL_FUNC)
&_arrow_dataset___Dataset__type_name, 1},
diff --git a/r/src/arrow_exports.h b/r/src/arrow_exports.h
index 92d451d..c4cc0ff 100644
--- a/r/src/arrow_exports.h
+++ b/r/src/arrow_exports.h
@@ -29,6 +29,7 @@
#include <arrow/status.h>
#include <arrow/type_fwd.h>
#include <arrow/util/compression.h>
+#include <arrow/util/value_parsing.h>
namespace arrow {
diff --git a/r/src/csv.cpp b/r/src/csv.cpp
index ce9c068..efa6462 100644
--- a/r/src/csv.cpp
+++ b/r/src/csv.cpp
@@ -20,6 +20,7 @@
#if defined(ARROW_R_WITH_ARROW)
#include <arrow/csv/reader.h>
+#include <arrow/util/value_parsing.h>
// [[arrow::export]]
std::shared_ptr<arrow::csv::ReadOptions> csv___ReadOptions__initialize(
@@ -32,6 +33,7 @@ std::shared_ptr<arrow::csv::ReadOptions>
csv___ReadOptions__initialize(
res->column_names =
cpp11::as_cpp<std::vector<std::string>>(options["column_names"]);
res->autogenerate_column_names =
cpp11::as_cpp<bool>(options["autogenerate_column_names"]);
+
return res;
}
@@ -51,6 +53,16 @@ std::shared_ptr<arrow::csv::ParseOptions>
csv___ParseOptions__initialize(
}
// [[arrow::export]]
+SEXP csv___ReadOptions__column_names(
+ const std::shared_ptr<arrow::csv::ReadOptions>& options) {
+ if (options->autogenerate_column_names) {
+ return R_NilValue;
+ }
+
+ return cpp11::as_sexp(options->column_names);
+}
+
+// [[arrow::export]]
std::shared_ptr<arrow::csv::ConvertOptions> csv___ConvertOptions__initialize(
cpp11::list options) {
auto res = std::make_shared<arrow::csv::ConvertOptions>(
@@ -62,12 +74,63 @@ std::shared_ptr<arrow::csv::ConvertOptions>
csv___ConvertOptions__initialize(
// If true, then strings in "null_values" are considered null for string
columns.
// If false, then all strings are valid string values.
res->strings_can_be_null =
cpp11::as_cpp<bool>(options["strings_can_be_null"]);
- // TODO: there are more conversion options available:
- // // Optional per-column types (disabling type inference on those columns)
- // std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
- // // Recognized spellings for boolean values
- // std::vector<std::string> true_values;
- // std::vector<std::string> false_values;
+
+ res->true_values =
cpp11::as_cpp<std::vector<std::string>>(options["true_values"]);
+ res->false_values =
cpp11::as_cpp<std::vector<std::string>>(options["false_values"]);
+
+ SEXP col_types = options["col_types"];
+ if (Rf_inherits(col_types, "Schema")) {
+ auto schema = cpp11::as_cpp<std::shared_ptr<arrow::Schema>>(col_types);
+ std::unordered_map<std::string, std::shared_ptr<arrow::DataType>>
column_types;
+ for (const auto& field : schema->fields()) {
+ column_types.insert(std::make_pair(field->name(), field->type()));
+ }
+ res->column_types = column_types;
+ }
+
+ res->auto_dict_encode = cpp11::as_cpp<bool>(options["auto_dict_encode"]);
+ res->auto_dict_max_cardinality =
+ cpp11::as_cpp<int>(options["auto_dict_max_cardinality"]);
+ res->include_columns =
+ cpp11::as_cpp<std::vector<std::string>>(options["include_columns"]);
+ res->include_missing_columns =
cpp11::as_cpp<bool>(options["include_missing_columns"]);
+
+ SEXP op_timestamp_parsers = options["timestamp_parsers"];
+ if (!Rf_isNull(op_timestamp_parsers)) {
+ std::vector<std::shared_ptr<arrow::TimestampParser>> timestamp_parsers;
+
+ // if we have a character vector, convert to arrow::TimestampParser
+ if (TYPEOF(op_timestamp_parsers) == STRSXP) {
+ cpp11::strings s_timestamp_parsers(op_timestamp_parsers);
+ for (cpp11::r_string s : s_timestamp_parsers) {
+ timestamp_parsers.push_back(arrow::TimestampParser::MakeStrptime(s));
+ }
+
+ } else if (TYPEOF(op_timestamp_parsers) == VECSXP) {
+ cpp11::list lst_parsers(op_timestamp_parsers);
+
+ for (SEXP x : lst_parsers) {
+ // handle scalar string and TimestampParser instances
+ if (TYPEOF(x) == STRSXP && XLENGTH(x) == 1) {
+ timestamp_parsers.push_back(
+ arrow::TimestampParser::MakeStrptime(CHAR(STRING_ELT(x, 0))));
+ } else if (Rf_inherits(x, "TimestampParser")) {
+ timestamp_parsers.push_back(
+ cpp11::as_cpp<std::shared_ptr<arrow::TimestampParser>>(x));
+ } else {
+ cpp11::stop(
+ "unsupported timestamp parser, must be a scalar string or a "
+ "<TimestampParser> object");
+ }
+ }
+
+ } else {
+ cpp11::stop(
+ "unsupported timestamp parser, must be character vector of strptime "
+ "specifications, or a list of <TimestampParser> objects");
+ }
+ res->timestamp_parsers = timestamp_parsers;
+ }
return res;
}
@@ -89,4 +152,26 @@ std::shared_ptr<arrow::Table> csv___TableReader__Read(
return ValueOrStop(table_reader->Read());
}
+// [[arrow::export]]
+std::string TimestampParser__kind(const
std::shared_ptr<arrow::TimestampParser>& parser) {
+ return parser->kind();
+}
+
+// [[arrow::export]]
+std::string TimestampParser__format(
+ const std::shared_ptr<arrow::TimestampParser>& parser) {
+ return parser->format();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::TimestampParser> TimestampParser__MakeStrptime(
+ std::string format) {
+ return arrow::TimestampParser::MakeStrptime(format);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::TimestampParser> TimestampParser__MakeISO8601() {
+ return arrow::TimestampParser::MakeISO8601();
+}
+
#endif
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 2d85437..94dd10b 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -174,3 +174,83 @@ test_that("read_csv_arrow() can detect compression from
file name", {
tab1 <- read_csv_arrow(tf)
expect_equivalent(tbl, tab1)
})
+
+test_that("read_csv_arrow(schema=)", {
+ tbl <- example_data[, "int"]
+ tf <- tempfile(); on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, schema = schema(int = float64()), skip = 1)
+ expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
+})
+
+test_that("read_csv_arrow(col_types = <Schema>)", {
+ tbl <- example_data[, "int"]
+ tf <- tempfile(); on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, col_types = schema(int = float64()))
+ expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
+})
+
+test_that("read_csv_arrow(col_types=string, col_names)", {
+ tbl <- example_data[, "int"]
+ tf <- tempfile(); on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, col_names = "int", col_types = "d", skip = 1)
+ expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
+
+ expect_error(read_csv_arrow(tf, col_types = c("i", "d")))
+ expect_error(read_csv_arrow(tf, col_types = "d"))
+ expect_error(read_csv_arrow(tf, col_types = "i", col_names = c("a", "b")))
+ expect_error(read_csv_arrow(tf, col_types = "y", col_names = "a"))
+})
+
+test_that("read_csv_arrow() can read timestamps", {
+ tbl <- tibble::tibble(time = as.POSIXct("2020-07-20 16:20", tz = "UTC"))
+ tf <- tempfile(); on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, col_types = schema(time = timestamp()))
+ expect_equal(tbl, df)
+
+ df <- read_csv_arrow(tf, col_types = "t", col_names = "time", skip = 1)
+ expect_equal(tbl, df)
+})
+
+test_that("read_csv_arrow(timestamp_parsers=)", {
+ tf <- tempfile(); on.exit(unlink(tf))
+ tbl <- tibble::tibble(time = "23/09/2020")
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(
+ tf,
+ col_types = schema(time = timestamp(timezone = "UTC")),
+ timestamp_parsers = "%d/%m/%Y"
+ )
+ expect_equal(df$time, as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC"))
+})
+
+test_that("Skipping columns with null()", {
+ tf <- tempfile(); on.exit(unlink(tf))
+ cols <- c("dbl", "lgl", "false", "chr")
+ tbl <- example_data[, cols]
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, col_types = "d-_c", col_names = cols, skip = 1)
+ expect_identical(df, tbl[, c("dbl", "chr")])
+})
+
+test_that("Mix of guessing and declaring types", {
+ tf <- tempfile(); on.exit(unlink(tf))
+ cols <- c("dbl", "lgl", "false", "chr")
+ tbl <- example_data[, cols]
+ write.csv(tbl, tf, row.names = FALSE)
+
+ tab <- read_csv_arrow(tf, col_types = schema(dbl = float32()), as_data_frame
= FALSE)
+ expect_equal(tab$schema, schema(dbl = float32(), lgl = bool(), false =
bool(), chr = utf8()))
+
+ df <- read_csv_arrow(tf, col_types = "d-?c", col_names = cols, skip = 1)
+ expect_identical(df, tbl[, c("dbl", "false", "chr")])
+})