This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new d9ee741b19 GH-39811: [R] better documentation for col_types argument
in open_delim_dataset (#45719)
d9ee741b19 is described below
commit d9ee741b19c94984ab3eb13219ffbcb10eb26144
Author: Anatolii Tsyplenkov <[email protected]>
AuthorDate: Fri Apr 11 21:05:28 2025 +1200
GH-39811: [R] better documentation for col_types argument in
open_delim_dataset (#45719)
### Rationale for this change
Hi, can you please consider this tiny update to the docs? In the current
documentation, it's misleading how to specify col_types when a delimited file
is scanned using `open_csv_dataset`, `open_delim_dataset`, etc. Reading what is
currently written, one may assume that they can declare column types by
providing the compact string representation that `readr` uses.
https://github.com/apache/arrow/blob/3c8fe098c7f5e0e40bd06bc6afca8412eb81f56e/r/man/open_delim_dataset.Rd#L164-L165
But it doesn't work. See reprex below
```r
library(arrow)
#>
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#>
#> timestamp
tf <- tempfile()
dir.create(tf)
df <- data.frame(x = c("1", "2", "NULL"))
file_path <- file.path(tf, "file1.txt")
write.table(df, file_path, sep = ",", row.names = FALSE)
open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = "c")
#> Error:
#> ! Unsupported `col_types` specification.
#> ℹ `col_types` must be NULL, or a <Schema>.
unlink(tf)
```
### What changes are included in this PR?
The current PR provides a clearer explanation of what should be passed to
the `col_types` argument, along with a basic example for the
`open_csv_dataset()`.
### Are these changes tested?
Not needed, as only the R documentation has been updated
### Are there any user-facing changes?
Only the R documentation has been updated
Lead-authored-by: Anatolii Tsyplenkov <[email protected]>
Co-authored-by: Nic Crane <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/R/csv.R | 30 +----------------------
r/R/dataset-format.R | 43 ++++++++++++++++++++++++++-------
r/R/dataset.R | 2 ++
r/R/util.R | 47 +++++++++++++++++++++++++++++++++++++
r/man/open_delim_dataset.Rd | 4 ++--
r/tests/testthat/test-dataset-csv.R | 17 +++++++++++++-
r/tests/testthat/test-util.R | 31 ++++++++++++++++++++++++
7 files changed, 133 insertions(+), 41 deletions(-)
diff --git a/r/R/csv.R b/r/R/csv.R
index 7335475703..7652858b4a 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -842,35 +842,7 @@ readr_to_csv_convert_options <- function(na,
include_columns <- character()
if (is.character(col_types)) {
- if (length(col_types) != 1L) {
- abort("`col_types` is a character vector that is not of size 1")
- }
- n <- nchar(col_types)
- specs <- substring(col_types, seq_len(n), seq_len(n))
- if (!is_bare_character(col_names, n)) {
- abort("Compact specification for `col_types` requires `col_names`")
- }
-
- col_types <- set_names(nm = col_names, map2(specs, col_names, ~ {
- switch(.x,
- "c" = utf8(),
- "i" = int32(),
- "n" = float64(),
- "d" = float64(),
- "l" = bool(),
- "f" = dictionary(),
- "D" = date32(),
- "T" = timestamp(unit = "ns"),
- "t" = time32(),
- "_" = null(),
- "-" = null(),
- "?" = NULL,
- abort("Unsupported compact specification: '", .x, "' for column '",
.y, "'")
- )
- }))
- # To "guess" types, omit them from col_types
- col_types <- keep(col_types, ~ !is.null(.x))
- col_types <- schema(col_types)
+ col_types <- parse_compact_col_spec(col_types, col_names)
}
if (!is.null(col_types)) {
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index c25a505f89..efe3545fb8 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -191,7 +191,6 @@ JsonFileFormat$create <- function(...) {
#' @export
CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
CsvFileFormat$create <- function(..., partitioning = NULL) {
-
dots <- list(...)
options <- check_csv_file_format_args(dots, partitioning = partitioning)
@@ -202,7 +201,6 @@ CsvFileFormat$create <- function(..., partitioning = NULL) {
# Check all arguments are valid
check_csv_file_format_args <- function(args, partitioning = NULL) {
-
options <- list(
parse_options = args$parse_options,
convert_options = args$convert_options,
@@ -220,18 +218,24 @@ check_csv_file_format_args <- function(args, partitioning
= NULL) {
options$parse_options <- do.call(csv_parse_options, args$parse_options)
}
- if (is.null(args$convert_options)) {
- options$convert_options <- do.call(csv_file_format_convert_opts, args)
- } else if (is.list(args$convert_options)) {
- options$convert_options <- do.call(csv_convert_options,
args$convert_options)
- }
-
+ # Set up read_options before convert_options since convert_options needs
column names
if (is.null(args$read_options)) {
options$read_options <- do.call(csv_file_format_read_opts, c(args,
list(partitioning = partitioning)))
} else if (is.list(args$read_options)) {
options$read_options <- do.call(csv_read_options, args$read_options)
}
+ # If col_names is provided, add it to read_options
+ if ("col_names" %in% names(args)) {
+ args$read_options <- list(col_names = args$col_names)
+ }
+
+ if (is.null(args$convert_options)) {
+ options$convert_options <- do.call(csv_file_format_convert_opts, c(args,
list(read_options = options$read_options)))
+ } else if (is.list(args$convert_options)) {
+ options$convert_options <- do.call(csv_convert_options,
args$convert_options)
+ }
+
options
}
@@ -458,11 +462,32 @@ csv_file_format_convert_opts <- function(...) {
opts[["quoted_na"]] <- NULL
}
+ # Handle readr-style col_types specification
+ if ("col_types" %in% names(opts) && is.character(opts[["col_types"]])) {
+ # Get column names from read_options if available
+ col_names <- if (!is.null(opts[["read_options"]])) {
+ if (!is.null(opts[["read_options"]]$column_names)) {
+ opts[["read_options"]]$column_names
+ } else if (!is.null(opts[["read_options"]]$col_names)) {
+ opts[["read_options"]]$col_names
+ } else {
+ abort("Compact specification for `col_types` requires column names in
read_options")
+ }
+ } else if ("col_names" %in% names(opts)) {
+ opts[["col_names"]]
+ } else {
+ abort("Compact specification for `col_types` requires column names")
+ }
+
+ opts[["col_types"]] <- parse_compact_col_spec(opts[["col_types"]],
col_names)
+ }
+
+ # Remove read_options from opts before calling csv_convert_options
+ opts[["read_options"]] <- NULL
do.call(csv_convert_options, opts)
}
csv_file_format_read_opts <- function(schema = NULL, partitioning = NULL, ...)
{
-
opts <- list(...)
# Filter out arguments meant for CsvParseOptions/CsvConvertOptions
arrow_opts <- c(names(formals(csv_parse_options)), "parse_options")
diff --git a/r/R/dataset.R b/r/R/dataset.R
index 7a6c6c694a..c7bd602ce3 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -255,6 +255,8 @@ open_dataset <- function(sources,
#'
#' read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip =
1)
#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip
= 1)
+#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types =
schema(list(x = int32())))
+#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = "i",
col_names = "y", skip = 1)
#'
#' unlink(tf)
#' @seealso [open_dataset()]
diff --git a/r/R/util.R b/r/R/util.R
index 14e4544ab1..bba52b1876 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -248,3 +248,50 @@ check_named_cols <- function(df) {
)
}
}
+
+#' Parse a compact column type specification into Arrow schema
+#'
+#' @param col_types A single character string where each character represents
+#' a column type, like in readr
+#' @param col_names Character vector of column names (must match the length of
+#' col_types characters)
+#' @return A Schema object
+#'
+#' @examples
+#' parse_compact_col_spec("ci", colnames = c("x", "y"))
+#'
+#' @keywords internal
+parse_compact_col_spec <- function(col_types, col_names) {
+ if (length(col_types) != 1L) {
+ abort("`col_types` must be a character vector of size 1")
+ }
+ n <- nchar(col_types)
+ specs <- substring(col_types, seq_len(n), seq_len(n))
+
+ if (!is_bare_character(col_names, n)) {
+ abort("Compact specification for `col_types` requires `col_names` of
matching length")
+ }
+
+ col_types <- set_names(nm = col_names, map2(specs, col_names, ~
col_type_from_compact(.x, .y)))
+ # To "guess" types, omit them from col_types
+ col_types <- keep(col_types, ~ !is.null(.x))
+ schema(col_types)
+}
+
+col_type_from_compact <- function(x, y) {
+ switch(x,
+ "c" = utf8(),
+ "i" = int32(),
+ "n" = float64(),
+ "d" = float64(),
+ "l" = bool(),
+ "f" = dictionary(),
+ "D" = date32(),
+ "T" = timestamp(unit = "ns"),
+ "t" = time32(),
+ "_" = null(),
+ "-" = null(),
+ "?" = NULL,
+ abort(paste0("Unsupported compact specification: '", x, "' for column '",
y, "'"))
+ )
+}
diff --git a/r/man/open_delim_dataset.Rd b/r/man/open_delim_dataset.Rd
index 7b81f0033e..280a3e6acb 100644
--- a/r/man/open_delim_dataset.Rd
+++ b/r/man/open_delim_dataset.Rd
@@ -161,8 +161,7 @@ column names and will not be included in the data frame. If
\code{FALSE}, column
names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
Alternatively, you can specify a character vector of column names.}
-\item{col_types}{A compact string representation of the column types,
-an Arrow \link{Schema}, or \code{NULL} (the default) to infer types from the
data.}
+\item{col_types}{an Arrow \link{Schema}, or \code{NULL} (the default) to infer
types from the data.}
\item{na}{A character vector of strings to interpret as missing values.}
@@ -221,6 +220,7 @@ write.table(df, file_path, sep = ",", row.names = FALSE)
read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip =
1)
+open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types =
schema(list(x = int32())))
unlink(tf)
\dontshow{\}) # examplesIf}
diff --git a/r/tests/testthat/test-dataset-csv.R
b/r/tests/testthat/test-dataset-csv.R
index 387346a0d6..91e93192ed 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -523,6 +523,21 @@ test_that("open_delim_dataset params passed through to
open_dataset", {
ds_strings <- open_csv_dataset(dst_dir, col_types = data_schema)
expect_equal(ds_strings$schema, schema(a = string(), b = string()))
+ # col_types - as compact schema
+ compact_schema <- schema(
+ int = int32(), dbl = float64(), lgl = bool(), chr = utf8(),
+ fct = dictionary(), ts = timestamp(unit = "ns")
+ )
+
+ ds <- open_csv_dataset(
+ csv_dir,
+ col_names = c("int", "dbl", "lgl", "chr", "fct", "ts"),
+ col_types = "idlcfT",
+ skip = 1
+ )
+
+ expect_equal(schema(ds), compact_schema)
+
# skip_empty_rows
tf <- tempfile()
writeLines('"x"\n"y"\nNA\nNA\n"NULL"\n\n\n', tf)
@@ -553,7 +568,7 @@ test_that("open_delim_dataset params passed through to
open_dataset", {
ds <- open_csv_dataset(
csv_dir,
schema = schema(
- int = int64(), dbl = int64(), lgl = bool(), chr = utf8(),
+ int = int64(), dbl = float64(), lgl = bool(), chr = utf8(),
fct = utf8(), ts = timestamp(unit = "s")
),
skip = 1
diff --git a/r/tests/testthat/test-util.R b/r/tests/testthat/test-util.R
index 15aece7c3f..878700f2fc 100644
--- a/r/tests/testthat/test-util.R
+++ b/r/tests/testthat/test-util.R
@@ -70,3 +70,34 @@ test_that("all_funs() identifies namespace-qualified and
unqualified functions",
c("other_fun", "fun", "sum", "base::log")
)
})
+
+test_that("parse_compact_col_spec() converts string specs to schema", {
+ compact_schema <- parse_compact_col_spec(
+ col_types = "cidlDTtf_-?",
+ col_names = c("c", "i", "d", "l", "D", "T", "t", "f", "_", "-", "?")
+ )
+
+ expect_equal(
+ compact_schema,
+ schema(
+ c = utf8(), i = int32(), d = float64(), l = bool(), D = date32(),
+ T = timestamp(unit = "ns"), t = time32(unit = "ms"), f = dictionary(),
+ `_` = null(), `-` = null()
+ )
+ )
+
+ expect_error(
+ parse_compact_col_spec(c("i", "d"), c("a", "b")),
+ "`col_types` must be a character vector of size 1"
+ )
+
+ expect_error(
+ parse_compact_col_spec("idc", c("a", "b")),
+ "Compact specification for `col_types` requires `col_names` of matching
length"
+ )
+
+ expect_error(
+ parse_compact_col_spec("y", "a"),
+ "Unsupported compact specification: 'y' for column 'a'"
+ )
+})