(arrow) branch main updated: GH-39811: [R] better documentation for col_types argument in open_delim_dataset (#45719)

thisisnic Fri, 11 Apr 2025 02:08:02 -0700

This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new d9ee741b19 GH-39811: [R] better documentation for col_types argument 
in open_delim_dataset (#45719)
d9ee741b19 is described below

commit d9ee741b19c94984ab3eb13219ffbcb10eb26144
Author: Anatolii Tsyplenkov <[email protected]>
AuthorDate: Fri Apr 11 21:05:28 2025 +1200

    GH-39811: [R] better documentation for col_types argument in 
open_delim_dataset (#45719)
    
    ### Rationale for this change
    Hi, can you please consider this tiny update to the docs? In the current 
documentation, it's misleading how to specify col_types when a delimited file 
is scanned using `open_csv_dataset`, `open_delim_dataset`, etc. Reading what is 
currently written, one may assume that they can declare column types by 
providing the compact string representation that `readr` uses.
    
    
https://github.com/apache/arrow/blob/3c8fe098c7f5e0e40bd06bc6afca8412eb81f56e/r/man/open_delim_dataset.Rd#L164-L165
    
    But it doesn't work. See reprex below
    
    ```r
    library(arrow)
    #>
    #> Attaching package: 'arrow'
    #> The following object is masked from 'package:utils':
    #>
    #>     timestamp
    tf <- tempfile()
    dir.create(tf)
    df <- data.frame(x = c("1", "2", "NULL"))
    
    file_path <- file.path(tf, "file1.txt")
    write.table(df, file_path, sep = ",", row.names = FALSE)
    
    open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = "c")
    #> Error:
    #> ! Unsupported `col_types` specification.
    #> ℹ `col_types` must be NULL, or a <Schema>.
    
    unlink(tf)
    ```
    
    ### What changes are included in this PR?
    The current PR provides a clearer explanation of what should be passed to 
the `col_types` argument, along with a basic example for the 
`open_csv_dataset()`.
    
    ### Are these changes tested?
    Not needed, as only the R documentation has been updated
    
    ### Are there any user-facing changes?
    Only the R documentation has been updated
    
    Lead-authored-by: Anatolii Tsyplenkov <[email protected]>
    Co-authored-by: Nic Crane <[email protected]>
    Signed-off-by: Nic Crane <[email protected]>
---
 r/R/csv.R                           | 30 +----------------------
 r/R/dataset-format.R                | 43 ++++++++++++++++++++++++++-------
 r/R/dataset.R                       |  2 ++
 r/R/util.R                          | 47 +++++++++++++++++++++++++++++++++++++
 r/man/open_delim_dataset.Rd         |  4 ++--
 r/tests/testthat/test-dataset-csv.R | 17 +++++++++++++-
 r/tests/testthat/test-util.R        | 31 ++++++++++++++++++++++++
 7 files changed, 133 insertions(+), 41 deletions(-)

diff --git a/r/R/csv.R b/r/R/csv.R
index 7335475703..7652858b4a 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -842,35 +842,7 @@ readr_to_csv_convert_options <- function(na,
   include_columns <- character()
 
   if (is.character(col_types)) {
-    if (length(col_types) != 1L) {
-      abort("`col_types` is a character vector that is not of size 1")
-    }
-    n <- nchar(col_types)
-    specs <- substring(col_types, seq_len(n), seq_len(n))
-    if (!is_bare_character(col_names, n)) {
-      abort("Compact specification for `col_types` requires `col_names`")
-    }
-
-    col_types <- set_names(nm = col_names, map2(specs, col_names, ~ {
-      switch(.x,
-        "c" = utf8(),
-        "i" = int32(),
-        "n" = float64(),
-        "d" = float64(),
-        "l" = bool(),
-        "f" = dictionary(),
-        "D" = date32(),
-        "T" = timestamp(unit = "ns"),
-        "t" = time32(),
-        "_" = null(),
-        "-" = null(),
-        "?" = NULL,
-        abort("Unsupported compact specification: '", .x, "' for column '", 
.y, "'")
-      )
-    }))
-    # To "guess" types, omit them from col_types
-    col_types <- keep(col_types, ~ !is.null(.x))
-    col_types <- schema(col_types)
+    col_types <- parse_compact_col_spec(col_types, col_names)
   }
 
   if (!is.null(col_types)) {
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index c25a505f89..efe3545fb8 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -191,7 +191,6 @@ JsonFileFormat$create <- function(...) {
 #' @export
 CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
 CsvFileFormat$create <- function(..., partitioning = NULL) {
-
   dots <- list(...)
 
   options <- check_csv_file_format_args(dots, partitioning = partitioning)
@@ -202,7 +201,6 @@ CsvFileFormat$create <- function(..., partitioning = NULL) {
 
 # Check all arguments are valid
 check_csv_file_format_args <- function(args, partitioning = NULL) {
-
   options <- list(
     parse_options = args$parse_options,
     convert_options = args$convert_options,
@@ -220,18 +218,24 @@ check_csv_file_format_args <- function(args, partitioning 
= NULL) {
     options$parse_options <- do.call(csv_parse_options, args$parse_options)
   }
 
-  if (is.null(args$convert_options)) {
-    options$convert_options <- do.call(csv_file_format_convert_opts, args)
-  } else if (is.list(args$convert_options)) {
-    options$convert_options <- do.call(csv_convert_options, 
args$convert_options)
-  }
-
+  # Set up read_options before convert_options since convert_options needs 
column names
   if (is.null(args$read_options)) {
     options$read_options <- do.call(csv_file_format_read_opts, c(args, 
list(partitioning = partitioning)))
   } else if (is.list(args$read_options)) {
     options$read_options <- do.call(csv_read_options, args$read_options)
   }
 
+  # If col_names is provided, add it to read_options
+  if ("col_names" %in% names(args)) {
+    args$read_options <- list(col_names = args$col_names)
+  }
+
+  if (is.null(args$convert_options)) {
+    options$convert_options <- do.call(csv_file_format_convert_opts, c(args, 
list(read_options = options$read_options)))
+  } else if (is.list(args$convert_options)) {
+    options$convert_options <- do.call(csv_convert_options, 
args$convert_options)
+  }
+
   options
 }
 
@@ -458,11 +462,32 @@ csv_file_format_convert_opts <- function(...) {
     opts[["quoted_na"]] <- NULL
   }
 
+  # Handle readr-style col_types specification
+  if ("col_types" %in% names(opts) && is.character(opts[["col_types"]])) {
+    # Get column names from read_options if available
+    col_names <- if (!is.null(opts[["read_options"]])) {
+      if (!is.null(opts[["read_options"]]$column_names)) {
+        opts[["read_options"]]$column_names
+      } else if (!is.null(opts[["read_options"]]$col_names)) {
+        opts[["read_options"]]$col_names
+      } else {
+        abort("Compact specification for `col_types` requires column names in 
read_options")
+      }
+    } else if ("col_names" %in% names(opts)) {
+      opts[["col_names"]]
+    } else {
+      abort("Compact specification for `col_types` requires column names")
+    }
+
+    opts[["col_types"]] <- parse_compact_col_spec(opts[["col_types"]], 
col_names)
+  }
+
+  # Remove read_options from opts before calling csv_convert_options
+  opts[["read_options"]] <- NULL
   do.call(csv_convert_options, opts)
 }
 
 csv_file_format_read_opts <- function(schema = NULL, partitioning = NULL, ...) 
{
-
   opts <- list(...)
   # Filter out arguments meant for CsvParseOptions/CsvConvertOptions
   arrow_opts <- c(names(formals(csv_parse_options)), "parse_options")
diff --git a/r/R/dataset.R b/r/R/dataset.R
index 7a6c6c694a..c7bd602ce3 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -255,6 +255,8 @@ open_dataset <- function(sources,
 #'
 #' read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 
1)
 #' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip 
= 1)
+#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = 
schema(list(x = int32())))
+#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = "i", 
col_names = "y", skip = 1)
 #'
 #' unlink(tf)
 #' @seealso [open_dataset()]
diff --git a/r/R/util.R b/r/R/util.R
index 14e4544ab1..bba52b1876 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -248,3 +248,50 @@ check_named_cols <- function(df) {
     )
   }
 }
+
+#' Parse a compact column type specification into Arrow schema
+#'
+#' @param col_types A single character string where each character represents
+#' a column type, like in readr
+#' @param col_names Character vector of column names (must match the length of
+#' col_types characters)
+#' @return A Schema object
+#'
+#' @examples
+#' parse_compact_col_spec("ci", colnames = c("x", "y"))
+#'
+#' @keywords internal
+parse_compact_col_spec <- function(col_types, col_names) {
+  if (length(col_types) != 1L) {
+    abort("`col_types` must be a character vector of size 1")
+  }
+  n <- nchar(col_types)
+  specs <- substring(col_types, seq_len(n), seq_len(n))
+
+  if (!is_bare_character(col_names, n)) {
+    abort("Compact specification for `col_types` requires `col_names` of 
matching length")
+  }
+
+  col_types <- set_names(nm = col_names, map2(specs, col_names, ~ 
col_type_from_compact(.x, .y)))
+  # To "guess" types, omit them from col_types
+  col_types <- keep(col_types, ~ !is.null(.x))
+  schema(col_types)
+}
+
+col_type_from_compact <- function(x, y) {
+  switch(x,
+    "c" = utf8(),
+    "i" = int32(),
+    "n" = float64(),
+    "d" = float64(),
+    "l" = bool(),
+    "f" = dictionary(),
+    "D" = date32(),
+    "T" = timestamp(unit = "ns"),
+    "t" = time32(),
+    "_" = null(),
+    "-" = null(),
+    "?" = NULL,
+    abort(paste0("Unsupported compact specification: '", x, "' for column '", 
y, "'"))
+  )
+}
diff --git a/r/man/open_delim_dataset.Rd b/r/man/open_delim_dataset.Rd
index 7b81f0033e..280a3e6acb 100644
--- a/r/man/open_delim_dataset.Rd
+++ b/r/man/open_delim_dataset.Rd
@@ -161,8 +161,7 @@ column names and will not be included in the data frame. If 
\code{FALSE}, column
 names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
 Alternatively, you can specify a character vector of column names.}
 
-\item{col_types}{A compact string representation of the column types,
-an Arrow \link{Schema}, or \code{NULL} (the default) to infer types from the 
data.}
+\item{col_types}{an Arrow \link{Schema}, or \code{NULL} (the default) to infer 
types from the data.}
 
 \item{na}{A character vector of strings to interpret as missing values.}
 
@@ -221,6 +220,7 @@ write.table(df, file_path, sep = ",", row.names = FALSE)
 
 read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
 open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 
1)
+open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = 
schema(list(x = int32())))
 
 unlink(tf)
 \dontshow{\}) # examplesIf}
diff --git a/r/tests/testthat/test-dataset-csv.R 
b/r/tests/testthat/test-dataset-csv.R
index 387346a0d6..91e93192ed 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -523,6 +523,21 @@ test_that("open_delim_dataset params passed through to 
open_dataset", {
   ds_strings <- open_csv_dataset(dst_dir, col_types = data_schema)
   expect_equal(ds_strings$schema, schema(a = string(), b = string()))
 
+  # col_types - as compact schema
+  compact_schema <- schema(
+    int = int32(), dbl = float64(), lgl = bool(), chr = utf8(),
+    fct = dictionary(), ts = timestamp(unit = "ns")
+  )
+
+  ds <- open_csv_dataset(
+    csv_dir,
+    col_names = c("int", "dbl", "lgl", "chr", "fct", "ts"),
+    col_types = "idlcfT",
+    skip = 1
+  )
+
+  expect_equal(schema(ds), compact_schema)
+
   # skip_empty_rows
   tf <- tempfile()
   writeLines('"x"\n"y"\nNA\nNA\n"NULL"\n\n\n', tf)
@@ -553,7 +568,7 @@ test_that("open_delim_dataset params passed through to 
open_dataset", {
   ds <- open_csv_dataset(
     csv_dir,
     schema = schema(
-      int = int64(), dbl = int64(), lgl = bool(), chr = utf8(),
+      int = int64(), dbl = float64(), lgl = bool(), chr = utf8(),
       fct = utf8(), ts = timestamp(unit = "s")
     ),
     skip = 1
diff --git a/r/tests/testthat/test-util.R b/r/tests/testthat/test-util.R
index 15aece7c3f..878700f2fc 100644
--- a/r/tests/testthat/test-util.R
+++ b/r/tests/testthat/test-util.R
@@ -70,3 +70,34 @@ test_that("all_funs() identifies namespace-qualified and 
unqualified functions",
     c("other_fun", "fun", "sum", "base::log")
   )
 })
+
+test_that("parse_compact_col_spec() converts string specs to schema", {
+  compact_schema <- parse_compact_col_spec(
+    col_types = "cidlDTtf_-?",
+    col_names = c("c", "i", "d", "l", "D", "T", "t", "f", "_", "-", "?")
+  )
+
+  expect_equal(
+    compact_schema,
+    schema(
+      c = utf8(), i = int32(), d = float64(), l = bool(), D = date32(),
+      T = timestamp(unit = "ns"), t = time32(unit = "ms"), f = dictionary(),
+      `_` = null(), `-` = null()
+    )
+  )
+
+  expect_error(
+    parse_compact_col_spec(c("i", "d"), c("a", "b")),
+    "`col_types` must be a character vector of size 1"
+  )
+
+  expect_error(
+    parse_compact_col_spec("idc", c("a", "b")),
+    "Compact specification for `col_types` requires `col_names` of matching 
length"
+  )
+
+  expect_error(
+    parse_compact_col_spec("y", "a"),
+    "Unsupported compact specification: 'y' for column 'a'"
+  )
+})

(arrow) branch main updated: GH-39811: [R] better documentation for col_types argument in open_delim_dataset (#45719)

Reply via email to