(arrow) branch main updated: GH-38903: [R][Docs] Improve documentation of col_types (#46145)

thisisnic Wed, 23 Apr 2025 00:53:02 -0700

This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 486670a726 GH-38903: [R][Docs] Improve documentation of col_types 
(#46145)
486670a726 is described below

commit 486670a7266cf6f49d0b7cc0209359332b27572a
Author: Nic Crane <[email protected]>
AuthorDate: Wed Apr 23 08:44:01 2025 +0100

    GH-38903: [R][Docs] Improve documentation of col_types (#46145)
    
    ### Rationale for this change
    
    Add clarity to the documentation around the `col_types` parameter for 
`open_delim_dataset()`
    
    ### What changes are included in this PR?
    
    Additional docs. I also improved some of our tests to make error messages 
more explicit.
    
    ### Are these changes tested?
    
    Nah, but they're mainly docs.
    
    ### Are there any user-facing changes?
    
    Nope.
    * GitHub Issue: #38903
    
    Authored-by: Nic Crane <[email protected]>
    Signed-off-by: Nic Crane <[email protected]>
---
 r/R/dataset.R               | 17 ++++++++++-------
 r/R/util.R                  | 12 ------------
 r/man/open_dataset.Rd       |  3 ---
 r/man/open_delim_dataset.Rd | 15 +++++++++++----
 r/tests/testthat/test-csv.R | 20 ++++++++++++++++----
 5 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/r/R/dataset.R b/r/R/dataset.R
index c7bd602ce3..e78055dedb 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -131,7 +131,6 @@
 #' # Set up directory for examples
 #' tf <- tempfile()
 #' dir.create(tf)
-#' on.exit(unlink(tf))
 #'
 #' write_dataset(mtcars, tf, partitioning = "cyl")
 #'
@@ -145,7 +144,6 @@
 #' ## You must specify the file format if using a format other than parquet.
 #' tf2 <- tempfile()
 #' dir.create(tf2)
-#' on.exit(unlink(tf2))
 #' write_dataset(mtcars, tf2, format = "ipc")
 #' # This line will results in errors when you try to work with the data
 #' \dontrun{
@@ -158,7 +156,6 @@
 #' # Create a temporary directory and write example dataset
 #' tf3 <- tempfile()
 #' dir.create(tf3)
-#' on.exit(unlink(tf3))
 #' write_dataset(airquality, tf3, partitioning = c("Month", "Day"), hive_style 
= FALSE)
 #'
 #' # View files - you can see the partitioning means that files have been 
written
@@ -248,17 +245,23 @@ open_dataset <- function(sources,
 #' # Set up directory for examples
 #' tf <- tempfile()
 #' dir.create(tf)
-#' df <- data.frame(x = c("1", "2", "NULL"))
 #'
+#' df <- data.frame(x = c("1", "2", "NULL"))
 #' file_path <- file.path(tf, "file1.txt")
 #' write.table(df, file_path, sep = ",", row.names = FALSE)
 #'
+#' # Use readr-style params identically in both `read_csv_dataset()` and 
`open_csv_dataset()`
 #' read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 
1)
 #' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip 
= 1)
-#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = 
schema(list(x = int32())))
-#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = "i", 
col_names = "y", skip = 1)
 #'
-#' unlink(tf)
+#' # Use `col_types` to specify a schema, partial schema, or compact 
representation
+#' tf2 <- tempfile()
+#' write_csv_dataset(cars, tf2)
+#'
+#' open_csv_dataset(tf2, col_types = schema(speed = int32(), dist = int32()))
+#' open_csv_dataset(tf2, col_types = schema(speed = int32()))
+#' open_csv_dataset(tf2, col_types = "ii", col_names = c("speed", "dist"), 
skip = 1)
+#'
 #' @seealso [open_dataset()]
 #' @export
 open_delim_dataset <- function(sources,
diff --git a/r/R/util.R b/r/R/util.R
index bba52b1876..ca9a9efd9d 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -249,18 +249,6 @@ check_named_cols <- function(df) {
   }
 }
 
-#' Parse a compact column type specification into Arrow schema
-#'
-#' @param col_types A single character string where each character represents
-#' a column type, like in readr
-#' @param col_names Character vector of column names (must match the length of
-#' col_types characters)
-#' @return A Schema object
-#'
-#' @examples
-#' parse_compact_col_spec("ci", colnames = c("x", "y"))
-#'
-#' @keywords internal
 parse_compact_col_spec <- function(col_types, col_names) {
   if (length(col_types) != 1L) {
     abort("`col_types` must be a character vector of size 1")
diff --git a/r/man/open_dataset.Rd b/r/man/open_dataset.Rd
index 7028f38467..b48384c436 100644
--- a/r/man/open_dataset.Rd
+++ b/r/man/open_dataset.Rd
@@ -166,7 +166,6 @@ information will be taken from the file paths.
 # Set up directory for examples
 tf <- tempfile()
 dir.create(tf)
-on.exit(unlink(tf))
 
 write_dataset(mtcars, tf, partitioning = "cyl")
 
@@ -180,7 +179,6 @@ open_dataset(c(file.path(tf, "cyl=4/part-0.parquet"), 
file.path(tf, "cyl=8/part-
 ## You must specify the file format if using a format other than parquet.
 tf2 <- tempfile()
 dir.create(tf2)
-on.exit(unlink(tf2))
 write_dataset(mtcars, tf2, format = "ipc")
 # This line will results in errors when you try to work with the data
 \dontrun{
@@ -193,7 +191,6 @@ open_dataset(tf2, format = "ipc")
 # Create a temporary directory and write example dataset
 tf3 <- tempfile()
 dir.create(tf3)
-on.exit(unlink(tf3))
 write_dataset(airquality, tf3, partitioning = c("Month", "Day"), hive_style = 
FALSE)
 
 # View files - you can see the partitioning means that files have been written
diff --git a/r/man/open_delim_dataset.Rd b/r/man/open_delim_dataset.Rd
index 280a3e6acb..17ccb9f027 100644
--- a/r/man/open_delim_dataset.Rd
+++ b/r/man/open_delim_dataset.Rd
@@ -161,7 +161,8 @@ column names and will not be included in the data frame. If 
\code{FALSE}, column
 names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
 Alternatively, you can specify a character vector of column names.}
 
-\item{col_types}{an Arrow \link{Schema}, or \code{NULL} (the default) to infer 
types from the data.}
+\item{col_types}{A compact string representation of the column types,
+an Arrow \link{Schema}, or \code{NULL} (the default) to infer types from the 
data.}
 
 \item{na}{A character vector of strings to interpret as missing values.}
 
@@ -213,16 +214,22 @@ for opening single files and functions for opening 
datasets.
 # Set up directory for examples
 tf <- tempfile()
 dir.create(tf)
-df <- data.frame(x = c("1", "2", "NULL"))
 
+df <- data.frame(x = c("1", "2", "NULL"))
 file_path <- file.path(tf, "file1.txt")
 write.table(df, file_path, sep = ",", row.names = FALSE)
 
+# Use readr-style params identically in both `read_csv_dataset()` and 
`open_csv_dataset()`
 read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
 open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 
1)
-open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = 
schema(list(x = int32())))
 
-unlink(tf)
+# Use `col_types` to specify a schema, partial schema, or compact 
representation
+tf2 <- tempfile()
+write_csv_dataset(cars, tf2)
+
+open_csv_dataset(tf2, col_types = schema(speed = int32(), dist = int32()))
+open_csv_dataset(tf2, col_types = schema(speed = int32()))
+open_csv_dataset(tf2, col_types = "ii", col_names = c("speed", "dist"), skip = 
1)
 \dontshow{\}) # examplesIf}
 }
 \seealso{
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 84c1786f6f..769c2c98f1 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -209,10 +209,22 @@ test_that("read_csv_arrow(col_types=string, col_names)", {
   df <- read_csv_arrow(tf, col_names = "int", col_types = "d", skip = 1)
   expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
 
-  expect_error(read_csv_arrow(tf, col_types = c("i", "d")))
-  expect_error(read_csv_arrow(tf, col_types = "d"))
-  expect_error(read_csv_arrow(tf, col_types = "i", col_names = c("a", "b")))
-  expect_error(read_csv_arrow(tf, col_types = "y", col_names = "a"))
+  expect_error(
+    read_csv_arrow(tf, col_types = c("i", "d")),
+    "`col_types` must be a character vector of size 1"
+  )
+  expect_error(
+    read_csv_arrow(tf, col_types = "d"),
+    "Compact specification for `col_types` requires `col_names` of matching 
length"
+  )
+  expect_error(
+    read_csv_arrow(tf, col_types = "i", col_names = c("a", "b")),
+    "Compact specification for `col_types` requires `col_names` of matching 
length"
+  )
+  expect_error(
+    read_csv_arrow(tf, col_types = "y", col_names = "a"),
+    "Unsupported compact specification: 'y' for column 'a'"
+  )
 })
 
 test_that("read_csv_arrow() can read timestamps", {

(arrow) branch main updated: GH-38903: [R][Docs] Improve documentation of col_types (#46145)

Reply via email to