This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 486670a726 GH-38903: [R][Docs] Improve documentation of col_types
(#46145)
486670a726 is described below
commit 486670a7266cf6f49d0b7cc0209359332b27572a
Author: Nic Crane <[email protected]>
AuthorDate: Wed Apr 23 08:44:01 2025 +0100
GH-38903: [R][Docs] Improve documentation of col_types (#46145)
### Rationale for this change
Add clarity to the documentation around the `col_types` parameter for
`open_delim_dataset()`
### What changes are included in this PR?
Additional docs. I also improved some of our tests to make error messages
more explicit.
### Are these changes tested?
Nah, but they're mainly docs.
### Are there any user-facing changes?
Nope.
* GitHub Issue: #38903
Authored-by: Nic Crane <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/R/dataset.R | 17 ++++++++++-------
r/R/util.R | 12 ------------
r/man/open_dataset.Rd | 3 ---
r/man/open_delim_dataset.Rd | 15 +++++++++++----
r/tests/testthat/test-csv.R | 20 ++++++++++++++++----
5 files changed, 37 insertions(+), 30 deletions(-)
diff --git a/r/R/dataset.R b/r/R/dataset.R
index c7bd602ce3..e78055dedb 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -131,7 +131,6 @@
#' # Set up directory for examples
#' tf <- tempfile()
#' dir.create(tf)
-#' on.exit(unlink(tf))
#'
#' write_dataset(mtcars, tf, partitioning = "cyl")
#'
@@ -145,7 +144,6 @@
#' ## You must specify the file format if using a format other than parquet.
#' tf2 <- tempfile()
#' dir.create(tf2)
-#' on.exit(unlink(tf2))
#' write_dataset(mtcars, tf2, format = "ipc")
#' # This line will results in errors when you try to work with the data
#' \dontrun{
@@ -158,7 +156,6 @@
#' # Create a temporary directory and write example dataset
#' tf3 <- tempfile()
#' dir.create(tf3)
-#' on.exit(unlink(tf3))
#' write_dataset(airquality, tf3, partitioning = c("Month", "Day"), hive_style
= FALSE)
#'
#' # View files - you can see the partitioning means that files have been
written
@@ -248,17 +245,23 @@ open_dataset <- function(sources,
#' # Set up directory for examples
#' tf <- tempfile()
#' dir.create(tf)
-#' df <- data.frame(x = c("1", "2", "NULL"))
#'
+#' df <- data.frame(x = c("1", "2", "NULL"))
#' file_path <- file.path(tf, "file1.txt")
#' write.table(df, file_path, sep = ",", row.names = FALSE)
#'
+#' # Use readr-style params identically in both `read_csv_dataset()` and
`open_csv_dataset()`
#' read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip =
1)
#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip
= 1)
-#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types =
schema(list(x = int32())))
-#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = "i",
col_names = "y", skip = 1)
#'
-#' unlink(tf)
+#' # Use `col_types` to specify a schema, partial schema, or compact
representation
+#' tf2 <- tempfile()
+#' write_csv_dataset(cars, tf2)
+#'
+#' open_csv_dataset(tf2, col_types = schema(speed = int32(), dist = int32()))
+#' open_csv_dataset(tf2, col_types = schema(speed = int32()))
+#' open_csv_dataset(tf2, col_types = "ii", col_names = c("speed", "dist"),
skip = 1)
+#'
#' @seealso [open_dataset()]
#' @export
open_delim_dataset <- function(sources,
diff --git a/r/R/util.R b/r/R/util.R
index bba52b1876..ca9a9efd9d 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -249,18 +249,6 @@ check_named_cols <- function(df) {
}
}
-#' Parse a compact column type specification into Arrow schema
-#'
-#' @param col_types A single character string where each character represents
-#' a column type, like in readr
-#' @param col_names Character vector of column names (must match the length of
-#' col_types characters)
-#' @return A Schema object
-#'
-#' @examples
-#' parse_compact_col_spec("ci", colnames = c("x", "y"))
-#'
-#' @keywords internal
parse_compact_col_spec <- function(col_types, col_names) {
if (length(col_types) != 1L) {
abort("`col_types` must be a character vector of size 1")
diff --git a/r/man/open_dataset.Rd b/r/man/open_dataset.Rd
index 7028f38467..b48384c436 100644
--- a/r/man/open_dataset.Rd
+++ b/r/man/open_dataset.Rd
@@ -166,7 +166,6 @@ information will be taken from the file paths.
# Set up directory for examples
tf <- tempfile()
dir.create(tf)
-on.exit(unlink(tf))
write_dataset(mtcars, tf, partitioning = "cyl")
@@ -180,7 +179,6 @@ open_dataset(c(file.path(tf, "cyl=4/part-0.parquet"),
file.path(tf, "cyl=8/part-
## You must specify the file format if using a format other than parquet.
tf2 <- tempfile()
dir.create(tf2)
-on.exit(unlink(tf2))
write_dataset(mtcars, tf2, format = "ipc")
# This line will results in errors when you try to work with the data
\dontrun{
@@ -193,7 +191,6 @@ open_dataset(tf2, format = "ipc")
# Create a temporary directory and write example dataset
tf3 <- tempfile()
dir.create(tf3)
-on.exit(unlink(tf3))
write_dataset(airquality, tf3, partitioning = c("Month", "Day"), hive_style =
FALSE)
# View files - you can see the partitioning means that files have been written
diff --git a/r/man/open_delim_dataset.Rd b/r/man/open_delim_dataset.Rd
index 280a3e6acb..17ccb9f027 100644
--- a/r/man/open_delim_dataset.Rd
+++ b/r/man/open_delim_dataset.Rd
@@ -161,7 +161,8 @@ column names and will not be included in the data frame. If
\code{FALSE}, column
names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
Alternatively, you can specify a character vector of column names.}
-\item{col_types}{an Arrow \link{Schema}, or \code{NULL} (the default) to infer
types from the data.}
+\item{col_types}{A compact string representation of the column types,
+an Arrow \link{Schema}, or \code{NULL} (the default) to infer types from the
data.}
\item{na}{A character vector of strings to interpret as missing values.}
@@ -213,16 +214,22 @@ for opening single files and functions for opening
datasets.
# Set up directory for examples
tf <- tempfile()
dir.create(tf)
-df <- data.frame(x = c("1", "2", "NULL"))
+df <- data.frame(x = c("1", "2", "NULL"))
file_path <- file.path(tf, "file1.txt")
write.table(df, file_path, sep = ",", row.names = FALSE)
+# Use readr-style params identically in both `read_csv_dataset()` and
`open_csv_dataset()`
read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip =
1)
-open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types =
schema(list(x = int32())))
-unlink(tf)
+# Use `col_types` to specify a schema, partial schema, or compact
representation
+tf2 <- tempfile()
+write_csv_dataset(cars, tf2)
+
+open_csv_dataset(tf2, col_types = schema(speed = int32(), dist = int32()))
+open_csv_dataset(tf2, col_types = schema(speed = int32()))
+open_csv_dataset(tf2, col_types = "ii", col_names = c("speed", "dist"), skip =
1)
\dontshow{\}) # examplesIf}
}
\seealso{
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 84c1786f6f..769c2c98f1 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -209,10 +209,22 @@ test_that("read_csv_arrow(col_types=string, col_names)", {
df <- read_csv_arrow(tf, col_names = "int", col_types = "d", skip = 1)
expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
- expect_error(read_csv_arrow(tf, col_types = c("i", "d")))
- expect_error(read_csv_arrow(tf, col_types = "d"))
- expect_error(read_csv_arrow(tf, col_types = "i", col_names = c("a", "b")))
- expect_error(read_csv_arrow(tf, col_types = "y", col_names = "a"))
+ expect_error(
+ read_csv_arrow(tf, col_types = c("i", "d")),
+ "`col_types` must be a character vector of size 1"
+ )
+ expect_error(
+ read_csv_arrow(tf, col_types = "d"),
+ "Compact specification for `col_types` requires `col_names` of matching
length"
+ )
+ expect_error(
+ read_csv_arrow(tf, col_types = "i", col_names = c("a", "b")),
+ "Compact specification for `col_types` requires `col_names` of matching
length"
+ )
+ expect_error(
+ read_csv_arrow(tf, col_types = "y", col_names = "a"),
+ "Unsupported compact specification: 'y' for column 'a'"
+ )
})
test_that("read_csv_arrow() can read timestamps", {