Re: [PR] GH-39811: [R] better documentation for col_types argument in open_delim_dataset [arrow]

via GitHub Mon, 07 Apr 2025 02:09:45 -0700


thisisnic commented on code in PR #45719:
URL: https://github.com/apache/arrow/pull/45719#discussion_r2030753282



##########
r/R/dataset-format.R:
##########
@@ -220,18 +219,23 @@ check_csv_file_format_args <- function(args, partitioning 
= NULL) {
     options$parse_options <- do.call(csv_parse_options, args$parse_options)
   }
 
-  if (is.null(args$convert_options)) {
-    options$convert_options <- do.call(csv_file_format_convert_opts, args)
-  } else if (is.list(args$convert_options)) {
-    options$convert_options <- do.call(csv_convert_options, 
args$convert_options)
-  }
-
+  # Set up read_options before convert_options since convert_options needs 
column names
   if (is.null(args$read_options)) {
+    # If col_names is provided, add it to read_options
+    if ("col_names" %in% names(args)) {
+      args$read_options <- list(col_names = args$col_names)
+    }

Review Comment:
   We may want to move this conditional out of this block to make it clearer to 
skim over.



##########
r/R/dataset-format.R:
##########
@@ -458,6 +462,57 @@ csv_file_format_convert_opts <- function(...) {
     opts[["quoted_na"]] <- NULL
   }
 
+  # Handle readr-style col_types specification
+  if ("col_types" %in% names(opts) && is.character(opts[["col_types"]])) {
+    if (length(opts[["col_types"]]) != 1L) {
+      abort("`col_types` is a character vector that is not of size 1")
+    }
+    n <- nchar(opts[["col_types"]])
+    specs <- substring(opts[["col_types"]], seq_len(n), seq_len(n))
+    
+    # Get column names from read_options if available
+    col_names <- if (!is.null(opts[["read_options"]])) {
+      if (!is.null(opts[["read_options"]]$column_names)) {
+        opts[["read_options"]]$column_names
+      } else if (!is.null(opts[["read_options"]]$col_names)) {
+        opts[["read_options"]]$col_names
+      } else {
+        abort("Compact specification for `col_types` requires column names in 
read_options")
+      }
+    } else if ("col_names" %in% names(opts)) {
+      opts[["col_names"]]
+    } else {
+      abort("Compact specification for `col_types` requires column names")
+    }
+
+    if (!is_bare_character(col_names, n)) {
+      abort("Compact specification for `col_types` requires `col_names`")
+    }
+
+    col_types <- set_names(nm = col_names, map2(specs, col_names, ~ {
+      switch(.x,
+        "c" = utf8(),
+        "i" = int32(),
+        "n" = float64(),
+        "d" = float64(),
+        "l" = bool(),
+        "f" = dictionary(),
+        "D" = date32(),
+        "T" = timestamp(unit = "ns"),
+        "t" = time32(),
+        "_" = null(),
+        "-" = null(),
+        "?" = NULL,
+        abort("Unsupported compact specification: '", .x, "' for column '", 
.y, "'")
+      )
+    }))
+    # To "guess" types, omit them from col_types
+    col_types <- keep(col_types, ~ !is.null(.x))
+    opts[["col_types"]] <- schema(col_types)
+  }
+
+  # Remove read_options from opts before calling csv_convert_options
+  opts[["read_options"]] <- NULL

Review Comment:
   A lot of this code already exists in `csv.R` so if we want to use it here, 
we should refactor it out into helper functions which we reuse in both places 
rather than making another copy - then we'll reduce the extra code we generate, 
and if anything needs updating anywhere it'll get updated everywhere.



##########
r/tests/testthat/test-dataset-csv.R:
##########
@@ -662,3 +662,130 @@ test_that("open_dataset() with `decimal_point` argument", 
{
     tibble(x = 1.2, y = "c")
   )
 })
+
+test_that("open_csv_dataset(col_types = <Schema>)", {
+  tbl <- example_data[, "int"]
+  tf <- tempfile()
+  on.exit(unlink(tf))
+  write.csv(tbl, tf, row.names = FALSE)
+
+  df <- open_csv_dataset(tf, col_types = schema(int = float64()))
+  expect_identical(dplyr::collect(df), tibble::tibble(int = 
as.numeric(tbl$int)))
+})
+
+test_that("open_tsv/delim_dataset(col_types = <Schema>)", {
+  tbl <- example_data[, "int"]
+  tf <- tempfile()
+  on.exit(unlink(tf))
+  write.table(tbl, tf, sep = "\t", row.names = FALSE)
+
+  df <- open_tsv_dataset(tf, col_types = schema(int = float64()))
+  df_delim <- open_delim_dataset(tf, delim = "\t", col_types = schema(int = 
float64()))
+
+  expect_identical(dplyr::collect(df), tibble::tibble(int = 
as.numeric(tbl$int)))
+  expect_identical(dplyr::collect(df), dplyr::collect(df_delim))
+})
+
+test_that("open_*_dataset(col_types=string, col_names)", {
+  # Test data setup
+  tbl <- example_data[, "int"]
+  tf <- tempfile()
+  on.exit(unlink(tf))
+  write.csv(tbl, tf, row.names = FALSE)
+
+  # Basic tests with float64 type
+  df <- open_csv_dataset(tf, col_names = "int", col_types = "d", skip = 1)
+  df_delim <- open_delim_dataset(tf, col_names = "int", col_types = "d", skip 
= 1, delim = ",")
+  df_general <- open_dataset(tf, col_names = "int", col_types = "d", skip = 1, 
format = "csv")
+
+  expect_identical(dplyr::collect(df), tibble::tibble(int = 
as.numeric(tbl$int)))
+  expect_identical(dplyr::collect(df_delim), tibble::tibble(int = 
as.numeric(tbl$int)))
+  expect_identical(dplyr::collect(df_general), tibble::tibble(int = 
as.numeric(tbl$int)))
+
+  # Test with compact string representation
+  expect_identical(
+    tibble::tibble(int = as.numeric(tbl$int)),
+    open_dataset(tf, col_names = "int", col_types = "d", skip = 1, format = 
"csv") |> 
+      dplyr::collect()
+  )
+
+  # Error cases
+  expect_error(open_csv_dataset(tf, col_types = c("i", "d")))
+  expect_error(open_csv_dataset(tf, col_types = "d"))
+  expect_error(open_csv_dataset(tf, col_types = "i", col_names = c("a", "b")))
+  expect_error(open_csv_dataset(tf, col_types = "y", col_names = "a"))

Review Comment:
   Including this error testing is great, though I reckon we should wrap it up 
into tests for any new helper functions we create (see earlier comment about 
helper functions instead of copying it), rather than doing it here.



##########
r/tests/testthat/test-dataset-csv.R:
##########
@@ -662,3 +662,130 @@ test_that("open_dataset() with `decimal_point` argument", 
{
     tibble(x = 1.2, y = "c")
   )
 })
+
+test_that("open_csv_dataset(col_types = <Schema>)", {
+  tbl <- example_data[, "int"]
+  tf <- tempfile()
+  on.exit(unlink(tf))
+  write.csv(tbl, tf, row.names = FALSE)
+
+  df <- open_csv_dataset(tf, col_types = schema(int = float64()))
+  expect_identical(dplyr::collect(df), tibble::tibble(int = 
as.numeric(tbl$int)))
+})

Review Comment:
   We can incorporate some of this into the test labelled "open_delim_dataset 
params passed through to open_dataset", maybe [after the test for 
schema](https://github.com/apache/arrow/blob/b9afcfa6f50cf097c98c34ac84c396ffe8e27005/r/tests/testthat/test-dataset-csv.R#L561).



##########
r/tests/testthat/test-dataset-csv.R:
##########
@@ -662,3 +662,130 @@ test_that("open_dataset() with `decimal_point` argument", 
{
     tibble(x = 1.2, y = "c")
   )
 })
+
+test_that("open_csv_dataset(col_types = <Schema>)", {
+  tbl <- example_data[, "int"]
+  tf <- tempfile()
+  on.exit(unlink(tf))
+  write.csv(tbl, tf, row.names = FALSE)
+
+  df <- open_csv_dataset(tf, col_types = schema(int = float64()))
+  expect_identical(dplyr::collect(df), tibble::tibble(int = 
as.numeric(tbl$int)))
+})
+
+test_that("open_tsv/delim_dataset(col_types = <Schema>)", {
+  tbl <- example_data[, "int"]
+  tf <- tempfile()
+  on.exit(unlink(tf))
+  write.table(tbl, tf, sep = "\t", row.names = FALSE)
+
+  df <- open_tsv_dataset(tf, col_types = schema(int = float64()))
+  df_delim <- open_delim_dataset(tf, delim = "\t", col_types = schema(int = 
float64()))
+
+  expect_identical(dplyr::collect(df), tibble::tibble(int = 
as.numeric(tbl$int)))
+  expect_identical(dplyr::collect(df), dplyr::collect(df_delim))
+})
+

Review Comment:
   As `open_delim_dataset()` is just a thin wrapper around 
`open_csv_dataset()`, we don't need to test them both here if the bug was found 
in both.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-39811: [R] better documentation for col_types argument in open_delim_dataset [arrow]

Reply via email to