This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5962184 ARROW-14063: [R] open_dataset() does not work on CSVs without
header rows
5962184 is described below
commit 5962184e7a3051ac8a7610a601bf017e7a8843fd
Author: Nic Crane <[email protected]>
AuthorDate: Wed Oct 13 15:04:34 2021 -0700
ARROW-14063: [R] open_dataset() does not work on CSVs without header rows
Closes #11346 from thisisnic/ARROW-14063_schemas
Lead-authored-by: Nic Crane <[email protected]>
Co-authored-by: Neal Richardson <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
---
r/R/dataset-format.R | 16 +++++++++++-----
r/R/dataset.R | 2 +-
r/tests/testthat/test-dataset-csv.R | 9 +++++++++
3 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index 2e1c673..b0b9321 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -74,12 +74,12 @@ FileFormat <- R6Class("FileFormat",
type = function() dataset___FileFormat__type_name(self)
)
)
-FileFormat$create <- function(format, ...) {
+FileFormat$create <- function(format, schema = NULL, ...) {
opt_names <- names(list(...))
if (format %in% c("csv", "text") || any(opt_names %in% c("delim",
"delimiter"))) {
- CsvFileFormat$create(...)
+ CsvFileFormat$create(schema = schema, ...)
} else if (format == c("tsv")) {
- CsvFileFormat$create(delimiter = "\t", ...)
+ CsvFileFormat$create(delimiter = "\t", schema = schema, ...)
} else if (format == "parquet") {
ParquetFileFormat$create(...)
} else if (format %in% c("ipc", "arrow", "feather")) { # These are aliases
for the same thing
@@ -118,7 +118,8 @@ IpcFileFormat <- R6Class("IpcFileFormat", inherit =
FileFormat)
#' @rdname FileFormat
#' @export
CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
-CsvFileFormat$create <- function(..., opts =
csv_file_format_parse_options(...),
+CsvFileFormat$create <- function(...,
+ opts = csv_file_format_parse_options(...),
convert_options =
csv_file_format_convert_opts(...),
read_options =
csv_file_format_read_opts(...)) {
dataset___CsvFileFormat__Make(opts, convert_options, read_options)
@@ -132,6 +133,7 @@ csv_file_format_parse_options <- function(...) {
read_opts <- names(formals(CsvReadOptions$create))
opts[convert_opts] <- NULL
opts[read_opts] <- NULL
+ opts[["schema"]] <- NULL
opt_names <- names(opts)
# Catch any readr-style options specified with full option names that are
# supported by read_delim_arrow() (and its wrappers) but are not yet
@@ -205,10 +207,11 @@ csv_file_format_convert_opts <- function(...) {
opts[arrow_opts] <- NULL
opts[readr_opts] <- NULL
opts[read_opts] <- NULL
+ opts[["schema"]] <- NULL
do.call(CsvConvertOptions$create, opts)
}
-csv_file_format_read_opts <- function(...) {
+csv_file_format_read_opts <- function(schema = NULL, ...) {
opts <- list(...)
# Filter out arguments meant for CsvParseOptions/CsvConvertOptions
arrow_opts <- names(formals(CsvParseOptions$create))
@@ -217,6 +220,9 @@ csv_file_format_read_opts <- function(...) {
opts[arrow_opts] <- NULL
opts[readr_opts] <- NULL
opts[convert_opts] <- NULL
+ if (!is.null(schema)) {
+ opts[["column_names"]] <- names(schema)
+ }
do.call(CsvReadOptions$create, opts)
}
diff --git a/r/R/dataset.R b/r/R/dataset.R
index 2297c9c..7207a55 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -157,7 +157,7 @@ open_dataset <- function(sources,
return(dataset___UnionDataset__create(sources, schema))
}
- factory <- DatasetFactory$create(sources, partitioning = partitioning,
format = format, ...)
+ factory <- DatasetFactory$create(sources, partitioning = partitioning,
format = format, schema = schema, ...)
tryCatch(
# Default is _not_ to inspect/unify schemas
factory$Finish(schema, isTRUE(unify_schemas)),
diff --git a/r/tests/testthat/test-dataset-csv.R
b/r/tests/testthat/test-dataset-csv.R
index 8d140853..ab66931 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -279,3 +279,12 @@ test_that("Error if no format specified and files are not
parquet", {
"Parquet magic bytes not found"
)
})
+
+test_that("Column names inferred from schema for headerless CSVs
(ARROW-14063)", {
+ headerless_csv_dir <- make_temp_dir()
+ tbl <- df1[, c("int", "dbl")]
+ write.table(tbl, file.path(headerless_csv_dir, "file1.csv"), sep = ",",
row.names = FALSE, col.names = FALSE)
+
+ ds <- open_dataset(headerless_csv_dir, format = "csv", schema = schema(int =
int32(), dbl = float64()))
+ expect_equal(ds %>% collect(), tbl)
+})