[arrow] branch master updated: ARROW-14063: [R] open_dataset() does not work on CSVs without header rows

npr Wed, 13 Oct 2021 15:06:28 -0700

This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 5962184  ARROW-14063: [R] open_dataset() does not work on CSVs without 
header rows
5962184 is described below

commit 5962184e7a3051ac8a7610a601bf017e7a8843fd
Author: Nic Crane <[email protected]>
AuthorDate: Wed Oct 13 15:04:34 2021 -0700

    ARROW-14063: [R] open_dataset() does not work on CSVs without header rows
    
    Closes #11346 from thisisnic/ARROW-14063_schemas
    
    Lead-authored-by: Nic Crane <[email protected]>
    Co-authored-by: Neal Richardson <[email protected]>
    Signed-off-by: Neal Richardson <[email protected]>
---
 r/R/dataset-format.R                | 16 +++++++++++-----
 r/R/dataset.R                       |  2 +-
 r/tests/testthat/test-dataset-csv.R |  9 +++++++++
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index 2e1c673..b0b9321 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -74,12 +74,12 @@ FileFormat <- R6Class("FileFormat",
     type = function() dataset___FileFormat__type_name(self)
   )
 )
-FileFormat$create <- function(format, ...) {
+FileFormat$create <- function(format, schema = NULL, ...) {
   opt_names <- names(list(...))
   if (format %in% c("csv", "text") || any(opt_names %in% c("delim", 
"delimiter"))) {
-    CsvFileFormat$create(...)
+    CsvFileFormat$create(schema = schema, ...)
   } else if (format == c("tsv")) {
-    CsvFileFormat$create(delimiter = "\t", ...)
+    CsvFileFormat$create(delimiter = "\t", schema = schema, ...)
   } else if (format == "parquet") {
     ParquetFileFormat$create(...)
   } else if (format %in% c("ipc", "arrow", "feather")) { # These are aliases 
for the same thing
@@ -118,7 +118,8 @@ IpcFileFormat <- R6Class("IpcFileFormat", inherit = 
FileFormat)
 #' @rdname FileFormat
 #' @export
 CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
-CsvFileFormat$create <- function(..., opts = 
csv_file_format_parse_options(...),
+CsvFileFormat$create <- function(...,
+                                 opts = csv_file_format_parse_options(...),
                                  convert_options = 
csv_file_format_convert_opts(...),
                                  read_options = 
csv_file_format_read_opts(...)) {
   dataset___CsvFileFormat__Make(opts, convert_options, read_options)
@@ -132,6 +133,7 @@ csv_file_format_parse_options <- function(...) {
   read_opts <- names(formals(CsvReadOptions$create))
   opts[convert_opts] <- NULL
   opts[read_opts] <- NULL
+  opts[["schema"]] <- NULL
   opt_names <- names(opts)
   # Catch any readr-style options specified with full option names that are
   # supported by read_delim_arrow() (and its wrappers) but are not yet
@@ -205,10 +207,11 @@ csv_file_format_convert_opts <- function(...) {
   opts[arrow_opts] <- NULL
   opts[readr_opts] <- NULL
   opts[read_opts] <- NULL
+  opts[["schema"]] <- NULL
   do.call(CsvConvertOptions$create, opts)
 }
 
-csv_file_format_read_opts <- function(...) {
+csv_file_format_read_opts <- function(schema = NULL, ...) {
   opts <- list(...)
   # Filter out arguments meant for CsvParseOptions/CsvConvertOptions
   arrow_opts <- names(formals(CsvParseOptions$create))
@@ -217,6 +220,9 @@ csv_file_format_read_opts <- function(...) {
   opts[arrow_opts] <- NULL
   opts[readr_opts] <- NULL
   opts[convert_opts] <- NULL
+  if (!is.null(schema)) {
+    opts[["column_names"]] <- names(schema)
+  }
   do.call(CsvReadOptions$create, opts)
 }
 
diff --git a/r/R/dataset.R b/r/R/dataset.R
index 2297c9c..7207a55 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -157,7 +157,7 @@ open_dataset <- function(sources,
     return(dataset___UnionDataset__create(sources, schema))
   }
 
-  factory <- DatasetFactory$create(sources, partitioning = partitioning, 
format = format, ...)
+  factory <- DatasetFactory$create(sources, partitioning = partitioning, 
format = format, schema = schema, ...)
   tryCatch(
     # Default is _not_ to inspect/unify schemas
     factory$Finish(schema, isTRUE(unify_schemas)),
diff --git a/r/tests/testthat/test-dataset-csv.R 
b/r/tests/testthat/test-dataset-csv.R
index 8d140853..ab66931 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -279,3 +279,12 @@ test_that("Error if no format specified and files are not 
parquet", {
     "Parquet magic bytes not found"
   )
 })
+
+test_that("Column names inferred from schema for headerless CSVs 
(ARROW-14063)", {
+  headerless_csv_dir <- make_temp_dir()
+  tbl <- df1[, c("int", "dbl")]
+  write.table(tbl, file.path(headerless_csv_dir, "file1.csv"), sep = ",", 
row.names = FALSE, col.names = FALSE)
+
+  ds <- open_dataset(headerless_csv_dir, format = "csv", schema = schema(int = 
int32(), dbl = float64()))
+  expect_equal(ds %>% collect(), tbl)
+})

[arrow] branch master updated: ARROW-14063: [R] open_dataset() does not work on CSVs without header rows

Reply via email to