thisisnic commented on code in PR #36436:
URL: https://github.com/apache/arrow/pull/36436#discussion_r1262303120


##########
r/R/csv.R:
##########
@@ -512,30 +512,46 @@ CsvReadOptions$create <- function(use_threads = 
option_use_threads(),
 
 readr_to_csv_write_options <- function(include_header = TRUE,
                                        batch_size = 1024L,
-                                       na = "") {
+                                       delim = ",",
+                                       na = "",
+                                       eol = "\n") {
   CsvWriteOptions$create(
     include_header = include_header,
     batch_size = batch_size,
-    null_string = na
+    delimiter = delim,
+    null_string = na,
+    eol = eol
   )
 }
 
 #' @rdname CsvReadOptions
 #' @export
 CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
-CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L, 
null_string = "") {
-  assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
+CsvWriteOptions$create <- function(include_header = TRUE,
+                                   batch_size = 1024L,
+                                   delimiter = ",",
+                                   null_string = "",
+                                   eol = "\n") {
   assert_that(is.logical(include_header))
+
+  assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
+
+  assert_that(is.character(delimiter))
+
   assert_that(is.character(null_string))
   assert_that(!is.na(null_string))
   assert_that(length(null_string) == 1)
   assert_that(!grepl('"', null_string), msg = "na argument must not contain 
quote characters.")
 
+  assert_that(is.character(eol))

Review Comment:
   Another comment re the extra spacing.  Thanks for adding all these checks 
though, this will be nice and robust!



##########
r/R/csv.R:
##########
@@ -512,30 +512,46 @@ CsvReadOptions$create <- function(use_threads = 
option_use_threads(),
 
 readr_to_csv_write_options <- function(include_header = TRUE,
                                        batch_size = 1024L,
-                                       na = "") {
+                                       delim = ",",
+                                       na = "",
+                                       eol = "\n") {
   CsvWriteOptions$create(
     include_header = include_header,
     batch_size = batch_size,
-    null_string = na
+    delimiter = delim,
+    null_string = na,
+    eol = eol
   )
 }
 
 #' @rdname CsvReadOptions
 #' @export
 CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
-CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L, 
null_string = "") {
-  assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
+CsvWriteOptions$create <- function(include_header = TRUE,
+                                   batch_size = 1024L,
+                                   delimiter = ",",
+                                   null_string = "",
+                                   eol = "\n") {

Review Comment:
   Would you mind rearranging these so that the previously included options 
come before the added ones? The reasoning here is that people might be using 
this function already (as it's exported) without naming the options, so 
reordering them could constitute a breaking change to their code.



##########
r/R/dataset-format.R:
##########
@@ -76,9 +76,10 @@ FileFormat <- R6Class("FileFormat",
 )
 FileFormat$create <- function(format, schema = NULL, ...) {
   opt_names <- names(list(...))
-  if (format %in% c("csv", "text") || any(opt_names %in% c("delim", 
"delimiter"))) {
+  if (format %in% c("csv", "text", "txt") || any(opt_names %in% c("delim", 
"delimiter"))) {
     CsvFileFormat$create(schema = schema, ...)
   } else if (format == "tsv") {
+    # This delimiter argument is ignored.

Review Comment:
   Could this be solved by using [`FileFormat$create(format = "tsv", ...)` 
](https://arrow.apache.org/docs/dev/r/reference/FileFormat.html) on the line 
below?



##########
r/R/csv.R:
##########
@@ -512,30 +512,46 @@ CsvReadOptions$create <- function(use_threads = 
option_use_threads(),
 
 readr_to_csv_write_options <- function(include_header = TRUE,
                                        batch_size = 1024L,
-                                       na = "") {
+                                       delim = ",",
+                                       na = "",
+                                       eol = "\n") {
   CsvWriteOptions$create(
     include_header = include_header,
     batch_size = batch_size,
-    null_string = na
+    delimiter = delim,
+    null_string = na,
+    eol = eol
   )
 }
 
 #' @rdname CsvReadOptions
 #' @export
 CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
-CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L, 
null_string = "") {
-  assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
+CsvWriteOptions$create <- function(include_header = TRUE,
+                                   batch_size = 1024L,
+                                   delimiter = ",",
+                                   null_string = "",
+                                   eol = "\n") {
   assert_that(is.logical(include_header))
+
+  assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
+
+  assert_that(is.character(delimiter))
+

Review Comment:
   Really minor, but would you mind removing the extra spaces here?



##########
r/R/dataset-write.R:
##########
@@ -178,12 +178,32 @@ write_dataset <- function(dataset,
   }
 
   path_and_fs <- get_path_and_filesystem(path)
+
+  dots <- list(...)
+  if (format %in% c("csv", "tsv") && any(c("delimiter", "delim") %in% 
names(dots))) {
+    stop("Do not set a delimiter for csv or tsv formats.")
+  }

Review Comment:
   I like your reasoning here, though to note, when I did a search for 
"alternative delimiter for csv", it appears that, despite the name "comma 
separated values", it's not uncommon for csv files to use alternative 
delimiters.  
   
   What I'd recommend is either:
   
   1. let's let people specify CSV with an alternative delimiter and go with 
the delimiter they specify, or
   2. we update this text in this error message to use "must" phrasing as per 
[the tidyverse style guide](https://style.tidyverse.org/error-messages.html), 
and recommend they specify the format as "text" instead of whatever format 
they've specified (and output that too).
   
   I'll leave the design decision up to you - let me know what you think is 
best!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to