thisisnic commented on a change in pull request #12083:
URL: https://github.com/apache/arrow/pull/12083#discussion_r793688513



##########
File path: r/R/dataset-format.R
##########
@@ -122,6 +122,30 @@ CsvFileFormat$create <- function(...,
                                  opts = csv_file_format_parse_options(...),
                                  convert_options = 
csv_file_format_convert_opts(...),
                                  read_options = 
csv_file_format_read_opts(...)) {
+
+  # Evaluate opts first to catch any unsupported arguments
+  force(opts)
+
+  options <- list(...)
+  schema <- options[["schema"]]
+
+  column_names <- read_options$column_names
+  schema_names <- names(schema)
+
+  if (!is.null(schema) & !identical(schema_names, column_names)) {
+
+    # Element wise comparison and set differnce of column_names and names in 
schema
+    mismatch_colnames <- column_names[match(column_names, schema_names, 
nomatch = 0) != seq(column_names)]
+    not_in_schema <- setdiff(schema_names, column_names)

Review comment:
       ```suggestion
       
   ```
   Removing these as no longer needed due to changes below

##########
File path: r/tests/testthat/test-dataset-csv.R
##########
@@ -299,3 +299,20 @@ test_that("open_dataset() deals with BOMs 
(byte-order-marks) correctly", {
     tibble(a = c(1, 3), b = c(2, 4))
   )
 })
+
+test_that("Error if read_options$column_names and schema-names differ 
(ARROW-14744)", {
+
+  dst_dir  <- make_temp_dir()
+  dst_file <- file.path(dst_dir, "file.csv")
+  df <- df1[, c("int", "dbl")]
+  write.csv(df, dst_file, row.names = FALSE, quote = FALSE)
+
+  # Mismatch of column names vs. schema-names should raise an error
+  schema  <- schema(int = int32(), dbl = float64())
+
+  expect_error(
+    open_dataset(csv_dir, format = "csv", schema = schema, column_names = 
c("these", "wont", "match")),
+    "column_names must match schema-names"

Review comment:
       ```suggestion
       "Values in `column_names` must match schema field names"
   ```
   Updating this to match change made above

##########
File path: r/R/dataset-format.R
##########
@@ -122,6 +122,30 @@ CsvFileFormat$create <- function(...,
                                  opts = csv_file_format_parse_options(...),
                                  convert_options = 
csv_file_format_convert_opts(...),
                                  read_options = 
csv_file_format_read_opts(...)) {
+
+  # Evaluate opts first to catch any unsupported arguments
+  force(opts)
+
+  options <- list(...)
+  schema <- options[["schema"]]
+
+  column_names <- read_options$column_names
+  schema_names <- names(schema)
+
+  if (!is.null(schema) & !identical(schema_names, column_names)) {
+
+    # Element wise comparison and set differnce of column_names and names in 
schema
+    mismatch_colnames <- column_names[match(column_names, schema_names, 
nomatch = 0) != seq(column_names)]
+    not_in_schema <- setdiff(schema_names, column_names)
+
+    abort(
+      paste(
+        "column_names must match schema-names:",
+        deparse1(c(mismatch_colnames, not_in_schema))
+      )

Review comment:
       ```suggestion
         paste(
           "Values in `column_names` must match schema field names"
         )
   ```

##########
File path: r/R/dataset-format.R
##########
@@ -122,6 +122,30 @@ CsvFileFormat$create <- function(...,
                                  opts = csv_file_format_parse_options(...),
                                  convert_options = 
csv_file_format_convert_opts(...),
                                  read_options = 
csv_file_format_read_opts(...)) {
+
+  # Evaluate opts first to catch any unsupported arguments
+  force(opts)
+
+  options <- list(...)
+  schema <- options[["schema"]]
+
+  column_names <- read_options$column_names
+  schema_names <- names(schema)
+
+  if (!is.null(schema) & !identical(schema_names, column_names)) {
+
+    # Element wise comparison and set differnce of column_names and names in 
schema
+    mismatch_colnames <- column_names[match(column_names, schema_names, 
nomatch = 0) != seq(column_names)]
+    not_in_schema <- setdiff(schema_names, column_names)
+
+    abort(
+      paste(
+        "column_names must match schema-names:",
+        deparse1(c(mismatch_colnames, not_in_schema))
+      )

Review comment:
       I've just removed the pasting of the fields as it's a little unclear 
which are missing from which, so best to leave out to reduce ambiguity!




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to