[arrow] branch main updated: GH-29184: [R] Read CSV with comma as decimal mark (#38002)

thisisnic Mon, 09 Oct 2023 09:15:46 -0700

This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new b3362f2d24 GH-29184: [R] Read CSV with comma as decimal mark (#38002)
b3362f2d24 is described below

commit b3362f2d241248a435ce53ae85bf3cf5c65d1432
Author: Nic Crane <[email protected]>
AuthorDate: Mon Oct 9 18:13:56 2023 +0200

    GH-29184: [R] Read CSV with comma as decimal mark (#38002)
    
    ### Rationale for this change
    
    Allow customisable decimal points when reading data
    
    ### What changes are included in this PR?
    
    Expose the C++ option in R
    
    ### Are these changes tested?
    
    Aye
    
    ### Are there any user-facing changes?
    
    Indeed
    * Closes: #29184
    
    Authored-by: Nic Crane <[email protected]>
    Signed-off-by: Nic Crane <[email protected]>
---
 r/NAMESPACE                         |  1 +
 r/R/csv.R                           | 50 +++++++++++++++++++++++++++++++------
 r/man/CsvReadOptions.Rd             |  1 +
 r/man/csv_convert_options.Rd        | 11 +++++---
 r/man/csv_parse_options.Rd          |  6 ++---
 r/man/csv_read_options.Rd           |  2 +-
 r/man/read_delim_arrow.Rd           | 29 +++++++++++++++++++--
 r/src/csv.cpp                       |  2 ++
 r/tests/testthat/test-csv.R         |  7 ++++++
 r/tests/testthat/test-dataset-csv.R | 15 +++++++++++
 10 files changed, 107 insertions(+), 17 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index b675952d01..dac2cbda9c 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -371,6 +371,7 @@ export(open_csv_dataset)
 export(open_dataset)
 export(open_delim_dataset)
 export(open_tsv_dataset)
+export(read_csv2_arrow)
 export(read_csv_arrow)
 export(read_delim_arrow)
 export(read_feather)
diff --git a/r/R/csv.R b/r/R/csv.R
index e68a05720d..89df75a4b0 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -22,7 +22,8 @@
 #' `readr::read_delim()`, and `col_select` was inspired by `vroom::vroom()`.
 #'
 #' `read_csv_arrow()` and `read_tsv_arrow()` are wrappers around
-#' `read_delim_arrow()` that specify a delimiter.
+#' `read_delim_arrow()` that specify a delimiter. `read_csv2_arrow()` uses 
`;`⁠ for
+#' the delimiter and `,` for the decimal point.
 #'
 #' Note that not all `readr` options are currently implemented here. Please 
file
 #' an issue if you encounter one that `arrow` should support.
@@ -129,6 +130,7 @@
 #' @param read_options see [CSV reading options][csv_read_options()]
 #' @param as_data_frame Should the function return a `tibble` (default) or
 #' an Arrow [Table]?
+#' @param decimal_point Character to use for decimal point in floating point 
numbers.
 #'
 #' @return A `tibble`, or a Table if `as_data_frame = FALSE`.
 #' @export
@@ -178,7 +180,8 @@ read_delim_arrow <- function(file,
                              convert_options = NULL,
                              read_options = NULL,
                              as_data_frame = TRUE,
-                             timestamp_parsers = NULL) {
+                             timestamp_parsers = NULL,
+                             decimal_point = ".") {
   if (inherits(schema, "Schema")) {
     col_names <- names(schema)
     col_types <- schema
@@ -197,8 +200,9 @@ read_delim_arrow <- function(file,
   }
   if (is.null(convert_options)) {
     convert_options <- readr_to_csv_convert_options(
-      na,
-      quoted_na,
+      na = na,
+      quoted_na = quoted_na,
+      decimal_point = decimal_point,
       col_types = col_types,
       col_names = read_options$column_names,
       timestamp_parsers = timestamp_parsers
@@ -279,6 +283,32 @@ read_csv_arrow <- function(file,
   eval.parent(mc)
 }
 
+#' @rdname read_delim_arrow
+#' @export
+read_csv2_arrow <- function(file,
+                            quote = '"',
+                            escape_double = TRUE,
+                            escape_backslash = FALSE,
+                            schema = NULL,
+                            col_names = TRUE,
+                            col_types = NULL,
+                            col_select = NULL,
+                            na = c("", "NA"),
+                            quoted_na = TRUE,
+                            skip_empty_rows = TRUE,
+                            skip = 0L,
+                            parse_options = NULL,
+                            convert_options = NULL,
+                            read_options = NULL,
+                            as_data_frame = TRUE,
+                            timestamp_parsers = NULL) {
+  mc <- match.call()
+  mc$delim <- ";"
+  mc$decimal_point = ","
+  mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow"))
+  eval.parent(mc)
+}
+
 #' @rdname read_delim_arrow
 #' @export
 read_tsv_arrow <- function(file,
@@ -497,6 +527,7 @@ csv_read_options <- function(use_threads = 
option_use_threads(),
 #'    (a) `NULL`, the default, which uses the ISO-8601 parser;
 #'    (b) a character vector of [strptime][base::strptime()] parse strings; or
 #'    (c) a list of [TimestampParser] objects.
+#' - `decimal_point` Character to use for decimal point in floating point 
numbers. Default: "."
 #'
 #' `TimestampParser$create()` takes an optional `format` string argument.
 #' See [`strptime()`][base::strptime()] for example syntax.
@@ -747,6 +778,7 @@ TimestampParser$create <- function(format = NULL) {
 #'    (a) `NULL`, the default, which uses the ISO-8601 parser;
 #'    (b) a character vector of [strptime][base::strptime()] parse strings; or
 #'    (c) a list of [TimestampParser] objects.
+#' @param decimal_point Character to use for decimal point in floating point 
numbers.
 #'
 #' @examples
 #' tf <- tempfile()
@@ -765,7 +797,8 @@ csv_convert_options <- function(check_utf8 = TRUE,
                                 auto_dict_max_cardinality = 50L,
                                 include_columns = character(),
                                 include_missing_columns = FALSE,
-                                timestamp_parsers = NULL) {
+                                timestamp_parsers = NULL,
+                                decimal_point = ".") {
   if (!is.null(col_types) && !inherits(col_types, "Schema")) {
     abort(c(
       "Unsupported `col_types` specification.",
@@ -785,7 +818,8 @@ csv_convert_options <- function(check_utf8 = TRUE,
       auto_dict_max_cardinality = auto_dict_max_cardinality,
       include_columns = include_columns,
       include_missing_columns = include_missing_columns,
-      timestamp_parsers = timestamp_parsers
+      timestamp_parsers = timestamp_parsers,
+      decimal_point = decimal_point
     )
   )
 }
@@ -800,6 +834,7 @@ CsvConvertOptions$create <- csv_convert_options
 
 readr_to_csv_convert_options <- function(na,
                                          quoted_na,
+                                         decimal_point,
                                          col_types = NULL,
                                          col_names = NULL,
                                          timestamp_parsers = NULL) {
@@ -851,7 +886,8 @@ readr_to_csv_convert_options <- function(na,
     strings_can_be_null = quoted_na,
     col_types = col_types,
     timestamp_parsers = timestamp_parsers,
-    include_columns = include_columns
+    include_columns = include_columns,
+    decimal_point = decimal_point
   )
 }
 
diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd
index 32742280cc..d4544cf829 100644
--- a/r/man/CsvReadOptions.Rd
+++ b/r/man/CsvReadOptions.Rd
@@ -94,6 +94,7 @@ starting from the beginning of this vector. Possible values 
are
 (a) \code{NULL}, the default, which uses the ISO-8601 parser;
 (b) a character vector of \link[base:strptime]{strptime} parse strings; or
 (c) a list of \link{TimestampParser} objects.
+\item \code{decimal_point} Character to use for decimal point in floating 
point numbers. Default: "."
 }
 
 \code{TimestampParser$create()} takes an optional \code{format} string 
argument.
diff --git a/r/man/csv_convert_options.Rd b/r/man/csv_convert_options.Rd
index 4fd6eac1c3..f48da56eaa 100644
--- a/r/man/csv_convert_options.Rd
+++ b/r/man/csv_convert_options.Rd
@@ -15,7 +15,8 @@ csv_convert_options(
   auto_dict_max_cardinality = 50L,
   include_columns = character(),
   include_missing_columns = FALSE,
-  timestamp_parsers = NULL
+  timestamp_parsers = NULL,
+  decimal_point = "."
 )
 }
 \arguments{
@@ -56,6 +57,8 @@ starting from the beginning of this vector. Possible values 
are
 (a) \code{NULL}, the default, which uses the ISO-8601 parser;
 (b) a character vector of \link[base:strptime]{strptime} parse strings; or
 (c) a list of \link{TimestampParser} objects.}
+
+\item{decimal_point}{Character to use for decimal point in floating point 
numbers.}
 }
 \description{
 CSV Convert Options
@@ -63,7 +66,7 @@ CSV Convert Options
 \examples{
 tf <- tempfile()
 on.exit(unlink(tf))
-writeLines('x\n1\nNULL\n2\nNA', tf)
-read_csv_arrow(tf, convert_options = csv_convert_options(null_values =  c("", 
"NA", "NULL")))
-open_csv_dataset(tf, convert_options = csv_convert_options(null_values =  
c("", "NA", "NULL")))
+writeLines("x\n1\nNULL\n2\nNA", tf)
+read_csv_arrow(tf, convert_options = csv_convert_options(null_values = c("", 
"NA", "NULL")))
+open_csv_dataset(tf, convert_options = csv_convert_options(null_values = c("", 
"NA", "NULL")))
 }
diff --git a/r/man/csv_parse_options.Rd b/r/man/csv_parse_options.Rd
index e2e8fd19a5..23c647470b 100644
--- a/r/man/csv_parse_options.Rd
+++ b/r/man/csv_parse_options.Rd
@@ -40,7 +40,7 @@ CSV Parsing Options
 \examples{
 tf <- tempfile()
 on.exit(unlink(tf))
-writeLines('x\n1\n\n2', tf)
-read_csv_arrow(tf, parse_options = csv_parse_options(ignore_empty_lines =  
FALSE))
-open_csv_dataset(tf, parse_options = csv_parse_options(ignore_empty_lines =  
FALSE))
+writeLines("x\n1\n\n2", tf)
+read_csv_arrow(tf, parse_options = csv_parse_options(ignore_empty_lines = 
FALSE))
+open_csv_dataset(tf, parse_options = csv_parse_options(ignore_empty_lines = 
FALSE))
 }
diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd
index ed2436f316..622e8d5e5b 100644
--- a/r/man/csv_read_options.Rd
+++ b/r/man/csv_read_options.Rd
@@ -45,7 +45,7 @@ CSV Reading Options
 \examples{
 tf <- tempfile()
 on.exit(unlink(tf))
-writeLines('my file has a non-data header\nx\n1\n2', tf)
+writeLines("my file has a non-data header\nx\n1\n2", tf)
 read_csv_arrow(tf, read_options = csv_read_options(skip_rows = 1))
 open_csv_dataset(tf, read_options = csv_read_options(skip_rows = 1))
 }
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index 09c20fa013..04447fefc7 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -3,6 +3,7 @@
 \name{read_delim_arrow}
 \alias{read_delim_arrow}
 \alias{read_csv_arrow}
+\alias{read_csv2_arrow}
 \alias{read_tsv_arrow}
 \title{Read a CSV or other delimited file with Arrow}
 \usage{
@@ -24,7 +25,8 @@ read_delim_arrow(
   convert_options = NULL,
   read_options = NULL,
   as_data_frame = TRUE,
-  timestamp_parsers = NULL
+  timestamp_parsers = NULL,
+  decimal_point = "."
 )
 
 read_csv_arrow(
@@ -47,6 +49,26 @@ read_csv_arrow(
   timestamp_parsers = NULL
 )
 
+read_csv2_arrow(
+  file,
+  quote = "\\"",
+  escape_double = TRUE,
+  escape_backslash = FALSE,
+  schema = NULL,
+  col_names = TRUE,
+  col_types = NULL,
+  col_select = NULL,
+  na = c("", "NA"),
+  quoted_na = TRUE,
+  skip_empty_rows = TRUE,
+  skip = 0L,
+  parse_options = NULL,
+  convert_options = NULL,
+  read_options = NULL,
+  as_data_frame = TRUE,
+  timestamp_parsers = NULL
+)
+
 read_tsv_arrow(
   file,
   quote = "\\"",
@@ -139,6 +161,8 @@ starting from the beginning of this vector. Possible values 
are:
 \item a character vector of \link[base:strptime]{strptime} parse strings
 \item a list of \link{TimestampParser} objects
 }}
+
+\item{decimal_point}{Character to use for decimal point in floating point 
numbers.}
 }
 \value{
 A \code{tibble}, or a Table if \code{as_data_frame = FALSE}.
@@ -150,7 +174,8 @@ Arrow C++ options have been mapped to argument names that 
follow those of
 }
 \details{
 \code{read_csv_arrow()} and \code{read_tsv_arrow()} are wrappers around
-\code{read_delim_arrow()} that specify a delimiter.
+\code{read_delim_arrow()} that specify a delimiter. \code{read_csv2_arrow()} 
uses \verb{;}⁠ for
+the delimiter and \verb{,} for the decimal point.
 
 Note that not all \code{readr} options are currently implemented here. Please 
file
 an issue if you encounter one that \code{arrow} should support.
diff --git a/r/src/csv.cpp b/r/src/csv.cpp
index ffb8a11e6b..d253aa878b 100644
--- a/r/src/csv.cpp
+++ b/r/src/csv.cpp
@@ -181,6 +181,8 @@ std::shared_ptr<arrow::csv::ConvertOptions> 
csv___ConvertOptions__initialize(
     res->timestamp_parsers = timestamp_parsers;
   }
 
+  res->decimal_point = cpp11::as_cpp<char>(options["decimal_point"]);
+
   return res;
 }
 
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 22ccc7950f..36f1f229a6 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -733,3 +733,10 @@ test_that("Can read CSV files from a URL", {
   expect_true(tibble::is_tibble(cu))
   expect_identical(dim(cu), c(100L, 13L))
 })
+
+test_that("read_csv2_arrow correctly parses comma decimals", {
+  tf <- tempfile()
+  writeLines("x;y\n1,2;c", con = tf)
+  expect_equal(read_csv2_arrow(tf), tibble(x = 1.2, y = "c"))
+
+})
diff --git a/r/tests/testthat/test-dataset-csv.R 
b/r/tests/testthat/test-dataset-csv.R
index bee701e5ef..2698cd854a 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -647,3 +647,18 @@ test_that("GH-34640 - CSV datasets are read in correctly 
when both schema and pa
       summarize(mean = mean(integer))
   )
 })
+
+test_that("open_dataset() with `decimal_point` argument", {
+  temp_dir <- make_temp_dir()
+  writeLines("x\ty\n1,2\tc", con = file.path(temp_dir, "file1.csv"))
+
+  expect_equal(
+    open_dataset(temp_dir, format = "tsv") %>% collect(),
+    tibble(x = "1,2", y = "c")
+  )
+
+  expect_equal(
+    open_dataset(temp_dir, format = "tsv", decimal_point = ",") %>% collect(),
+    tibble(x = 1.2, y = "c")
+  )
+})

[arrow] branch main updated: GH-29184: [R] Read CSV with comma as decimal mark (#38002)

Reply via email to