This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new b3362f2d24 GH-29184: [R] Read CSV with comma as decimal mark (#38002)
b3362f2d24 is described below
commit b3362f2d241248a435ce53ae85bf3cf5c65d1432
Author: Nic Crane <[email protected]>
AuthorDate: Mon Oct 9 18:13:56 2023 +0200
GH-29184: [R] Read CSV with comma as decimal mark (#38002)
### Rationale for this change
Allow customisable decimal points when reading data
### What changes are included in this PR?
Expose the C++ option in R
### Are these changes tested?
Aye
### Are there any user-facing changes?
Indeed
* Closes: #29184
Authored-by: Nic Crane <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/NAMESPACE | 1 +
r/R/csv.R | 50 +++++++++++++++++++++++++++++++------
r/man/CsvReadOptions.Rd | 1 +
r/man/csv_convert_options.Rd | 11 +++++---
r/man/csv_parse_options.Rd | 6 ++---
r/man/csv_read_options.Rd | 2 +-
r/man/read_delim_arrow.Rd | 29 +++++++++++++++++++--
r/src/csv.cpp | 2 ++
r/tests/testthat/test-csv.R | 7 ++++++
r/tests/testthat/test-dataset-csv.R | 15 +++++++++++
10 files changed, 107 insertions(+), 17 deletions(-)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index b675952d01..dac2cbda9c 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -371,6 +371,7 @@ export(open_csv_dataset)
export(open_dataset)
export(open_delim_dataset)
export(open_tsv_dataset)
+export(read_csv2_arrow)
export(read_csv_arrow)
export(read_delim_arrow)
export(read_feather)
diff --git a/r/R/csv.R b/r/R/csv.R
index e68a05720d..89df75a4b0 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -22,7 +22,8 @@
#' `readr::read_delim()`, and `col_select` was inspired by `vroom::vroom()`.
#'
#' `read_csv_arrow()` and `read_tsv_arrow()` are wrappers around
-#' `read_delim_arrow()` that specify a delimiter.
+#' `read_delim_arrow()` that specify a delimiter. `read_csv2_arrow()` uses
`;` for
+#' the delimiter and `,` for the decimal point.
#'
#' Note that not all `readr` options are currently implemented here. Please
file
#' an issue if you encounter one that `arrow` should support.
@@ -129,6 +130,7 @@
#' @param read_options see [CSV reading options][csv_read_options()]
#' @param as_data_frame Should the function return a `tibble` (default) or
#' an Arrow [Table]?
+#' @param decimal_point Character to use for decimal point in floating point
numbers.
#'
#' @return A `tibble`, or a Table if `as_data_frame = FALSE`.
#' @export
@@ -178,7 +180,8 @@ read_delim_arrow <- function(file,
convert_options = NULL,
read_options = NULL,
as_data_frame = TRUE,
- timestamp_parsers = NULL) {
+ timestamp_parsers = NULL,
+ decimal_point = ".") {
if (inherits(schema, "Schema")) {
col_names <- names(schema)
col_types <- schema
@@ -197,8 +200,9 @@ read_delim_arrow <- function(file,
}
if (is.null(convert_options)) {
convert_options <- readr_to_csv_convert_options(
- na,
- quoted_na,
+ na = na,
+ quoted_na = quoted_na,
+ decimal_point = decimal_point,
col_types = col_types,
col_names = read_options$column_names,
timestamp_parsers = timestamp_parsers
@@ -279,6 +283,32 @@ read_csv_arrow <- function(file,
eval.parent(mc)
}
+#' @rdname read_delim_arrow
+#' @export
+read_csv2_arrow <- function(file,
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ schema = NULL,
+ col_names = TRUE,
+ col_types = NULL,
+ col_select = NULL,
+ na = c("", "NA"),
+ quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = NULL,
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL) {
+ mc <- match.call()
+ mc$delim <- ";"
+ mc$decimal_point = ","
+ mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow"))
+ eval.parent(mc)
+}
+
#' @rdname read_delim_arrow
#' @export
read_tsv_arrow <- function(file,
@@ -497,6 +527,7 @@ csv_read_options <- function(use_threads =
option_use_threads(),
#' (a) `NULL`, the default, which uses the ISO-8601 parser;
#' (b) a character vector of [strptime][base::strptime()] parse strings; or
#' (c) a list of [TimestampParser] objects.
+#' - `decimal_point` Character to use for decimal point in floating point
numbers. Default: "."
#'
#' `TimestampParser$create()` takes an optional `format` string argument.
#' See [`strptime()`][base::strptime()] for example syntax.
@@ -747,6 +778,7 @@ TimestampParser$create <- function(format = NULL) {
#' (a) `NULL`, the default, which uses the ISO-8601 parser;
#' (b) a character vector of [strptime][base::strptime()] parse strings; or
#' (c) a list of [TimestampParser] objects.
+#' @param decimal_point Character to use for decimal point in floating point
numbers.
#'
#' @examples
#' tf <- tempfile()
@@ -765,7 +797,8 @@ csv_convert_options <- function(check_utf8 = TRUE,
auto_dict_max_cardinality = 50L,
include_columns = character(),
include_missing_columns = FALSE,
- timestamp_parsers = NULL) {
+ timestamp_parsers = NULL,
+ decimal_point = ".") {
if (!is.null(col_types) && !inherits(col_types, "Schema")) {
abort(c(
"Unsupported `col_types` specification.",
@@ -785,7 +818,8 @@ csv_convert_options <- function(check_utf8 = TRUE,
auto_dict_max_cardinality = auto_dict_max_cardinality,
include_columns = include_columns,
include_missing_columns = include_missing_columns,
- timestamp_parsers = timestamp_parsers
+ timestamp_parsers = timestamp_parsers,
+ decimal_point = decimal_point
)
)
}
@@ -800,6 +834,7 @@ CsvConvertOptions$create <- csv_convert_options
readr_to_csv_convert_options <- function(na,
quoted_na,
+ decimal_point,
col_types = NULL,
col_names = NULL,
timestamp_parsers = NULL) {
@@ -851,7 +886,8 @@ readr_to_csv_convert_options <- function(na,
strings_can_be_null = quoted_na,
col_types = col_types,
timestamp_parsers = timestamp_parsers,
- include_columns = include_columns
+ include_columns = include_columns,
+ decimal_point = decimal_point
)
}
diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd
index 32742280cc..d4544cf829 100644
--- a/r/man/CsvReadOptions.Rd
+++ b/r/man/CsvReadOptions.Rd
@@ -94,6 +94,7 @@ starting from the beginning of this vector. Possible values
are
(a) \code{NULL}, the default, which uses the ISO-8601 parser;
(b) a character vector of \link[base:strptime]{strptime} parse strings; or
(c) a list of \link{TimestampParser} objects.
+\item \code{decimal_point} Character to use for decimal point in floating
point numbers. Default: "."
}
\code{TimestampParser$create()} takes an optional \code{format} string
argument.
diff --git a/r/man/csv_convert_options.Rd b/r/man/csv_convert_options.Rd
index 4fd6eac1c3..f48da56eaa 100644
--- a/r/man/csv_convert_options.Rd
+++ b/r/man/csv_convert_options.Rd
@@ -15,7 +15,8 @@ csv_convert_options(
auto_dict_max_cardinality = 50L,
include_columns = character(),
include_missing_columns = FALSE,
- timestamp_parsers = NULL
+ timestamp_parsers = NULL,
+ decimal_point = "."
)
}
\arguments{
@@ -56,6 +57,8 @@ starting from the beginning of this vector. Possible values
are
(a) \code{NULL}, the default, which uses the ISO-8601 parser;
(b) a character vector of \link[base:strptime]{strptime} parse strings; or
(c) a list of \link{TimestampParser} objects.}
+
+\item{decimal_point}{Character to use for decimal point in floating point
numbers.}
}
\description{
CSV Convert Options
@@ -63,7 +66,7 @@ CSV Convert Options
\examples{
tf <- tempfile()
on.exit(unlink(tf))
-writeLines('x\n1\nNULL\n2\nNA', tf)
-read_csv_arrow(tf, convert_options = csv_convert_options(null_values = c("",
"NA", "NULL")))
-open_csv_dataset(tf, convert_options = csv_convert_options(null_values =
c("", "NA", "NULL")))
+writeLines("x\n1\nNULL\n2\nNA", tf)
+read_csv_arrow(tf, convert_options = csv_convert_options(null_values = c("",
"NA", "NULL")))
+open_csv_dataset(tf, convert_options = csv_convert_options(null_values = c("",
"NA", "NULL")))
}
diff --git a/r/man/csv_parse_options.Rd b/r/man/csv_parse_options.Rd
index e2e8fd19a5..23c647470b 100644
--- a/r/man/csv_parse_options.Rd
+++ b/r/man/csv_parse_options.Rd
@@ -40,7 +40,7 @@ CSV Parsing Options
\examples{
tf <- tempfile()
on.exit(unlink(tf))
-writeLines('x\n1\n\n2', tf)
-read_csv_arrow(tf, parse_options = csv_parse_options(ignore_empty_lines =
FALSE))
-open_csv_dataset(tf, parse_options = csv_parse_options(ignore_empty_lines =
FALSE))
+writeLines("x\n1\n\n2", tf)
+read_csv_arrow(tf, parse_options = csv_parse_options(ignore_empty_lines =
FALSE))
+open_csv_dataset(tf, parse_options = csv_parse_options(ignore_empty_lines =
FALSE))
}
diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd
index ed2436f316..622e8d5e5b 100644
--- a/r/man/csv_read_options.Rd
+++ b/r/man/csv_read_options.Rd
@@ -45,7 +45,7 @@ CSV Reading Options
\examples{
tf <- tempfile()
on.exit(unlink(tf))
-writeLines('my file has a non-data header\nx\n1\n2', tf)
+writeLines("my file has a non-data header\nx\n1\n2", tf)
read_csv_arrow(tf, read_options = csv_read_options(skip_rows = 1))
open_csv_dataset(tf, read_options = csv_read_options(skip_rows = 1))
}
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index 09c20fa013..04447fefc7 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -3,6 +3,7 @@
\name{read_delim_arrow}
\alias{read_delim_arrow}
\alias{read_csv_arrow}
+\alias{read_csv2_arrow}
\alias{read_tsv_arrow}
\title{Read a CSV or other delimited file with Arrow}
\usage{
@@ -24,7 +25,8 @@ read_delim_arrow(
convert_options = NULL,
read_options = NULL,
as_data_frame = TRUE,
- timestamp_parsers = NULL
+ timestamp_parsers = NULL,
+ decimal_point = "."
)
read_csv_arrow(
@@ -47,6 +49,26 @@ read_csv_arrow(
timestamp_parsers = NULL
)
+read_csv2_arrow(
+ file,
+ quote = "\\"",
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ schema = NULL,
+ col_names = TRUE,
+ col_types = NULL,
+ col_select = NULL,
+ na = c("", "NA"),
+ quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = NULL,
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL
+)
+
read_tsv_arrow(
file,
quote = "\\"",
@@ -139,6 +161,8 @@ starting from the beginning of this vector. Possible values
are:
\item a character vector of \link[base:strptime]{strptime} parse strings
\item a list of \link{TimestampParser} objects
}}
+
+\item{decimal_point}{Character to use for decimal point in floating point
numbers.}
}
\value{
A \code{tibble}, or a Table if \code{as_data_frame = FALSE}.
@@ -150,7 +174,8 @@ Arrow C++ options have been mapped to argument names that
follow those of
}
\details{
\code{read_csv_arrow()} and \code{read_tsv_arrow()} are wrappers around
-\code{read_delim_arrow()} that specify a delimiter.
+\code{read_delim_arrow()} that specify a delimiter. \code{read_csv2_arrow()}
uses \verb{;} for
+the delimiter and \verb{,} for the decimal point.
Note that not all \code{readr} options are currently implemented here. Please
file
an issue if you encounter one that \code{arrow} should support.
diff --git a/r/src/csv.cpp b/r/src/csv.cpp
index ffb8a11e6b..d253aa878b 100644
--- a/r/src/csv.cpp
+++ b/r/src/csv.cpp
@@ -181,6 +181,8 @@ std::shared_ptr<arrow::csv::ConvertOptions>
csv___ConvertOptions__initialize(
res->timestamp_parsers = timestamp_parsers;
}
+ res->decimal_point = cpp11::as_cpp<char>(options["decimal_point"]);
+
return res;
}
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 22ccc7950f..36f1f229a6 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -733,3 +733,10 @@ test_that("Can read CSV files from a URL", {
expect_true(tibble::is_tibble(cu))
expect_identical(dim(cu), c(100L, 13L))
})
+
+test_that("read_csv2_arrow correctly parses comma decimals", {
+ tf <- tempfile()
+ writeLines("x;y\n1,2;c", con = tf)
+ expect_equal(read_csv2_arrow(tf), tibble(x = 1.2, y = "c"))
+
+})
diff --git a/r/tests/testthat/test-dataset-csv.R
b/r/tests/testthat/test-dataset-csv.R
index bee701e5ef..2698cd854a 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -647,3 +647,18 @@ test_that("GH-34640 - CSV datasets are read in correctly
when both schema and pa
summarize(mean = mean(integer))
)
})
+
+test_that("open_dataset() with `decimal_point` argument", {
+ temp_dir <- make_temp_dir()
+ writeLines("x\ty\n1,2\tc", con = file.path(temp_dir, "file1.csv"))
+
+ expect_equal(
+ open_dataset(temp_dir, format = "tsv") %>% collect(),
+ tibble(x = "1,2", y = "c")
+ )
+
+ expect_equal(
+ open_dataset(temp_dir, format = "tsv", decimal_point = ",") %>% collect(),
+ tibble(x = 1.2, y = "c")
+ )
+})