This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 8f347c559d GH-36247: [R] Add write_csv_dataset (#36436)
8f347c559d is described below
commit 8f347c559d93f5c4087558e48cd4300d0fe8b806
Author: David Greiss <[email protected]>
AuthorDate: Fri Aug 25 08:56:11 2023 -0400
GH-36247: [R] Add write_csv_dataset (#36436)
### Rationale for this change
Create a convenience wrapper around `write_dataset` for csv files.
### What changes are included in this PR?
Adds a `write_csv_dataset()`
### Are these changes tested?
Yes a few tests were added.
### Are there any user-facing changes?
Yes a new function has been added. If this looks good I can add more to the
docs as well.
* Closes: #36247
Lead-authored-by: David Greiss <[email protected]>
Co-authored-by: Nic Crane <[email protected]>
Co-authored-by: David Greiss <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/NAMESPACE | 3 +
r/R/csv.R | 40 +++-
r/R/dataset-format.R | 19 +-
r/R/dataset-write.R | 199 +++++++++++++++++++-
r/_pkgdown.yml | 3 +
r/man/write_dataset.Rd | 2 +-
r/man/{write_dataset.Rd => write_delim_dataset.Rd} | 144 ++++++++-------
r/src/csv.cpp | 6 +-
r/tests/testthat/_snaps/dataset-write.md | 2 +-
r/tests/testthat/test-dataset-write.R | 204 +++++++++++++++++++++
10 files changed, 532 insertions(+), 90 deletions(-)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index f479917642..21f88b4180 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -406,12 +406,15 @@ export(value_counts)
export(vctrs_extension_array)
export(vctrs_extension_type)
export(write_csv_arrow)
+export(write_csv_dataset)
export(write_dataset)
+export(write_delim_dataset)
export(write_feather)
export(write_ipc_file)
export(write_ipc_stream)
export(write_parquet)
export(write_to_raw)
+export(write_tsv_dataset)
importFrom(R6,R6Class)
importFrom(assertthat,assert_that)
importFrom(assertthat,is.string)
diff --git a/r/R/csv.R b/r/R/csv.R
index d53dc07b42..b119d16a84 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -510,32 +510,56 @@ CsvReadOptions$create <- function(use_threads =
option_use_threads(),
options
}
-readr_to_csv_write_options <- function(include_header = TRUE,
+readr_to_csv_write_options <- function(col_names = TRUE,
batch_size = 1024L,
- na = "") {
+ delim = ",",
+ na = "",
+ eol = "\n",
+ quote = c("needed", "all", "none")) {
+ quoting_style_arrow_opts <- c("Needed", "AllValid", "None")
+ quote <- match(match.arg(quote), c("needed", "all", "none"))
+ quote <- quoting_style_arrow_opts[quote]
+
CsvWriteOptions$create(
- include_header = include_header,
+ include_header = col_names,
batch_size = batch_size,
- null_string = na
+ delimiter = delim,
+ null_string = na,
+ eol = eol,
+ quoting_style = quote
)
}
#' @rdname CsvReadOptions
#' @export
CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
-CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L,
null_string = "") {
- assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
+CsvWriteOptions$create <- function(include_header = TRUE,
+ batch_size = 1024L,
+ null_string = "",
+ delimiter = ",",
+ eol = "\n",
+ quoting_style = c("Needed", "AllValid",
"None")) {
+ quoting_style <- match.arg(quoting_style)
+ quoting_style_opts <- c("Needed", "AllValid", "None")
+ quoting_style <- match(quoting_style, quoting_style_opts) - 1L
+
assert_that(is.logical(include_header))
+ assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
+ assert_that(is.character(delimiter))
assert_that(is.character(null_string))
assert_that(!is.na(null_string))
assert_that(length(null_string) == 1)
assert_that(!grepl('"', null_string), msg = "na argument must not contain
quote characters.")
+ assert_that(is.character(eol))
csv___WriteOptions__initialize(
list(
include_header = include_header,
batch_size = as.integer(batch_size),
- null_string = as.character(null_string)
+ delimiter = delimiter,
+ null_string = as.character(null_string),
+ eol = eol,
+ quoting_style = quoting_style
)
)
}
@@ -787,7 +811,7 @@ write_csv_arrow <- function(x,
if (is.null(write_options)) {
write_options <- readr_to_csv_write_options(
- include_header = include_header,
+ col_names = include_header,
batch_size = batch_size,
na = na
)
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index 9c7e332f5e..e1f434d60c 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -76,9 +76,10 @@ FileFormat <- R6Class("FileFormat",
)
FileFormat$create <- function(format, schema = NULL, ...) {
opt_names <- names(list(...))
- if (format %in% c("csv", "text") || any(opt_names %in% c("delim",
"delimiter"))) {
+ if (format %in% c("csv", "text", "txt") || any(opt_names %in% c("delim",
"delimiter"))) {
CsvFileFormat$create(schema = schema, ...)
} else if (format == "tsv") {
+ # This delimiter argument is ignored.
CsvFileFormat$create(delimiter = "\t", schema = schema, ...)
} else if (format == "parquet") {
ParquetFileFormat$create(...)
@@ -635,7 +636,7 @@ FileWriteOptions <- R6Class("FileWriteOptions",
"codec",
"null_fallback"
)
- } else if (format == "csv") {
+ } else if (format %in% c("csv", "tsv", "txt", "text")) {
supported_args <- c(
names(formals(CsvWriteOptions$create)),
names(formals(readr_to_csv_write_options))
@@ -691,14 +692,20 @@ FileWriteOptions <- R6Class("FileWriteOptions",
get_ipc_metadata_version(args$metadata_version)
)
}
- } else if (self$type == "csv") {
+ } else if (self$type %in% c("csv", "tsv", "txt", "text")) {
arrow_opts <- names(formals(CsvWriteOptions$create))
readr_opts <- names(formals(readr_to_csv_write_options))
readr_only_opts <- setdiff(readr_opts, arrow_opts)
+ arrow_only_opts <- setdiff(arrow_opts, readr_opts)
- is_arrow_opt <- !is.na(pmatch(names(args), arrow_opts))
- is_readr_opt <- !is.na(pmatch(names(args), readr_opts))
- is_readr_only_opt <- !is.na(pmatch(names(args), readr_only_opts))
+ is_arrow_opt <- !is.na(match(names(args), arrow_opts))
+ is_readr_opt <- !is.na(match(names(args), readr_opts))
+ is_arrow_only_opt <- !is.na(match(names(args), arrow_only_opts))
+ is_readr_only_opt <- !is.na(match(names(args), readr_only_opts))
+
+ if (any(is_arrow_only_opt) && any(is_readr_only_opt)) {
+ stop("Use either Arrow write options or readr write options, not
both")
+ }
# These option names aren't mutually exclusive, so only use readr path
# if we have at least one readr-specific option.
diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R
index f971c0e9cd..288da4ffde 100644
--- a/r/R/dataset-write.R
+++ b/r/R/dataset-write.R
@@ -122,7 +122,7 @@
#' @export
write_dataset <- function(dataset,
path,
- format = c("parquet", "feather", "arrow", "ipc",
"csv"),
+ format = c("parquet", "feather", "arrow", "ipc",
"csv", "tsv", "txt", "text"),
partitioning = dplyr::group_vars(dataset),
basename_template = paste0("part-{i}.",
as.character(format)),
hive_style = TRUE,
@@ -178,12 +178,31 @@ write_dataset <- function(dataset,
}
path_and_fs <- get_path_and_filesystem(path)
+
+ dots <- list(...)
+ if (format %in% c("txt", "text") && !any(c("delimiter", "delim") %in%
names(dots))) {
+ stop("A delimiter must be given for a txt format.")
+ }
+ if (format == "tsv" && any(c("delimiter", "delim") %in% names(dots))) {
+ stop("Can't set a delimiter for the tsv format.")
+ }
+
output_schema <- final_node$schema
- options <- FileWriteOptions$create(
- format,
- column_names = names(output_schema),
- ...
- )
+ # This is a workaround because CsvFileFormat$create defaults the delimiter
to ","
+ if (format == "tsv") {
+ options <- FileWriteOptions$create(
+ format,
+ column_names = names(output_schema),
+ delimiter = "\t",
+ ...
+ )
+ } else {
+ options <- FileWriteOptions$create(
+ format,
+ column_names = names(output_schema),
+ ...
+ )
+ }
# TODO(ARROW-16200): expose FileSystemDatasetWriteOptions in R
# and encapsulate this logic better
@@ -209,6 +228,174 @@ write_dataset <- function(dataset,
)
}
+#' Write a dataset into partitioned flat files.
+#'
+#' The `write_*_dataset()` are a family of wrappers around [write_dataset] to
allow for easy switching
+#' between functions for writing datasets.
+#'
+#' @inheritParams write_dataset
+#' @param col_names Whether to write an initial header line with column names.
+#' @param batch_size Maximum number of rows processed at a time. Default is
1024L.
+#' @param delim Delimiter used to separate values. Defaults to `","` for
`write_delim_dataset()` and
+#' `write_csv_dataset()`, and `"\t` for `write_tsv_dataset()`. Cannot be
changed for `write_tsv_dataset()`.
+#' @param na a character vector of strings to interpret as missing values.
Quotes are not allowed in this string.
+#' The default is an empty string `""`.
+#' @param eol the end of line character to use for ending rows. The default is
`"\n"`.
+#' @param quote How to handle fields which contain characters that need to be
quoted.
+#' - `needed` - Enclose all strings and binary values in quotes which need
them, because their CSV rendering can
+#' contain quotes itself (the default)
+#' - `all` - Enclose all valid values in quotes. Nulls are not quoted. May
cause readers to
+#' interpret all values as strings if schema is inferred.
+#' - `none` - Do not enclose any values in quotes. Prevents values from
containing quotes ("),
+#' cell delimiters (,) or line endings (\\r, \\n), (following RFC4180). If
values
+#' contain these characters, an error is caused when attempting to write.
+#' @return The input `dataset`, invisibly.
+#'
+#' @seealso [write_dataset()]
+#' @export
+write_delim_dataset <- function(dataset,
+ path,
+ partitioning = dplyr::group_vars(dataset),
+ basename_template = "part-{i}.txt",
+ hive_style = TRUE,
+ existing_data_behavior = c("overwrite",
"error", "delete_matching"),
+ max_partitions = 1024L,
+ max_open_files = 900L,
+ max_rows_per_file = 0L,
+ min_rows_per_group = 0L,
+ max_rows_per_group = bitwShiftL(1, 20),
+ col_names = TRUE,
+ batch_size = 1024L,
+ delim = ",",
+ na = "",
+ eol = "\n",
+ quote = c("needed", "all", "none")) {
+ if (!missing(max_rows_per_file) && missing(max_rows_per_group) &&
max_rows_per_group > max_rows_per_file) {
+ max_rows_per_group <- max_rows_per_file
+ }
+
+ quoting_style_arrow_opts <- c("Needed", "AllValid", "None")
+ quote <- match(match.arg(quote), c("needed", "all", "none"))
+ quote <- quoting_style_arrow_opts[quote]
+
+ write_dataset(
+ dataset = dataset,
+ path = path,
+ format = "txt",
+ partitioning = partitioning,
+ basename_template = basename_template,
+ hive_style = hive_style,
+ existing_data_behavior = existing_data_behavior,
+ max_partitions = max_partitions,
+ max_open_files = max_open_files,
+ max_rows_per_file = max_rows_per_file,
+ min_rows_per_group = min_rows_per_group,
+ max_rows_per_group = max_rows_per_group,
+ include_header = col_names,
+ batch_size = batch_size,
+ delimiter = delim,
+ null_string = na,
+ eol = eol,
+ quoting_style = quote
+ )
+}
+
+#' @rdname write_delim_dataset
+#' @export
+write_csv_dataset <- function(dataset,
+ path,
+ partitioning = dplyr::group_vars(dataset),
+ basename_template = "part-{i}.csv",
+ hive_style = TRUE,
+ existing_data_behavior = c("overwrite", "error",
"delete_matching"),
+ max_partitions = 1024L,
+ max_open_files = 900L,
+ max_rows_per_file = 0L,
+ min_rows_per_group = 0L,
+ max_rows_per_group = bitwShiftL(1, 20),
+ col_names = TRUE,
+ batch_size = 1024L,
+ delim = ",",
+ na = "",
+ eol = "\n",
+ quote = c("needed", "all", "none")) {
+ if (!missing(max_rows_per_file) && missing(max_rows_per_group) &&
max_rows_per_group > max_rows_per_file) {
+ max_rows_per_group <- max_rows_per_file
+ }
+
+ quoting_style_arrow_opts <- c("Needed", "AllValid", "None")
+ quote <- match(match.arg(quote), c("needed", "all", "none"))
+ quote <- quoting_style_arrow_opts[quote]
+
+ write_dataset(
+ dataset = dataset,
+ path = path,
+ format = "csv",
+ partitioning = partitioning,
+ basename_template = basename_template,
+ hive_style = hive_style,
+ existing_data_behavior = existing_data_behavior,
+ max_partitions = max_partitions,
+ max_open_files = max_open_files,
+ max_rows_per_file = max_rows_per_file,
+ min_rows_per_group = min_rows_per_group,
+ max_rows_per_group = max_rows_per_group,
+ include_header = col_names,
+ batch_size = batch_size,
+ delimiter = delim,
+ null_string = na,
+ eol = eol,
+ quoting_style = quote
+ )
+}
+
+#' @rdname write_delim_dataset
+#' @export
+write_tsv_dataset <- function(dataset,
+ path,
+ partitioning = dplyr::group_vars(dataset),
+ basename_template = "part-{i}.tsv",
+ hive_style = TRUE,
+ existing_data_behavior = c("overwrite", "error",
"delete_matching"),
+ max_partitions = 1024L,
+ max_open_files = 900L,
+ max_rows_per_file = 0L,
+ min_rows_per_group = 0L,
+ max_rows_per_group = bitwShiftL(1, 20),
+ col_names = TRUE,
+ batch_size = 1024L,
+ na = "",
+ eol = "\n",
+ quote = c("needed", "all", "none")) {
+ if (!missing(max_rows_per_file) && missing(max_rows_per_group) &&
max_rows_per_group > max_rows_per_file) {
+ max_rows_per_group <- max_rows_per_file
+ }
+
+ quoting_style_arrow_opts <- c("Needed", "AllValid", "None")
+ quote <- match(match.arg(quote), c("needed", "all", "none"))
+ quote <- quoting_style_arrow_opts[quote]
+
+ write_dataset(
+ dataset = dataset,
+ path = path,
+ format = "tsv",
+ partitioning = partitioning,
+ basename_template = basename_template,
+ hive_style = hive_style,
+ existing_data_behavior = existing_data_behavior,
+ max_partitions = max_partitions,
+ max_open_files = max_open_files,
+ max_rows_per_file = max_rows_per_file,
+ min_rows_per_group = min_rows_per_group,
+ max_rows_per_group = max_rows_per_group,
+ include_header = col_names,
+ batch_size = batch_size,
+ null_string = na,
+ eol = eol,
+ quoting_style = quote
+ )
+}
+
validate_positive_int_value <- function(value, msg) {
if (!is_integerish(value, n = 1) || is.na(value) || value < 0) {
abort(paste(substitute(value), "must be a positive, non-missing integer"))
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index 10323a4796..57c21e3c0f 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -152,6 +152,9 @@ reference:
Write multi-file datasets to disk.
contents:
- write_dataset
+ - write_delim_dataset
+ - write_csv_dataset
+ - write_tsv_dataset
- title: Read files
desc: >
diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd
index 1bc940697c..34cffefbce 100644
--- a/r/man/write_dataset.Rd
+++ b/r/man/write_dataset.Rd
@@ -7,7 +7,7 @@
write_dataset(
dataset,
path,
- format = c("parquet", "feather", "arrow", "ipc", "csv"),
+ format = c("parquet", "feather", "arrow", "ipc", "csv", "tsv", "txt",
"text"),
partitioning = dplyr::group_vars(dataset),
basename_template = paste0("part-{i}.", as.character(format)),
hive_style = TRUE,
diff --git a/r/man/write_dataset.Rd b/r/man/write_delim_dataset.Rd
similarity index 50%
copy from r/man/write_dataset.Rd
copy to r/man/write_delim_dataset.Rd
index 1bc940697c..2dcd9707dc 100644
--- a/r/man/write_dataset.Rd
+++ b/r/man/write_delim_dataset.Rd
@@ -1,15 +1,16 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dataset-write.R
-\name{write_dataset}
-\alias{write_dataset}
-\title{Write a dataset}
+\name{write_delim_dataset}
+\alias{write_delim_dataset}
+\alias{write_csv_dataset}
+\alias{write_tsv_dataset}
+\title{Write a dataset into partitioned flat files.}
\usage{
-write_dataset(
+write_delim_dataset(
dataset,
path,
- format = c("parquet", "feather", "arrow", "ipc", "csv"),
partitioning = dplyr::group_vars(dataset),
- basename_template = paste0("part-{i}.", as.character(format)),
+ basename_template = "part-{i}.txt",
hive_style = TRUE,
existing_data_behavior = c("overwrite", "error", "delete_matching"),
max_partitions = 1024L,
@@ -17,7 +18,51 @@ write_dataset(
max_rows_per_file = 0L,
min_rows_per_group = 0L,
max_rows_per_group = bitwShiftL(1, 20),
- ...
+ col_names = TRUE,
+ batch_size = 1024L,
+ delim = ",",
+ na = "",
+ eol = "\\n",
+ quote = c("needed", "all", "none")
+)
+
+write_csv_dataset(
+ dataset,
+ path,
+ partitioning = dplyr::group_vars(dataset),
+ basename_template = "part-{i}.csv",
+ hive_style = TRUE,
+ existing_data_behavior = c("overwrite", "error", "delete_matching"),
+ max_partitions = 1024L,
+ max_open_files = 900L,
+ max_rows_per_file = 0L,
+ min_rows_per_group = 0L,
+ max_rows_per_group = bitwShiftL(1, 20),
+ col_names = TRUE,
+ batch_size = 1024L,
+ delim = ",",
+ na = "",
+ eol = "\\n",
+ quote = c("needed", "all", "none")
+)
+
+write_tsv_dataset(
+ dataset,
+ path,
+ partitioning = dplyr::group_vars(dataset),
+ basename_template = "part-{i}.tsv",
+ hive_style = TRUE,
+ existing_data_behavior = c("overwrite", "error", "delete_matching"),
+ max_partitions = 1024L,
+ max_open_files = 900L,
+ max_rows_per_file = 0L,
+ min_rows_per_group = 0L,
+ max_rows_per_group = bitwShiftL(1, 20),
+ col_names = TRUE,
+ batch_size = 1024L,
+ na = "",
+ eol = "\\n",
+ quote = c("needed", "all", "none")
)
}
\arguments{
@@ -29,9 +74,6 @@ etc. to transform the data before it is written if you need
to.}
\item{path}{string path, URI, or \code{SubTreeFileSystem} referencing a
directory
to write to (directory will be created if it does not exist)}
-\item{format}{a string identifier of the file format. Default is to use
-"parquet" (see \link{FileFormat})}
-
\item{partitioning}{\code{Partitioning} or a character vector of columns to
use as partition keys (to be written as path segments). Default is to
use the current \code{group_by()} columns.}
@@ -81,68 +123,36 @@ group and when this number of rows is exceeded, it is
split and the next set
of rows is written to the next group. This value must be set such that it is
greater than \code{min_rows_per_group}. Default is 1024 * 1024.}
-\item{...}{additional format-specific arguments. For available Parquet
-options, see \code{\link[=write_parquet]{write_parquet()}}. The available
Feather options are:
+\item{col_names}{Whether to write an initial header line with column names.}
+
+\item{batch_size}{Maximum number of rows processed at a time. Default is
1024L.}
+
+\item{delim}{Delimiter used to separate values. Defaults to \code{","} for
\code{write_delim_dataset()} and
+\code{write_csv_dataset()}, and \verb{"\\t} for \code{write_tsv_dataset()}.
Cannot be changed for \code{write_tsv_dataset()}.}
+
+\item{na}{a character vector of strings to interpret as missing values. Quotes
are not allowed in this string.
+The default is an empty string \code{""}.}
+
+\item{eol}{the end of line character to use for ending rows. The default is
\code{"\\n"}.}
+
+\item{quote}{How to handle fields which contain characters that need to be
quoted.
\itemize{
-\item \code{use_legacy_format} logical: write data formatted so that Arrow
libraries
-versions 0.14 and lower can read it. Default is \code{FALSE}. You can also
-enable this by setting the environment variable
\code{ARROW_PRE_0_15_IPC_FORMAT=1}.
-\item \code{metadata_version}: A string like "V5" or the equivalent integer
indicating
-the Arrow IPC MetadataVersion. Default (\code{NULL}) will use the latest
version,
-unless the environment variable \code{ARROW_PRE_1_0_METADATA_VERSION=1}, in
-which case it will be V4.
-\item \code{codec}: A \link{Codec} which will be used to compress body buffers
of written
-files. Default (NULL) will not compress body buffers.
-\item \code{null_fallback}: character to be used in place of missing values
(\code{NA} or
-\code{NULL}) when using Hive-style partitioning. See
\code{\link[=hive_partition]{hive_partition()}}.
+\item \code{needed} - Enclose all strings and binary values in quotes which
need them, because their CSV rendering can
+contain quotes itself (the default)
+\item \code{all} - Enclose all valid values in quotes. Nulls are not quoted.
May cause readers to
+interpret all values as strings if schema is inferred.
+\item \code{none} - Do not enclose any values in quotes. Prevents values
from containing quotes ("),
+cell delimiters (,) or line endings (\\r, \\n), (following RFC4180). If values
+contain these characters, an error is caused when attempting to write.
}}
}
\value{
-The input \code{dataset}, invisibly
+The input \code{dataset}, invisibly.
}
\description{
-This function allows you to write a dataset. By writing to more efficient
-binary storage formats, and by specifying relevant partitioning, you can
-make it much faster to read and query.
+The \verb{write_*_dataset()} are a family of wrappers around
\link{write_dataset} to allow for easy switching
+between functions for writing datasets.
}
-\examples{
-\dontshow{if (arrow_with_dataset() & arrow_with_parquet() &
requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4")
withAutoprint else force)(\{ # examplesIf}
-# You can write datasets partitioned by the values in a column (here: "cyl").
-# This creates a structure of the form cyl=X/part-Z.parquet.
-one_level_tree <- tempfile()
-write_dataset(mtcars, one_level_tree, partitioning = "cyl")
-list.files(one_level_tree, recursive = TRUE)
-
-# You can also partition by the values in multiple columns
-# (here: "cyl" and "gear").
-# This creates a structure of the form cyl=X/gear=Y/part-Z.parquet.
-two_levels_tree <- tempfile()
-write_dataset(mtcars, two_levels_tree, partitioning = c("cyl", "gear"))
-list.files(two_levels_tree, recursive = TRUE)
-
-# In the two previous examples we would have:
-# X = {4,6,8}, the number of cylinders.
-# Y = {3,4,5}, the number of forward gears.
-# Z = {0,1,2}, the number of saved parts, starting from 0.
-
-# You can obtain the same result as as the previous examples using arrow with
-# a dplyr pipeline. This will be the same as two_levels_tree above, but the
-# output directory will be different.
-library(dplyr)
-two_levels_tree_2 <- tempfile()
-mtcars \%>\%
- group_by(cyl, gear) \%>\%
- write_dataset(two_levels_tree_2)
-list.files(two_levels_tree_2, recursive = TRUE)
-
-# And you can also turn off the Hive-style directory naming where the column
-# name is included with the values by using `hive_style = FALSE`.
-
-# Write a structure X/Y/part-Z.parquet.
-two_levels_tree_no_hive <- tempfile()
-mtcars \%>\%
- group_by(cyl, gear) \%>\%
- write_dataset(two_levels_tree_no_hive, hive_style = FALSE)
-list.files(two_levels_tree_no_hive, recursive = TRUE)
-\dontshow{\}) # examplesIf}
+\seealso{
+\code{\link[=write_dataset]{write_dataset()}}
}
diff --git a/r/src/csv.cpp b/r/src/csv.cpp
index d04caf5c1f..ffb8a11e6b 100644
--- a/r/src/csv.cpp
+++ b/r/src/csv.cpp
@@ -31,8 +31,12 @@ std::shared_ptr<arrow::csv::WriteOptions>
csv___WriteOptions__initialize(
std::make_shared<arrow::csv::WriteOptions>(arrow::csv::WriteOptions::Defaults());
res->include_header = cpp11::as_cpp<bool>(options["include_header"]);
res->batch_size = cpp11::as_cpp<int>(options["batch_size"]);
- res->io_context = MainRThread::GetInstance().CancellableIOContext();
+ res->delimiter = cpp11::as_cpp<char>(options["delimiter"]);
res->null_string = cpp11::as_cpp<std::string>(options["null_string"]);
+ res->io_context = MainRThread::GetInstance().CancellableIOContext();
+ res->eol = cpp11::as_cpp<std::string>(options["eol"]);
+ res->quoting_style =
+ cpp11::as_cpp<enum arrow::csv::QuotingStyle>(options["quoting_style"]);
return res;
}
diff --git a/r/tests/testthat/_snaps/dataset-write.md
b/r/tests/testthat/_snaps/dataset-write.md
index e302d8463d..19f687be67 100644
--- a/r/tests/testthat/_snaps/dataset-write.md
+++ b/r/tests/testthat/_snaps/dataset-write.md
@@ -42,7 +42,7 @@
Condition
Error in `check_additional_args()`:
! `nonsensical_arg` is not a valid argument for your chosen `format`.
- i Supported arguments: `include_header`, `batch_size`, `null_string`,
and `na`.
+ i Supported arguments: `include_header`, `batch_size`, `null_string`,
`delimiter`, `eol`, `quoting_style`, `col_names`, `delim`, `na`, and `quote`.
---
diff --git a/r/tests/testthat/test-dataset-write.R
b/r/tests/testthat/test-dataset-write.R
index 8553dafd80..e2c3c68b50 100644
--- a/r/tests/testthat/test-dataset-write.R
+++ b/r/tests/testthat/test-dataset-write.R
@@ -812,3 +812,207 @@ test_that("write_dataset() errors on data.frame with NULL
names", {
names(df) <- NULL
expect_error(write_dataset(df, tempfile()), "Input data frame columns must
be named")
})
+
+test_that("Writing a dataset to text files with wrapper functions.", {
+ df <- tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ )
+
+ dst_dir <- make_temp_dir()
+ write_delim_dataset(df, dst_dir)
+ expect_true(dir.exists(dst_dir))
+ new_ds <- open_dataset(dst_dir, format = "text")
+ expect_equal(new_ds %>% collect(), df)
+
+ dst_dir <- make_temp_dir()
+ write_csv_dataset(df, dst_dir)
+ expect_true(dir.exists(dst_dir))
+ new_ds <- open_dataset(dst_dir, format = "csv")
+ expect_equal(new_ds %>% collect(), df)
+
+ dst_dir <- make_temp_dir()
+ write_tsv_dataset(df, dst_dir)
+ expect_true(dir.exists(dst_dir))
+ new_ds <- open_dataset(dst_dir, format = "tsv")
+ expect_equal(new_ds %>% collect(), df)
+})
+
+test_that("Writing a flat file dataset: `basename_template` default behavier",
{
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+
+ dst_dir <- make_temp_dir()
+ write_delim_dataset(ds, dst_dir, max_rows_per_file = 5L)
+ expect_identical(
+ dir(dst_dir, full.names = FALSE, recursive = TRUE),
+ paste0("part-", 0:3, ".txt")
+ )
+
+ dst_dir <- make_temp_dir()
+ write_csv_dataset(ds, dst_dir, max_rows_per_file = 5L)
+ expect_identical(
+ dir(dst_dir, full.names = FALSE, recursive = TRUE),
+ paste0("part-", 0:3, ".csv")
+ )
+
+ dst_dir <- make_temp_dir()
+ write_tsv_dataset(ds, dst_dir, max_rows_per_file = 5L)
+ expect_identical(
+ dir(dst_dir, full.names = FALSE, recursive = TRUE),
+ paste0("part-", 0:3, ".tsv")
+ )
+})
+
+test_that("max_rows_per_group is adjusted if at odds with max_rows_per_file in
write_delim_dataset()", {
+ skip_if_not_available("parquet")
+ df <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ )
+
+ # max_rows_per_group unset adjust silently
+ dst_dir <- make_temp_dir()
+ expect_silent(
+ write_delim_dataset(df, dst_dir, max_rows_per_file = 5)
+ )
+
+ dst_dir <- make_temp_dir()
+ expect_silent(
+ write_csv_dataset(df, dst_dir, max_rows_per_file = 5)
+ )
+
+ dst_dir <- make_temp_dir()
+ expect_silent(
+ write_tsv_dataset(df, dst_dir, max_rows_per_file = 5)
+ )
+})
+
+test_that("Writing a flat file dataset without a delimiter throws an error.", {
+ df <- tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ )
+
+ dst_dir <- make_temp_dir()
+ expect_error(
+ write_dataset(df, dst_dir, format = "txt"),
+ "A delimiter must be given for a txt format."
+ )
+
+ expect_error(
+ write_dataset(df, dst_dir, format = "text"),
+ "A delimiter must be given for a txt format."
+ )
+})
+
+test_that("Dataset can write flat files using readr::write_csv() options.", {
+ df <- tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ )
+
+ dst_dir <- make_temp_dir()
+ write_dataset(df, dst_dir, format = "csv", col_names = FALSE)
+ expect_true(dir.exists(dst_dir))
+
+ header <- readLines(file(paste0(dst_dir, "/part-0.csv")), n = 1L)
+ expect_equal(header, "1,1,true,\"a\"")
+
+ df2 <- tibble(x = "")
+ dst_dir <- make_temp_dir()
+ write_dataset(df2, dst_dir, format = "csv", eol = "\r\n")
+ expect_true(dir.exists(dst_dir))
+
+ header <- readBin(con <- file(paste0(dst_dir, "/part-0.csv"), "rb"), "raw",
n = 5)
+ close(con)
+
+ # 0d and 0a are the character codes for CRLF (https://www.asciitable.com)
+ expect_equal(header[4:5], as.raw(c(0x0d, 0x0a)))
+
+ dst_dir <- make_temp_dir()
+ expect_error(
+ write_dataset(df, dst_dir, format = "csv", include_header = FALSE, delim =
";"),
+ "Use either Arrow write options or readr write options, not both"
+ )
+ dst_dir <- make_temp_dir()
+ write_dataset(df, dst_dir, format = "csv", quoting_style = "AllValid")
+ ds <- open_dataset(dst_dir, format = "csv")
+ expect_equal(df, ds |> collect())
+
+ lines <- paste(readLines(paste0(dst_dir, "/part-0.csv")), sep = "\n")
+ expect_equal(lines[2], "\"1\",\"1\",\"true\",\"a\"")
+
+ expect_error(
+ write_dataset(df, dst_dir, format = "tsv", delimiter = ";"),
+ "Can't set a delimiter for the tsv format."
+ )
+})
+
+test_that("Dataset write wrappers can write flat files using
readr::write_csv() options.", {
+ df <- tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ )
+
+ dst_dir <- make_temp_dir()
+ write_csv_dataset(df, dst_dir, col_names = FALSE)
+
+ header <- readLines(file(paste0(dst_dir, "/part-0.csv")), n = 1L)
+ expect_equal(header, "1,1,true,\"a\"")
+
+ dst_dir <- make_temp_dir()
+ write_tsv_dataset(df, dst_dir, col_names = FALSE)
+
+ header <- readLines(file(paste0(dst_dir, "/part-0.tsv")), n = 1L)
+ expect_equal(header, "1\t1\ttrue\t\"a\"")
+
+ df2 <- tibble(x = "")
+ dst_dir <- make_temp_dir()
+ write_csv_dataset(df2, dst_dir, eol = "\r\n")
+ header <- readBin(con <- file(paste0(dst_dir, "/part-0.csv"), "rb"), "raw",
n = 5)
+ close(con)
+ # 0d and 0a are the character codes for CRLF (https://www.asciitable.com)
+ expect_equal(header[4:5], as.raw(c(0x0d, 0x0a)))
+
+ df2 <- tibble(x = "")
+ dst_dir <- make_temp_dir()
+ write_tsv_dataset(df2, dst_dir, eol = "\r\n")
+ header <- readBin(con <- file(paste0(dst_dir, "/part-0.tsv"), "rb"), "raw",
n = 5)
+ close(con)
+ # 0d and 0a are the character codes for CRLF (https://www.asciitable.com)
+ expect_equal(header[4:5], as.raw(c(0x0d, 0x0a)))
+
+ dst_dir <- make_temp_dir()
+ write_csv_dataset(df, dst_dir, quote = "all", delim = ";")
+ ds <- open_dataset(dst_dir, format = "csv", delim = ";")
+ expect_equal(df, ds |> collect())
+
+ lines <- paste(readLines(paste0(dst_dir, "/part-0.csv")), sep = "\n")
+ expect_equal(lines[2], "\"1\";\"1\";\"true\";\"a\"")
+
+ dst_dir <- make_temp_dir()
+ write_tsv_dataset(df, dst_dir, quote = "all", eol = "\r\n")
+ ds <- open_dataset(dst_dir, format = "tsv")
+ expect_equal(df, ds |> collect())
+
+ lines <- paste(readLines(paste0(dst_dir, "/part-0.tsv")), sep = "\n")
+ expect_equal(lines[2], "\"1\"\t\"1\"\t\"true\"\t\"a\"")
+ dst_dir <- make_temp_dir()
+ write_tsv_dataset(df, dst_dir, na = "NOVALUE")
+ ds <- open_dataset(dst_dir, format = "tsv") |> collect()
+
+ expect_equal(
+ ds$lgl,
+ c("true", "false", "NOVALUE", "true", "false", "true", "false", "NOVALUE",
"true", "false")
+ )
+})