This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f0ff8d015a ARROW-16715: [R] Bump default parquet version (#13555)
f0ff8d015a is described below
commit f0ff8d015a26a780426a13b556d9db082daed200
Author: Neal Richardson <[email protected]>
AuthorDate: Mon Jul 11 11:26:51 2022 -0400
ARROW-16715: [R] Bump default parquet version (#13555)
Also removes deprecated args to `write_parquet()`
Authored-by: Neal Richardson <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
---
r/NAMESPACE | 1 +
r/NEWS.md | 1 +
r/R/arrow-package.R | 2 +-
r/R/enums.R | 2 +-
r/R/parquet.R | 99 ++++++++++++++++----------------
r/man/enums.Rd | 2 +-
r/man/write_parquet.Rd | 48 ++++++++--------
r/tests/testthat/_snaps/dataset-write.md | 2 +-
r/tests/testthat/test-parquet.R | 52 ++++++++++++++---
9 files changed, 122 insertions(+), 87 deletions(-)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 5762df9eb0..023e9bb831 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -395,6 +395,7 @@ importFrom(rlang,"%||%")
importFrom(rlang,":=")
importFrom(rlang,.data)
importFrom(rlang,abort)
+importFrom(rlang,arg_match)
importFrom(rlang,as_function)
importFrom(rlang,as_label)
importFrom(rlang,as_quosure)
diff --git a/r/NEWS.md b/r/NEWS.md
index 119974f74a..fca55b047e 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -25,6 +25,7 @@
* `orders` with year, month, day, hours, minutes, and seconds components are
supported.
* the `orders` argument in the Arrow binding works as follows: `orders` are
transformed into `formats` which subsequently get applied in turn. There is no
`select_formats` parameter and no inference takes place (like is the case in
`lubridate::parse_date_time()`).
* `read_arrow()` and `write_arrow()`, deprecated since 1.0.0 (July 2020), have
been removed. Use the `read/write_feather()` and `read/write_ipc_stream()`
functions depending on whether you're working with the Arrow IPC file or stream
format, respectively.
+* `write_parquet()` now defaults to writing Parquet format version 2.4 (was
1.0). Previously deprecated arguments `properties` and `arrow_properties` have
been removed; if you need to deal with these lower-level properties objects
directly, use `ParquetFileWriter`, which `write_parquet()` wraps.
# arrow 8.0.0
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 7b59854f1e..05270ef6bb 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -23,7 +23,7 @@
#' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind
set_names exec
#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr
.data seq2 is_interactive
#' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr
enexprs as_quosure
-#' @importFrom rlang is_list call2 is_empty as_function as_label
+#' @importFrom rlang is_list call2 is_empty as_function as_label arg_match
#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
#' @useDynLib arrow, .registration = TRUE
#' @keywords internal
diff --git a/r/R/enums.R b/r/R/enums.R
index 17d0484b99..727ca9388c 100644
--- a/r/R/enums.R
+++ b/r/R/enums.R
@@ -122,7 +122,7 @@ FileType <- enum("FileType",
#' @export
#' @rdname enums
ParquetVersionType <- enum("ParquetVersionType",
- PARQUET_1_0 = 0L, PARQUET_2_0 = 1L
+ PARQUET_1_0 = 0L, PARQUET_2_0 = 1L, PARQUET_2_4 = 2L, PARQUET_2_6 = 3L
)
#' @export
diff --git a/r/R/parquet.R b/r/R/parquet.R
index 62da28fd1e..8cd9daa857 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -83,30 +83,29 @@ read_parquet <- function(file,
#' @param sink A string file path, URI, or [OutputStream], or path in a file
#' system (`SubTreeFileSystem`)
#' @param chunk_size how many rows of data to write to disk at once. This
-#' directly corresponds to how many rows will be in each row group in parquet.
-#' If `NULL`, a best guess will be made for optimal size (based on the number
of
-#' columns and number of rows), though if the data has fewer than 250 million
-#' cells (rows x cols), then the total number of rows is used.
-#' @param version parquet version, "1.0" or "2.0". Default "1.0". Numeric
values
-#' are coerced to character.
+#' directly corresponds to how many rows will be in each row group in
+#' parquet. If `NULL`, a best guess will be made for optimal size (based on
+#' the number of columns and number of rows), though if the data has fewer
+#' than 250 million cells (rows x cols), then the total number of rows is
+#' used.
+#' @param version parquet version: "1.0", "2.0" (deprecated), "2.4" (default),
+#' "2.6", or "latest" (currently equivalent to 2.6). Numeric values are
+#' coerced to character.
#' @param compression compression algorithm. Default "snappy". See details.
-#' @param compression_level compression level. Meaning depends on compression
algorithm
-#' @param use_dictionary Specify if we should use dictionary encoding. Default
`TRUE`
-#' @param write_statistics Specify if we should write statistics. Default
`TRUE`
+#' @param compression_level compression level. Meaning depends on compression
+#' algorithm
+#' @param use_dictionary logical: use dictionary encoding? Default `TRUE`
+#' @param write_statistics logical: include statistics? Default `TRUE`
#' @param data_page_size Set a target threshold for the approximate encoded
#' size of data pages within a column chunk (in bytes). Default 1 MiB.
-#' @param use_deprecated_int96_timestamps Write timestamps to INT96 Parquet
format. Default `FALSE`.
+#' @param use_deprecated_int96_timestamps logical: write timestamps to INT96
+#' Parquet format, which has been deprecated? Default `FALSE`.
#' @param coerce_timestamps Cast timestamps a particular resolution. Can be
#' `NULL`, "ms" or "us". Default `NULL` (no casting)
-#' @param allow_truncated_timestamps Allow loss of data when coercing
timestamps to a
-#' particular resolution. E.g. if microsecond or nanosecond data is lost
when coercing
-#' to "ms", do not raise an exception
-#' @param properties A `ParquetWriterProperties` object, used instead of the
options
-#' enumerated in this function's signature. Providing `properties` as an
argument
-#' is deprecated; if you need to assemble `ParquetWriterProperties` outside
-#' of `write_parquet()`, use `ParquetFileWriter` instead.
-#' @param arrow_properties A `ParquetArrowWriterProperties` object. Like
-#' `properties`, this argument is deprecated.
+#' @param allow_truncated_timestamps logical: Allow loss of data when coercing
+#' timestamps to a particular resolution. E.g. if microsecond or nanosecond
+#' data is lost when coercing to "ms", do not raise an exception. Default
+#' `FALSE`.
#'
#' @details The parameters `compression`, `compression_level`,
`use_dictionary` and
#' `write_statistics` support various patterns:
@@ -128,7 +127,7 @@ read_parquet <- function(file,
#' Note that "uncompressed" columns may still have dictionary encoding.
#'
#' @return the input `x` invisibly.
-#'
+#' @seealso [ParquetFileWriter] for a lower-level interface to Parquet writing.
#' @examplesIf arrow_with_parquet()
#' tf1 <- tempfile(fileext = ".parquet")
#' write_parquet(data.frame(x = 1:5), tf1)
@@ -143,7 +142,7 @@ write_parquet <- function(x,
sink,
chunk_size = NULL,
# writer properties
- version = NULL,
+ version = "2.4",
compression = default_parquet_compression(),
compression_level = NULL,
use_dictionary = NULL,
@@ -152,9 +151,7 @@ write_parquet <- function(x,
# arrow writer properties
use_deprecated_int96_timestamps = FALSE,
coerce_timestamps = NULL,
- allow_truncated_timestamps = FALSE,
- properties = NULL,
- arrow_properties = NULL) {
+ allow_truncated_timestamps = FALSE) {
x_out <- x
x <- as_writable_table(x)
@@ -163,24 +160,10 @@ write_parquet <- function(x,
on.exit(sink$close())
}
- # Deprecation warnings
- if (!is.null(properties)) {
- warning(
- "Providing 'properties' is deprecated. If you need to assemble
properties outside ",
- "this function, use ParquetFileWriter instead."
- )
- }
- if (!is.null(arrow_properties)) {
- warning(
- "Providing 'arrow_properties' is deprecated. If you need to assemble
arrow_properties ",
- "outside this function, use ParquetFileWriter instead."
- )
- }
-
writer <- ParquetFileWriter$create(
x$schema,
sink,
- properties = properties %||% ParquetWriterProperties$create(
+ properties = ParquetWriterProperties$create(
names(x),
version = version,
compression = compression,
@@ -189,7 +172,7 @@ write_parquet <- function(x,
write_statistics = write_statistics,
data_page_size = data_page_size
),
- arrow_properties = arrow_properties %||%
ParquetArrowWriterProperties$create(
+ arrow_properties = ParquetArrowWriterProperties$create(
use_deprecated_int96_timestamps = use_deprecated_int96_timestamps,
coerce_timestamps = coerce_timestamps,
allow_truncated_timestamps = allow_truncated_timestamps
@@ -238,19 +221,35 @@ ParquetArrowWriterProperties$create <-
function(use_deprecated_int96_timestamps
valid_parquet_version <- c(
"1.0" = ParquetVersionType$PARQUET_1_0,
- "2.0" = ParquetVersionType$PARQUET_2_0
+ "2.0" = ParquetVersionType$PARQUET_2_0,
+ "2.4" = ParquetVersionType$PARQUET_2_4,
+ "2.6" = ParquetVersionType$PARQUET_2_6,
+ "latest" = ParquetVersionType$PARQUET_2_6
)
-make_valid_version <- function(version, valid_versions =
valid_parquet_version) {
+make_valid_parquet_version <- function(version, valid_versions =
valid_parquet_version) {
if (is_integerish(version)) {
- version <- as.character(version)
+ version <- as.numeric(version)
}
- tryCatch(
- valid_versions[[match.arg(version, choices = names(valid_versions))]],
- error = function(cond) {
- stop('"version" should be one of ', oxford_paste(names(valid_versions),
"or"), call. = FALSE)
- }
- )
+ if (is.numeric(version)) {
+ version <- format(version, nsmall = 1)
+ }
+
+ if (!is.string(version)) {
+ stop(
+ "`version` must be one of ", oxford_paste(names(valid_versions), "or"),
+ call. = FALSE
+ )
+ }
+ out <- valid_versions[[arg_match(version, values = names(valid_versions))]]
+
+ if (identical(out, ParquetVersionType$PARQUET_2_0)) {
+ warning(
+ 'Parquet format version "2.0" is deprecated. Use "2.4" or "2.6" to
select format features.',
+ call. = FALSE
+ )
+ }
+ out
}
#' @title ParquetWriterProperties class
@@ -300,7 +299,7 @@ ParquetWriterPropertiesBuilder <-
R6Class("ParquetWriterPropertiesBuilder",
inherit = ArrowObject,
public = list(
set_version = function(version) {
- parquet___WriterProperties___Builder__version(self,
make_valid_version(version))
+ parquet___WriterProperties___Builder__version(self,
make_valid_parquet_version(version))
},
set_compression = function(column_names, compression) {
compression <- compression_from_name(compression)
diff --git a/r/man/enums.Rd b/r/man/enums.Rd
index 7ec126a019..614c196fde 100644
--- a/r/man/enums.Rd
+++ b/r/man/enums.Rd
@@ -36,7 +36,7 @@ An object of class \code{Compression::type} (inherits from
\code{arrow-enum}) of
An object of class \code{FileType} (inherits from \code{arrow-enum}) of length
4.
-An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum})
of length 2.
+An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum})
of length 4.
An object of class \code{MetadataVersion} (inherits from \code{arrow-enum}) of
length 5.
diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd
index efc6856e5e..ff57e4c8e9 100644
--- a/r/man/write_parquet.Rd
+++ b/r/man/write_parquet.Rd
@@ -8,7 +8,7 @@ write_parquet(
x,
sink,
chunk_size = NULL,
- version = NULL,
+ version = "2.4",
compression = default_parquet_compression(),
compression_level = NULL,
use_dictionary = NULL,
@@ -16,9 +16,7 @@ write_parquet(
data_page_size = NULL,
use_deprecated_int96_timestamps = FALSE,
coerce_timestamps = NULL,
- allow_truncated_timestamps = FALSE,
- properties = NULL,
- arrow_properties = NULL
+ allow_truncated_timestamps = FALSE
)
}
\arguments{
@@ -28,41 +26,38 @@ write_parquet(
system (\code{SubTreeFileSystem})}
\item{chunk_size}{how many rows of data to write to disk at once. This
-directly corresponds to how many rows will be in each row group in parquet.
-If \code{NULL}, a best guess will be made for optimal size (based on the
number of
-columns and number of rows), though if the data has fewer than 250 million
-cells (rows x cols), then the total number of rows is used.}
+directly corresponds to how many rows will be in each row group in
+parquet. If \code{NULL}, a best guess will be made for optimal size (based on
+the number of columns and number of rows), though if the data has fewer
+than 250 million cells (rows x cols), then the total number of rows is
+used.}
-\item{version}{parquet version, "1.0" or "2.0". Default "1.0". Numeric values
-are coerced to character.}
+\item{version}{parquet version: "1.0", "2.0" (deprecated), "2.4" (default),
+"2.6", or "latest" (currently equivalent to 2.6). Numeric values are
+coerced to character.}
\item{compression}{compression algorithm. Default "snappy". See details.}
-\item{compression_level}{compression level. Meaning depends on compression
algorithm}
+\item{compression_level}{compression level. Meaning depends on compression
+algorithm}
-\item{use_dictionary}{Specify if we should use dictionary encoding. Default
\code{TRUE}}
+\item{use_dictionary}{logical: use dictionary encoding? Default \code{TRUE}}
-\item{write_statistics}{Specify if we should write statistics. Default
\code{TRUE}}
+\item{write_statistics}{logical: include statistics? Default \code{TRUE}}
\item{data_page_size}{Set a target threshold for the approximate encoded
size of data pages within a column chunk (in bytes). Default 1 MiB.}
-\item{use_deprecated_int96_timestamps}{Write timestamps to INT96 Parquet
format. Default \code{FALSE}.}
+\item{use_deprecated_int96_timestamps}{logical: write timestamps to INT96
+Parquet format, which has been deprecated? Default \code{FALSE}.}
\item{coerce_timestamps}{Cast timestamps a particular resolution. Can be
\code{NULL}, "ms" or "us". Default \code{NULL} (no casting)}
-\item{allow_truncated_timestamps}{Allow loss of data when coercing timestamps
to a
-particular resolution. E.g. if microsecond or nanosecond data is lost when
coercing
-to "ms", do not raise an exception}
-
-\item{properties}{A \code{ParquetWriterProperties} object, used instead of the
options
-enumerated in this function's signature. Providing \code{properties} as an
argument
-is deprecated; if you need to assemble \code{ParquetWriterProperties} outside
-of \code{write_parquet()}, use \code{ParquetFileWriter} instead.}
-
-\item{arrow_properties}{A \code{ParquetArrowWriterProperties} object. Like
-\code{properties}, this argument is deprecated.}
+\item{allow_truncated_timestamps}{logical: Allow loss of data when coercing
+timestamps to a particular resolution. E.g. if microsecond or nanosecond
+data is lost when coercing to "ms", do not raise an exception. Default
+\code{FALSE}.}
}
\value{
the input \code{x} invisibly.
@@ -110,3 +105,6 @@ if (codec_is_available("gzip")) {
}
\dontshow{\}) # examplesIf}
}
+\seealso{
+\link{ParquetFileWriter} for a lower-level interface to Parquet writing.
+}
diff --git a/r/tests/testthat/_snaps/dataset-write.md
b/r/tests/testthat/_snaps/dataset-write.md
index 34cb46e9cb..e9ca7e0998 100644
--- a/r/tests/testthat/_snaps/dataset-write.md
+++ b/r/tests/testthat/_snaps/dataset-write.md
@@ -45,5 +45,5 @@
write_dataset(df, dst_dir, format = "parquet", nonsensical_arg =
"blah-blah")
Error <rlang_error>
`nonsensical_arg` is not a valid argument for your chosen `format`.
- i Supported arguments: `chunk_size`, `version`, `compression`,
`compression_level`, `use_dictionary`, `write_statistics`, `data_page_size`,
`use_deprecated_int96_timestamps`, `coerce_timestamps`,
`allow_truncated_timestamps`, `properties`, and `arrow_properties`.
+ i Supported arguments: `chunk_size`, `version`, `compression`,
`compression_level`, `use_dictionary`, `write_statistics`, `data_page_size`,
`use_deprecated_int96_timestamps`, `coerce_timestamps`, and
`allow_truncated_timestamps`.
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index 482f508575..b75892bc84 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -129,15 +129,51 @@ test_that("write_parquet() can truncate timestamps", {
expect_equal(as.data.frame(tab), as.data.frame(new))
})
-test_that("make_valid_version()", {
- expect_equal(make_valid_version("1.0"), ParquetVersionType$PARQUET_1_0)
- expect_equal(make_valid_version("2.0"), ParquetVersionType$PARQUET_2_0)
+test_that("make_valid_parquet_version()", {
+ expect_equal(
+ make_valid_parquet_version("1.0"),
+ ParquetVersionType$PARQUET_1_0
+ )
+ expect_deprecated(
+ expect_equal(
+ make_valid_parquet_version("2.0"),
+ ParquetVersionType$PARQUET_2_0
+ )
+ )
+ expect_equal(
+ make_valid_parquet_version("2.4"),
+ ParquetVersionType$PARQUET_2_4
+ )
+ expect_equal(
+ make_valid_parquet_version("2.6"),
+ ParquetVersionType$PARQUET_2_6
+ )
+ expect_equal(
+ make_valid_parquet_version("latest"),
+ ParquetVersionType$PARQUET_2_6
+ )
- expect_equal(make_valid_version(1), ParquetVersionType$PARQUET_1_0)
- expect_equal(make_valid_version(2), ParquetVersionType$PARQUET_2_0)
+ expect_equal(make_valid_parquet_version(1), ParquetVersionType$PARQUET_1_0)
+ expect_deprecated(
+ expect_equal(make_valid_parquet_version(2), ParquetVersionType$PARQUET_2_0)
+ )
+ expect_equal(make_valid_parquet_version(1.0), ParquetVersionType$PARQUET_1_0)
+ expect_equal(make_valid_parquet_version(2.4), ParquetVersionType$PARQUET_2_4)
+})
- expect_equal(make_valid_version(1.0), ParquetVersionType$PARQUET_1_0)
- expect_equal(make_valid_version(2.0), ParquetVersionType$PARQUET_2_0)
+test_that("make_valid_parquet_version() input validation", {
+ expect_error(
+ make_valid_parquet_version("0.3.14"),
+ "`version` must be one of"
+ )
+ expect_error(
+ make_valid_parquet_version(NULL),
+ "`version` must be one of"
+ )
+ expect_error(
+ make_valid_parquet_version(c("2", "4")),
+ "`version` must be one of"
+ )
})
test_that("write_parquet() defaults to snappy compression", {
@@ -239,7 +275,7 @@ test_that("write_parquet() handles version argument", {
tf <- tempfile()
on.exit(unlink(tf))
- purrr::walk(list("1.0", "2.0", 1.0, 2.0, 1L, 2L), ~ {
+ purrr::walk(list("1.0", "2.4", "2.6", "latest", 1.0, 2.4, 2.6, 1L), ~ {
write_parquet(df, tf, version = .x)
expect_identical(read_parquet(tf), df)
})