[arrow] branch master updated: ARROW-16715: [R] Bump default parquet version (#13555)

npr Mon, 11 Jul 2022 08:27:06 -0700

This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new f0ff8d015a ARROW-16715: [R] Bump default parquet version (#13555)
f0ff8d015a is described below

commit f0ff8d015a26a780426a13b556d9db082daed200
Author: Neal Richardson <[email protected]>
AuthorDate: Mon Jul 11 11:26:51 2022 -0400

    ARROW-16715: [R] Bump default parquet version (#13555)
    
    Also removes deprecated args to `write_parquet()`
    
    Authored-by: Neal Richardson <[email protected]>
    Signed-off-by: Neal Richardson <[email protected]>
---
 r/NAMESPACE                              |  1 +
 r/NEWS.md                                |  1 +
 r/R/arrow-package.R                      |  2 +-
 r/R/enums.R                              |  2 +-
 r/R/parquet.R                            | 99 ++++++++++++++++----------------
 r/man/enums.Rd                           |  2 +-
 r/man/write_parquet.Rd                   | 48 ++++++++--------
 r/tests/testthat/_snaps/dataset-write.md |  2 +-
 r/tests/testthat/test-parquet.R          | 52 ++++++++++++++---
 9 files changed, 122 insertions(+), 87 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 5762df9eb0..023e9bb831 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -395,6 +395,7 @@ importFrom(rlang,"%||%")
 importFrom(rlang,":=")
 importFrom(rlang,.data)
 importFrom(rlang,abort)
+importFrom(rlang,arg_match)
 importFrom(rlang,as_function)
 importFrom(rlang,as_label)
 importFrom(rlang,as_quosure)
diff --git a/r/NEWS.md b/r/NEWS.md
index 119974f74a..fca55b047e 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -25,6 +25,7 @@
   * `orders` with year, month, day, hours, minutes, and seconds components are 
supported.
   * the `orders` argument in the Arrow binding works as follows: `orders` are 
transformed into `formats` which subsequently get applied in turn. There is no 
`select_formats` parameter and no inference takes place (like is the case in 
`lubridate::parse_date_time()`).
 * `read_arrow()` and `write_arrow()`, deprecated since 1.0.0 (July 2020), have 
been removed. Use the `read/write_feather()` and `read/write_ipc_stream()` 
functions depending on whether you're working with the Arrow IPC file or stream 
format, respectively.
+* `write_parquet()` now defaults to writing Parquet format version 2.4 (was 
1.0). Previously deprecated arguments `properties` and `arrow_properties` have 
been removed; if you need to deal with these lower-level properties objects 
directly, use `ParquetFileWriter`, which `write_parquet()` wraps.
 
 # arrow 8.0.0
 
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 7b59854f1e..05270ef6bb 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -23,7 +23,7 @@
 #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind 
set_names exec
 #' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr 
.data seq2 is_interactive
 #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr 
enexprs as_quosure
-#' @importFrom rlang is_list call2 is_empty as_function as_label
+#' @importFrom rlang is_list call2 is_empty as_function as_label arg_match
 #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
 #' @useDynLib arrow, .registration = TRUE
 #' @keywords internal
diff --git a/r/R/enums.R b/r/R/enums.R
index 17d0484b99..727ca9388c 100644
--- a/r/R/enums.R
+++ b/r/R/enums.R
@@ -122,7 +122,7 @@ FileType <- enum("FileType",
 #' @export
 #' @rdname enums
 ParquetVersionType <- enum("ParquetVersionType",
-  PARQUET_1_0 = 0L, PARQUET_2_0 = 1L
+  PARQUET_1_0 = 0L, PARQUET_2_0 = 1L, PARQUET_2_4 = 2L, PARQUET_2_6 = 3L
 )
 
 #' @export
diff --git a/r/R/parquet.R b/r/R/parquet.R
index 62da28fd1e..8cd9daa857 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -83,30 +83,29 @@ read_parquet <- function(file,
 #' @param sink A string file path, URI, or [OutputStream], or path in a file
 #' system (`SubTreeFileSystem`)
 #' @param chunk_size how many rows of data to write to disk at once. This
-#' directly corresponds to how many rows will be in each row group in parquet.
-#' If `NULL`, a best guess will be made for optimal size (based on the number 
of
-#'  columns and number of rows), though if the data has fewer than 250 million
-#'  cells (rows x cols), then the total number of rows is used.
-#' @param version parquet version, "1.0" or "2.0". Default "1.0". Numeric 
values
-#'   are coerced to character.
+#'    directly corresponds to how many rows will be in each row group in
+#'    parquet. If `NULL`, a best guess will be made for optimal size (based on
+#'    the number of columns and number of rows), though if the data has fewer
+#'    than 250 million cells (rows x cols), then the total number of rows is
+#'    used.
+#' @param version parquet version: "1.0", "2.0" (deprecated), "2.4" (default),
+#'    "2.6", or "latest" (currently equivalent to 2.6). Numeric values are
+#'    coerced to character.
 #' @param compression compression algorithm. Default "snappy". See details.
-#' @param compression_level compression level. Meaning depends on compression 
algorithm
-#' @param use_dictionary Specify if we should use dictionary encoding. Default 
`TRUE`
-#' @param write_statistics Specify if we should write statistics. Default 
`TRUE`
+#' @param compression_level compression level. Meaning depends on compression
+#'    algorithm
+#' @param use_dictionary logical: use dictionary encoding? Default `TRUE`
+#' @param write_statistics logical: include statistics? Default `TRUE`
 #' @param data_page_size Set a target threshold for the approximate encoded
 #'    size of data pages within a column chunk (in bytes). Default 1 MiB.
-#' @param use_deprecated_int96_timestamps Write timestamps to INT96 Parquet 
format. Default `FALSE`.
+#' @param use_deprecated_int96_timestamps logical: write timestamps to INT96
+#'    Parquet format, which has been deprecated? Default `FALSE`.
 #' @param coerce_timestamps Cast timestamps a particular resolution. Can be
 #'   `NULL`, "ms" or "us". Default `NULL` (no casting)
-#' @param allow_truncated_timestamps Allow loss of data when coercing 
timestamps to a
-#'    particular resolution. E.g. if microsecond or nanosecond data is lost 
when coercing
-#'    to "ms", do not raise an exception
-#' @param properties A `ParquetWriterProperties` object, used instead of the 
options
-#'    enumerated in this function's signature. Providing `properties` as an 
argument
-#'    is deprecated; if you need to assemble `ParquetWriterProperties` outside
-#'    of `write_parquet()`, use `ParquetFileWriter` instead.
-#' @param arrow_properties A `ParquetArrowWriterProperties` object. Like
-#'    `properties`, this argument is deprecated.
+#' @param allow_truncated_timestamps logical: Allow loss of data when coercing
+#'    timestamps to a particular resolution. E.g. if microsecond or nanosecond
+#'    data is lost when coercing to "ms", do not raise an exception. Default
+#'    `FALSE`.
 #'
 #' @details The parameters `compression`, `compression_level`, 
`use_dictionary` and
 #'   `write_statistics` support various patterns:
@@ -128,7 +127,7 @@ read_parquet <- function(file,
 #' Note that "uncompressed" columns may still have dictionary encoding.
 #'
 #' @return the input `x` invisibly.
-#'
+#' @seealso [ParquetFileWriter] for a lower-level interface to Parquet writing.
 #' @examplesIf arrow_with_parquet()
 #' tf1 <- tempfile(fileext = ".parquet")
 #' write_parquet(data.frame(x = 1:5), tf1)
@@ -143,7 +142,7 @@ write_parquet <- function(x,
                           sink,
                           chunk_size = NULL,
                           # writer properties
-                          version = NULL,
+                          version = "2.4",
                           compression = default_parquet_compression(),
                           compression_level = NULL,
                           use_dictionary = NULL,
@@ -152,9 +151,7 @@ write_parquet <- function(x,
                           # arrow writer properties
                           use_deprecated_int96_timestamps = FALSE,
                           coerce_timestamps = NULL,
-                          allow_truncated_timestamps = FALSE,
-                          properties = NULL,
-                          arrow_properties = NULL) {
+                          allow_truncated_timestamps = FALSE) {
   x_out <- x
   x <- as_writable_table(x)
 
@@ -163,24 +160,10 @@ write_parquet <- function(x,
     on.exit(sink$close())
   }
 
-  # Deprecation warnings
-  if (!is.null(properties)) {
-    warning(
-      "Providing 'properties' is deprecated. If you need to assemble 
properties outside ",
-      "this function, use ParquetFileWriter instead."
-    )
-  }
-  if (!is.null(arrow_properties)) {
-    warning(
-      "Providing 'arrow_properties' is deprecated. If you need to assemble 
arrow_properties ",
-      "outside this function, use ParquetFileWriter instead."
-    )
-  }
-
   writer <- ParquetFileWriter$create(
     x$schema,
     sink,
-    properties = properties %||% ParquetWriterProperties$create(
+    properties = ParquetWriterProperties$create(
       names(x),
       version = version,
       compression = compression,
@@ -189,7 +172,7 @@ write_parquet <- function(x,
       write_statistics = write_statistics,
       data_page_size = data_page_size
     ),
-    arrow_properties = arrow_properties %||% 
ParquetArrowWriterProperties$create(
+    arrow_properties = ParquetArrowWriterProperties$create(
       use_deprecated_int96_timestamps = use_deprecated_int96_timestamps,
       coerce_timestamps = coerce_timestamps,
       allow_truncated_timestamps = allow_truncated_timestamps
@@ -238,19 +221,35 @@ ParquetArrowWriterProperties$create <- 
function(use_deprecated_int96_timestamps
 
 valid_parquet_version <- c(
   "1.0" = ParquetVersionType$PARQUET_1_0,
-  "2.0" = ParquetVersionType$PARQUET_2_0
+  "2.0" = ParquetVersionType$PARQUET_2_0,
+  "2.4" = ParquetVersionType$PARQUET_2_4,
+  "2.6" = ParquetVersionType$PARQUET_2_6,
+  "latest" = ParquetVersionType$PARQUET_2_6
 )
 
-make_valid_version <- function(version, valid_versions = 
valid_parquet_version) {
+make_valid_parquet_version <- function(version, valid_versions = 
valid_parquet_version) {
   if (is_integerish(version)) {
-    version <- as.character(version)
+    version <- as.numeric(version)
   }
-  tryCatch(
-    valid_versions[[match.arg(version, choices = names(valid_versions))]],
-    error = function(cond) {
-      stop('"version" should be one of ', oxford_paste(names(valid_versions), 
"or"), call. = FALSE)
-    }
-  )
+  if (is.numeric(version)) {
+    version <- format(version, nsmall = 1)
+  }
+
+  if (!is.string(version)) {
+    stop(
+      "`version` must be one of ", oxford_paste(names(valid_versions), "or"),
+      call. = FALSE
+    )
+  }
+  out <- valid_versions[[arg_match(version, values = names(valid_versions))]]
+
+  if (identical(out, ParquetVersionType$PARQUET_2_0)) {
+    warning(
+      'Parquet format version "2.0" is deprecated. Use "2.4" or "2.6" to 
select format features.',
+      call. = FALSE
+    )
+  }
+  out
 }
 
 #' @title ParquetWriterProperties class
@@ -300,7 +299,7 @@ ParquetWriterPropertiesBuilder <- 
R6Class("ParquetWriterPropertiesBuilder",
   inherit = ArrowObject,
   public = list(
     set_version = function(version) {
-      parquet___WriterProperties___Builder__version(self, 
make_valid_version(version))
+      parquet___WriterProperties___Builder__version(self, 
make_valid_parquet_version(version))
     },
     set_compression = function(column_names, compression) {
       compression <- compression_from_name(compression)
diff --git a/r/man/enums.Rd b/r/man/enums.Rd
index 7ec126a019..614c196fde 100644
--- a/r/man/enums.Rd
+++ b/r/man/enums.Rd
@@ -36,7 +36,7 @@ An object of class \code{Compression::type} (inherits from 
\code{arrow-enum}) of
 
 An object of class \code{FileType} (inherits from \code{arrow-enum}) of length 
4.
 
-An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) 
of length 2.
+An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) 
of length 4.
 
 An object of class \code{MetadataVersion} (inherits from \code{arrow-enum}) of 
length 5.
 
diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd
index efc6856e5e..ff57e4c8e9 100644
--- a/r/man/write_parquet.Rd
+++ b/r/man/write_parquet.Rd
@@ -8,7 +8,7 @@ write_parquet(
   x,
   sink,
   chunk_size = NULL,
-  version = NULL,
+  version = "2.4",
   compression = default_parquet_compression(),
   compression_level = NULL,
   use_dictionary = NULL,
@@ -16,9 +16,7 @@ write_parquet(
   data_page_size = NULL,
   use_deprecated_int96_timestamps = FALSE,
   coerce_timestamps = NULL,
-  allow_truncated_timestamps = FALSE,
-  properties = NULL,
-  arrow_properties = NULL
+  allow_truncated_timestamps = FALSE
 )
 }
 \arguments{
@@ -28,41 +26,38 @@ write_parquet(
 system (\code{SubTreeFileSystem})}
 
 \item{chunk_size}{how many rows of data to write to disk at once. This
-directly corresponds to how many rows will be in each row group in parquet.
-If \code{NULL}, a best guess will be made for optimal size (based on the 
number of
-columns and number of rows), though if the data has fewer than 250 million
-cells (rows x cols), then the total number of rows is used.}
+directly corresponds to how many rows will be in each row group in
+parquet. If \code{NULL}, a best guess will be made for optimal size (based on
+the number of columns and number of rows), though if the data has fewer
+than 250 million cells (rows x cols), then the total number of rows is
+used.}
 
-\item{version}{parquet version, "1.0" or "2.0". Default "1.0". Numeric values
-are coerced to character.}
+\item{version}{parquet version: "1.0", "2.0" (deprecated), "2.4" (default),
+"2.6", or "latest" (currently equivalent to 2.6). Numeric values are
+coerced to character.}
 
 \item{compression}{compression algorithm. Default "snappy". See details.}
 
-\item{compression_level}{compression level. Meaning depends on compression 
algorithm}
+\item{compression_level}{compression level. Meaning depends on compression
+algorithm}
 
-\item{use_dictionary}{Specify if we should use dictionary encoding. Default 
\code{TRUE}}
+\item{use_dictionary}{logical: use dictionary encoding? Default \code{TRUE}}
 
-\item{write_statistics}{Specify if we should write statistics. Default 
\code{TRUE}}
+\item{write_statistics}{logical: include statistics? Default \code{TRUE}}
 
 \item{data_page_size}{Set a target threshold for the approximate encoded
 size of data pages within a column chunk (in bytes). Default 1 MiB.}
 
-\item{use_deprecated_int96_timestamps}{Write timestamps to INT96 Parquet 
format. Default \code{FALSE}.}
+\item{use_deprecated_int96_timestamps}{logical: write timestamps to INT96
+Parquet format, which has been deprecated? Default \code{FALSE}.}
 
 \item{coerce_timestamps}{Cast timestamps a particular resolution. Can be
 \code{NULL}, "ms" or "us". Default \code{NULL} (no casting)}
 
-\item{allow_truncated_timestamps}{Allow loss of data when coercing timestamps 
to a
-particular resolution. E.g. if microsecond or nanosecond data is lost when 
coercing
-to "ms", do not raise an exception}
-
-\item{properties}{A \code{ParquetWriterProperties} object, used instead of the 
options
-enumerated in this function's signature. Providing \code{properties} as an 
argument
-is deprecated; if you need to assemble \code{ParquetWriterProperties} outside
-of \code{write_parquet()}, use \code{ParquetFileWriter} instead.}
-
-\item{arrow_properties}{A \code{ParquetArrowWriterProperties} object. Like
-\code{properties}, this argument is deprecated.}
+\item{allow_truncated_timestamps}{logical: Allow loss of data when coercing
+timestamps to a particular resolution. E.g. if microsecond or nanosecond
+data is lost when coercing to "ms", do not raise an exception. Default
+\code{FALSE}.}
 }
 \value{
 the input \code{x} invisibly.
@@ -110,3 +105,6 @@ if (codec_is_available("gzip")) {
 }
 \dontshow{\}) # examplesIf}
 }
+\seealso{
+\link{ParquetFileWriter} for a lower-level interface to Parquet writing.
+}
diff --git a/r/tests/testthat/_snaps/dataset-write.md 
b/r/tests/testthat/_snaps/dataset-write.md
index 34cb46e9cb..e9ca7e0998 100644
--- a/r/tests/testthat/_snaps/dataset-write.md
+++ b/r/tests/testthat/_snaps/dataset-write.md
@@ -45,5 +45,5 @@
       write_dataset(df, dst_dir, format = "parquet", nonsensical_arg = 
"blah-blah")
     Error <rlang_error>
       `nonsensical_arg` is not a valid argument for your chosen `format`.
-      i Supported arguments: `chunk_size`, `version`, `compression`, 
`compression_level`, `use_dictionary`, `write_statistics`, `data_page_size`, 
`use_deprecated_int96_timestamps`, `coerce_timestamps`, 
`allow_truncated_timestamps`, `properties`, and `arrow_properties`.
+      i Supported arguments: `chunk_size`, `version`, `compression`, 
`compression_level`, `use_dictionary`, `write_statistics`, `data_page_size`, 
`use_deprecated_int96_timestamps`, `coerce_timestamps`, and 
`allow_truncated_timestamps`.
 
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index 482f508575..b75892bc84 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -129,15 +129,51 @@ test_that("write_parquet() can truncate timestamps", {
   expect_equal(as.data.frame(tab), as.data.frame(new))
 })
 
-test_that("make_valid_version()", {
-  expect_equal(make_valid_version("1.0"), ParquetVersionType$PARQUET_1_0)
-  expect_equal(make_valid_version("2.0"), ParquetVersionType$PARQUET_2_0)
+test_that("make_valid_parquet_version()", {
+  expect_equal(
+    make_valid_parquet_version("1.0"),
+    ParquetVersionType$PARQUET_1_0
+  )
+  expect_deprecated(
+    expect_equal(
+      make_valid_parquet_version("2.0"),
+      ParquetVersionType$PARQUET_2_0
+    )
+  )
+  expect_equal(
+    make_valid_parquet_version("2.4"),
+    ParquetVersionType$PARQUET_2_4
+  )
+  expect_equal(
+    make_valid_parquet_version("2.6"),
+    ParquetVersionType$PARQUET_2_6
+  )
+  expect_equal(
+    make_valid_parquet_version("latest"),
+    ParquetVersionType$PARQUET_2_6
+  )
 
-  expect_equal(make_valid_version(1), ParquetVersionType$PARQUET_1_0)
-  expect_equal(make_valid_version(2), ParquetVersionType$PARQUET_2_0)
+  expect_equal(make_valid_parquet_version(1), ParquetVersionType$PARQUET_1_0)
+  expect_deprecated(
+    expect_equal(make_valid_parquet_version(2), ParquetVersionType$PARQUET_2_0)
+  )
+  expect_equal(make_valid_parquet_version(1.0), ParquetVersionType$PARQUET_1_0)
+  expect_equal(make_valid_parquet_version(2.4), ParquetVersionType$PARQUET_2_4)
+})
 
-  expect_equal(make_valid_version(1.0), ParquetVersionType$PARQUET_1_0)
-  expect_equal(make_valid_version(2.0), ParquetVersionType$PARQUET_2_0)
+test_that("make_valid_parquet_version() input validation", {
+  expect_error(
+    make_valid_parquet_version("0.3.14"),
+    "`version` must be one of"
+  )
+  expect_error(
+    make_valid_parquet_version(NULL),
+    "`version` must be one of"
+  )
+  expect_error(
+    make_valid_parquet_version(c("2", "4")),
+    "`version` must be one of"
+  )
 })
 
 test_that("write_parquet() defaults to snappy compression", {
@@ -239,7 +275,7 @@ test_that("write_parquet() handles version argument", {
   tf <- tempfile()
   on.exit(unlink(tf))
 
-  purrr::walk(list("1.0", "2.0", 1.0, 2.0, 1L, 2L), ~ {
+  purrr::walk(list("1.0", "2.4", "2.6", "latest", 1.0, 2.4, 2.6, 1L), ~ {
     write_parquet(df, tf, version = .x)
     expect_identical(read_parquet(tf), df)
   })

[arrow] branch master updated: ARROW-16715: [R] Bump default parquet version (#13555)

Reply via email to