This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 6deb892 ARROW-10386: [R] List column class attributes not preserved
in roundtrip
6deb892 is described below
commit 6deb892f7e80e4c1ebb9f434af14fa0cb096cfe7
Author: Jonathan Keane <[email protected]>
AuthorDate: Wed Jan 13 15:15:57 2021 -0800
ARROW-10386: [R] List column class attributes not preserved in roundtrip
Closes #9182 from jonkeane/ARROW-10386/List_metadata
Lead-authored-by: Jonathan Keane <[email protected]>
Co-authored-by: Romain Francois <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
---
r/NAMESPACE | 1 +
r/NEWS.md | 2 +
r/R/arrow-package.R | 4 ++
r/R/feather.R | 1 +
r/R/parquet.R | 1 +
r/R/record-batch.R | 45 +++++++++++++++----
r/R/schema.R | 25 +++++++++++
r/R/table.R | 21 +++++++--
r/extra-tests/helpers.R | 4 ++
r/extra-tests/test-read-files.R | 32 ++++++++++++++
r/extra-tests/write-files.R | 3 ++
r/man/ParquetWriterProperties.Rd | 2 +
r/man/RecordBatch.Rd | 2 +-
r/man/Schema.Rd | 26 +++++++++++
r/man/Table.Rd | 2 +-
r/man/write_feather.Rd | 2 +
.../data-arrow-extra-meta_3.0.0.parquet | Bin 0 -> 13263 bytes
r/tests/testthat/helper-data.R | 7 +++
r/tests/testthat/test-backwards-compatibility.R | 9 ++++
r/tests/testthat/test-metadata.R | 48 +++++++++++++++++++++
r/vignettes/arrow.Rmd | 2 +-
21 files changed, 223 insertions(+), 16 deletions(-)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 9ce89ca..fdc84aa 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -332,6 +332,7 @@ importFrom(tidyselect,vars_select)
importFrom(utils,head)
importFrom(utils,install.packages)
importFrom(utils,modifyList)
+importFrom(utils,object.size)
importFrom(utils,packageVersion)
importFrom(utils,tail)
importFrom(vctrs,s3_register)
diff --git a/r/NEWS.md b/r/NEWS.md
index 40a943c..521af3e 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -37,6 +37,8 @@
* Option `arrow.skip_nul` (default `FALSE`, as in `base::scan()`) allows
conversion of Arrow string (`utf8()`) type data containing embedded nul `\0`
characters to R. If set to `TRUE`, nuls will be stripped and a warning is
emitted if any are found.
* `arrow_info()` for an overview of various run-time and build-time Arrow
configurations, useful for debugging
* Set environment variable `ARROW_DEFAULT_MEMORY_POOL` before loading the
Arrow package to change memory allocators. Windows packages are built with
`mimalloc`; most others have `jemalloc`. These are used by default if they were
built, and they're generally much faster than the system malloc, but sometimes
it is useful to turn them off for debugging purposes. To disable them, set
`ARROW_DEFAULT_MEMORY_POOL=system`.
+* List columns that have attributes on each element are now also included with
the metadata that is saved when creating Arrow tables. This allows `sf` tibbles
to faithfully preserved and roundtripped
(ARROW-10386)[https://issues.apache.org/jira/browse/ARROW-10386].
+* R metadata that exceeds 100Kb is now compressed before being written to a
table; see `schema()` for more details.
## Bug fixes
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 8743037..540cbcd 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -147,6 +147,10 @@ print.arrow_info <- function(x, ...) {
invisible(x)
}
+option_compress_metadata <- function() {
+ !is_false(getOption("arrow.compress_metadata"))
+}
+
#' @include enums.R
ArrowObject <- R6Class("ArrowObject",
public = list(
diff --git a/r/R/feather.R b/r/R/feather.R
index 6d29b7d..5aaf340 100644
--- a/r/R/feather.R
+++ b/r/R/feather.R
@@ -44,6 +44,7 @@
#' the stream will be left open.
#' @export
#' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC
data.
+#' @seealso [Schema] for information about schemas and metadata handling.
#' @examples
#' \donttest{
#' tf <- tempfile()
diff --git a/r/R/parquet.R b/r/R/parquet.R
index ccf87c2..4fe3216 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -275,6 +275,7 @@ make_valid_version <- function(version, valid_versions =
valid_parquet_version)
#' "snappy" for the `compression` argument.
#'
#' @seealso [write_parquet]
+#' @seealso [Schema] for information about schemas and metadata handling.
#'
#' @export
ParquetWriterProperties <- R6Class("ParquetWriterProperties", inherit =
ArrowObject)
diff --git a/r/R/record-batch.R b/r/R/record-batch.R
index f7589f9..1d36453 100644
--- a/r/R/record-batch.R
+++ b/r/R/record-batch.R
@@ -67,7 +67,7 @@
#' - `$schema`
#' - `$metadata`: Returns the key-value metadata of the `Schema` as a named
list.
#' Modify or replace by assigning in (`batch$metadata <- new_metadata`).
-#' All list elements are coerced to string.
+#' All list elements are coerced to string. See `schema()` for more
information.
#' - `$columns`: Returns a list of `Array`s
#' @rdname RecordBatch
#' @name RecordBatch
@@ -274,17 +274,36 @@ as.data.frame.RecordBatch <- function(x, row.names =
NULL, optional = FALSE, ...
df
}
+#' @importFrom utils object.size
.serialize_arrow_r_metadata <- function(x) {
assert_is(x, "list")
# drop problems attributes (most likely from readr)
x[["attributes"]][["problems"]] <- NULL
- rawToChar(serialize(x, NULL, ascii = TRUE))
+ out <- serialize(x, NULL, ascii = TRUE)
+
+ # if the metadata is over 100 kB, compress
+ if (option_compress_metadata() && object.size(out) > 100000) {
+ out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE)
+
+ # but ensure that the compression+serialization is effective.
+ if (object.size(out) > object.size(out_comp)) out <- out_comp
+ }
+
+ rawToChar(out)
}
.unserialize_arrow_r_metadata <- function(x) {
- tryCatch(unserialize(charToRaw(x)), error = function(e) {
+ tryCatch({
+ out <- unserialize(charToRaw(x))
+
+ # if this is still raw, try decompressing
+ if (is.raw(out)) {
+ out <- unserialize(memDecompress(out, type = "gzip"))
+ }
+ out
+ }, error = function(e) {
warning("Invalid metadata$r", call. = FALSE)
NULL
})
@@ -292,6 +311,20 @@ as.data.frame.RecordBatch <- function(x, row.names = NULL,
optional = FALSE, ...
apply_arrow_r_metadata <- function(x, r_metadata) {
tryCatch({
+ columns_metadata <- r_metadata$columns
+ if (is.data.frame(x)) {
+ if (length(names(x)) && !is.null(columns_metadata)) {
+ for (name in intersect(names(columns_metadata), names(x))) {
+ x[[name]] <- apply_arrow_r_metadata(x[[name]],
columns_metadata[[name]])
+ }
+ }
+ } else if(is.list(x) && !inherits(x, "POSIXlt") &&
!is.null(columns_metadata)) {
+ x <- map2(x, columns_metadata, function(.x, .y) {
+ apply_arrow_r_metadata(.x, .y)
+ })
+ x
+ }
+
if (!is.null(r_metadata$attributes)) {
attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes
if (inherits(x, "POSIXlt")) {
@@ -303,12 +336,6 @@ apply_arrow_r_metadata <- function(x, r_metadata) {
}
}
- columns_metadata <- r_metadata$columns
- if (length(names(x)) && !is.null(columns_metadata)) {
- for (name in intersect(names(columns_metadata), names(x))) {
- x[[name]] <- apply_arrow_r_metadata(x[[name]],
columns_metadata[[name]])
- }
- }
}, error = function(e) {
warning("Invalid metadata$r", call. = FALSE)
})
diff --git a/r/R/schema.R b/r/R/schema.R
index 9a0ad85..46eab69 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -50,6 +50,31 @@
#' - `$metadata`: returns the key-value metadata as a named list.
#' Modify or replace by assigning in (`sch$metadata <- new_metadata`).
#' All list elements are coerced to string.
+#'
+#' @section R Metadata:
+#'
+#' When converting a data.frame to an Arrow Table or RecordBatch, attributes
+#' from the `data.frame` are saved alongside tables so that the object can be
+#' reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata
+#' can be both at the top-level of the `data.frame` (e.g. `attributes(df)`)
or
+#' at the column (e.g. `attributes(df$col_a)`) or for list columns only:
+#' element level (e.g. `attributes(df[1, "col_a"])`). For example, this
allows
+#' for storing `haven` columns in a table and being able to faithfully
+#' re-create them when pulled back into R. This metadata is separate from the
+#' schema (column names and types) which is compatible with other Arrow
+#' clients. The R metadata is only read by R and is ignored by other clients
+#' (e.g. Pandas has its own custom metadata). This metadata is stored in
+#' `$metadata$r`.
+#'
+#' Since Schema metadata keys and values must be strings, this metadata is
+#' saved by serializing R's attribute list structure to a string. If the
+#' serialized metadata exceeds 100Kb in size, by default it is compressed
+#' starting in version 3.0.0. To disable this compression (e.g. for tables
+#' that are compatible with Arrow versions before 3.0.0 and include large
+#' amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`.
+#' Files with compressed metadata are readable by older versions of arrow,
but
+#' the metadata is dropped.
+#'
#' @rdname Schema
#' @name Schema
#' @examples
diff --git a/r/R/table.R b/r/R/table.R
index 9513943..730ed24 100644
--- a/r/R/table.R
+++ b/r/R/table.R
@@ -76,7 +76,7 @@
#' - `$schema`
#' - `$metadata`: Returns the key-value metadata of the `Schema` as a named
list.
#' Modify or replace by assigning in (`tab$metadata <- new_metadata`).
-#' All list elements are coerced to string.
+#' All list elements are coerced to string. See `schema()` for more
information.
#' - `$columns`: Returns a list of `ChunkedArray`s
#' @rdname Table
#' @name Table
@@ -211,11 +211,24 @@ arrow_attributes <- function(x, only_top_level = FALSE) {
if (is.data.frame(x)) {
columns <- map(x, arrow_attributes)
- if (length(att) || !all(map_lgl(columns, is.null))) {
+ out <- if (length(att) || !all(map_lgl(columns, is.null))) {
list(attributes = att, columns = columns)
}
- } else if (length(att)) {
- list(attributes = att, columns = NULL)
+ return(out)
+ }
+
+ columns <- NULL
+ if (is.list(x) && !inherits(x, "POSIXlt")) {
+ # for list columns, we also keep attributes of each
+ # element in columns
+ columns <- map(x, arrow_attributes)
+ if (all(map_lgl(columns, is.null))) {
+ columns <- NULL
+ }
+ }
+
+ if (length(att) || !is.null(columns)) {
+ list(attributes = att, columns = columns)
} else {
NULL
}
diff --git a/r/extra-tests/helpers.R b/r/extra-tests/helpers.R
index 61b7da4..af57d45 100644
--- a/r/extra-tests/helpers.R
+++ b/r/extra-tests/helpers.R
@@ -19,6 +19,10 @@ if_version <- function(version, op = `==`) {
op(packageVersion("arrow"), version)
}
+if_version_less_than <- function(version) {
+ if_version(version, op = `<`)
+}
+
skip_if_version_less_than <- function(version, msg) {
if(if_version(version, `<`)) {
skip(msg)
diff --git a/r/extra-tests/test-read-files.R b/r/extra-tests/test-read-files.R
index 90efce3..10e9f95 100644
--- a/r/extra-tests/test-read-files.R
+++ b/r/extra-tests/test-read-files.R
@@ -162,4 +162,36 @@ test_that("Can see the metadata (stream)", {
)
})
+test_that("Can see the extra metadata (parquet)", {
+ pq_file <- "files/ex_data_extra_metadata.parquet"
+
+ if (if_version_less_than("3.0.0")) {
+ expect_warning(
+ df <- read_parquet(pq_file),
+ "Invalid metadata$r",
+ fixed = TRUE
+ )
+ expect_s3_class(df, "tbl")
+ } else {
+ # version 3.0.0 and greater
+ df <- read_parquet(pq_file)
+ expect_s3_class(df, "tbl")
+ expect_equal(
+ attributes(df),
+ list(
+ names = letters[1:4],
+ row.names = 1L,
+ class = c("tbl_df", "tbl", "data.frame"),
+ top_level = list(
+ field_one = 12,
+ field_two = "more stuff"
+ )
+ )
+ )
+
+ # column-level attributes for the large column.
+ expect_named(attributes(df$b), "lots")
+ expect_length(attributes(df$b)$lots, 100)
+ }
+})
diff --git a/r/extra-tests/write-files.R b/r/extra-tests/write-files.R
index e0927ea..75889b6 100644
--- a/r/extra-tests/write-files.R
+++ b/r/extra-tests/write-files.R
@@ -37,3 +37,6 @@ example_with_metadata_v1$c <- NULL
write_feather(example_with_metadata_v1,
"extra-tests/files/ex_data_v1.feather", version = 1)
write_ipc_stream(example_with_metadata, "extra-tests/files/ex_data.stream")
+
+write_parquet(example_with_extra_metadata,
"extra-tests/files/ex_data_extra_metadata.parquet")
+
diff --git a/r/man/ParquetWriterProperties.Rd b/r/man/ParquetWriterProperties.Rd
index a2fab2a..7beb8a8 100644
--- a/r/man/ParquetWriterProperties.Rd
+++ b/r/man/ParquetWriterProperties.Rd
@@ -44,4 +44,6 @@ size of data pages within a column chunk (in bytes). Default
1 MiB.
\seealso{
\link{write_parquet}
+
+\link{Schema} for information about schemas and metadata handling.
}
diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch.Rd
index 86d230b..06f9f67 100644
--- a/r/man/RecordBatch.Rd
+++ b/r/man/RecordBatch.Rd
@@ -69,7 +69,7 @@ There are also some active bindings
\item \verb{$schema}
\item \verb{$metadata}: Returns the key-value metadata of the \code{Schema} as
a named list.
Modify or replace by assigning in (\code{batch$metadata <- new_metadata}).
-All list elements are coerced to string.
+All list elements are coerced to string. See \code{schema()} for more
information.
\item \verb{$columns}: Returns a list of \code{Array}s
}
}
diff --git a/r/man/Schema.Rd b/r/man/Schema.Rd
index 1c1f75e..c2fb2fa 100644
--- a/r/man/Schema.Rd
+++ b/r/man/Schema.Rd
@@ -48,6 +48,32 @@ All list elements are coerced to string.
}
}
+\section{R Metadata}{
+
+
+When converting a data.frame to an Arrow Table or RecordBatch, attributes
+from the \code{data.frame} are saved alongside tables so that the object can be
+reconstructed faithfully in R (e.g. with \code{as.data.frame()}). This metadata
+can be both at the top-level of the \code{data.frame} (e.g.
\code{attributes(df)}) or
+at the column (e.g. \code{attributes(df$col_a)}) or for list columns only:
+element level (e.g. \code{attributes(df[1, "col_a"])}). For example, this
allows
+for storing \code{haven} columns in a table and being able to faithfully
+re-create them when pulled back into R. This metadata is separate from the
+schema (column names and types) which is compatible with other Arrow
+clients. The R metadata is only read by R and is ignored by other clients
+(e.g. Pandas has its own custom metadata). This metadata is stored in
+\verb{$metadata$r}.
+
+Since Schema metadata keys and values must be strings, this metadata is
+saved by serializing R's attribute list structure to a string. If the
+serialized metadata exceeds 100Kb in size, by default it is compressed
+starting in version 3.0.0. To disable this compression (e.g. for tables
+that are compatible with Arrow versions before 3.0.0 and include large
+amounts of metadata), set the option \code{arrow.compress_metadata} to
\code{FALSE}.
+Files with compressed metadata are readable by older versions of arrow, but
+the metadata is dropped.
+}
+
\examples{
\donttest{
df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
diff --git a/r/man/Table.Rd b/r/man/Table.Rd
index 5dc2194..14c0b0b 100644
--- a/r/man/Table.Rd
+++ b/r/man/Table.Rd
@@ -69,7 +69,7 @@ There are also some active bindings:
\item \verb{$schema}
\item \verb{$metadata}: Returns the key-value metadata of the \code{Schema} as
a named list.
Modify or replace by assigning in (\code{tab$metadata <- new_metadata}).
-All list elements are coerced to string.
+All list elements are coerced to string. See \code{schema()} for more
information.
\item \verb{$columns}: Returns a list of \code{ChunkedArray}s
}
}
diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd
index 277c819..691adbe 100644
--- a/r/man/write_feather.Rd
+++ b/r/man/write_feather.Rd
@@ -56,4 +56,6 @@ write_feather(mtcars, tf)
}
\seealso{
\link{RecordBatchWriter} for lower-level access to writing Arrow IPC data.
+
+\link{Schema} for information about schemas and metadata handling.
}
diff --git a/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet
b/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet
new file mode 100644
index 0000000..bf95f23
Binary files /dev/null and
b/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet differ
diff --git a/r/tests/testthat/helper-data.R b/r/tests/testthat/helper-data.R
index 26b1cf0..ecce773 100644
--- a/r/tests/testthat/helper-data.R
+++ b/r/tests/testthat/helper-data.R
@@ -67,3 +67,10 @@ make_big_string <- function() {
# This creates a character vector that would exceed the capacity of
BinaryArray
rep(purrr::map_chr(2047:2050, ~paste(sample(letters, ., replace = TRUE),
collapse = "")), 2^18)
}
+
+make_string_of_size <- function(size = 1) {
+ purrr::map_chr(1000*size, ~paste(sample(letters, ., replace = TRUE),
collapse = ""))
+}
+
+example_with_extra_metadata <- example_with_metadata
+attributes(example_with_extra_metadata$b) <- list(lots =
rep(make_string_of_size(1), 100))
diff --git a/r/tests/testthat/test-backwards-compatibility.R
b/r/tests/testthat/test-backwards-compatibility.R
index fb06897..c6bd514 100644
--- a/r/tests/testthat/test-backwards-compatibility.R
+++ b/r/tests/testthat/test-backwards-compatibility.R
@@ -45,6 +45,15 @@ expect_identical_with_metadata <- function(object, expected,
..., top_level = TR
expect_identical(object, expected, ...)
}
+test_that("reading a known Parquet file to dataframe with 3.0.0", {
+ skip_if_not_available("snappy")
+ pq_file <- test_path("golden-files/data-arrow-extra-meta_3.0.0.parquet")
+
+ df <- read_parquet(pq_file)
+ # this is equivalent to `expect_identical()`
+ expect_identical_with_metadata(df, example_with_extra_metadata)
+})
+
test_that("reading a known Parquet file to dataframe with 2.0.0", {
skip_if_not_available("snappy")
pq_file <- test_path("golden-files/data-arrow_2.0.0.parquet")
diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index 53ee427..17c43bb 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -83,6 +83,46 @@ test_that("Garbage R metadata doesn't break things", {
)
})
+test_that("Metadata serialization compression", {
+ # attributes that (when serialized) are just under 100kb are not compressed,
+ # and simply serialized
+ strings <- as.list(rep(make_string_of_size(1), 98))
+ small <- .serialize_arrow_r_metadata(strings)
+ expect_equal(
+ object.size(small),
+ object.size(rawToChar(serialize(strings, NULL, ascii = TRUE)))
+ )
+
+ # Large strings will be compressed
+ large_strings <- as.list(rep(make_string_of_size(1), 100))
+ large <- .serialize_arrow_r_metadata(large_strings)
+ expect_lt(
+ object.size(large),
+ object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE)))
+ )
+ # and this compression ends up being smaller than even the "small" strings
+ expect_lt(object.size(large), object.size(small))
+
+ # However strings where compression + serialization is not effective are no
+ # worse than only serialization alone
+ large_few_strings <- as.list(rep(make_string_of_size(50), 2))
+ large_few <- .serialize_arrow_r_metadata(large_few_strings)
+ expect_equal(
+ object.size(large_few),
+ object.size(rawToChar(serialize(large_few_strings, NULL, ascii = TRUE)))
+ )
+
+ # But we can disable compression
+ op <- options(arrow.compress_metadata = FALSE); on.exit(options(op))
+
+ large_strings <- as.list(rep(make_string_of_size(1), 100))
+ large <- .serialize_arrow_r_metadata(large_strings)
+ expect_equal(
+ object.size(large),
+ object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE)))
+ )
+})
+
test_that("RecordBatch metadata", {
rb <- RecordBatch$create(x = 1:2, y = c("a", "b"))
expect_equivalent(rb$metadata, list())
@@ -137,6 +177,7 @@ test_that("metadata keeps attribute of top level data
frame", {
expect_identical(as.data.frame(tab), df)
})
+
test_that("metadata drops readr's problems attribute", {
readr_like <- tibble::tibble(
dbl = 1.1,
@@ -156,3 +197,10 @@ test_that("metadata drops readr's problems attribute", {
tab <- Table$create(readr_like)
expect_null(attr(as.data.frame(tab), "problems"))
})
+
+test_that("metadata of list elements (ARROW-10386)", {
+ df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz =
"qux"))))
+ tab <- Table$create(df)
+ expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar")
+ expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux")
+})
diff --git a/r/vignettes/arrow.Rmd b/r/vignettes/arrow.Rmd
index 9ea977b..a1604cb 100644
--- a/r/vignettes/arrow.Rmd
+++ b/r/vignettes/arrow.Rmd
@@ -154,7 +154,7 @@ Arrow supports custom key-value metadata attached to
Schemas. When we convert a
This metadata is preserved when writing the table to Feather or Parquet, and
when reading those files into R, or when calling `as.data.frame()` on a
Table/RecordBatch, the column attributes are restored to the columns of the
resulting `data.frame`. This means that custom data types, including
`haven::labelled`, `vctrs` annotations, and others, are preserved when doing a
round-trip through Arrow.
-Note that the `attributes()` stored in `$metadata$r` are only understood by R.
If you write a `data.frame` with `haven` columns to a Feather file and read
that in Pandas, the `haven` metadata won't be recognized there. (Similarly,
Pandas writes its own custom metadata, which the R package does not consume.)
You are free, however, to define custom metadata conventions for your
application and assign any (string) values you want to other metadata keys.
+Note that the `attributes()` stored in `$metadata$r` are only understood by R.
If you write a `data.frame` with `haven` columns to a Feather file and read
that in Pandas, the `haven` metadata won't be recognized there. (Similarly,
Pandas writes its own custom metadata, which the R package does not consume.)
You are free, however, to define custom metadata conventions for your
application and assign any (string) values you want to other metadata keys. For
more details, see the documentati [...]
## Class structure and package conventions