This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e306c35 ARROW-10624: [R] Proactively remove "problems" attributes
e306c35 is described below
commit e306c35530e43799ff21bb14d23387b86a4eff05
Author: Jonathan Keane <[email protected]>
AuthorDate: Mon Jan 4 15:50:15 2021 -0800
ARROW-10624: [R] Proactively remove "problems" attributes
Closes #9092 from jonkeane/r_attr
Authored-by: Jonathan Keane <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
---
r/NEWS.md | 6 ++++--
r/R/record-batch.R | 5 +++++
r/tests/testthat/test-metadata.R | 24 +++++++++++++++++++++++-
3 files changed, 32 insertions(+), 3 deletions(-)
diff --git a/r/NEWS.md b/r/NEWS.md
index 66f37bb..ebf80ee 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -29,10 +29,11 @@
## Enhancements
-* Table columns can now be added, replaced, or removed by assigning `<-` with
either `$` or `[[`
+* Table columns can now be added, replaced, or removed by assigning (`<-`)
with either `$` or `[[`
* Column names of Tables and RecordBatches can be renamed by assigning
`names()`
* Large string types can now be written to Parquet files
-* The [pronouns `.data` and
`.env`](https://rlang.r-lib.org/reference/tidyeval-data.html) are now fully
supported in Arrow-dplyr pipelines.
+* The [pronouns `.data` and
`.env`](https://rlang.r-lib.org/reference/tidyeval-data.html) are now fully
supported in Arrow `dplyr` pipelines.
+* Option `arrow.skip_nul` (default `FALSE`, as in `base::scan()`) allows
conversion of Arrow string (`utf8()`) type data containing embedded nul `\0`
characters to R. If set to `TRUE`, nuls will be stripped and a warning is
emitted if any are found.
## Bug fixes
@@ -40,6 +41,7 @@
* C++ functions now trigger garbage collection when needed
* `write_parquet()` can now write RecordBatches
* Reading a Table from a RecordBatchStreamReader containing 0 batches no
longer crashes
+* `readr`'s `problems` attribute is removed when converting to Arrow
RecordBatch and table to prevent large amounts of metadata from accumulating
inadvertently [ARROW-10624](https://issues.apache.org/jira/browse/ARROW-10624)
## Packaging and installation
diff --git a/r/R/record-batch.R b/r/R/record-batch.R
index 331a7a7..ef42c8d 100644
--- a/r/R/record-batch.R
+++ b/r/R/record-batch.R
@@ -274,6 +274,11 @@ as.data.frame.RecordBatch <- function(x, row.names = NULL,
optional = FALSE, ...
}
.serialize_arrow_r_metadata <- function(x) {
+ assert_is(x, "list")
+
+ # drop problems attributes (most likely from readr)
+ x[["attributes"]][["problems"]] <- NULL
+
rawToChar(serialize(x, NULL, ascii = TRUE))
}
diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index 1cd6fbc..53ee427 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -73,7 +73,9 @@ test_that("Garbage R metadata doesn't break things", {
"Invalid metadata$r",
fixed = TRUE
)
- tab$metadata$r <- .serialize_arrow_r_metadata("garbage")
+ # serialize data like .serialize_arrow_r_metadata does, but don't call that
+ # directly since it checks to ensure that the data is a list
+ tab$metadata$r <- rawToChar(serialize("garbage", NULL, ascii = TRUE))
expect_warning(
expect_identical(as.data.frame(tab), example_data[1:6]),
"Invalid metadata$r",
@@ -134,3 +136,23 @@ test_that("metadata keeps attribute of top level data
frame", {
expect_identical(attr(as.data.frame(tab), "foo"), "bar")
expect_identical(as.data.frame(tab), df)
})
+
+test_that("metadata drops readr's problems attribute", {
+ readr_like <- tibble::tibble(
+ dbl = 1.1,
+ not_here = NA_character_
+ )
+ attributes(readr_like) <- append(
+ attributes(readr_like),
+ list(problems = tibble::tibble(
+ row = 1L,
+ col = NA_character_,
+ expected = "2 columns",
+ actual = "1 columns",
+ file = "'test'"
+ ))
+ )
+
+ tab <- Table$create(readr_like)
+ expect_null(attr(as.data.frame(tab), "problems"))
+})