This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 9ea91ca01f GH-48057: [R] Slow reading performance caused by
apply_arrow_r_metadata() looping through all columns, including NULL ones
(#48104)
9ea91ca01f is described below
commit 9ea91ca01fb300d8b74d19652d1ea68d39c2564d
Author: Nic Crane <[email protected]>
AuthorDate: Thu Nov 13 17:49:14 2025 +0000
GH-48057: [R] Slow reading performance caused by apply_arrow_r_metadata()
looping through all columns, including NULL ones (#48104)
### Rationale for this change
Slow reading due to looping through metadata
### What changes are included in this PR?
Don't loop through NULL metadata
### Are these changes tested?
Not in unit tests, but see comment below with microbenchmarks.
### Are there any user-facing changes?
No
* GitHub Issue: #48057
Authored-by: Nic Crane <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/R/metadata.R | 2 +-
r/tests/testthat/test-metadata.R | 27 +++++++++++++++++++++++++++
2 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/r/R/metadata.R b/r/R/metadata.R
index 93aa5018f6..206a18d09f 100644
--- a/r/R/metadata.R
+++ b/r/R/metadata.R
@@ -175,7 +175,7 @@ apply_arrow_r_metadata <- function(x, r_metadata) {
columns_metadata <- r_metadata$columns
if (is.data.frame(x)) {
# if columns metadata exists, apply it here
- if (length(names(x)) && !is.null(columns_metadata)) {
+ if (length(names(x)) && !is.null(columns_metadata) &&
!all(map_lgl(columns_metadata, is.null))) {
for (name in intersect(names(columns_metadata), names(x))) {
x[[name]] <- apply_arrow_r_metadata(x[[name]],
columns_metadata[[name]])
}
diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index 869bde5b4d..90b9f599ec 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -490,3 +490,30 @@ test_that("data.frame class attribute is not saved", {
df_arrow <- arrow_table(df)
expect_identical(df_arrow$r_metadata, list(attributes = list(foo = "bar"),
columns = list(x = NULL)))
})
+
+test_that("apply_arrow_r_metadata doesn't add in metadata from plain
data.frame objects - GH48057", {
+ # with just a plain df the (empty) column metadata is not preserved
+ plain_df <- data.frame(x = 1:5)
+ plain_df_arrow <- arrow_table(plain_df)
+
+ expect_equal(plain_df_arrow$metadata$r$columns, list(x = NULL))
+
+ plain_df_no_metadata <- plain_df_arrow$to_data_frame()
+ plain_df_with_metadata <- apply_arrow_r_metadata(plain_df_no_metadata,
plain_df_arrow$metadata$r)
+
+ expect_identical(plain_df_no_metadata, plain_df_with_metadata)
+
+ # with more complex column metadata - it preserves it
+ spicy_df_arrow <- arrow_table(haven_data)
+
+ expect_equal(
+ spicy_df_arrow$metadata$r$columns,
+ list(num = list(attributes = list(format.spss = "F8.2"), columns = NULL),
cat_int = NULL, cat_chr = NULL)
+ )
+
+ spicy_df_no_metadata <- spicy_df_arrow$to_data_frame()
+ spicy_df_with_metadata <- apply_arrow_r_metadata(spicy_df_no_metadata,
spicy_df_arrow$metadata$r)
+
+ expect_null(attr(spicy_df_no_metadata$num, "format.spss"))
+ expect_equal(attr(spicy_df_with_metadata$num, "format.spss"), "F8.2")
+})