(arrow-nanoarrow) branch main updated: fix(r): Fix map type and struct-nested-in-struct convert (#829)

paleolimbot Mon, 08 Dec 2025 19:44:27 -0800

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 04e7521d fix(r): Fix map type and struct-nested-in-struct convert 
(#829)
04e7521d is described below

commit 04e7521d561ba0841a44eb482062a67c05146dae
Author: Dewey Dunnington <[email protected]>
AuthorDate: Mon Dec 8 21:44:16 2025 -0600

    fix(r): Fix map type and struct-nested-in-struct convert (#829)
    
    Some of the types listed in the Overture maps foundation Parquet files
    are deeply nested in various ways we hadn't encountered yet when trying
    to convert to R.
    
    The motivating reprex here was:
    
    ```r
    library(sedonadb)
    
    
sd_read_parquet("/Volumes/data/overture/data/theme=divisions/type=division_area/")
 |>
      head(100000) |>
      sd_collect() |>
      tibble::as_tibble()
    #> Can't convert `common` <map<key_value: struct<key: string, value: 
string>>> to R vector of type vctrs_list_of
    #> or
    #> Something about expected length mixing up rows and columns of a data 
frame
    ```
    
    After this PR:
    
    ``` r
    library(sedonadb)
    #> Warning: package 'sedonadb' was built under R version 4.5.2
    
    
sd_read_parquet("/Volumes/data/overture/data/theme=divisions/type=division_area/")
 |>
      head(100000) |>
      sd_collect() |>
      tibble::as_tibble()
    #> # A tibble: 100,000 × 13
    #>    id     geometry bbox$xmin country version sources subtype class 
names$primary
    #>    <chr>  <grrw_v>     <dbl> <chr>     <int> <list<> <chr>   <chr> <chr>
    #>  1 194bd… <POLYGO…     -16.6 ES            1 [1 × 7] microh… land  
"Plaza Paco …
    #>  2 f7334… <MULTIP…     -16.6 ES            3 [1 × 7] locali… land  "Los 
Realejo…
    #>  3 a273d… <POLYGO…     -16.6 ES            1 [1 × 7] microh… land  
"Plaza de la…
    #>  4 46d32… <POLYGO…     -16.6 ES            1 [1 × 7] microh… land  
"Plaza de la…
    #>  5 897c5… <POLYGO…     -16.6 ES            1 [1 × 7] microh… land  
"Plaza Domín…
    #>  6 92c7c… <POLYGO…     -16.6 ES            1 [1 × 7] microh… land  
"Plazoleta M…
    #>  7 afdbf… <POLYGO…     -16.6 ES            1 [1 × 7] microh… land  
"Plaza Poeta…
    #>  8 8ed3a… <POLYGO…     -16.6 ES            1 [1 × 7] microh… land  
"Plaza de Pe…
    #>  9 319f4… <POLYGO…     -16.7 ES            2 [1 × 7] locali… land  "La 
Guancha"
    #> 10 36457… <MULTIP…     -16.7 ES            3 [1 × 7] locali… land  "San 
Juan de…
    #> # ℹ 99,990 more rows
    #> # ℹ 9 more variables: bbox$xmax <dbl>, $ymin <dbl>, $ymax <dbl>,
    #> #   names$common <list<df[,2]>>, $rules <list<df[,6]>>, is_land <lgl>,
    #> #   is_territorial <lgl>, region <chr>, division_id <chr>
    ```
    
    Interestingly, nanoarrow seems to be much faster than arrow at
    converting this deeply nested structure. I have no idea why.
    
    ``` r
    complex_map_file <- system.file("test-data/complex-map.arrows", package = 
"nanoarrow")
    
    bench::mark(
      arrow = as.data.frame(arrow::read_ipc_stream(complex_map_file)),
      nanoarrow = as.data.frame(nanoarrow::read_nanoarrow(complex_map_file)),
      check = FALSE
    )
    #> # A tibble: 2 × 6
    #>   expression      min   median `itr/sec` mem_alloc `gc/sec`
    #>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
    #> 1 arrow       44.95ms  44.95ms      22.2   21.04MB    178.
    #> 2 nanoarrow    3.97ms   5.81ms     178.     2.06MB     23.8
    ```
    
    <sup>Created on 2025-12-05 with [reprex
    v2.1.1](https://reprex.tidyverse.org)</sup>
---
 r/inst/test-data/README.md                   |  40 +++++++++++++++++++++++++
 r/inst/test-data/complex-map.arrows          | Bin 0 -> 785040 bytes
 r/src/materialize.c                          |  43 +++++++++++++++++++++------
 r/tests/testthat/test-convert-array-stream.R |  34 +++++++++++++++++++++
 r/tests/testthat/test-convert-array.R        |  35 ++++++++++++++++++++++
 5 files changed, 143 insertions(+), 9 deletions(-)

diff --git a/r/inst/test-data/README.md b/r/inst/test-data/README.md
new file mode 100644
index 00000000..7704e141
--- /dev/null
+++ b/r/inst/test-data/README.md
@@ -0,0 +1,40 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Extra test files for R
+
+## complex-map.arrows
+
+Sourced from the [Overture Maps Foundation](https://overturemaps.org) 
"divisions_area" table using
+[SedonaDB](https://sedona.apache.org/sedonadb).
+
+```r
+library(sedonadb)
+
+sd_sql("SET datafusion.execution.batch_size = 1024")
+
+sd_read_parquet("/Volumes/data/overture/data/theme=divisions/type=division_area/")
 |>
+  sd_to_view("division_area", overwrite = TRUE)
+
+sd_sql("SELECT ROW_NUMBER() OVER (ORDER BY names.primary) as idx, names FROM 
division_area WHERE names.common IS NOT NULL") |>
+  sd_compute() |> sd_to_view("names_with_common", overwrite = TRUE)
+
+sd_sql("SELECT * FROM names_with_common WHERE (idx % 100) = 0 ORDER BY idx") |>
+  nanoarrow::write_nanoarrow("inst/test-data/complex-map.arrows")
+```
diff --git a/r/inst/test-data/complex-map.arrows 
b/r/inst/test-data/complex-map.arrows
new file mode 100644
index 00000000..965297e1
Binary files /dev/null and b/r/inst/test-data/complex-map.arrows differ
diff --git a/r/src/materialize.c b/r/src/materialize.c
index 77db5cbf..2d42245e 100644
--- a/r/src/materialize.c
+++ b/r/src/materialize.c
@@ -64,17 +64,41 @@ static int has_attrib_safe(SEXP x, SEXP sym) {
 }
 
 R_xlen_t nanoarrow_data_frame_size(SEXP x) {
-  if (Rf_length(x) > 0) {
-    // This both avoids materializing the row.names attribute and
-    // makes this work with struct-style vctrs that don't have a
-    // row.names attribute but that always have one or more element
+  // If this is actually a data frame, check the rownames. ALTREP ensures
+  // that "expanding" these rownames by accessing the attribute does not
+  // actually compute millions of strings.
+  if (Rf_inherits(x, "data.frame")) {
+    return Rf_xlength(Rf_getAttrib(x, R_RowNamesSymbol));
+  }
+
+  // If we are here, this is a struct style vctr. If we have a plain vctr
+  // as the first element we can just compute its length from the Rf_xlength()
+  // of the first column.
+  if (Rf_length(x) > 0 && !Rf_isObject(VECTOR_ELT(x, 0))) {
     return Rf_xlength(VECTOR_ELT(x, 0));
   } else {
-    // Since ALTREP was introduced, materializing the row.names attribute is
-    // usually deferred such that values in the form c(NA, -nrow), 1:nrow, or
-    // as.character(1:nrow) are never actually computed when the length is
-    // taken.
-    return Rf_xlength(Rf_getAttrib(x, R_RowNamesSymbol));
+    // Otherwise, we need S3 dispatch to compute the length for us.
+    SEXP length_sym = PROTECT(Rf_install("length"));
+    SEXP call_sexp = PROTECT(Rf_lang2(length_sym, x));
+    SEXP length_sexp = PROTECT(Rf_eval(call_sexp, R_BaseEnv));
+    if (Rf_xlength(length_sexp) != 1) {
+      Rf_error("length() return value with size != 1");
+    }
+
+    R_xlen_t out = 0;
+    switch (TYPEOF(length_sexp)) {
+      case INTSXP:
+        out = INTEGER(length_sexp)[0];
+        break;
+      case REALSXP:
+        out = (R_xlen_t)REAL(length_sexp)[0];
+        break;
+      default:
+        Rf_error("length() return value with unexpected type");
+    }
+
+    UNPROTECT(3);
+    return out;
   }
 }
 
@@ -559,6 +583,7 @@ static int nanoarrow_materialize_list_of(struct RConverter* 
converter,
     case NANOARROW_TYPE_NA:
       return NANOARROW_OK;
     case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
       for (int64_t i = 0; i < dst->length; i++) {
         if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) {
           offset = offsets[raw_src_offset + i];
diff --git a/r/tests/testthat/test-convert-array-stream.R 
b/r/tests/testthat/test-convert-array-stream.R
index c1db7761..068e3958 100644
--- a/r/tests/testthat/test-convert-array-stream.R
+++ b/r/tests/testthat/test-convert-array-stream.R
@@ -252,6 +252,40 @@ test_that("convert array stream works for 
fixed_size_list_of() with parent nulls
   )
 })
 
+test_that("complex nested struct/map combination can be read", {
+  skip_if_not_installed("arrow")
+  skip_if_not_installed("vctrs")
+
+  complex_map_file <- system.file("test-data/complex-map.arrows", package = 
"nanoarrow")
+  df_from_arrow <- as.data.frame(arrow::read_ipc_stream(complex_map_file))
+  df_from_nanoarrow <- as.data.frame(read_nanoarrow(complex_map_file))
+
+  expect_identical(
+    df_from_nanoarrow$idx,
+    as.double(df_from_arrow$idx)
+  )
+
+  expect_identical(
+    df_from_nanoarrow$names$primary,
+    df_from_arrow$names$primary
+  )
+
+  expect_identical(
+    df_from_nanoarrow$names$common[[3000]],
+    as.data.frame(df_from_arrow$names$common[[3000]])
+  )
+
+  expect_identical(
+    lengths(df_from_nanoarrow$names$rules),
+    lengths(df_from_arrow$names$rules)
+  )
+
+  expect_identical(
+    df_from_nanoarrow$names$rules[[4238]][c("variant", "language", "value")],
+    as.data.frame(df_from_arrow$names$rules[[4238]][c("variant", "language", 
"value")])
+  )
+})
+
 test_that("convert array stream respects the value of n", {
   batches <- list(
     data.frame(x = 1:5),
diff --git a/r/tests/testthat/test-convert-array.R 
b/r/tests/testthat/test-convert-array.R
index 973fc861..7b7be069 100644
--- a/r/tests/testthat/test-convert-array.R
+++ b/r/tests/testthat/test-convert-array.R
@@ -154,6 +154,17 @@ test_that("convert to vector works for tibble", {
   )
 })
 
+test_that("convert to vector works for a rcrd-sytle vctr with complex 
columns", {
+  skip_if_not_installed("vctrs")
+
+  rcrd <- vctrs::new_rcrd(list(x = data.frame(y = 1:10)))
+  rcrd_array <- as_nanoarrow_array(vctrs::vec_data(rcrd))
+  expect_identical(
+    convert_array(rcrd_array, rcrd),
+    rcrd
+  )
+})
+
 test_that("convert to vector works for nanoarrow_vctr()", {
   array <- as_nanoarrow_array(c("one", "two", "three"))
 
@@ -1084,6 +1095,30 @@ test_that("convert to vector works for null -> 
vctrs::list_of()", {
   )
 })
 
+test_that("convert to vector works for map -> vctrs::list_of", {
+  skip_if_not_installed("arrow")
+  skip_if_not_installed("vctrs")
+
+  values <- vctrs::list_of(
+    data.frame(key = "key1", value = 1L),
+    data.frame(key = c("key2", "key3"), value = c(2L, 3L)),
+    NULL
+  )
+
+  array_list <- as_nanoarrow_array(
+    arrow::Array$create(
+      values,
+      type = arrow::map_of(arrow::string(), arrow::int32())
+    )
+  )
+
+  # Default conversion
+  expect_identical(
+    convert_array(array_list),
+    values
+  )
+})
+
 test_that("convert to vector works for fixed_size_list_of() -> matrix()", {
   mat <- matrix(1:6, ncol = 2, byrow = TRUE)
   array <- as_nanoarrow_array(mat)

(arrow-nanoarrow) branch main updated: fix(r): Fix map type and struct-nested-in-struct convert (#829)

Reply via email to