Dewey Dunnington created ARROW-16269:
----------------------------------------

             Summary: [R][Python] Roundtrip ChunkedArray with ExtensionType 
drops type
                 Key: ARROW-16269
                 URL: https://issues.apache.org/jira/browse/ARROW-16269
             Project: Apache Arrow
          Issue Type: Improvement
          Components: Python, R
            Reporter: Dewey Dunnington


After ARROW-15168 we will use ExtensionType in more cases to handle R vector 
types that we don't natively implement a conversion for; however, roundtripping 
a Table through results in a Table with a slightly inconsistent state where the 
type of the ChunkedArray doesn't line up with the type in the schema:

{code:R}
# remotes::install_github("apache/arrow/r")
library(arrow, warn.conflicts = FALSE)
pa <- reticulate::import("pyarrow", convert = FALSE)

table <- arrow_table(
  ext_col = chunked_array(vctrs_extension_array(1:10))
)
table$ext_col$type
#> VctrsExtensionType
#> integer(0)
table$schema$ext_col$type
#> VctrsExtensionType
#> integer(0)

table_py <- pa$Table$from_arrays(table$columns, schema = table$schema)
table_py$column("ext_col")$type
#> int32
table_py$schema$field("ext_col")$type
#> int32

cols <- reticulate::py_to_r(table_py$columns)
names(cols) <- reticulate::py_to_r(table_py$column_names)
table2 <- Table$create(!!! cols, schema = table$schema)
table2$ext_col$type
#> Int32
#> int32
table2$schema$ext_col$type
#> VctrsExtensionType
#> integer(0)
{code}

The workaround in ARROW-15168 is to go through RecordBatchReader, which is 
probably fine but in some cases might result in ChunkedArray columns getting 
re-chunked to intersection of all the chunks. This doesn't copy any data, but 
isn't ideal (we should be able to roundtrip column-wise and avoid any 
re-chunking).

{code:R}
# remotes::install_github("apache/arrow/r#12817")
library(arrow, warn.conflicts = FALSE)

table <- arrow_table(
  c1 = chunked_array(1:2, 3:4, 5:6), 
  c2 = chunked_array(1:6)
)

table$c1
#> ChunkedArray
#> [
#>   [
#>     1,
#>     2
#>   ],
#>   [
#>     3,
#>     4
#>   ],
#>   [
#>     5,
#>     6
#>   ]
#> ]
table$c2
#> ChunkedArray
#> [
#>   [
#>     1,
#>     2,
#>     3,
#>     4,
#>     5,
#>     6
#>   ]
#> ]

rbr <- as_record_batch_reader(table)
table2 <- rbr$read_table()

table2$c1
#> ChunkedArray
#> [
#>   [
#>     1,
#>     2
#>   ],
#>   [
#>     3,
#>     4
#>   ],
#>   [
#>     5,
#>     6
#>   ]
#> ]
table2$c2
#> ChunkedArray
#> [
#>   [
#>     1,
#>     2
#>   ],
#>   [
#>     3,
#>     4
#>   ],
#>   [
#>     5,
#>     6
#>   ]
#> ]
{code}




--
This message was sent by Atlassian Jira
(v8.20.7#820007)

Reply via email to