This is an automated email from the ASF dual-hosted git repository. romainfrancois pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new c164685 ARROW-5625: [R] convert Array of struct type to data frame columns c164685 is described below commit c16468554ab994a6e228e2ed24181be0bb60ab93 Author: Romain Francois <rom...@rstudio.com> AuthorDate: Tue Jun 18 09:59:22 2019 +0200 ARROW-5625: [R] convert Array of struct type to data frame columns struct arrays become data frame columns, i.e. ``` r library(arrow, warn.conflicts = FALSE) library(tibble) tf <- tempfile() writeLines(' { "hello": 3.5, "world": false, "yo": "thing", "nuf": {} } { "hello": 3.25, "world": null, "nuf": null } { "hello": 3.125, "world": null, "yo": "\u5fcd", "nuf": { "ps": 78.0, "house": "Gryffindor"} } { "hello": 0.0, "world": true, "yo": null, "nuf": { "ps": 90.0, "house": "Slytherin" } } ', tf) tab1 <- read_json_arrow(tf, as_tibble = FALSE) array <- tab1$column(3)$data()$chunk(0) array$field(0) #> arrow::Array #> [ #> null, #> null, #> 78, #> 90 #> ] array$as_vector() #> ps house #> 1 NA <NA> #> 2 NA <NA> #> 3 78 Gryffindor #> 4 90 Slytherin as.data.frame(tab1) #> # A tibble: 4 x 4 #> hello world yo nuf$ps $house #> <dbl> <lgl> <chr> <dbl> <chr> #> 1 3.5 FALSE thing NA <NA> #> 2 3.25 NA <NA> NA <NA> #> 3 3.12 NA 忍 78 Gryffindor #> 4 0 TRUE <NA> 90 Slytherin ``` <sup>Created on 2019-06-17 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0.9000)</sup> Author: Romain Francois <rom...@rstudio.com> Closes #4593 from romainfrancois/ARROW-5625/struct_arrays and squashes the following commits: b1f087e9 <Romain Francois> expand on unit test and added comment 17fd51e1 <Romain Francois> effectively use arrays 059c2442 <Romain Francois> lint 394a3d76 <Romain Francois> Converter_Struct::Ingest.*() a76f2671 <Romain Francois> Converter_Struct::Allocate() e425c5b2 <Romain Francois> More StrucrtArray methods bd71f388 <Romain Francois> + methods GetFieldIndex() and GetFieldByName() to arrow::StructType --- r/R/ChunkedArray.R | 4 +- r/R/Struct.R | 9 ++++- r/R/array.R | 25 ++++++++---- r/R/arrowExports.R | 20 ++++++++++ r/src/array.cpp | 21 ++++++++++ r/src/array__to_vector.cpp | 62 +++++++++++++++++++++++++++++ r/src/arrowExports.cpp | 84 ++++++++++++++++++++++++++++++++++++++++ r/src/arrow_types.h | 3 +- r/src/datatype.cpp | 12 ++++++ r/src/symbols.cpp | 1 + r/tests/testthat/test-DataType.R | 7 ++++ r/tests/testthat/test-json.R | 28 +++++++++----- 12 files changed, 254 insertions(+), 22 deletions(-) diff --git a/r/R/ChunkedArray.R b/r/R/ChunkedArray.R index 69a0224..fa9aaee 100644 --- a/r/R/ChunkedArray.R +++ b/r/R/ChunkedArray.R @@ -32,7 +32,7 @@ `arrow::ChunkedArray` <- R6Class("arrow::ChunkedArray", inherit = `arrow::Object`, public = list( length = function() ChunkedArray__length(self), - chunk = function(i) shared_ptr(`arrow::Array`, ChunkedArray__chunk(self, i)), + chunk = function(i) `arrow::Array`$dispatch(ChunkedArray__chunk(self, i)), as_vector = function() ChunkedArray__as_vector(self), Slice = function(offset, length = NULL){ if (is.null(length)) { @@ -50,7 +50,7 @@ active = list( null_count = function() ChunkedArray__null_count(self), num_chunks = function() ChunkedArray__num_chunks(self), - chunks = function() map(ChunkedArray__chunks(self), shared_ptr, class = `arrow::Array`), + chunks = function() map(ChunkedArray__chunks(self), ~ `arrow::Array`$dispatch(.x)), type = function() `arrow::DataType`$dispatch(ChunkedArray__type(self)) ) ) diff --git a/r/R/Struct.R b/r/R/Struct.R index ec78699..820e1a8 100644 --- a/r/R/Struct.R +++ b/r/R/Struct.R @@ -18,11 +18,16 @@ #' @include R6.R `arrow::StructType` <- R6Class("arrow::StructType", - inherit = `arrow::NestedType` + inherit = `arrow::NestedType`, + public = list( + GetFieldByName = function(name) shared_ptr(`arrow::Field`, StructType__GetFieldByName(self, name)), + GetFieldIndex = function(name) StructType__GetFieldIndex(self, name) + ) ) #' @rdname DataType #' @export struct <- function(...){ - shared_ptr(`arrow::StructType`, struct_(.fields(list(...)))) + xp <- struct_(.fields(list(...))) + shared_ptr(`arrow::StructType`, xp) } diff --git a/r/R/array.R b/r/R/array.R index 244cee0..b6e21ef 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -103,10 +103,27 @@ ) ) +`arrow::DictionaryArray` <- R6Class("arrow::DictionaryArray", inherit = `arrow::Array`, + public = list( + indices = function() `arrow::Array`$dispatch(DictionaryArray__indices(self)), + dictionary = function() `arrow::Array`$dispatch(DictionaryArray__dictionary(self)) + ) +) + +`arrow::StructArray` <- R6Class("arrow::StructArray", inherit = `arrow::Array`, + public = list( + field = function(i) `arrow::Array`$dispatch(StructArray__field(self, i)), + GetFieldByName = function(name) `arrow::Array`$dispatch(StructArray__GetFieldByName(self, name)), + Flatten = function() map(StructArray__Flatten(self), ~ `arrow::Array`$dispatch(.x)) + ) +) + `arrow::Array`$dispatch <- function(xp){ a <- shared_ptr(`arrow::Array`, xp) if(a$type_id() == Type$DICTIONARY){ a <- shared_ptr(`arrow::DictionaryArray`, xp) + } else if (a$type_id() == Type$STRUCT) { + a <- shared_ptr(`arrow::StructArray`, xp) } a } @@ -126,11 +143,3 @@ array <- function(x, type = NULL){ `arrow::Array`$dispatch(Array__from_vector(x, type)) } - -`arrow::DictionaryArray` <- R6Class("arrow::DictionaryArray", inherit = `arrow::Array`, - public = list( - indices = function() `arrow::Array`$dispatch(DictionaryArray__indices(self)), - dictionary = function() `arrow::Array`$dispatch(DictionaryArray__dictionary(self)) - ) -) - diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 52ff492..8609f9b 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -68,6 +68,18 @@ DictionaryArray__dictionary <- function(array){ .Call(`_arrow_DictionaryArray__dictionary` , array) } +StructArray__field <- function(array, i){ + .Call(`_arrow_StructArray__field` , array, i) +} + +StructArray__GetFieldByName <- function(array, name){ + .Call(`_arrow_StructArray__GetFieldByName` , array, name) +} + +StructArray__Flatten <- function(array){ + .Call(`_arrow_StructArray__Flatten` , array) +} + Array__as_vector <- function(array){ .Call(`_arrow_Array__as_vector` , array) } @@ -436,6 +448,14 @@ DictionaryType__ordered <- function(type){ .Call(`_arrow_DictionaryType__ordered` , type) } +StructType__GetFieldByName <- function(type, name){ + .Call(`_arrow_StructType__GetFieldByName` , type, name) +} + +StructType__GetFieldIndex <- function(type, name){ + .Call(`_arrow_StructType__GetFieldIndex` , type, name) +} + ipc___feather___TableWriter__SetDescription <- function(writer, description){ invisible(.Call(`_arrow_ipc___feather___TableWriter__SetDescription` , writer, description)) } diff --git a/r/src/array.cpp b/r/src/array.cpp index 60fd7da..35da4b1 100644 --- a/r/src/array.cpp +++ b/r/src/array.cpp @@ -119,4 +119,25 @@ std::shared_ptr<arrow::Array> DictionaryArray__dictionary( return array->dictionary(); } +// [[arrow::export]] +std::shared_ptr<arrow::Array> StructArray__field( + const std::shared_ptr<arrow::StructArray>& array, int i) { + return array->field(i); +} + +// [[arrow::export]] +std::shared_ptr<arrow::Array> StructArray__GetFieldByName( + const std::shared_ptr<arrow::StructArray>& array, const std::string& name) { + return array->GetFieldByName(name); +} + +// [[arrow::export]] +arrow::ArrayVector StructArray__Flatten( + const std::shared_ptr<arrow::StructArray>& array) { + int nf = array->num_fields(); + arrow::ArrayVector out(nf); + STOP_IF_NOT_OK(array->Flatten(arrow::default_memory_pool(), &out)); + return out; +} + #endif diff --git a/r/src/array__to_vector.cpp b/r/src/array__to_vector.cpp index 17d0600..4e26f8d 100644 --- a/r/src/array__to_vector.cpp +++ b/r/src/array__to_vector.cpp @@ -345,6 +345,65 @@ class Converter_Dictionary : public Converter { } }; +class Converter_Struct : public Converter { + public: + explicit Converter_Struct(const ArrayVector& arrays) : Converter(arrays), converters() { + auto first_array = + internal::checked_cast<arrow::StructArray*>(Converter::arrays_[0].get()); + int nf = first_array->num_fields(); + for (int i = 0; i < nf; i++) { + converters.push_back(Converter::Make({first_array->field(i)})); + } + } + + SEXP Allocate(R_xlen_t n) const { + // allocate a data frame column to host each array + auto first_array = + internal::checked_cast<arrow::StructArray*>(Converter::arrays_[0].get()); + auto type = first_array->struct_type(); + int nf = first_array->num_fields(); + Rcpp::List out(nf); + Rcpp::CharacterVector colnames(nf); + for (int i = 0; i < nf; i++) { + out[i] = converters[i]->Allocate(n); + colnames[i] = type->child(i)->name(); + } + IntegerVector rn(2); + rn[0] = NA_INTEGER; + rn[1] = -n; + Rf_setAttrib(out, symbols::row_names, rn); + Rf_setAttrib(out, R_NamesSymbol, colnames); + Rf_setAttrib(out, R_ClassSymbol, Rf_mkString("data.frame")); + return out; + } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + int nf = converters.size(); + for (int i = 0; i < nf; i++) { + STOP_IF_NOT_OK(converters[i]->Ingest_all_nulls(VECTOR_ELT(data, i), start, n)); + } + return Status::OK(); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array, + R_xlen_t start, R_xlen_t n) const { + auto struct_array = internal::checked_cast<arrow::StructArray*>(array.get()); + int nf = converters.size(); + // Flatten() deals with merging of nulls + ArrayVector arrays(nf); + STOP_IF_NOT_OK(struct_array->Flatten(default_memory_pool(), &arrays)); + for (int i = 0; i < nf; i++) { + STOP_IF_NOT_OK( + converters[i]->Ingest_some_nulls(VECTOR_ELT(data, i), arrays[i], start, n)); + } + + return Status::OK(); + } + + private: + std::vector<std::shared_ptr<Converter>> converters; +}; + double ms_to_seconds(int64_t ms) { return static_cast<double>(ms / 1000); } class Converter_Date64 : public Converter { @@ -599,6 +658,9 @@ std::shared_ptr<Converter> Converter::Make(const ArrayVector& arrays) { case Type::DECIMAL: return std::make_shared<arrow::r::Converter_Decimal>(arrays); + case Type::STRUCT: + return std::make_shared<arrow::r::Converter_Struct>(arrays); + default: break; } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index f16179b..2352184 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -270,6 +270,53 @@ RcppExport SEXP _arrow_DictionaryArray__dictionary(SEXP array_sexp){ } #endif +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::Array> StructArray__field(const std::shared_ptr<arrow::StructArray>& array, int i); +RcppExport SEXP _arrow_StructArray__field(SEXP array_sexp, SEXP i_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::StructArray>&>::type array(array_sexp); + Rcpp::traits::input_parameter<int>::type i(i_sexp); + return Rcpp::wrap(StructArray__field(array, i)); +END_RCPP +} +#else +RcppExport SEXP _arrow_StructArray__field(SEXP array_sexp, SEXP i_sexp){ + Rf_error("Cannot call StructArray__field(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::Array> StructArray__GetFieldByName(const std::shared_ptr<arrow::StructArray>& array, const std::string& name); +RcppExport SEXP _arrow_StructArray__GetFieldByName(SEXP array_sexp, SEXP name_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::StructArray>&>::type array(array_sexp); + Rcpp::traits::input_parameter<const std::string&>::type name(name_sexp); + return Rcpp::wrap(StructArray__GetFieldByName(array, name)); +END_RCPP +} +#else +RcppExport SEXP _arrow_StructArray__GetFieldByName(SEXP array_sexp, SEXP name_sexp){ + Rf_error("Cannot call StructArray__GetFieldByName(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// array.cpp +#if defined(ARROW_R_WITH_ARROW) +arrow::ArrayVector StructArray__Flatten(const std::shared_ptr<arrow::StructArray>& array); +RcppExport SEXP _arrow_StructArray__Flatten(SEXP array_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::StructArray>&>::type array(array_sexp); + return Rcpp::wrap(StructArray__Flatten(array)); +END_RCPP +} +#else +RcppExport SEXP _arrow_StructArray__Flatten(SEXP array_sexp){ + Rf_error("Cannot call StructArray__Flatten(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // array__to_vector.cpp #if defined(ARROW_R_WITH_ARROW) SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array); @@ -1664,6 +1711,38 @@ RcppExport SEXP _arrow_DictionaryType__ordered(SEXP type_sexp){ } #endif +// datatype.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::Field> StructType__GetFieldByName(const std::shared_ptr<arrow::StructType>& type, const std::string& name); +RcppExport SEXP _arrow_StructType__GetFieldByName(SEXP type_sexp, SEXP name_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::StructType>&>::type type(type_sexp); + Rcpp::traits::input_parameter<const std::string&>::type name(name_sexp); + return Rcpp::wrap(StructType__GetFieldByName(type, name)); +END_RCPP +} +#else +RcppExport SEXP _arrow_StructType__GetFieldByName(SEXP type_sexp, SEXP name_sexp){ + Rf_error("Cannot call StructType__GetFieldByName(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// datatype.cpp +#if defined(ARROW_R_WITH_ARROW) +int StructType__GetFieldIndex(const std::shared_ptr<arrow::StructType>& type, const std::string& name); +RcppExport SEXP _arrow_StructType__GetFieldIndex(SEXP type_sexp, SEXP name_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::StructType>&>::type type(type_sexp); + Rcpp::traits::input_parameter<const std::string&>::type name(name_sexp); + return Rcpp::wrap(StructType__GetFieldIndex(type, name)); +END_RCPP +} +#else +RcppExport SEXP _arrow_StructType__GetFieldIndex(SEXP type_sexp, SEXP name_sexp){ + Rf_error("Cannot call StructType__GetFieldIndex(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // feather.cpp #if defined(ARROW_R_WITH_ARROW) void ipc___feather___TableWriter__SetDescription(const std::unique_ptr<arrow::ipc::feather::TableWriter>& writer, const std::string& description); @@ -3289,6 +3368,9 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Array__Mask", (DL_FUNC) &_arrow_Array__Mask, 1}, { "_arrow_DictionaryArray__indices", (DL_FUNC) &_arrow_DictionaryArray__indices, 1}, { "_arrow_DictionaryArray__dictionary", (DL_FUNC) &_arrow_DictionaryArray__dictionary, 1}, + { "_arrow_StructArray__field", (DL_FUNC) &_arrow_StructArray__field, 2}, + { "_arrow_StructArray__GetFieldByName", (DL_FUNC) &_arrow_StructArray__GetFieldByName, 2}, + { "_arrow_StructArray__Flatten", (DL_FUNC) &_arrow_StructArray__Flatten, 1}, { "_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1}, { "_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 1}, { "_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2}, @@ -3381,6 +3463,8 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_DictionaryType__value_type", (DL_FUNC) &_arrow_DictionaryType__value_type, 1}, { "_arrow_DictionaryType__name", (DL_FUNC) &_arrow_DictionaryType__name, 1}, { "_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1}, + { "_arrow_StructType__GetFieldByName", (DL_FUNC) &_arrow_StructType__GetFieldByName, 2}, + { "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2}, { "_arrow_ipc___feather___TableWriter__SetDescription", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetDescription, 2}, { "_arrow_ipc___feather___TableWriter__SetNumRows", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetNumRows, 2}, { "_arrow_ipc___feather___TableWriter__Append", (DL_FUNC) &_arrow_ipc___feather___TableWriter__Append, 3}, diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index ca1e2d6..c93d448 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -31,6 +31,7 @@ struct symbols { static SEXP xp; static SEXP dot_Internal; static SEXP inspect; + static SEXP row_names; }; } // namespace r } // namespace arrow @@ -172,9 +173,9 @@ inline std::shared_ptr<T> extract(SEXP x) { #include <arrow/ipc/feather.h> #include <arrow/ipc/reader.h> #include <arrow/ipc/writer.h> +#include <arrow/json/reader.h> #include <arrow/type.h> #include <arrow/util/compression.h> -#include <arrow/json/reader.h> RCPP_EXPOSED_ENUM_NODECL(arrow::Type::type) RCPP_EXPOSED_ENUM_NODECL(arrow::DateUnit) diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index 0ab881d..18920f2 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -269,4 +269,16 @@ bool DictionaryType__ordered(const std::shared_ptr<arrow::DictionaryType>& type) return type->ordered(); } +// [[arrow::export]] +std::shared_ptr<arrow::Field> StructType__GetFieldByName( + const std::shared_ptr<arrow::StructType>& type, const std::string& name) { + return type->GetFieldByName(name); +} + +// [[arrow::export]] +int StructType__GetFieldIndex(const std::shared_ptr<arrow::StructType>& type, + const std::string& name) { + return type->GetFieldIndex(name); +} + #endif diff --git a/r/src/symbols.cpp b/r/src/symbols.cpp index de3fcf9..828033b 100644 --- a/r/src/symbols.cpp +++ b/r/src/symbols.cpp @@ -23,6 +23,7 @@ SEXP symbols::units = Rf_install("units"); SEXP symbols::xp = Rf_install(".:xp:."); SEXP symbols::dot_Internal = Rf_install(".Internal"); SEXP symbols::inspect = Rf_install("inspect"); +SEXP symbols::row_names = Rf_install("row.names"); void inspect(SEXP obj) { Rcpp::Shield<SEXP> call_inspect(Rf_lang2(symbols::inspect, obj)); diff --git a/r/tests/testthat/test-DataType.R b/r/tests/testthat/test-DataType.R index 5faf721..6f77b3b 100644 --- a/r/tests/testthat/test-DataType.R +++ b/r/tests/testthat/test-DataType.R @@ -311,6 +311,13 @@ test_that("struct type works as expected", { x$children(), list(field("x", int32()), field("y", boolean())) ) + expect_equal(x$GetFieldIndex("x"), 0L) + expect_equal(x$GetFieldIndex("y"), 1L) + expect_equal(x$GetFieldIndex("z"), -1L) + + expect_equal(x$GetFieldByName("x"), field("x", int32())) + expect_equal(x$GetFieldByName("y"), field("y", boolean())) + expect_null(x$GetFieldByName("z")) }) test_that("DictionaryType works as expected (ARROW-3355)", { diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R index 0321fb4..38b20a8 100644 --- a/r/tests/testthat/test-json.R +++ b/r/tests/testthat/test-json.R @@ -75,10 +75,12 @@ test_that("read_json_arrow() converts to tibble", { test_that("Can read json file with nested columns (ARROW-5503)", { tf <- tempfile() writeLines(' - { "hello": 3.5, "world": false, "yo": "thing", "arr": [1, 2, 3], "nuf": {} } - { "hello": 3.25, "world": null, "arr": [2], "nuf": null } - { "hello": 3.125, "world": null, "yo": "\u5fcd", "arr": [], "nuf": { "ps": 78 } } - { "hello": 0.0, "world": true, "yo": null, "arr": null, "nuf": { "ps": 90 } } + { "nuf": {} } + { "nuf": null } + { "nuf": { "ps": 78.0, "hello": "hi" } } + { "nuf": { "ps": 90.0, "hello": "bonjour" } } + { "nuf": { "hello": "ciao" } } + { "nuf": { "ps": 19 } } ', tf) tab1 <- read_json_arrow(tf, as_tibble = FALSE) @@ -91,13 +93,21 @@ test_that("Can read json file with nested columns (ARROW-5503)", { expect_equal( tab1$schema, schema( - hello = float64(), - world = boolean(), - yo = utf8(), - arr = list_of(int64()), - nuf = struct(ps = int64()) + nuf = struct(ps = float64(), hello = utf8()) ) ) + + struct_array <- tab1$column(0)$data()$chunk(0) + ps <- array(c(NA, NA, 78, 90, NA, 19)) + hello <- array(c(NA, NA, "hi", "bonjour", "ciao", NA)) + expect_equal(struct_array$field(0L), ps) + expect_equal(struct_array$GetFieldByName("ps"), ps) + expect_equal(struct_array$Flatten(), list(ps, hello)) + expect_equal( + struct_array$as_vector(), + data.frame(ps = ps$as_vector(), hello = hello$as_vector(), stringsAsFactors = FALSE) + ) + # cannot yet test list and struct types in R api # tib <- as.data.frame(tab1)