This is an automated email from the ASF dual-hosted git repository. kszucs pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
commit cad5c424526600b61037c3b7fffdb33ae4def2a4 Author: Neal Richardson <neal.p.richard...@gmail.com> AuthorDate: Thu Oct 3 16:54:31 2019 -0700 ARROW-3808: [R] Array extract, including Take method I implemented `Array__Take` and `Array__Filter` and then wrote `RecordBatch__Take` and `RecordBatch__Filter` to wrap them. This provides full R `[` semantics for these two classes. `ChunkedArray__Filter`, and `Table__Filter` around it, were not too bad to write, but `ChunkedArray__Take` and `Table__Take` are a bit more involved. `[` is also fully supported for those two classes. In addition to R numerics and logicals, `[` supports using an Array to Filter or Take on, and you can use a `ChunkedArray` too for `Table` and `ChunkedArray`. Closes #5531 from nealrichardson/array-extract and squashes the following commits: ff9e04ee8 <Neal Richardson> Lint a1eeb984f <Neal Richardson> Note about vctrs::vec_recycle() 038e2a846 <Neal Richardson> ChunkedArray__Take calls Concatenate if necessary 62446597a <Neal Richardson> Support filtering ChunkedArrays and Tables with ChunkedArray b62aac507 <Neal Richardson> Unskip test and address some PR feedback d6baf6dc9 <Neal Richardson> :rat: 9ec81d0f8 <Neal Richardson> Test some error handling and support passing Arrays (somewhat) to subset rows 71084fd21 <Neal Richardson> Add Table methods and update doc 9d4dfd2a8 <Neal Richardson> Move ChunkedArray__Take from R to Rcpp 58b0313bc <Neal Richardson> Add ChunkedArray__Filter and find a bug in Array->Filter() 96b0e3abd <Neal Richardson> Array__Filter and RecordBatch__Filter; start sketching ChunkedArray__Take in R b067be320 <Neal Richardson> Add Take and Filter for RecordBatch ab3c7b155 <Neal Richardson> Add Array__Take and [ method around that Authored-by: Neal Richardson <neal.p.richard...@gmail.com> Signed-off-by: Neal Richardson <neal.p.richard...@gmail.com> --- r/Makefile | 1 + r/NAMESPACE | 3 +- r/R/array.R | 79 +++++++++++++- r/R/arrowExports.R | 48 ++++++++- r/R/chunked-array.R | 58 +++++++--- r/R/record-batch.R | 31 ++++-- r/R/table.R | 25 +++++ r/R/type.R | 6 +- r/man/ChunkedArray.Rd | 31 ++++-- r/man/RecordBatch.Rd | 4 + r/man/Table.Rd | 5 + r/man/array.Rd | 10 +- r/src/arrowExports.cpp | 194 +++++++++++++++++++++++++++++++--- r/src/chunkedarray.cpp | 4 +- r/src/compute.cpp | 193 +++++++++++++++++++++++++++++++++ r/tests/testthat/helper-expectation.R | 24 +++++ r/tests/testthat/test-Array.R | 55 +++++++++- r/tests/testthat/test-RecordBatch.R | 23 ++-- r/tests/testthat/test-Table.R | 29 +++-- r/tests/testthat/test-chunked-array.R | 35 ++++++ 20 files changed, 780 insertions(+), 78 deletions(-) diff --git a/r/Makefile b/r/Makefile index f5c35ee..80a7f2a 100644 --- a/r/Makefile +++ b/r/Makefile @@ -44,3 +44,4 @@ clean: -rm src/Makevars -rm src/Makevars.win -rm -rf arrow.Rcheck/ + -find . -name "*.orig" -delete diff --git a/r/NAMESPACE b/r/NAMESPACE index 3f880fb..3f4b5f0 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -4,6 +4,8 @@ S3method("!=",Object) S3method("$",RecordBatch) S3method("$",Table) S3method("==",Object) +S3method("[",Array) +S3method("[",ChunkedArray) S3method("[",RecordBatch) S3method("[",Table) S3method("[[",RecordBatch) @@ -42,7 +44,6 @@ S3method(tail,RecordBatch) S3method(tail,Table) S3method(type,Array) S3method(type,ChunkedArray) -S3method(type,Column) S3method(type,default) S3method(write_arrow,RecordBatchWriter) S3method(write_arrow,character) diff --git a/r/R/array.R b/r/R/array.R index 2c50edb..b294042 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -54,8 +54,16 @@ #' - `$data()`: return the underlying [ArrayData][ArrayData] #' - `$as_vector()`: convert to an R vector #' - `$ToString()`: string representation of the array -#' - `$Slice(offset, length = NULL)` : Construct a zero-copy slice of the array with the indicated offset and length. If length is `NULL`, the slice goes until the end of the array. +#' - `$Slice(offset, length = NULL)`: Construct a zero-copy slice of the array +#' with the indicated offset and length. If length is `NULL`, the slice goes +#' until the end of the array. +#' - `$Take(i)`: return an `Array` with values at positions given by integers +#' (R vector or Array Array) `i`. +#' - `$Filter(i)`: return an `Array` with values at positions where logical +#' vector (or Arrow boolean Array) `i` is `TRUE`. #' - `$RangeEquals(other, start_idx, end_idx, other_start_idx)` : +#' - `$cast(target_type, safe = TRUE, options = cast_options(safe))`: Alter the +#' data in the array to change its type. #' - `$View(type)`: Construct a zero-copy view of this array with the given type. #' - `$Validate()` : Perform any validation checks to determine obvious inconsistencies #' within the array's internal data. This can be an expensive check, potentially `O(length)` @@ -85,6 +93,23 @@ Array <- R6Class("Array", shared_ptr(Array, Array__Slice2(self, offset, length)) } }, + Take = function(i) { + if (is.numeric(i)) { + i <- as.integer(i) + } + if (is.integer(i)) { + i <- Array$create(i) + } + assert_is(i, "Array") # Support ChunkedArray too? + shared_ptr(Array, Array__Take(self, i)) + }, + Filter = function(i) { + if (is.logical(i)) { + i <- Array$create(i) + } + assert_is(i, "Array") + shared_ptr(Array, Array__Filter(self, i)) + }, RangeEquals = function(other, start_idx, end_idx, other_start_idx) { assert_is(other, "Array") Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx) @@ -156,3 +181,55 @@ length.Array <- function(x) x$length() #' @export as.vector.Array <- function(x, mode) x$as_vector() + +filter_rows <- function(x, i, ...) { + # General purpose function for [ row subsetting with R semantics + # Based on the input for `i`, calls x$Filter, x$Slice, or x$Take + nrows <- x$num_rows %||% x$length() # Depends on whether Array or Table-like + if (is.logical(i)) { + i <- rep_len(i, nrows) # For R recycling behavior; consider vctrs::vec_recycle() + x$Filter(i) + } else if (is.numeric(i)) { + if (all(i < 0)) { + # in R, negative i means "everything but i" + i <- setdiff(seq_len(nrows), -1 * i) + } + if (is.sliceable(i)) { + x$Slice(i[1] - 1, length(i)) + } else if (all(i > 0)) { + x$Take(i - 1) + } else { + stop("Cannot mix positive and negative indices", call. = FALSE) + } + } else if (is.Array(i, INTEGER_TYPES)) { + # NOTE: this doesn't do the - 1 offset + x$Take(i) + } else if (is.Array(i, "bool")) { + x$Filter(i) + } else { + # Unsupported cases + if (is.Array(i)) { + stop("Cannot extract rows with an Array of type ", i$type$ToString(), call. = FALSE) + } + stop("Cannot extract rows with an object of class ", class(i), call.=FALSE) + } +} + +#' @export +`[.Array` <- filter_rows + +is.sliceable <- function(i) { + # Determine whether `i` can be expressed as a $Slice() command + is.numeric(i) && + length(i) > 0 && + all(i > 0) && + identical(as.integer(i), i[1]:i[length(i)]) +} + +is.Array <- function(x, type = NULL) { + is_it <- inherits(x, c("Array", "ChunkedArray")) + if (is_it && !is.null(type)) { + is_it <- x$type$ToString() %in% type + } + is_it +} diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 40c05f9..4c3ba83 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -208,12 +208,12 @@ ChunkedArray__type <- function(chunked_array){ .Call(`_arrow_ChunkedArray__type` , chunked_array) } -ChunkArray__Slice1 <- function(chunked_array, offset){ - .Call(`_arrow_ChunkArray__Slice1` , chunked_array, offset) +ChunkedArray__Slice1 <- function(chunked_array, offset){ + .Call(`_arrow_ChunkedArray__Slice1` , chunked_array, offset) } -ChunkArray__Slice2 <- function(chunked_array, offset, length){ - .Call(`_arrow_ChunkArray__Slice2` , chunked_array, offset, length) +ChunkedArray__Slice2 <- function(chunked_array, offset, length){ + .Call(`_arrow_ChunkedArray__Slice2` , chunked_array, offset, length) } ChunkedArray__View <- function(array, type){ @@ -264,6 +264,46 @@ Table__cast <- function(table, schema, options){ .Call(`_arrow_Table__cast` , table, schema, options) } +Array__Take <- function(values, indices){ + .Call(`_arrow_Array__Take` , values, indices) +} + +RecordBatch__Take <- function(batch, indices){ + .Call(`_arrow_RecordBatch__Take` , batch, indices) +} + +ChunkedArray__Take <- function(values, indices){ + .Call(`_arrow_ChunkedArray__Take` , values, indices) +} + +Table__Take <- function(table, indices){ + .Call(`_arrow_Table__Take` , table, indices) +} + +Array__Filter <- function(values, filter){ + .Call(`_arrow_Array__Filter` , values, filter) +} + +RecordBatch__Filter <- function(batch, filter){ + .Call(`_arrow_RecordBatch__Filter` , batch, filter) +} + +ChunkedArray__Filter <- function(values, filter){ + .Call(`_arrow_ChunkedArray__Filter` , values, filter) +} + +ChunkedArray__FilterChunked <- function(values, filter){ + .Call(`_arrow_ChunkedArray__FilterChunked` , values, filter) +} + +Table__Filter <- function(table, filter){ + .Call(`_arrow_Table__Filter` , table, filter) +} + +Table__FilterChunked <- function(table, filter){ + .Call(`_arrow_Table__FilterChunked` , table, filter) +} + csv___ReadOptions__initialize <- function(options){ .Call(`_arrow_csv___ReadOptions__initialize` , options) } diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index 58f4b4e..6e977f2 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -30,17 +30,26 @@ #' #' @section Methods: #' -#' - `$length()` -#' - `$chunk(i)` -#' - `$as_vector()` -#' - `$Slice(offset, length = NULL)` -#' - `$cast(target_type, safe = TRUE, options = cast_options(safe))` -#' - `$null_count()` -#' - `$chunks()` -#' - `$num_chunks()` -#' - `$type()` -#' - `$View(type)`: Construct a zero-copy view of this chunked array with the given type. -#' - `$Validate()` : Perform any validation checks to determine obvious inconsistencies +#' - `$length()`: Size in the number of elements this array contains +#' - `$chunk(i)`: Extract an `Array` chunk by integer position +#' - `$as_vector()`: convert to an R vector +#' - `$Slice(offset, length = NULL)`: Construct a zero-copy slice of the array +#' with the indicated offset and length. If length is `NULL`, the slice goes +#' until the end of the array. +#' - `$Take(i)`: return a `ChunkedArray` with values at positions given by +#' integers `i`. If `i` is an Arrow `Array` or `ChunkedArray`, it will be +#' coerced to an R vector before taking. +#' - `$Filter(i)`: return a `ChunkedArray` with values at positions where +#' logical vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`. +#' - `$cast(target_type, safe = TRUE, options = cast_options(safe))`: Alter the +#' data in the array to change its type. +#' - `$null_count()`: The number of null entries in the array +#' - `$chunks()`: return a list of `Array`s +#' - `$num_chunks()`: integer number of chunks in the `ChunkedArray` +#' - `$type()`: logical type of data +#' - `$View(type)`: Construct a zero-copy view of this `ChunkedArray` with the +#' given type. +#' - `$Validate()`: Perform any validation checks to determine obvious inconsistencies #' within the array's internal data. This can be an expensive check, potentially `O(length)` #' #' @rdname ChunkedArray @@ -54,11 +63,31 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = Object, as_vector = function() ChunkedArray__as_vector(self), Slice = function(offset, length = NULL){ if (is.null(length)) { - shared_ptr(ChunkedArray, ChunkArray__Slice1(self, offset)) + shared_ptr(ChunkedArray, ChunkedArray__Slice1(self, offset)) } else { - shared_ptr(ChunkedArray, ChunkArray__Slice2(self, offset, length)) + shared_ptr(ChunkedArray, ChunkedArray__Slice2(self, offset, length)) } }, + Take = function(i) { + if (inherits(i, c("Array", "ChunkedArray"))) { + # Hack because ChunkedArray__Take doesn't take Arrays + i <- as.vector(i) + } else if (is.numeric(i)) { + i <- as.integer(i) + } + assert_is(i, "integer") + return(shared_ptr(ChunkedArray, ChunkedArray__Take(self, i))) + }, + Filter = function(i) { + if (is.logical(i)) { + i <- Array$create(i) + } + if (inherits(i, "ChunkedArray")) { + return(shared_ptr(ChunkedArray, ChunkedArray__FilterChunked(self, i))) + } + assert_is(i, "Array") + shared_ptr(ChunkedArray, ChunkedArray__Filter(self, i)) + }, cast = function(target_type, safe = TRUE, options = cast_options(safe)) { assert_is(target_type, "DataType") assert_is(options, "CastOptions") @@ -111,3 +140,6 @@ length.ChunkedArray <- function(x) x$length() #' @export as.vector.ChunkedArray <- function(x, mode) x$as_vector() + +#' @export +`[.ChunkedArray` <- filter_rows diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 6dcb18f..4464ad9 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -58,6 +58,10 @@ #' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the #' indicated integer offset and going for the given length, or to the end #' of the table if `NULL`, the default. +#' - `$Take(i)`: return an `RecordBatch` with rows at positions given by +#' integers (R vector or Array Array) `i`. +#' - `$Filter(i)`: return an `RecordBatch` with rows at positions where logical +#' vector (or Arrow boolean Array) `i` is `TRUE`. #' - `$serialize()`: Returns a raw vector suitable for interprocess communication #' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter #' the schema of the record batch. @@ -109,7 +113,23 @@ RecordBatch <- R6Class("RecordBatch", inherit = Object, shared_ptr(RecordBatch, RecordBatch__Slice2(self, offset, length)) } }, - + Take = function(i) { + if (is.numeric(i)) { + i <- as.integer(i) + } + if (is.integer(i)) { + i <- Array$create(i) + } + assert_is(i, "Array") + shared_ptr(RecordBatch, RecordBatch__Take(self, i)) + }, + Filter = function(i) { + if (is.logical(i)) { + i <- Array$create(i) + } + assert_is(i, "Array") + shared_ptr(RecordBatch, RecordBatch__Filter(self, i)) + }, serialize = function() ipc___SerializeRecordBatch__Raw(self), ToString = function() ToString_tabular(self), @@ -167,14 +187,7 @@ names.RecordBatch <- function(x) { #' @export `[.RecordBatch` <- function(x, i, j, ..., drop = FALSE) { if (!missing(i)) { - if (is.numeric(i) && - length(i) > 0 && - all(i > 0) && - identical(i, as(seq(i[1], i[length(i)], 1), class(i)))) { - x <- x$Slice(i[1] - 1, length(i)) - } else { - stop('Only row "Slicing" (taking rows a:b) currently supported', call. = FALSE) - } + x <- filter_rows(x, i, ...) } if (!missing(j)) { x <- x$select(j) diff --git a/r/R/table.R b/r/R/table.R index b3175e5..d9fc837 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -65,6 +65,11 @@ #' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the #' indicated integer offset and going for the given length, or to the end #' of the table if `NULL`, the default. +#' - `$Take(i)`: return an `Table` with rows at positions given by +#' integers `i`. If `i` is an Arrow `Array` or `ChunkedArray`, it will be +#' coerced to an R vector before taking. +#' - `$Filter(i)`: return an `Table` with rows at positions where logical +#' vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`. #' - `$serialize(output_stream, ...)`: Write the table to the given #' [OutputStream] #' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter @@ -132,6 +137,26 @@ Table <- R6Class("Table", inherit = Object, shared_ptr(Table, Table__Slice2(self, offset, length)) } }, + Take = function(i) { + if (inherits(i, c("Array", "ChunkedArray"))) { + # Hack because ChunkedArray__Take doesn't take Arrays + i <- as.vector(i) + } else if (is.numeric(i)) { + i <- as.integer(i) + } + assert_is(i, "integer") + shared_ptr(Table, Table__Take(self, i)) + }, + Filter = function(i) { + if (is.logical(i)) { + i <- Array$create(i) + } + if (inherits(i, "ChunkedArray")) { + return(shared_ptr(Table, Table__FilterChunked(self, i))) + } + assert_is(i, "Array") + shared_ptr(Table, Table__Filter(self, i)) + }, Equals = function(other) { Table__Equals(self, other) diff --git a/r/R/type.R b/r/R/type.R index 51601e4..6364cef 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -86,6 +86,9 @@ DataType <- R6Class("DataType", DataType$create <- function(xp) shared_ptr(DataType, xp)$..dispatch() +INTEGER_TYPES <- as.character(outer(c("uint", "int"), c(8, 16, 32, 64), paste0)) +FLOAT_TYPES <- c("float16", "float32", "float64", "halffloat", "float", "double") + #' infer the arrow Array type from an R vector #' #' @param x an R vector @@ -103,9 +106,6 @@ type.Array <- function(x) x$type #' @export type.ChunkedArray <- function(x) x$type -#' @export -type.Column <- function(x) x$type - #----- metadata diff --git a/r/man/ChunkedArray.Rd b/r/man/ChunkedArray.Rd index 0678d6e..0628b5c 100644 --- a/r/man/ChunkedArray.Rd +++ b/r/man/ChunkedArray.Rd @@ -27,17 +27,26 @@ various Arrays or R vectors. \code{chunked_array()} is an alias for it. \section{Methods}{ \itemize{ -\item \code{$length()} -\item \code{$chunk(i)} -\item \code{$as_vector()} -\item \code{$Slice(offset, length = NULL)} -\item \code{$cast(target_type, safe = TRUE, options = cast_options(safe))} -\item \code{$null_count()} -\item \code{$chunks()} -\item \code{$num_chunks()} -\item \code{$type()} -\item \code{$View(type)}: Construct a zero-copy view of this chunked array with the given type. -\item \code{$Validate()} : Perform any validation checks to determine obvious inconsistencies +\item \code{$length()}: Size in the number of elements this array contains +\item \code{$chunk(i)}: Extract an \code{Array} chunk by integer position +\item \code{$as_vector()}: convert to an R vector +\item \code{$Slice(offset, length = NULL)}: Construct a zero-copy slice of the array +with the indicated offset and length. If length is \code{NULL}, the slice goes +until the end of the array. +\item \code{$Take(i)}: return a \code{ChunkedArray} with values at positions given by +integers \code{i}. If \code{i} is an Arrow \code{Array} or \code{ChunkedArray}, it will be +coerced to an R vector before taking. +\item \code{$Filter(i)}: return a \code{ChunkedArray} with values at positions where +logical vector or Arrow boolean-type \code{(Chunked)Array} \code{i} is \code{TRUE}. +\item \code{$cast(target_type, safe = TRUE, options = cast_options(safe))}: Alter the +data in the array to change its type. +\item \code{$null_count()}: The number of null entries in the array +\item \code{$chunks()}: return a list of \code{Array}s +\item \code{$num_chunks()}: integer number of chunks in the \code{ChunkedArray} +\item \code{$type()}: logical type of data +\item \code{$View(type)}: Construct a zero-copy view of this \code{ChunkedArray} with the +given type. +\item \code{$Validate()}: Perform any validation checks to determine obvious inconsistencies within the array's internal data. This can be an expensive check, potentially \code{O(length)} } } diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch.Rd index 09bec65..d5ee46f 100644 --- a/r/man/RecordBatch.Rd +++ b/r/man/RecordBatch.Rd @@ -58,6 +58,10 @@ methods as well as "tidy select" expressions. \item \code{$Slice(offset, length = NULL)}: Create a zero-copy view starting at the indicated integer offset and going for the given length, or to the end of the table if \code{NULL}, the default. +\item \code{$Take(i)}: return an \code{RecordBatch} with rows at positions given by +integers (R vector or Array Array) \code{i}. +\item \code{$Filter(i)}: return an \code{RecordBatch} with rows at positions where logical +vector (or Arrow boolean Array) \code{i} is \code{TRUE}. \item \code{$serialize()}: Returns a raw vector suitable for interprocess communication \item \code{$cast(target_schema, safe = TRUE, options = cast_options(safe))}: Alter the schema of the record batch. diff --git a/r/man/Table.Rd b/r/man/Table.Rd index aa6206b..bac7a7b 100644 --- a/r/man/Table.Rd +++ b/r/man/Table.Rd @@ -57,6 +57,11 @@ methods as well as "tidy select" expressions. \item \code{$Slice(offset, length = NULL)}: Create a zero-copy view starting at the indicated integer offset and going for the given length, or to the end of the table if \code{NULL}, the default. +\item \code{$Take(i)}: return an \code{Table} with rows at positions given by +integers \code{i}. If \code{i} is an Arrow \code{Array} or \code{ChunkedArray}, it will be +coerced to an R vector before taking. +\item \code{$Filter(i)}: return an \code{Table} with rows at positions where logical +vector or Arrow boolean-type \code{(Chunked)Array} \code{i} is \code{TRUE}. \item \code{$serialize(output_stream, ...)}: Write the table to the given \link{OutputStream} \item \code{$cast(target_schema, safe = TRUE, options = cast_options(safe))}: Alter diff --git a/r/man/array.Rd b/r/man/array.Rd index c3e06da..105404f 100644 --- a/r/man/array.Rd +++ b/r/man/array.Rd @@ -44,8 +44,16 @@ a == a \item \code{$data()}: return the underlying \link{ArrayData} \item \code{$as_vector()}: convert to an R vector \item \code{$ToString()}: string representation of the array -\item \code{$Slice(offset, length = NULL)} : Construct a zero-copy slice of the array with the indicated offset and length. If length is \code{NULL}, the slice goes until the end of the array. +\item \code{$Slice(offset, length = NULL)}: Construct a zero-copy slice of the array +with the indicated offset and length. If length is \code{NULL}, the slice goes +until the end of the array. +\item \code{$Take(i)}: return an \code{Array} with values at positions given by integers +(R vector or Array Array) \code{i}. +\item \code{$Filter(i)}: return an \code{Array} with values at positions where logical +vector (or Arrow boolean Array) \code{i} is \code{TRUE}. \item \code{$RangeEquals(other, start_idx, end_idx, other_start_idx)} : +\item \code{$cast(target_type, safe = TRUE, options = cast_options(safe))}: Alter the +data in the array to change its type. \item \code{$View(type)}: Construct a zero-copy view of this array with the given type. \item \code{$Validate()} : Perform any validation checks to determine obvious inconsistencies within the array's internal data. This can be an expensive check, potentially \code{O(length)} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index b00eca9..37914e6 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -810,34 +810,34 @@ RcppExport SEXP _arrow_ChunkedArray__type(SEXP chunked_array_sexp){ // chunkedarray.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr<arrow::ChunkedArray> ChunkArray__Slice1(const std::shared_ptr<arrow::ChunkedArray>& chunked_array, int offset); -RcppExport SEXP _arrow_ChunkArray__Slice1(SEXP chunked_array_sexp, SEXP offset_sexp){ +std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Slice1(const std::shared_ptr<arrow::ChunkedArray>& chunked_array, int offset); +RcppExport SEXP _arrow_ChunkedArray__Slice1(SEXP chunked_array_sexp, SEXP offset_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ChunkedArray>&>::type chunked_array(chunked_array_sexp); Rcpp::traits::input_parameter<int>::type offset(offset_sexp); - return Rcpp::wrap(ChunkArray__Slice1(chunked_array, offset)); + return Rcpp::wrap(ChunkedArray__Slice1(chunked_array, offset)); END_RCPP } #else -RcppExport SEXP _arrow_ChunkArray__Slice1(SEXP chunked_array_sexp, SEXP offset_sexp){ - Rf_error("Cannot call ChunkArray__Slice1(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ChunkedArray__Slice1(SEXP chunked_array_sexp, SEXP offset_sexp){ + Rf_error("Cannot call ChunkedArray__Slice1(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif // chunkedarray.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr<arrow::ChunkedArray> ChunkArray__Slice2(const std::shared_ptr<arrow::ChunkedArray>& chunked_array, int offset, int length); -RcppExport SEXP _arrow_ChunkArray__Slice2(SEXP chunked_array_sexp, SEXP offset_sexp, SEXP length_sexp){ +std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Slice2(const std::shared_ptr<arrow::ChunkedArray>& chunked_array, int offset, int length); +RcppExport SEXP _arrow_ChunkedArray__Slice2(SEXP chunked_array_sexp, SEXP offset_sexp, SEXP length_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ChunkedArray>&>::type chunked_array(chunked_array_sexp); Rcpp::traits::input_parameter<int>::type offset(offset_sexp); Rcpp::traits::input_parameter<int>::type length(length_sexp); - return Rcpp::wrap(ChunkArray__Slice2(chunked_array, offset, length)); + return Rcpp::wrap(ChunkedArray__Slice2(chunked_array, offset, length)); END_RCPP } #else -RcppExport SEXP _arrow_ChunkArray__Slice2(SEXP chunked_array_sexp, SEXP offset_sexp, SEXP length_sexp){ - Rf_error("Cannot call ChunkArray__Slice2(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ChunkedArray__Slice2(SEXP chunked_array_sexp, SEXP offset_sexp, SEXP length_sexp){ + Rf_error("Cannot call ChunkedArray__Slice2(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -1037,6 +1037,166 @@ RcppExport SEXP _arrow_Table__cast(SEXP table_sexp, SEXP schema_sexp, SEXP optio } #endif +// compute.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::Array> Array__Take(const std::shared_ptr<arrow::Array>& values, const std::shared_ptr<arrow::Array>& indices); +RcppExport SEXP _arrow_Array__Take(SEXP values_sexp, SEXP indices_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Array>&>::type values(values_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Array>&>::type indices(indices_sexp); + return Rcpp::wrap(Array__Take(values, indices)); +END_RCPP +} +#else +RcppExport SEXP _arrow_Array__Take(SEXP values_sexp, SEXP indices_sexp){ + Rf_error("Cannot call Array__Take(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// compute.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::RecordBatch> RecordBatch__Take(const std::shared_ptr<arrow::RecordBatch>& batch, const std::shared_ptr<arrow::Array>& indices); +RcppExport SEXP _arrow_RecordBatch__Take(SEXP batch_sexp, SEXP indices_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Array>&>::type indices(indices_sexp); + return Rcpp::wrap(RecordBatch__Take(batch, indices)); +END_RCPP +} +#else +RcppExport SEXP _arrow_RecordBatch__Take(SEXP batch_sexp, SEXP indices_sexp){ + Rf_error("Cannot call RecordBatch__Take(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// compute.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Take(const std::shared_ptr<arrow::ChunkedArray>& values, Rcpp::IntegerVector& indices); +RcppExport SEXP _arrow_ChunkedArray__Take(SEXP values_sexp, SEXP indices_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ChunkedArray>&>::type values(values_sexp); + Rcpp::traits::input_parameter<Rcpp::IntegerVector&>::type indices(indices_sexp); + return Rcpp::wrap(ChunkedArray__Take(values, indices)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ChunkedArray__Take(SEXP values_sexp, SEXP indices_sexp){ + Rf_error("Cannot call ChunkedArray__Take(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// compute.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::Table> Table__Take(const std::shared_ptr<arrow::Table>& table, Rcpp::IntegerVector& indices); +RcppExport SEXP _arrow_Table__Take(SEXP table_sexp, SEXP indices_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Table>&>::type table(table_sexp); + Rcpp::traits::input_parameter<Rcpp::IntegerVector&>::type indices(indices_sexp); + return Rcpp::wrap(Table__Take(table, indices)); +END_RCPP +} +#else +RcppExport SEXP _arrow_Table__Take(SEXP table_sexp, SEXP indices_sexp){ + Rf_error("Cannot call Table__Take(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// compute.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::Array> Array__Filter(const std::shared_ptr<arrow::Array>& values, const std::shared_ptr<arrow::Array>& filter); +RcppExport SEXP _arrow_Array__Filter(SEXP values_sexp, SEXP filter_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Array>&>::type values(values_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Array>&>::type filter(filter_sexp); + return Rcpp::wrap(Array__Filter(values, filter)); +END_RCPP +} +#else +RcppExport SEXP _arrow_Array__Filter(SEXP values_sexp, SEXP filter_sexp){ + Rf_error("Cannot call Array__Filter(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// compute.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::RecordBatch> RecordBatch__Filter(const std::shared_ptr<arrow::RecordBatch>& batch, const std::shared_ptr<arrow::Array>& filter); +RcppExport SEXP _arrow_RecordBatch__Filter(SEXP batch_sexp, SEXP filter_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Array>&>::type filter(filter_sexp); + return Rcpp::wrap(RecordBatch__Filter(batch, filter)); +END_RCPP +} +#else +RcppExport SEXP _arrow_RecordBatch__Filter(SEXP batch_sexp, SEXP filter_sexp){ + Rf_error("Cannot call RecordBatch__Filter(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// compute.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Filter(const std::shared_ptr<arrow::ChunkedArray>& values, const std::shared_ptr<arrow::Array>& filter); +RcppExport SEXP _arrow_ChunkedArray__Filter(SEXP values_sexp, SEXP filter_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ChunkedArray>&>::type values(values_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Array>&>::type filter(filter_sexp); + return Rcpp::wrap(ChunkedArray__Filter(values, filter)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ChunkedArray__Filter(SEXP values_sexp, SEXP filter_sexp){ + Rf_error("Cannot call ChunkedArray__Filter(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// compute.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::ChunkedArray> ChunkedArray__FilterChunked(const std::shared_ptr<arrow::ChunkedArray>& values, const std::shared_ptr<arrow::ChunkedArray>& filter); +RcppExport SEXP _arrow_ChunkedArray__FilterChunked(SEXP values_sexp, SEXP filter_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ChunkedArray>&>::type values(values_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ChunkedArray>&>::type filter(filter_sexp); + return Rcpp::wrap(ChunkedArray__FilterChunked(values, filter)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ChunkedArray__FilterChunked(SEXP values_sexp, SEXP filter_sexp){ + Rf_error("Cannot call ChunkedArray__FilterChunked(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// compute.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::Table> Table__Filter(const std::shared_ptr<arrow::Table>& table, const std::shared_ptr<arrow::Array>& filter); +RcppExport SEXP _arrow_Table__Filter(SEXP table_sexp, SEXP filter_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Table>&>::type table(table_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Array>&>::type filter(filter_sexp); + return Rcpp::wrap(Table__Filter(table, filter)); +END_RCPP +} +#else +RcppExport SEXP _arrow_Table__Filter(SEXP table_sexp, SEXP filter_sexp){ + Rf_error("Cannot call Table__Filter(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// compute.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::Table> Table__FilterChunked(const std::shared_ptr<arrow::Table>& table, const std::shared_ptr<arrow::ChunkedArray>& filter); +RcppExport SEXP _arrow_Table__FilterChunked(SEXP table_sexp, SEXP filter_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Table>&>::type table(table_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ChunkedArray>&>::type filter(filter_sexp); + return Rcpp::wrap(Table__FilterChunked(table, filter)); +END_RCPP +} +#else +RcppExport SEXP _arrow_Table__FilterChunked(SEXP table_sexp, SEXP filter_sexp){ + Rf_error("Cannot call Table__FilterChunked(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // csv.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr<arrow::csv::ReadOptions> csv___ReadOptions__initialize(List_ options); @@ -4842,8 +5002,8 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ChunkedArray__chunk", (DL_FUNC) &_arrow_ChunkedArray__chunk, 2}, { "_arrow_ChunkedArray__chunks", (DL_FUNC) &_arrow_ChunkedArray__chunks, 1}, { "_arrow_ChunkedArray__type", (DL_FUNC) &_arrow_ChunkedArray__type, 1}, - { "_arrow_ChunkArray__Slice1", (DL_FUNC) &_arrow_ChunkArray__Slice1, 2}, - { "_arrow_ChunkArray__Slice2", (DL_FUNC) &_arrow_ChunkArray__Slice2, 3}, + { "_arrow_ChunkedArray__Slice1", (DL_FUNC) &_arrow_ChunkedArray__Slice1, 2}, + { "_arrow_ChunkedArray__Slice2", (DL_FUNC) &_arrow_ChunkedArray__Slice2, 3}, { "_arrow_ChunkedArray__View", (DL_FUNC) &_arrow_ChunkedArray__View, 2}, { "_arrow_ChunkedArray__Validate", (DL_FUNC) &_arrow_ChunkedArray__Validate, 1}, { "_arrow_ChunkedArray__Equals", (DL_FUNC) &_arrow_ChunkedArray__Equals, 2}, @@ -4856,6 +5016,16 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ChunkedArray__cast", (DL_FUNC) &_arrow_ChunkedArray__cast, 3}, { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, + { "_arrow_Array__Take", (DL_FUNC) &_arrow_Array__Take, 2}, + { "_arrow_RecordBatch__Take", (DL_FUNC) &_arrow_RecordBatch__Take, 2}, + { "_arrow_ChunkedArray__Take", (DL_FUNC) &_arrow_ChunkedArray__Take, 2}, + { "_arrow_Table__Take", (DL_FUNC) &_arrow_Table__Take, 2}, + { "_arrow_Array__Filter", (DL_FUNC) &_arrow_Array__Filter, 2}, + { "_arrow_RecordBatch__Filter", (DL_FUNC) &_arrow_RecordBatch__Filter, 2}, + { "_arrow_ChunkedArray__Filter", (DL_FUNC) &_arrow_ChunkedArray__Filter, 2}, + { "_arrow_ChunkedArray__FilterChunked", (DL_FUNC) &_arrow_ChunkedArray__FilterChunked, 2}, + { "_arrow_Table__Filter", (DL_FUNC) &_arrow_Table__Filter, 2}, + { "_arrow_Table__FilterChunked", (DL_FUNC) &_arrow_Table__FilterChunked, 2}, { "_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1}, { "_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1}, { "_arrow_csv___ConvertOptions__initialize", (DL_FUNC) &_arrow_csv___ConvertOptions__initialize, 1}, diff --git a/r/src/chunkedarray.cpp b/r/src/chunkedarray.cpp index aef2a0e..a97cab6 100644 --- a/r/src/chunkedarray.cpp +++ b/r/src/chunkedarray.cpp @@ -55,13 +55,13 @@ std::shared_ptr<arrow::DataType> ChunkedArray__type( } // [[arrow::export]] -std::shared_ptr<arrow::ChunkedArray> ChunkArray__Slice1( +std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Slice1( const std::shared_ptr<arrow::ChunkedArray>& chunked_array, int offset) { return chunked_array->Slice(offset); } // [[arrow::export]] -std::shared_ptr<arrow::ChunkedArray> ChunkArray__Slice2( +std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Slice2( const std::shared_ptr<arrow::ChunkedArray>& chunked_array, int offset, int length) { return chunked_array->Slice(offset, length); } diff --git a/r/src/compute.cpp b/r/src/compute.cpp index b3a25b7..4979424 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -82,4 +82,197 @@ std::shared_ptr<arrow::Table> Table__cast( return arrow::Table::Make(schema, std::move(columns), table->num_rows()); } +// [[arrow::export]] +std::shared_ptr<arrow::Array> Array__Take(const std::shared_ptr<arrow::Array>& values, + const std::shared_ptr<arrow::Array>& indices) { + std::shared_ptr<arrow::Array> out; + arrow::compute::FunctionContext context; + arrow::compute::TakeOptions options; + STOP_IF_NOT_OK(arrow::compute::Take(&context, *values, *indices, options, &out)); + return out; +} + +// [[arrow::export]] +std::shared_ptr<arrow::RecordBatch> RecordBatch__Take( + const std::shared_ptr<arrow::RecordBatch>& batch, + const std::shared_ptr<arrow::Array>& indices) { + int ncols = batch->num_columns(); + auto nrows = indices->length(); + + std::vector<std::shared_ptr<arrow::Array>> columns(ncols); + + for (R_xlen_t j = 0; j < ncols; j++) { + columns[j] = Array__Take(batch->column(j), indices); + } + + return arrow::RecordBatch::Make(batch->schema(), nrows, columns); +} + +// [[arrow::export]] +std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Take( + const std::shared_ptr<arrow::ChunkedArray>& values, Rcpp::IntegerVector& indices) { + int num_chunks = values->num_chunks(); + std::vector<std::shared_ptr<arrow::Array>> new_chunks(1); // Hard-coded 1 for now + // 1) If there's only one chunk, just take from it + if (num_chunks == 1) { + new_chunks[0] = Array__Take( + values->chunk(0), arrow::r::Array__from_vector(indices, arrow::int32(), true)); + return std::make_shared<arrow::ChunkedArray>(std::move(new_chunks)); + } + + std::shared_ptr<arrow::Array> current_chunk; + std::shared_ptr<arrow::Array> current_indices; + int offset = 0; + int len; + int min_i = indices[0]; + int max_i = indices[0]; + + // 2) See if all i are in the same chunk, call Array__Take on that + for (R_xlen_t i = 1; i < indices.size(); i++) { + if (indices[i] < min_i) { + min_i = indices[i]; + } else if (indices[i] > max_i) { + max_i = indices[i]; + } + } + for (R_xlen_t chk = 0; chk < num_chunks; chk++) { + current_chunk = values->chunk(chk); + len = current_chunk->length(); + if (min_i >= offset & max_i < offset + len) { + for (R_xlen_t i = 0; i < indices.size(); i++) { + // Subtract offset from all indices + indices[i] -= offset; + } + current_indices = arrow::r::Array__from_vector(indices, arrow::int32(), true); + new_chunks[0] = Array__Take(current_chunk, current_indices); + return std::make_shared<arrow::ChunkedArray>(std::move(new_chunks)); + } + offset += len; + } + + // TODO 3) If they're not all in the same chunk but are sorted, we can slice + // the indices (offset appropriately) and take from each chunk + + // 4) Last resort: concatenate the chunks + STOP_IF_NOT_OK( + arrow::Concatenate(values->chunks(), arrow::default_memory_pool(), ¤t_chunk)); + current_indices = arrow::r::Array__from_vector(indices, arrow::int32(), true); + new_chunks[0] = Array__Take(current_chunk, current_indices); + return std::make_shared<arrow::ChunkedArray>(std::move(new_chunks)); +} + +// [[arrow::export]] +std::shared_ptr<arrow::Table> Table__Take(const std::shared_ptr<arrow::Table>& table, + Rcpp::IntegerVector& indices) { + auto ncols = table->num_columns(); + std::vector<std::shared_ptr<arrow::ChunkedArray>> columns(ncols); + + for (R_xlen_t j = 0; j < ncols; j++) { + columns[j] = ChunkedArray__Take(table->column(j), indices); + } + + return arrow::Table::Make(table->schema(), columns); +} + +// [[arrow::export]] +std::shared_ptr<arrow::Array> Array__Filter(const std::shared_ptr<arrow::Array>& values, + const std::shared_ptr<arrow::Array>& filter) { + std::shared_ptr<arrow::Array> out; + arrow::compute::FunctionContext context; + STOP_IF_NOT_OK(arrow::compute::Filter(&context, *values, *filter, &out)); + return out; +} + +// [[arrow::export]] +std::shared_ptr<arrow::RecordBatch> RecordBatch__Filter( + const std::shared_ptr<arrow::RecordBatch>& batch, + const std::shared_ptr<arrow::Array>& filter) { + int ncols = batch->num_columns(); + + std::vector<std::shared_ptr<arrow::Array>> columns(ncols); + + for (R_xlen_t j = 0; j < ncols; j++) { + columns[j] = Array__Filter(batch->column(j), filter); + } + + return arrow::RecordBatch::Make(batch->schema(), columns[0]->length(), columns); +} + +// [[arrow::export]] +std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Filter( + const std::shared_ptr<arrow::ChunkedArray>& values, + const std::shared_ptr<arrow::Array>& filter) { + int num_chunks = values->num_chunks(); + std::vector<std::shared_ptr<arrow::Array>> new_chunks(num_chunks); + std::shared_ptr<arrow::Array> current_chunk; + int offset = 0; + int len; + + for (R_xlen_t i = 0; i < num_chunks; i++) { + current_chunk = values->chunk(i); + len = current_chunk->length(); + new_chunks[i] = Array__Filter(current_chunk, filter->Slice(offset, len)); + offset += len; + } + + return std::make_shared<arrow::ChunkedArray>(std::move(new_chunks)); +} + +// [[arrow::export]] +std::shared_ptr<arrow::ChunkedArray> ChunkedArray__FilterChunked( + const std::shared_ptr<arrow::ChunkedArray>& values, + const std::shared_ptr<arrow::ChunkedArray>& filter) { + int num_chunks = values->num_chunks(); + std::vector<std::shared_ptr<arrow::Array>> new_chunks(num_chunks); + std::shared_ptr<arrow::Array> current_chunk; + std::shared_ptr<arrow::ChunkedArray> current_chunked_filter; + std::shared_ptr<arrow::Array> current_filter; + + int offset = 0; + int len; + + for (R_xlen_t i = 0; i < num_chunks; i++) { + current_chunk = values->chunk(i); + len = current_chunk->length(); + current_chunked_filter = filter->Slice(offset, len); + if (current_chunked_filter->num_chunks() == 1) { + current_filter = current_chunked_filter->chunk(0); + } else { + // Concatenate the chunks of the filter so we have an Array + STOP_IF_NOT_OK(arrow::Concatenate(current_chunked_filter->chunks(), + arrow::default_memory_pool(), ¤t_filter)); + } + new_chunks[i] = Array__Filter(current_chunk, current_filter); + offset += len; + } + + return std::make_shared<arrow::ChunkedArray>(std::move(new_chunks)); +} + +// [[arrow::export]] +std::shared_ptr<arrow::Table> Table__Filter(const std::shared_ptr<arrow::Table>& table, + const std::shared_ptr<arrow::Array>& filter) { + auto ncols = table->num_columns(); + std::vector<std::shared_ptr<arrow::ChunkedArray>> columns(ncols); + + for (R_xlen_t j = 0; j < ncols; j++) { + columns[j] = ChunkedArray__Filter(table->column(j), filter); + } + + return arrow::Table::Make(table->schema(), columns); +} + +// [[arrow::export]] +std::shared_ptr<arrow::Table> Table__FilterChunked( + const std::shared_ptr<arrow::Table>& table, + const std::shared_ptr<arrow::ChunkedArray>& filter) { + auto ncols = table->num_columns(); + std::vector<std::shared_ptr<arrow::ChunkedArray>> columns(ncols); + + for (R_xlen_t j = 0; j < ncols; j++) { + columns[j] = ChunkedArray__FilterChunked(table->column(j), filter); + } + + return arrow::Table::Make(table->schema(), columns); +} #endif diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R new file mode 100644 index 0000000..d2a54cd --- /dev/null +++ b/r/tests/testthat/helper-expectation.R @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +expect_vector <- function(x, y, ...) { + expect_equal(as.vector(x), y, ...) +} + +expect_data_frame <- function(x, y, ...) { + expect_equal(as.data.frame(x), y, ...) +} diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 016b137..766facd 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -26,13 +26,13 @@ test_that("Array", { y <- x$Slice(10) expect_equal(y$type, int32()) expect_equal(length(y), 15L) - expect_equal(y$as_vector(), c(1:10, 1:5)) + expect_vector(y, c(1:10, 1:5)) expect_true(x$RangeEquals(y, 10, 24, 0)) z <- x$Slice(10, 5) expect_equal(z$type, int32()) expect_equal(z$length(), 5L) - expect_equal(z$as_vector(), c(1:5)) + expect_vector(z, c(1:5)) expect_true(x$RangeEquals(z, 10, 15, 0)) x_dbl <- Array$create(c(1,2,3,4,5,6)) @@ -462,3 +462,54 @@ test_that("Array$Validate()", { a <- Array$create(1:10) expect_error(a$Validate(), NA) }) + +test_that("is.Array", { + a <- Array$create(1, type = int32()) + expect_true(is.Array(a)) + expect_true(is.Array(a, "int32")) + expect_true(is.Array(a, c("int32", "int16"))) + expect_false(is.Array(a, "utf8")) + expect_true(is.Array(a$View(float32())), "float32") + expect_false(is.Array(1)) + expect_true(is.Array(ChunkedArray$create(1, 2))) +}) + +test_that("Array$Take()", { + a <- Array$create(10:20) + expect_equal(as.vector(a$Take(c(4, 2))), c(14, 12)) +}) + +test_that("[ method on Array", { + vec <- 11:20 + a <- Array$create(vec) + expect_vector(a[5:9], vec[5:9]) + expect_vector(a[c(9, 3, 5)], vec[c(9, 3, 5)]) + expect_vector(a[rep(c(TRUE, FALSE), 5)], vec[c(1, 3, 5, 7, 9)]) + expect_vector(a[-4], vec[-4]) + expect_vector(a[-1], vec[-1]) +}) + +test_that("[ accepts Arrays and otherwise handles bad input", { + vec <- 11:20 + a <- Array$create(vec) + ind <- c(9, 3, 5) + expect_error( + a[Array$create(ind)], + "Cannot extract rows with an Array of type double" + ) + expect_vector(a[Array$create(ind - 1, type = int8())], vec[ind]) + expect_vector(a[Array$create(ind - 1, type = uint8())], vec[ind]) + expect_error( + # Not currently supported + a[ChunkedArray$create(8, 2, 4, type = uint8())], + 'i must be a "Array"' + ) + + filt <- seq_along(vec) %in% ind + expect_vector(a[Array$create(filt)], vec[filt]) + + expect_error( + a["string"], + "Cannot extract rows with an object of class character" + ) +}) diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 6ee6300..3777b99 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -87,16 +87,20 @@ test_that("RecordBatch", { }) test_that("[ on RecordBatch", { - expect_identical(as.data.frame(batch[6:7,]), tbl[6:7,]) - expect_identical(as.data.frame(batch[c(6, 7),]), tbl[6:7,]) - expect_identical(as.data.frame(batch[6:7, 2:4]), tbl[6:7, 2:4]) - expect_identical(as.data.frame(batch[, c("dbl", "fct")]), tbl[, c(2, 5)]) + expect_data_frame(batch[6:7,], tbl[6:7,]) + expect_data_frame(batch[c(6, 7),], tbl[6:7,]) + expect_data_frame(batch[6:7, 2:4], tbl[6:7, 2:4]) + expect_data_frame(batch[, c("dbl", "fct")], tbl[, c(2, 5)]) expect_identical(as.vector(batch[, "chr", drop = TRUE]), tbl$chr) - expect_error( - batch[c(3, 5, 7),], - 'Only row "Slicing" (taking rows a:b) currently supported', - fixed = TRUE + expect_data_frame(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) + expect_data_frame( + batch[rep(c(FALSE, TRUE), 5),], + tbl[c(2, 4, 6, 8, 10),] ) + # bool Array + expect_data_frame(batch[batch$lgl,], tbl[tbl$lgl,]) + # int Array + expect_data_frame(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4]) }) test_that("[[ and $ on RecordBatch", { @@ -246,7 +250,7 @@ test_that("record_batch() auto splices (ARROW-5718)", { batch2 <- record_batch(!!!df) expect_equal(batch1, batch2) expect_equal(batch1$schema, schema(x = int32(), y = utf8())) - expect_equivalent(as.data.frame(batch1), df) + expect_data_frame(batch1, df) batch3 <- record_batch(df, z = 1:10) batch4 <- record_batch(!!!df, z = 1:10) @@ -275,4 +279,3 @@ test_that("record_batch() only auto splice data frames", { regexp = "only data frames are allowed as unnamed arguments to be auto spliced" ) }) - diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 61fb546..3c97f4b 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -94,14 +94,26 @@ test_that("[, [[, $ for Table", { expect_identical(names(tab), names(tbl)) - expect_identical(as.data.frame(tab[6:7,]), tbl[6:7,]) - expect_identical(as.data.frame(tab[6:7, 2:4]), tbl[6:7, 2:4]) - expect_identical(as.data.frame(tab[, c("dbl", "fct")]), tbl[, c(2, 5)]) - expect_identical(as.vector(tab[, "chr", drop = TRUE]), tbl$chr) - - expect_identical(as.vector(tab[["int"]]), tbl$int) - expect_identical(as.vector(tab$int), tbl$int) - expect_identical(as.vector(tab[[4]]), tbl$chr) + expect_data_frame(tab[6:7,], tbl[6:7,]) + expect_data_frame(tab[6:7, 2:4], tbl[6:7, 2:4]) + expect_data_frame(tab[, c("dbl", "fct")], tbl[, c(2, 5)]) + expect_vector(tab[, "chr", drop = TRUE], tbl$chr) + # Take within a single chunk + expect_data_frame(tab[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) + expect_data_frame(tab[rep(c(FALSE, TRUE), 5),], tbl[c(2, 4, 6, 8, 10),]) + # bool ChunkedArray (with one chunk) + expect_data_frame(tab[tab$lgl,], tbl[tbl$lgl,]) + # ChunkedArray with multiple chunks + c1 <- c(TRUE, FALSE, TRUE, TRUE, FALSE) + c2 <- c(FALSE, FALSE, TRUE, TRUE, FALSE) + ca <- ChunkedArray$create(c1, c2) + expect_data_frame(tab[ca,], tbl[c(1, 3, 4, 8, 9),]) + # int Array + expect_data_frame(tab[Array$create(5:6), 2:4], tbl[6:7, 2:4]) + + expect_vector(tab[["int"]], tbl$int) + expect_vector(tab$int, tbl$int) + expect_vector(tab[[4]], tbl$chr) expect_null(tab$qwerty) expect_null(tab[["asdf"]]) expect_error(tab[[c(4, 3)]], 'length(i) not equal to 1', fixed = TRUE) @@ -233,4 +245,3 @@ test_that("==.Table", { expect_true(all.equal(tab1, tab2)) expect_equal(tab1, tab2) }) - diff --git a/r/tests/testthat/test-chunked-array.R b/r/tests/testthat/test-chunked-array.R index 0e2a21b..02a9261 100644 --- a/r/tests/testthat/test-chunked-array.R +++ b/r/tests/testthat/test-chunked-array.R @@ -351,3 +351,38 @@ test_that("ChunkedArray$Validate()", { a <- ChunkedArray$create(1:10) expect_error(a$Validate(), NA) }) + +test_that("[ ChunkedArray", { + one_chunk <- chunked_array(2:11) + x <- chunked_array(1:10, 31:40, 51:55) + # Slice + expect_vector(x[8:12], c(8:10, 31:32)) + # Take from same chunk + expect_vector(x[c(11, 15, 12)], c(31, 35, 32)) + # Take from multiple chunks (calls Concatenate) + expect_vector(x[c(2, 11, 15, 12, 3)], c(2, 31, 35, 32, 3)) + + # Filter (with recycling) + expect_vector( + one_chunk[c(FALSE, TRUE, FALSE, FALSE, TRUE)], + c(3, 6, 8, 11) + ) + # Filter where both are 1-chunk + expect_vector( + one_chunk[ChunkedArray$create(rep(c(FALSE, TRUE, FALSE, FALSE, TRUE), 2))], + c(3, 6, 8, 11) + ) + # Filter multi-chunk with logical (-> Array) + expect_vector( + x[c(FALSE, TRUE, FALSE, FALSE, TRUE)], + c(2, 5, 7, 10, 32, 35, 37, 40, 52, 55) + ) + # Filter with a chunked array with different sized chunks + p1 <- c(FALSE, TRUE, FALSE, FALSE, TRUE) + p2 <- c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE) + filt <- ChunkedArray$create(p1, p2, p2) + expect_vector( + x[filt], + c(2, 5, 6, 8, 9, 35, 36, 38, 39, 55) + ) +})