This is an automated email from the ASF dual-hosted git repository. npr pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 21ad7ac ARROW-6340 [R] Implements low-level bindings to Dataset classes 21ad7ac is described below commit 21ad7ac1162eab188a1e15923fb1de5b795337ec Author: Neal Richardson <neal.p.richard...@gmail.com> AuthorDate: Fri Nov 8 11:22:55 2019 -0800 ARROW-6340 [R] Implements low-level bindings to Dataset classes This patch follows the end-to-end C++ test on #5675 and implements enough classes in R to do the whole path from DataSourceDiscovery -> DataSource -> Dataset -> ScannerBuilder -> Scanner -> Table -> data.frame. It also implements dplyr logic for `select` and `filter` (which plug into the `ScannerBuilder$Project()` and `$Filter()` methods), as well as the other dpylr verbs implemented in a recent patch. See r/tests/testthat/test-dataset.R for examples of the high-level user interface a [...] To do: - Vignette (deferred to https://issues.apache.org/jira/browse/ARROW-7092) - Resolve question/hack of FileSystem shared_ptr/global (deferred to https://issues.apache.org/jira/browse/ARROW-7094) - ScalarExpression creation in r/src/expression.cpp is limited to logical/integer/double (deferred to https://issues.apache.org/jira/browse/ARROW-7093) - Behavior when hitting unsupported queries: https://issues.apache.org/jira/browse/ARROW-7095 Closes #5454 from romainfrancois/ARROW-6340/Dataset and squashes the following commits: 9dfba2ea8 <Neal Richardson> Add -DARROW_DS_STATIC to configure.win be6621c45 <Neal Richardson> Document all the things 5e34d760d <Neal Richardson> Some review feedback and cleanup c65aaa179 <Neal Richardson> Add hive partitioning and start documenting ca3017c7c <Neal Richardson> ScannerBuilder->UseThreads(), plus some assorted fixes and temporary hacks e64bf35df <Neal Richardson> Cleanup some TODOs f9183a1f9 <Neal Richardson> Add some more input validation f1954fe39 <Neal Richardson> Update NAMESPACE e9a61888d <Neal Richardson> Make test dataset have multiple files. Start with partitioning (but it errors) c45340ba7 <Neal Richardson> and/or/not 3043a0378 <Neal Richardson> Simple dataset creation with open_dataset() c68eab564 <Neal Richardson> dplyr on a Dataset 072f46d55 <Neal Richardson> More expression support 568755b73 <Neal Richardson> Add filter expressions. The bear dances! 440054357 <Neal Richardson> Add Project, schema, names methods 62a0809e4 <Neal Richardson> Almost a table 77d3fea73 <Neal Richardson> Hey look a Dataset f7b92c93b <Neal Richardson> Look for libarrow_dataset 7e43ebf6f <Romain Francois> support for std::vector<std::shared_ptr<T>> in Rcpp function input a68bb3bfa <Romain Francois> dataset types Lead-authored-by: Neal Richardson <neal.p.richard...@gmail.com> Co-authored-by: Romain Francois <rom...@rstudio.com> Signed-off-by: Neal Richardson <neal.p.richard...@gmail.com> --- cpp/src/arrow/dataset/api.h | 1 + r/DESCRIPTION | 5 +- r/Makefile | 4 +- r/NAMESPACE | 26 ++- r/R/array.R | 2 +- r/R/arrow-package.R | 4 +- r/R/arrowExports.R | 108 +++++++++ r/R/dataset.R | 313 ++++++++++++++++++++++++++ r/R/dplyr.R | 46 +++- r/R/expression.R | 130 ++++++++++- r/R/filesystem.R | 6 +- r/R/schema.R | 4 +- r/R/util.R | 6 + r/README.Rmd | 4 +- r/README.md | 9 +- r/configure | 4 +- r/configure.win | 4 +- r/man/ArrayData.Rd | 1 - r/man/ChunkedArray.Rd | 29 ++- r/man/Codec.Rd | 1 - r/man/CsvReadOptions.Rd | 1 - r/man/CsvTableReader.Rd | 3 +- r/man/DataSource.Rd | 55 +++++ r/man/DataType.Rd | 1 - r/man/Dataset.Rd | 39 ++++ r/man/DictionaryType.Rd | 1 - r/man/Expression.Rd | 34 +++ r/man/FeatherTableReader.Rd | 17 +- r/man/FeatherTableWriter.Rd | 17 +- r/man/Field.Rd | 1 - r/man/FileStats.Rd | 10 +- r/man/FileSystem.Rd | 29 ++- r/man/FixedWidthType.Rd | 1 - r/man/InputStream.Rd | 19 +- r/man/MemoryPool.Rd | 1 - r/man/Message.Rd | 1 - r/man/MessageReader.Rd | 1 - r/man/OutputStream.Rd | 15 +- r/man/ParquetFileReader.Rd | 5 +- r/man/ParquetReaderProperties.Rd | 7 +- r/man/PartitionScheme.Rd | 30 +++ r/man/RecordBatch.Rd | 43 ++-- r/man/RecordBatchReader.Rd | 11 +- r/man/RecordBatchWriter.Rd | 9 +- r/man/Scanner.Rd | 29 +++ r/man/Schema.Rd | 7 +- r/man/Selector.Rd | 7 +- r/man/Table.Rd | 41 ++-- r/man/array.Rd | 39 ++-- r/man/arrow-package.Rd | 2 +- r/man/buffer.Rd | 9 +- r/man/cast_options.Rd | 8 +- r/man/compression.Rd | 1 - r/man/dictionary.Rd | 3 +- r/man/hive_partition.Rd | 28 +++ r/man/open_dataset.Rd | 36 +++ r/man/read_delim_arrow.Rd | 74 ++++-- r/man/read_parquet.Rd | 9 +- r/man/reexports.Rd | 2 +- r/man/write_arrow.Rd | 2 +- r/man/write_parquet.Rd | 28 ++- r/src/arrowExports.cpp | 450 +++++++++++++++++++++++++++++++++++++ r/src/arrow_types.h | 25 +++ r/src/dataset.cpp | 133 +++++++++++ r/src/expression.cpp | 113 ++++++++++ r/tests/testthat/test-dataset.R | 134 +++++++++++ r/tests/testthat/test-expression.R | 27 ++- r/tests/testthat/test-schema.R | 1 + r/tools/autobrew | 2 +- 69 files changed, 2008 insertions(+), 260 deletions(-) diff --git a/cpp/src/arrow/dataset/api.h b/cpp/src/arrow/dataset/api.h index f9e49f2..998e686 100644 --- a/cpp/src/arrow/dataset/api.h +++ b/cpp/src/arrow/dataset/api.h @@ -22,5 +22,6 @@ #include "arrow/dataset/file_base.h" #include "arrow/dataset/file_csv.h" #include "arrow/dataset/file_feather.h" +#include "arrow/dataset/file_parquet.h" #include "arrow/dataset/filter.h" #include "arrow/dataset/scanner.h" diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 2960e5f..228aa05 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -36,8 +36,8 @@ Imports: tidyselect, utils, vctrs -Roxygen: list(markdown = TRUE) -RoxygenNote: 6.1.1 +Roxygen: list(markdown = TRUE, r6 = FALSE, load = "source") +RoxygenNote: 6.1.99.9001 VignetteBuilder: knitr Suggests: covr, @@ -61,6 +61,7 @@ Collate: 'compression.R' 'compute.R' 'csv.R' + 'dataset.R' 'dictionary.R' 'record-batch.R' 'table.R' diff --git a/r/Makefile b/r/Makefile index 80a7f2a..be6735f 100644 --- a/r/Makefile +++ b/r/Makefile @@ -19,11 +19,11 @@ VERSION=$(shell grep ^Version DESCRIPTION | sed s/Version:\ //) ARROW_R_DEV="TRUE" doc: - R --slave -e 'devtools::document(); rmarkdown::render("README.Rmd")' + R --slave -e 'rmarkdown::render("README.Rmd"); roxygen2::roxygenize()' -git add --all man/*.Rd test: - export ARROW_R_DEV=$(ARROW_R_DEV) && R CMD INSTALL --install-tests --no-byte-compile . + export ARROW_R_DEV=$(ARROW_R_DEV) && R CMD INSTALL --install-tests --no-test-load --no-docs --no-help --no-byte-compile . export NOT_CRAN=true && R --slave -e 'library(testthat); setwd(file.path(.libPaths()[1], "arrow", "tests")); system.time(test_check("arrow", filter="${file}", reporter=ifelse(nchar("${r}"), "${r}", "summary")))' deps: diff --git a/r/NAMESPACE b/r/NAMESPACE index 4134a5a..798b939 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -13,23 +13,27 @@ S3method("[[",Table) S3method(Ops,Array) S3method(Ops,ChunkedArray) S3method(Ops,Expression) +S3method(Ops,array_expression) S3method(all,equal.Object) S3method(as.data.frame,RecordBatch) S3method(as.data.frame,Table) S3method(as.raw,Buffer) S3method(as.vector,Array) S3method(as.vector,ChunkedArray) -S3method(as.vector,Expression) +S3method(as.vector,array_expression) S3method(dim,RecordBatch) S3method(dim,Table) S3method(head,RecordBatch) S3method(head,Table) S3method(length,Array) S3method(length,ChunkedArray) +S3method(names,Dataset) S3method(names,RecordBatch) +S3method(names,ScannerBuilder) +S3method(names,Schema) S3method(names,Table) S3method(print,"arrow-enum") -S3method(print,Expression) +S3method(print,array_expression) S3method(read_message,InputStream) S3method(read_message,MessageReader) S3method(read_message,default) @@ -53,12 +57,14 @@ S3method(type,default) S3method(write_arrow,RecordBatchWriter) S3method(write_arrow,character) S3method(write_arrow,raw) +export(AndExpression) export(Array) export(Buffer) export(BufferOutputStream) export(BufferReader) export(ChunkedArray) export(Codec) +export(ComparisonExpression) export(CompressedInputStream) export(CompressedOutputStream) export(CompressionType) @@ -66,16 +72,23 @@ export(CsvConvertOptions) export(CsvParseOptions) export(CsvReadOptions) export(CsvTableReader) +export(DataSource) +export(DataSourceDiscovery) +export(Dataset) export(DateUnit) +export(Expression) export(FeatherTableReader) export(FeatherTableWriter) export(Field) +export(FieldExpression) export(FileMode) export(FileOutputStream) export(FileStats) export(FileSystem) +export(FileSystemDataSourceDiscovery) export(FileType) export(FixedSizeBufferWriter) +export(HivePartitionScheme) export(JsonParseOptions) export(JsonReadOptions) export(JsonTableReader) @@ -84,16 +97,23 @@ export(MemoryMappedFile) export(MessageReader) export(MessageType) export(MockOutputStream) +export(NotExpression) +export(OrExpression) export(ParquetFileReader) export(ParquetReaderProperties) export(ParquetVersionType) +export(PartitionScheme) export(RandomAccessFile) export(ReadableFile) export(RecordBatchFileReader) export(RecordBatchFileWriter) export(RecordBatchStreamReader) export(RecordBatchStreamWriter) +export(ScalarExpression) +export(Scanner) +export(ScannerBuilder) export(Schema) +export(SchemaPartitionScheme) export(Selector) export(StatusCode) export(SubTreeFileSystem) @@ -133,6 +153,7 @@ export(mmap_open) export(null) export(num_range) export(one_of) +export(open_dataset) export(read_arrow) export(read_csv_arrow) export(read_delim_arrow) @@ -170,6 +191,7 @@ importFrom(methods,as) importFrom(purrr,map) importFrom(purrr,map2) importFrom(purrr,map_int) +importFrom(purrr,map_lgl) importFrom(rlang,"%||%") importFrom(rlang,abort) importFrom(rlang,dots_n) diff --git a/r/R/array.R b/r/R/array.R index d05d91d..5f5581f 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -189,7 +189,7 @@ filter_rows <- function(x, i, ...) { # General purpose function for [ row subsetting with R semantics # Based on the input for `i`, calls x$Filter, x$Slice, or x$Take nrows <- x$num_rows %||% x$length() # Depends on whether Array or Table-like - if (inherits(i, "Expression")) { + if (inherits(i, "array_expression")) { # Evaluate it i <- as.vector(i) } diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 2ed85ec..78a900a 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -16,7 +16,7 @@ # under the License. #' @importFrom R6 R6Class -#' @importFrom purrr map map_int map2 +#' @importFrom purrr map map_int map_lgl map2 #' @importFrom assertthat assert_that #' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos eval_tidy new_data_mask syms env #' @importFrom Rcpp sourceCpp @@ -34,7 +34,7 @@ "group_vars", "ungroup", "mutate", "arrange", "rename", "pull" ) ) - for (cl in c("RecordBatch", "Table", "arrow_dplyr_query")) { + for (cl in c("Dataset", "RecordBatch", "Table", "arrow_dplyr_query")) { for (m in dplyr_methods) { s3_register(m, cl) } diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 7bf434a..fc7c1a0 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -340,6 +340,66 @@ csv___TableReader__Read <- function(table_reader){ .Call(`_arrow_csv___TableReader__Read` , table_reader) } +dataset___FSDSDiscovery__Make <- function(fs, selector){ + .Call(`_arrow_dataset___FSDSDiscovery__Make` , fs, selector) +} + +dataset___DSDiscovery__Finish <- function(discovery){ + .Call(`_arrow_dataset___DSDiscovery__Finish` , discovery) +} + +dataset___DSDiscovery__Inspect <- function(discovery){ + .Call(`_arrow_dataset___DSDiscovery__Inspect` , discovery) +} + +dataset___DSDiscovery__SetPartitionScheme <- function(discovery, part){ + invisible(.Call(`_arrow_dataset___DSDiscovery__SetPartitionScheme` , discovery, part)) +} + +dataset___SchemaPartitionScheme <- function(schm){ + .Call(`_arrow_dataset___SchemaPartitionScheme` , schm) +} + +dataset___HivePartitionScheme <- function(schm){ + .Call(`_arrow_dataset___HivePartitionScheme` , schm) +} + +dataset___Dataset__create <- function(sources, schm){ + .Call(`_arrow_dataset___Dataset__create` , sources, schm) +} + +dataset___Dataset__schema <- function(ds){ + .Call(`_arrow_dataset___Dataset__schema` , ds) +} + +dataset___Dataset__NewScan <- function(ds){ + .Call(`_arrow_dataset___Dataset__NewScan` , ds) +} + +dataset___ScannerBuilder__Project <- function(sb, cols){ + invisible(.Call(`_arrow_dataset___ScannerBuilder__Project` , sb, cols)) +} + +dataset___ScannerBuilder__Filter <- function(sb, expr){ + invisible(.Call(`_arrow_dataset___ScannerBuilder__Filter` , sb, expr)) +} + +dataset___ScannerBuilder__UseThreads <- function(sb, threads){ + invisible(.Call(`_arrow_dataset___ScannerBuilder__UseThreads` , sb, threads)) +} + +dataset___ScannerBuilder__schema <- function(sb){ + .Call(`_arrow_dataset___ScannerBuilder__schema` , sb) +} + +dataset___ScannerBuilder__Finish <- function(sb){ + .Call(`_arrow_dataset___ScannerBuilder__Finish` , sb) +} + +dataset___Scanner__ToTable <- function(scn){ + .Call(`_arrow_dataset___Scanner__ToTable` , scn) +} + shared_ptr_is_null <- function(xp){ .Call(`_arrow_shared_ptr_is_null` , xp) } @@ -536,6 +596,54 @@ ListType__value_type <- function(type){ .Call(`_arrow_ListType__value_type` , type) } +dataset___expr__field_ref <- function(name){ + .Call(`_arrow_dataset___expr__field_ref` , name) +} + +dataset___expr__equal <- function(lhs, rhs){ + .Call(`_arrow_dataset___expr__equal` , lhs, rhs) +} + +dataset___expr__not_equal <- function(lhs, rhs){ + .Call(`_arrow_dataset___expr__not_equal` , lhs, rhs) +} + +dataset___expr__greater <- function(lhs, rhs){ + .Call(`_arrow_dataset___expr__greater` , lhs, rhs) +} + +dataset___expr__greater_equal <- function(lhs, rhs){ + .Call(`_arrow_dataset___expr__greater_equal` , lhs, rhs) +} + +dataset___expr__less <- function(lhs, rhs){ + .Call(`_arrow_dataset___expr__less` , lhs, rhs) +} + +dataset___expr__less_equal <- function(lhs, rhs){ + .Call(`_arrow_dataset___expr__less_equal` , lhs, rhs) +} + +dataset___expr__and <- function(lhs, rhs){ + .Call(`_arrow_dataset___expr__and` , lhs, rhs) +} + +dataset___expr__or <- function(lhs, rhs){ + .Call(`_arrow_dataset___expr__or` , lhs, rhs) +} + +dataset___expr__not <- function(lhs){ + .Call(`_arrow_dataset___expr__not` , lhs) +} + +dataset___expr__scalar <- function(x){ + .Call(`_arrow_dataset___expr__scalar` , x) +} + +dataset___expr__ToString <- function(x){ + .Call(`_arrow_dataset___expr__ToString` , x) +} + ipc___feather___TableWriter__SetDescription <- function(writer, description){ invisible(.Call(`_arrow_ipc___feather___TableWriter__SetDescription` , writer, description)) } diff --git a/r/R/dataset.R b/r/R/dataset.R new file mode 100644 index 0000000..cac14ec --- /dev/null +++ b/r/R/dataset.R @@ -0,0 +1,313 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Open a multi-file dataset +#' +#' @param path String path to a directory containing the data files +#' @param schema [Schema] for the dataset. If `NULL` (the default), the schema +#' will be inferred from the files +#' @param partition One of +#' * A `Schema`, in which case the file paths relative to `path` will be +#' parsed, and path segments will be matched with the schema fields. For +#' example, `schema(year = int16(), month = int8())` would create partitions +#' for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc. +#' * A `HivePartitionScheme`, as returned by [hive_partition()] +#' * `NULL`, the default, for no partitioning +#' @param ... additional arguments passed to `DataSourceDiscovery$create()` +#' @return A [Dataset] R6 object. Use `dplyr` methods on it to query the data, +#' or call `$NewScan()` to construct a query directly. +#' @export +#' @seealso [PartitionScheme] for defining partitioning +#' @include arrow-package.R +open_dataset <- function (path, schema = NULL, partition = NULL, ...) { + dsd <- DataSourceDiscovery$create(path, ...) + if (is.null(schema)) { + schema <- dsd$Inspect() + } + if (!is.null(partition)) { + if (inherits(partition, "Schema")) { + partition <- SchemaPartitionScheme$create(partition) + } + assert_is(partition, "PartitionScheme") + dsd$SetPartitionScheme(partition) + } + Dataset$create(list(dsd$Finish()), schema) +} + +#' Multi-file datasets +#' +#' @description +#' Arrow Datasets allow you to query against data that has been split across +#' multiple files. This sharding of data may indicate partitioning, which +#' can accelerate queries that only touch some partitions (files). +#' +#' @section Factory: +#' The `Dataset$create()` factory method instantiates a `Dataset` and +#' takes the following arguments: +#' * `sources`: a list of [DataSource] objects +#' * `schema`: a [Schema] +#' @section Methods: +#' +#' - `$NewScan()`: Returns a [ScannerBuilder] for building a query +#' - `$schema`: Active binding, returns the [Schema] of the Dataset +#' @export +#' @seealso [open_dataset()] for a simple way to create a Dataset that has a +#' single `DataSource`. +Dataset <- R6Class("Dataset", inherit = Object, + public = list( + #' @description + #' Start a new scan of the data + #' @return A [ScannerBuilder] + NewScan = function() unique_ptr(ScannerBuilder, dataset___Dataset__NewScan(self)) + ), + active = list( + #' @description + #' Return the Dataset's `Schema` + schema = function() shared_ptr(Schema, dataset___Dataset__schema(self)) + ) +) +Dataset$create <- function(sources, schema) { + assert_is_list_of(sources, "DataSource") + assert_is(schema, "Schema") + shared_ptr(Dataset, dataset___Dataset__create(sources, schema)) +} + +#' @export +names.Dataset <- function(x) names(x$schema) + +#' Data sources for a Dataset +#' +#' @description +#' A [Dataset] can have one or more `DataSource`s. A `DataSource` contains one +#' or more `DataFragments`, such as files, of a common type and partition +#' scheme. `DataSourceDiscovery` is used to create a `DataSource`, inspect the +#' [Schema] of the fragments contained in it, and declare a partition scheme. +#' `FileSystemDataSourceDiscovery` is a subclass of `DataSourceDiscovery` for +#' discovering files in the local file system, the only currently supported +#' file system. +#' @section Factory: +#' The `DataSourceDiscovery$create()` factory method instantiates a +#' `DataSourceDiscovery` and takes the following arguments: +#' * `path`: A string file path containing data files +#' * `filesystem`: Currently only "local" is supported +#' * `format`: Currently only "parquet" is supported +#' * `allow_non_existent`: logical: is `path` allowed to not exist? Default +#' `FALSE`. See [Selector]. +#' * `recursive`: logical: should files be discovered in subdirectories of +#' * `path`? Default `TRUE`. +#' * `...` Additional arguments passed to the [FileSystem] `$create()` method +#' +#' `FileSystemDataSourceDiscovery$create()` is a lower-level factory method and +#' takes the following arguments: +#' * `filesystem`: A [FileSystem] +#' * `selector`: A [Selector] +#' * `format`: Currently only "parquet" is supported +#' @section Methods: +#' `DataSource` has no defined methods. It is just passed to `Dataset$create()`. +#' +#' `DataSourceDiscovery` and its subclasses have the following methods: +#' +#' - `$Inspect()`: Walks the files in the directory and returns a common [Schema] +#' - `$SetPartitionScheme(part)`: Takes a [PartitionScheme] +#' - `$Finish()`: Returns a `DataSource` +#' @rdname DataSource +#' @name DataSource +#' @seealso [Dataset] for what do do with a `DataSource` +#' @export +DataSource <- R6Class("DataSource", inherit = Object) + +#' @usage NULL +#' @format NULL +#' @rdname DataSource +#' @export +DataSourceDiscovery <- R6Class("DataSourceDiscovery", inherit = Object, + public = list( + Finish = function() shared_ptr(DataSource, dataset___DSDiscovery__Finish(self)), + SetPartitionScheme = function(part) { + assert_is(part, "PartitionScheme") + dataset___DSDiscovery__SetPartitionScheme(self, part) + self + }, + Inspect = function() shared_ptr(Schema, dataset___DSDiscovery__Inspect(self)) + ) +) +DataSourceDiscovery$create <- function(path, + filesystem = c("auto", "local"), + format = c("parquet"), + allow_non_existent = FALSE, + recursive = TRUE, + ...) { + if (!inherits(filesystem, "FileSystem")) { + filesystem <- match.arg(filesystem) + if (filesystem == "auto") { + # When there are other FileSystems supported, detect e.g. S3 from path + filesystem <- "local" + } + filesystem <- list( + local = LocalFileSystem + # We'll register other file systems here + )[[filesystem]]$create(...) + } + selector <- Selector$create( + path, + allow_non_existent = allow_non_existent, + recursive = recursive + ) + # This may also require different initializers + FileSystemDataSourceDiscovery$create(filesystem, selector, format) +} + +#' @usage NULL +#' @format NULL +#' @rdname DataSource +#' @export +FileSystemDataSourceDiscovery <- R6Class("FileSystemDataSourceDiscovery", + inherit = DataSourceDiscovery +) +FileSystemDataSourceDiscovery$create <- function(filesystem, + selector, + format = "parquet") { + assert_is(filesystem, "FileSystem") + assert_is(selector, "Selector") + format <- match.arg(format) # Only parquet for now + shared_ptr( + FileSystemDataSourceDiscovery, + dataset___FSDSDiscovery__Make(filesystem, selector) + ) +} + +#' Scan the contents of a dataset +#' +#' @description +#' A `Scanner` iterates over a [Dataset]'s data fragments and returns data +#' according to given row filtering and column projection. Use a +#' `ScannerBuilder`, from a `Dataset`'s `$NewScan()` method, to construct one. +#' +#' @section Methods: +#' `ScannerBuilder` has the following methods: +#' +#' - `$Project(cols)`: Indicate that the scan should only return columns given +#' by `cols`, a character vector of column names +#' - `$Filter(expr)`: Filter rows by an [Expression]. +#' - `$UseThreads(threads)`: logical: should the scan use multithreading? +#' The method's default input is `TRUE`, but you must call the method to enable +#' multithreading because the scanner default is `FALSE`. +#' - `$schema`: Active binding, returns the [Schema] of the Dataset +#' - `$Finish()`: Returns a `Scanner` +#' +#' `Scanner` currently has a single method, `$ToTable()`, which evaluates the +#' query and returns an Arrow [Table]. +#' @rdname Scanner +#' @name Scanner +#' @export +Scanner <- R6Class("Scanner", inherit = Object, + public = list( + ToTable = function() shared_ptr(Table, dataset___Scanner__ToTable(self)) + ) +) + +#' @usage NULL +#' @format NULL +#' @rdname Scanner +#' @export +ScannerBuilder <- R6Class("ScannerBuilder", inherit = Object, + public = list( + Project = function(cols) { + assert_is(cols, "character") + dataset___ScannerBuilder__Project(self, cols) + self + }, + Filter = function(expr) { + assert_is(expr, "Expression") + dataset___ScannerBuilder__Filter(self, expr) + self + }, + UseThreads = function(threads = option_use_threads()) { + dataset___ScannerBuilder__UseThreads(self, threads) + self + }, + Finish = function() unique_ptr(Scanner, dataset___ScannerBuilder__Finish(self)) + ), + active = list( + schema = function() shared_ptr(Schema, dataset___ScannerBuilder__schema(self)) + ) +) + +#' @export +names.ScannerBuilder <- function(x) names(x$schema) + +#' Define a partition scheme for a DataSource +#' +#' @description +#' Pass a `PartitionScheme` to a [DataSourceDiscovery]'s `$SetPartitionScheme()` +#' method to indicate how the file's paths should be interpreted to define +#' partitioning. +#' +#' A `SchemaPartitionScheme` describes how to interpret raw path segments, in +#' order. For example, `schema(year = int16(), month = int8())` would define +#' partitions for file paths like "2019/01/file.parquet", +#' "2019/02/file.parquet", etc. +#' +#' A `HivePartitionScheme` is for Hive-style partitioning, which embeds field +#' names and values in path segments, such as +#' "/year=2019/month=2/data.parquet". Because fields are named in the path +#' segments, order does not matter. +#' @section Factory: +#' Both `SchemaPartitionScheme$create()` and `HivePartitionScheme$create()` +#' factory methods take a [Schema] as a single input argument. The helper +#' function `hive_partition(...)` is shorthand for +#' `HivePartitionScheme$create(schema(...))`. +#' @name PartitionScheme +#' @rdname PartitionScheme +#' @export +PartitionScheme <- R6Class("PartitionScheme", inherit = Object) +#' @usage NULL +#' @format NULL +#' @rdname PartitionScheme +#' @export +SchemaPartitionScheme <- R6Class("SchemaPartitionScheme", inherit = PartitionScheme) +SchemaPartitionScheme$create <- function(schema) { + shared_ptr(SchemaPartitionScheme, dataset___SchemaPartitionScheme(schema)) +} + +#' @usage NULL +#' @format NULL +#' @rdname PartitionScheme +#' @export +HivePartitionScheme <- R6Class("HivePartitionScheme", inherit = PartitionScheme) +HivePartitionScheme$create <- function(schema) { + shared_ptr(HivePartitionScheme, dataset___HivePartitionScheme(schema)) +} + +#' Construct a Hive partition scheme +#' +#' Hive partitioning embeds field names and values in path segments, such as +#' "/year=2019/month=2/data.parquet". A [HivePartitionScheme][PartitionScheme] +#' is used to parse that in Dataset creation. +#' +#' Because fields are named in the path segments, order of fields passed to +#' `hive_partition()` does not matter. +#' @param ... named list of [data types][data-type], passed to [schema()] +#' @return A `HivePartitionScheme` +#' @examples +#' \donttest{ +#' hive_partition(year = int16(), month = int8()) +#' } +hive_partition <- function(...) { + schm <- schema(...) + HivePartitionScheme$create(schm) +} diff --git a/r/R/dplyr.R b/r/R/dplyr.R index b46eb14..c244ac9 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -37,14 +37,14 @@ select.arrow_dplyr_query <- function(.data, ...) { # This S3 method is registered on load if dplyr is present column_select(arrow_dplyr_query(.data), !!!enquos(...)) } -select.Table <- select.RecordBatch <- select.arrow_dplyr_query +select.Dataset <- select.Table <- select.RecordBatch <- select.arrow_dplyr_query #' @importFrom tidyselect vars_rename rename.arrow_dplyr_query <- function(.data, ...) { # This S3 method is registered on load if dplyr is present column_select(arrow_dplyr_query(.data), !!!enquos(...), .FUN = vars_rename) } -rename.Table <- rename.RecordBatch <- rename.arrow_dplyr_query +rename.Dataset <- rename.Table <- rename.RecordBatch <- rename.arrow_dplyr_query column_select <- function(.data, ..., .FUN = vars_select) { out <- .FUN(names(.data$selected_columns), !!!enquos(...)) @@ -75,26 +75,35 @@ filter.arrow_dplyr_query <- function(.data, ..., .preserve = FALSE) { .data <- arrow_dplyr_query(.data) # Eval filters to generate Expressions with references to Arrays. filter_data <- env() + data_is_dataset <- inherits(.data$.data, "Dataset") for (v in unique(unlist(lapply(filts, all.vars)))) { # Map any renamed vars to their name in the underlying Arrow schema if (!(v %in% names(.data$selected_columns))) { stop("object '", v, "' not found", call. = FALSE) } old_var_name <- .data$selected_columns[v] - this <- .data$.data[[old_var_name]] + if (data_is_dataset) { + # Make a FieldExpression + this <- FieldExpression$create(old_var_name) + } else { + # Get the Array + this <- .data$.data[[old_var_name]] + } assign(v, this, envir = filter_data) } dm <- new_data_mask(filter_data) filters <- try(lapply(filts, function (f) { # This should yield an Expression eval_tidy(f, dm) - }), silent = TRUE) + }), silent = FALSE) # If that errored, bail out and collect(), with a warning # TODO: consider re-evaling with the as.vector Arrays and yielding logical vector if (inherits(filters, "try-error")) { # TODO: only show this in some debug mode? + # TODO: if data_is_dataset, don't auto-collect? warning( "Filter expression not implemented in arrow, pulling data into R", + immediate. = TRUE, call. = FALSE ) return(dplyr::filter(dplyr::collect(.data), ...)) @@ -109,7 +118,7 @@ filter.arrow_dplyr_query <- function(.data, ..., .preserve = FALSE) { } .data } -filter.Table <- filter.RecordBatch <- filter.arrow_dplyr_query +filter.Dataset <- filter.Table <- filter.RecordBatch <- filter.arrow_dplyr_query collect.arrow_dplyr_query <- function(x, ...) { # This S3 method is registered on load if dplyr is present @@ -121,7 +130,17 @@ collect.arrow_dplyr_query <- function(x, ...) { } # Pull only the selected rows and cols into R - df <- as.data.frame(x$.data[x$filtered_rows, colnames]) + if (inherits(x$.data, "Dataset")) { + scanner_builder <- x$.data$NewScan() + scanner_builder$UseThreads() + scanner_builder$Project(colnames) + if (!isTRUE(x$filtered_rows)) { + scanner_builder$Filter(x$filtered_rows) + } + df <- as.data.frame(scanner_builder$Finish()$ToTable()) + } else { + df <- as.data.frame(x$.data[x$filtered_rows, colnames]) + } # In case variables were renamed, apply those names names(df) <- names(colnames) @@ -133,6 +152,7 @@ collect.arrow_dplyr_query <- function(x, ...) { } collect.Table <- as.data.frame.Table collect.RecordBatch <- as.data.frame.RecordBatch +collect.Dataset <- function(x, ...) stop("not implemented") #' @importFrom tidyselect vars_pull pull.arrow_dplyr_query <- function(.data, var = -1) { @@ -142,7 +162,7 @@ pull.arrow_dplyr_query <- function(.data, var = -1) { .data$selected_columns <- stats::setNames(.data$selected_columns[var], var) dplyr::collect(.data)[[1]] } -pull.Table <- pull.RecordBatch <- pull.arrow_dplyr_query +pull.Dataset <- pull.Table <- pull.RecordBatch <- pull.arrow_dplyr_query summarise.arrow_dplyr_query <- function(.data, ...) { # This S3 method is registered on load if dplyr is present @@ -155,7 +175,7 @@ summarise.arrow_dplyr_query <- function(.data, ...) { # TODO: determine whether work can be pushed down to Arrow dplyr::summarise(dplyr::collect(.data), ...) } -summarise.Table <- summarise.RecordBatch <- summarise.arrow_dplyr_query +summarise.Dataset <- summarise.Table <- summarise.RecordBatch <- summarise.arrow_dplyr_query group_by.arrow_dplyr_query <- function(.data, ..., add = FALSE) { # This S3 method is registered on load if dplyr is present @@ -163,31 +183,33 @@ group_by.arrow_dplyr_query <- function(.data, ..., add = FALSE) { .data$group_by_vars <- dplyr::group_by_prepare(.data, ..., add = add)$group_names .data } -group_by.Table <- group_by.RecordBatch <- group_by.arrow_dplyr_query +group_by.Dataset <- group_by.Table <- group_by.RecordBatch <- group_by.arrow_dplyr_query # This S3 method is registered on load if dplyr is present groups.arrow_dplyr_query <- function(x) syms(dplyr::group_vars(x)) -groups.Table <- groups.RecordBatch <- function(x) NULL +groups.Dataset <- groups.Table <- groups.RecordBatch <- function(x) NULL # This S3 method is registered on load if dplyr is present group_vars.arrow_dplyr_query <- function(x) x$group_by_vars -group_vars.Table <- group_vars.RecordBatch <- function(x) NULL +group_vars.Dataset <- group_vars.Table <- group_vars.RecordBatch <- function(x) NULL ungroup.arrow_dplyr_query <- function(x, ...) { # This S3 method is registered on load if dplyr is present x$group_by_vars <- character() x } -ungroup.Table <- ungroup.RecordBatch <- force +ungroup.Dataset <- ungroup.Table <- ungroup.RecordBatch <- force mutate.arrow_dplyr_query <- function(.data, ...) { # This S3 method is registered on load if dplyr is present dplyr::mutate(dplyr::collect(arrow_dplyr_query(.data)), ...) } mutate.Table <- mutate.RecordBatch <- mutate.arrow_dplyr_query +mutate.Dataset <- function(.data, ...) stop("not implemented") arrange.arrow_dplyr_query <- function(.data, ...) { # This S3 method is registered on load if dplyr is present dplyr::arrange(dplyr::collect(arrow_dplyr_query(.data)), ...) } arrange.Table <- arrange.RecordBatch <- arrange.arrow_dplyr_query +arrange.Dataset <- function(.data, ...) stop("not implemented") diff --git a/r/R/expression.R b/r/R/expression.R index e114d0d..90151ba 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -15,22 +15,144 @@ # specific language governing permissions and limitations # under the License. +#' @include arrowExports.R #' @export Ops.Array <- function(e1, e2) { - structure(list(fun = .Generic, args = list(e1, e2)), class = "Expression") + structure(list(fun = .Generic, args = list(e1, e2)), class = "array_expression") } #' @export Ops.ChunkedArray <- Ops.Array #' @export -Ops.Expression <- Ops.Array +Ops.array_expression <- Ops.Array #' @export -as.vector.Expression <- function(x, ...) { +as.vector.array_expression <- function(x, ...) { x$args <- lapply(x$args, as.vector) do.call(x$fun, x$args) } #' @export -print.Expression <- function(x, ...) print(as.vector(x)) +print.array_expression <- function(x, ...) print(as.vector(x)) + +########### + +#' Arrow expressions +#' +#' @description +#' `Expression`s are used to define filter logic for passing to a [Dataset] +#' [Scanner]. `FieldExpression`s refer to columns in the `Dataset` and are +#' compared to `ScalarExpression`s using `ComparisonExpression`s. +#' `ComparisonExpression`s may be combined with `AndExpression` or +#' `OrExpression` and negated with `NotExpression`. +#' +#' @section Factory: +#' `FieldExpression$create(name)` takes a string name as input. This string should +#' refer to a column in a `Dataset` at the time it is evaluated, but you can +#' construct a `FieldExpression` independently of any `Dataset`. +#' +#' `ScalarExpression$create(x)` takes a scalar (length-1) R value as input. +#' +#' `ComparisonExpression$create(OP, e1, e2)` takes a string operator name +#' (e.g. "==", "!=", ">", etc.) and two `Expression` objects. +#' +#' `AndExpression$create(e1, e2)` and `OrExpression$create(e1, e2)` take +#' two `Expression` objects, while `NotExpression$create(e1)` takes a single +#' `Expression`. +#' @name Expression +#' @rdname Expression +#' @export +Expression <- R6Class("Expression", inherit = Object, + public = list( + ToString = function() dataset___expr__ToString(self) + ) +) + +#' @usage NULL +#' @format NULL +#' @rdname Expression +#' @export +FieldExpression <- R6Class("FieldExpression", inherit = Expression) +FieldExpression$create <- function(name) { + assert_is(name, "character") + assert_that(length(name) == 1) + shared_ptr(FieldExpression, dataset___expr__field_ref(name)) +} + +#' @usage NULL +#' @format NULL +#' @rdname Expression +#' @export +ScalarExpression <- R6Class("ScalarExpression", inherit = Expression) +ScalarExpression$create <- function(x) { + shared_ptr(ScalarExpression, dataset___expr__scalar(x)) +} + +#' @usage NULL +#' @format NULL +#' @rdname Expression +#' @export +ComparisonExpression <- R6Class("ComparisonExpression", inherit = Expression) +ComparisonExpression$create <- function(OP, e1, e2) { + comp_func <- comparison_function_map[[OP]] + if (is.null(comp_func)) { + stop(OP, " is not a supported comparison function", call. = FALSE) + } + shared_ptr(ComparisonExpression, comp_func(e1, e2)) +} + +comparison_function_map <- list( + "==" = dataset___expr__equal, + "!=" = dataset___expr__not_equal, + ">" = dataset___expr__greater, + ">=" = dataset___expr__greater_equal, + "<" = dataset___expr__less, + "<=" = dataset___expr__less_equal +) + +#' @usage NULL +#' @format NULL +#' @rdname Expression +#' @export +AndExpression <- R6Class("AndExpression", inherit = Expression) +AndExpression$create <- function(e1, e2) { + shared_ptr(AndExpression, dataset___expr__and(e1, e2)) +} +#' @usage NULL +#' @format NULL +#' @rdname Expression +#' @export +OrExpression <- R6Class("OrExpression", inherit = Expression) +OrExpression$create <- function(e1, e2) { + shared_ptr(OrExpression, dataset___expr__or(e1, e2)) +} +#' @usage NULL +#' @format NULL +#' @rdname Expression +#' @export +NotExpression <- R6Class("NotExpression", inherit = Expression) +NotExpression$create <- function(e1) { + shared_ptr(NotExpression, dataset___expr__not(e1)) +} + +#' @export +Ops.Expression <- function(e1, e2) { + if (.Generic == "!") { + return(NotExpression$create(e1)) + } + # Check for non-expressions and convert to ScalarExpressions + if (!inherits(e1, "Expression")) { + e1 <- ScalarExpression$create(e1) + } + if (!inherits(e2, "Expression")) { + e2 <- ScalarExpression$create(e2) + } + if (.Generic == "&") { + AndExpression$create(e1, e2) + } else if (.Generic == "|") { + OrExpression$create(e1, e2) + } else { + ComparisonExpression$create(.Generic, e1, e2) + } +} diff --git a/r/R/filesystem.R b/r/R/filesystem.R index ce507cc..6be7fcc 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -233,7 +233,11 @@ FileSystem <- R6Class("FileSystem", inherit = Object, #' @export LocalFileSystem <- R6Class("LocalFileSystem", inherit = FileSystem) LocalFileSystem$create <- function() { - shared_ptr(LocalFileSystem, fs___LocalFileSystem__create()) + out <- shared_ptr(LocalFileSystem, fs___LocalFileSystem__create()) + # HACK: For some reason, the filesystem APIs use raw pointers, not shared_ptr + # so we have to preserve them in some more global scope to use them as expected + options(arrow.localfs = out) + out } diff --git a/r/R/schema.R b/r/R/schema.R index 1123015..668963f 100644 --- a/r/R/schema.R +++ b/r/R/schema.R @@ -59,7 +59,6 @@ Schema <- R6Class("Schema", names = function() Schema__names(self) ) ) - Schema$create <- function(...) shared_ptr(Schema, schema_(.fields(list2(...)))) #' @param ... named list of [data types][data-type] @@ -68,6 +67,9 @@ Schema$create <- function(...) shared_ptr(Schema, schema_(.fields(list2(...)))) # TODO (npr): add examples once ARROW-5505 merges schema <- Schema$create +#' @export +names.Schema <- function(x) x$names + #' read a Schema from a stream #' #' @param stream a stream diff --git a/r/R/util.R b/r/R/util.R index 0b122f9..4318997 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -34,3 +34,9 @@ assert_is <- function(object, class) { msg <- paste(substitute(object), "must be a", oxford_paste(class, "or")) assert_that(inherits(object, class), msg = msg) } + +assert_is_list_of <- function(object, class) { + msg <- paste(substitute(object), "must be a list of", oxford_paste(class, "or")) + assert_that(is.list(object), msg = msg) + assert_that(all(map_lgl(object, ~inherits(., class))), msg = msg) +} diff --git a/r/README.Rmd b/r/README.Rmd index 2442b83..d34b218 100644 --- a/r/README.Rmd +++ b/r/README.Rmd @@ -154,11 +154,11 @@ set the `ARROW_R_DEV` environment variable to `TRUE` (optionally, add it to your`~/.Renviron` file to persist across sessions) so that the `data-raw/codegen.R` file is used for code generation. -The codegen.R script has these dependencies: +The codegen.R script has these additional dependencies: ```r remotes::install_github("romainfrancois/decor") -install.packages(c("dplyr", "purrr", "glue")) +install.packages("glue") ``` We use Google C++ style in our C++ code. Check for style errors with diff --git a/r/README.md b/r/README.md index 371f2bc..3504d5a 100644 --- a/r/README.md +++ b/r/README.md @@ -67,6 +67,11 @@ Arrow C++ library first. ``` r library(arrow) +#> +#> Attaching package: 'arrow' +#> The following object is masked from 'package:utils': +#> +#> timestamp set.seed(24) tab <- Table$create( @@ -228,11 +233,11 @@ you will need to set the `ARROW_R_DEV` environment variable to `TRUE` sessions) so that the `data-raw/codegen.R` file is used for code generation. -The codegen.R script has these dependencies: +The codegen.R script has these additional dependencies: ``` r remotes::install_github("romainfrancois/decor") -install.packages(c("dplyr", "purrr", "glue")) +install.packages("glue") ``` We use Google C++ style in our C++ code. Check for style errors with diff --git a/r/configure b/r/configure index 861f5f4..f2fcb5b 100755 --- a/r/configure +++ b/r/configure @@ -26,12 +26,12 @@ # R CMD INSTALL --configure-vars='INCLUDE_DIR=/.../include LIB_DIR=/.../lib' # Library settings -PKG_CONFIG_NAME="arrow parquet" +PKG_CONFIG_NAME="arrow parquet arrow-dataset" PKG_DEB_NAME="(unsuppored)" PKG_RPM_NAME="(unsuppored)" PKG_BREW_NAME="apache-arrow" PKG_TEST_HEADER="<arrow/api.h>" -PKG_LIBS="-larrow -lparquet" +PKG_LIBS="-larrow -lparquet -larrow_dataset" # generate code if [ "$ARROW_R_DEV" = "TRUE" ]; then diff --git a/r/configure.win b/r/configure.win index 44dbb4c..a38021f 100644 --- a/r/configure.win +++ b/r/configure.win @@ -40,8 +40,8 @@ fi RWINLIB="../windows/$(ls windows/ | grep ^arrow-)" OPENSSL_LIBS="-lcrypto -lcrypt32" -PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_R_WITH_ARROW" -PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz ${OPENSSL_LIBS} -lws2_32" +PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_DS_STATIC -DARROW_R_WITH_ARROW" +PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow_dataset -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz ${OPENSSL_LIBS} -lws2_32" echo "*** Writing Makevars.win" sed -e "s|@cflags@|$PKG_CFLAGS|" -e "s|@libs@|$PKG_LIBS|" src/Makevars.in > src/Makevars.win diff --git a/r/man/ArrayData.Rd b/r/man/ArrayData.Rd index 035fee8..24530c4 100644 --- a/r/man/ArrayData.Rd +++ b/r/man/ArrayData.Rd @@ -25,4 +25,3 @@ data$buffers() ... } -\keyword{datasets} diff --git a/r/man/ChunkedArray.Rd b/r/man/ChunkedArray.Rd index 0628b5c..c26ff3f 100644 --- a/r/man/ChunkedArray.Rd +++ b/r/man/ChunkedArray.Rd @@ -27,26 +27,26 @@ various Arrays or R vectors. \code{chunked_array()} is an alias for it. \section{Methods}{ \itemize{ -\item \code{$length()}: Size in the number of elements this array contains -\item \code{$chunk(i)}: Extract an \code{Array} chunk by integer position -\item \code{$as_vector()}: convert to an R vector -\item \code{$Slice(offset, length = NULL)}: Construct a zero-copy slice of the array +\item \verb{$length()}: Size in the number of elements this array contains +\item \verb{$chunk(i)}: Extract an \code{Array} chunk by integer position +\item \verb{$as_vector()}: convert to an R vector +\item \verb{$Slice(offset, length = NULL)}: Construct a zero-copy slice of the array with the indicated offset and length. If length is \code{NULL}, the slice goes until the end of the array. -\item \code{$Take(i)}: return a \code{ChunkedArray} with values at positions given by +\item \verb{$Take(i)}: return a \code{ChunkedArray} with values at positions given by integers \code{i}. If \code{i} is an Arrow \code{Array} or \code{ChunkedArray}, it will be coerced to an R vector before taking. -\item \code{$Filter(i)}: return a \code{ChunkedArray} with values at positions where -logical vector or Arrow boolean-type \code{(Chunked)Array} \code{i} is \code{TRUE}. -\item \code{$cast(target_type, safe = TRUE, options = cast_options(safe))}: Alter the +\item \verb{$Filter(i)}: return a \code{ChunkedArray} with values at positions where +logical vector or Arrow boolean-type \verb{(Chunked)Array} \code{i} is \code{TRUE}. +\item \verb{$cast(target_type, safe = TRUE, options = cast_options(safe))}: Alter the data in the array to change its type. -\item \code{$null_count()}: The number of null entries in the array -\item \code{$chunks()}: return a list of \code{Array}s -\item \code{$num_chunks()}: integer number of chunks in the \code{ChunkedArray} -\item \code{$type()}: logical type of data -\item \code{$View(type)}: Construct a zero-copy view of this \code{ChunkedArray} with the +\item \verb{$null_count()}: The number of null entries in the array +\item \verb{$chunks()}: return a list of \code{Array}s +\item \verb{$num_chunks()}: integer number of chunks in the \code{ChunkedArray} +\item \verb{$type()}: logical type of data +\item \verb{$View(type)}: Construct a zero-copy view of this \code{ChunkedArray} with the given type. -\item \code{$Validate()}: Perform any validation checks to determine obvious inconsistencies +\item \verb{$Validate()}: Perform any validation checks to determine obvious inconsistencies within the array's internal data. This can be an expensive check, potentially \code{O(length)} } } @@ -54,4 +54,3 @@ within the array's internal data. This can be an expensive check, potentially \c \seealso{ \link{Array} } -\keyword{datasets} diff --git a/r/man/Codec.Rd b/r/man/Codec.Rd index 6619a64..e437d4f 100644 --- a/r/man/Codec.Rd +++ b/r/man/Codec.Rd @@ -20,4 +20,3 @@ compression level for the selected compression \code{type}. } } -\keyword{datasets} diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd index b674010..8ecb7e2 100644 --- a/r/man/CsvReadOptions.Rd +++ b/r/man/CsvReadOptions.Rd @@ -72,4 +72,3 @@ These classes have no implemented methods. They are containers for the options. } -\keyword{datasets} diff --git a/r/man/CsvTableReader.Rd b/r/man/CsvTableReader.Rd index 97abe66..8343653 100644 --- a/r/man/CsvTableReader.Rd +++ b/r/man/CsvTableReader.Rd @@ -26,8 +26,7 @@ take the following arguments: \section{Methods}{ \itemize{ -\item \code{$Read()}: returns an Arrow Table. +\item \verb{$Read()}: returns an Arrow Table. } } -\keyword{datasets} diff --git a/r/man/DataSource.Rd b/r/man/DataSource.Rd new file mode 100644 index 0000000..c83a4d7 --- /dev/null +++ b/r/man/DataSource.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset.R +\name{DataSource} +\alias{DataSource} +\alias{DataSourceDiscovery} +\alias{FileSystemDataSourceDiscovery} +\title{Data sources for a Dataset} +\description{ +A \link{Dataset} can have one or more \code{DataSource}s. A \code{DataSource} contains one +or more \code{DataFragments}, such as files, of a common type and partition +scheme. \code{DataSourceDiscovery} is used to create a \code{DataSource}, inspect the +\link{Schema} of the fragments contained in it, and declare a partition scheme. +\code{FileSystemDataSourceDiscovery} is a subclass of \code{DataSourceDiscovery} for +discovering files in the local file system, the only currently supported +file system. +} +\section{Factory}{ + +The \code{DataSourceDiscovery$create()} factory method instantiates a +\code{DataSourceDiscovery} and takes the following arguments: +\itemize{ +\item \code{path}: A string file path containing data files +\item \code{filesystem}: Currently only "local" is supported +\item \code{format}: Currently only "parquet" is supported +\item \code{allow_non_existent}: logical: is \code{path} allowed to not exist? Default +\code{FALSE}. See \link{Selector}. +\item \code{recursive}: logical: should files be discovered in subdirectories of +\item \code{path}? Default \code{TRUE}. +\item \code{...} Additional arguments passed to the \link{FileSystem} \verb{$create()} method +} + +\code{FileSystemDataSourceDiscovery$create()} is a lower-level factory method and +takes the following arguments: +\itemize{ +\item \code{filesystem}: A \link{FileSystem} +\item \code{selector}: A \link{Selector} +\item \code{format}: Currently only "parquet" is supported +} +} + +\section{Methods}{ + +\code{DataSource} has no defined methods. It is just passed to \code{Dataset$create()}. + +\code{DataSourceDiscovery} and its subclasses have the following methods: +\itemize{ +\item \verb{$Inspect()}: Walks the files in the directory and returns a common \link{Schema} +\item \verb{$SetPartitionScheme(part)}: Takes a \link{PartitionScheme} +\item \verb{$Finish()}: Returns a \code{DataSource} +} +} + +\seealso{ +\link{Dataset} for what do do with a \code{DataSource} +} diff --git a/r/man/DataType.Rd b/r/man/DataType.Rd index 64755c7..8c96141 100644 --- a/r/man/DataType.Rd +++ b/r/man/DataType.Rd @@ -13,4 +13,3 @@ class arrow::DataType TODO } -\keyword{datasets} diff --git a/r/man/Dataset.Rd b/r/man/Dataset.Rd new file mode 100644 index 0000000..9ba23c4 --- /dev/null +++ b/r/man/Dataset.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset.R +\name{Dataset} +\alias{Dataset} +\title{Multi-file datasets} +\value{ +A \link{ScannerBuilder} +} +\description{ +Arrow Datasets allow you to query against data that has been split across +multiple files. This sharding of data may indicate partitioning, which +can accelerate queries that only touch some partitions (files). + +Start a new scan of the data + +Return the Dataset's \code{Schema} +} +\section{Factory}{ + +The \code{Dataset$create()} factory method instantiates a \code{Dataset} and +takes the following arguments: +\itemize{ +\item \code{sources}: a list of \link{DataSource} objects +\item \code{schema}: a \link{Schema} +} +} + +\section{Methods}{ + +\itemize{ +\item \verb{$NewScan()}: Returns a \link{ScannerBuilder} for building a query +\item \verb{$schema}: Active binding, returns the \link{Schema} of the Dataset +} +} + +\seealso{ +\code{\link[=open_dataset]{open_dataset()}} for a simple way to create a Dataset that has a +single \code{DataSource}. +} diff --git a/r/man/DictionaryType.Rd b/r/man/DictionaryType.Rd index 4d64cb1..8c9087f 100644 --- a/r/man/DictionaryType.Rd +++ b/r/man/DictionaryType.Rd @@ -13,4 +13,3 @@ class DictionaryType TODO } -\keyword{datasets} diff --git a/r/man/Expression.Rd b/r/man/Expression.Rd new file mode 100644 index 0000000..c0108e3 --- /dev/null +++ b/r/man/Expression.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expression.R +\name{Expression} +\alias{Expression} +\alias{FieldExpression} +\alias{ScalarExpression} +\alias{ComparisonExpression} +\alias{AndExpression} +\alias{OrExpression} +\alias{NotExpression} +\title{Arrow expressions} +\description{ +\code{Expression}s are used to define filter logic for passing to a \link{Dataset} +\link{Scanner}. \code{FieldExpression}s refer to columns in the \code{Dataset} and are +compared to \code{ScalarExpression}s using \code{ComparisonExpression}s. +\code{ComparisonExpression}s may be combined with \code{AndExpression} or +\code{OrExpression} and negated with \code{NotExpression}. +} +\section{Factory}{ + +\code{FieldExpression$create(name)} takes a string name as input. This string should +refer to a column in a \code{Dataset} at the time it is evaluated, but you can +construct a \code{FieldExpression} independently of any \code{Dataset}. + +\code{ScalarExpression$create(x)} takes a scalar (length-1) R value as input. + +\code{ComparisonExpression$create(OP, e1, e2)} takes a string operator name +(e.g. "==", "!=", ">", etc.) and two \code{Expression} objects. + +\code{AndExpression$create(e1, e2)} and \code{OrExpression$create(e1, e2)} take +two \code{Expression} objects, while \code{NotExpression$create(e1)} takes a single +\code{Expression}. +} + diff --git a/r/man/FeatherTableReader.Rd b/r/man/FeatherTableReader.Rd index c0956d4..f06197a 100644 --- a/r/man/FeatherTableReader.Rd +++ b/r/man/FeatherTableReader.Rd @@ -25,15 +25,14 @@ takes the following arguments: \section{Methods}{ \itemize{ -\item \code{$GetDescription()} -\item \code{$HasDescription()} -\item \code{$version()} -\item \code{$num_rows()} -\item \code{$num_columns()} -\item \code{$GetColumnName()} -\item \code{$GetColumn()} -\item \code{$Read(columns)} +\item \verb{$GetDescription()} +\item \verb{$HasDescription()} +\item \verb{$version()} +\item \verb{$num_rows()} +\item \verb{$num_columns()} +\item \verb{$GetColumnName()} +\item \verb{$GetColumn()} +\item \verb{$Read(columns)} } } -\keyword{datasets} diff --git a/r/man/FeatherTableWriter.Rd b/r/man/FeatherTableWriter.Rd index e127bd8..3c6afaa 100644 --- a/r/man/FeatherTableWriter.Rd +++ b/r/man/FeatherTableWriter.Rd @@ -21,15 +21,14 @@ takes the following argument: \section{Methods}{ \itemize{ -\item \code{$GetDescription()} -\item \code{$HasDescription()} -\item \code{$version()} -\item \code{$num_rows()} -\item \code{$num_columns()} -\item \code{$GetColumnName()} -\item \code{$GetColumn()} -\item \code{$Read(columns)} +\item \verb{$GetDescription()} +\item \verb{$HasDescription()} +\item \verb{$version()} +\item \verb{$num_rows()} +\item \verb{$num_columns()} +\item \verb{$GetColumnName()} +\item \verb{$GetColumn()} +\item \verb{$Read(columns)} } } -\keyword{datasets} diff --git a/r/man/Field.Rd b/r/man/Field.Rd index 68b63a8..d5f147c 100644 --- a/r/man/Field.Rd +++ b/r/man/Field.Rd @@ -33,4 +33,3 @@ field(name, type, metadata) field("x", int32()) } } -\keyword{datasets} diff --git a/r/man/FileStats.Rd b/r/man/FileStats.Rd index 04bd92f..58ed52e 100644 --- a/r/man/FileStats.Rd +++ b/r/man/FileStats.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/filesystem.R -\docType{data} \name{FileStats} \alias{FileStats} \title{FileSystem entry stats} @@ -19,12 +18,11 @@ separator). \section{Active bindings}{ \itemize{ -\item \code{$type}: The file type -\item \code{$path}: The full file path in the filesystem -\item \code{$size}: The size in bytes, if available. Only regular files are +\item \verb{$type}: The file type +\item \verb{$path}: The full file path in the filesystem +\item \verb{$size}: The size in bytes, if available. Only regular files are guaranteed to have a size. -\item \code{$mtime}: The time of last modification, if available. +\item \verb{$mtime}: The time of last modification, if available. } } -\keyword{datasets} diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index 89fa4e6..98b24d3 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -15,7 +15,7 @@ to another implementation after prepending a fixed base path \section{Factory}{ -The \code{$create()} factory methods instantiate the \code{FileSystem} object and +The \verb{$create()} factory methods instantiate the \code{FileSystem} object and take the following arguments, depending on the subclass: \itemize{ \item no argument is needed for instantiating a \code{LocalFileSystem} @@ -26,33 +26,32 @@ take the following arguments, depending on the subclass: \section{Methods}{ \itemize{ -\item \code{$GetTargetStats(x)}: \code{x} may be a \link{Selector} or a character +\item \verb{$GetTargetStats(x)}: \code{x} may be a \link{Selector} or a character vector of paths. Returns a list of \link{FileStats} -\item \code{$CreateDir(path, recursive = TRUE)}: Create a directory and subdirectories. -\item \code{$DeleteDir(path)}: Delete a directory and its contents, recursively. -\item \code{$DeleteDirContents(path)}: Delete a directory's contents, recursively. -Like \code{$DeleteDir()}, +\item \verb{$CreateDir(path, recursive = TRUE)}: Create a directory and subdirectories. +\item \verb{$DeleteDir(path)}: Delete a directory and its contents, recursively. +\item \verb{$DeleteDirContents(path)}: Delete a directory's contents, recursively. +Like \verb{$DeleteDir()}, but doesn't delete the directory itself. Passing an empty path (\code{""}) will wipe the entire filesystem tree. -\item \code{$DeleteFile(path)} : Delete a file. -\item \code{$DeleteFiles(paths)} : Delete many files. The default implementation +\item \verb{$DeleteFile(path)} : Delete a file. +\item \verb{$DeleteFiles(paths)} : Delete many files. The default implementation issues individual delete operations in sequence. -\item \code{$Move(src, dest)}: Move / rename a file or directory. If the destination +\item \verb{$Move(src, dest)}: Move / rename a file or directory. If the destination exists: if it is a non-empty directory, an error is returned otherwise, if it has the same type as the source, it is replaced otherwise, behavior is unspecified (implementation-dependent). -\item \code{$CopyFile(src, dest)}: Copy a file. If the destination exists and is a +\item \verb{$CopyFile(src, dest)}: Copy a file. If the destination exists and is a directory, an error is returned. Otherwise, it is replaced. -\item \code{$OpenInputStream(path)}: Open an \link[=InputStream]{input stream} for +\item \verb{$OpenInputStream(path)}: Open an \link[=InputStream]{input stream} for sequential reading. -\item \code{$OpenInputFile(path)}: Open an \link[=RandomAccessFile]{input file} for random +\item \verb{$OpenInputFile(path)}: Open an \link[=RandomAccessFile]{input file} for random access reading. -\item \code{$OpenOutputStream(path)}: Open an \link[=OutputStream]{output stream} for +\item \verb{$OpenOutputStream(path)}: Open an \link[=OutputStream]{output stream} for sequential writing. -\item \code{$OpenAppendStream(path)}: Open an \link[=OutputStream]{output stream} for +\item \verb{$OpenAppendStream(path)}: Open an \link[=OutputStream]{output stream} for appending. } } -\keyword{datasets} diff --git a/r/man/FixedWidthType.Rd b/r/man/FixedWidthType.Rd index e06e8a4..2857826 100644 --- a/r/man/FixedWidthType.Rd +++ b/r/man/FixedWidthType.Rd @@ -13,4 +13,3 @@ class arrow::FixedWidthType TODO } -\keyword{datasets} diff --git a/r/man/InputStream.Rd b/r/man/InputStream.Rd index 57b49c9..b909a77 100644 --- a/r/man/InputStream.Rd +++ b/r/man/InputStream.Rd @@ -17,7 +17,7 @@ buffer. Use these with the various table readers. \section{Factory}{ -The \code{$create()} factory methods instantiate the \code{InputStream} object and +The \verb{$create()} factory methods instantiate the \code{InputStream} object and take the following arguments, depending on the subclass: \itemize{ \item \code{path} For \code{ReadableFile}, a character file name @@ -31,16 +31,15 @@ To instantiate a \code{MemoryMappedFile}, call \code{\link[=mmap_open]{mmap_open \section{Methods}{ \itemize{ -\item \code{$GetSize()}: -\item \code{$supports_zero_copy()}: Logical -\item \code{$seek(position)}: go to that position in the stream -\item \code{$tell()}: return the position in the stream -\item \code{$close()}: close the stream -\item \code{$Read(nbytes)}: read data from the stream, either a specified \code{nbytes} or +\item \verb{$GetSize()}: +\item \verb{$supports_zero_copy()}: Logical +\item \verb{$seek(position)}: go to that position in the stream +\item \verb{$tell()}: return the position in the stream +\item \verb{$close()}: close the stream +\item \verb{$Read(nbytes)}: read data from the stream, either a specified \code{nbytes} or all, if \code{nbytes} is not provided -\item \code{$ReadAt(position, nbytes)}: similar to \code{$seek(position)$Read(nbytes)} -\item \code{$Resize(size)}: for a \code{MemoryMappedFile} that is writeable +\item \verb{$ReadAt(position, nbytes)}: similar to \verb{$seek(position)$Read(nbytes)} +\item \verb{$Resize(size)}: for a \code{MemoryMappedFile} that is writeable } } -\keyword{datasets} diff --git a/r/man/MemoryPool.Rd b/r/man/MemoryPool.Rd index e69fc8b..8bffc76 100644 --- a/r/man/MemoryPool.Rd +++ b/r/man/MemoryPool.Rd @@ -13,4 +13,3 @@ class arrow::MemoryPool TODO } -\keyword{datasets} diff --git a/r/man/Message.Rd b/r/man/Message.Rd index f699d51..84dd90a 100644 --- a/r/man/Message.Rd +++ b/r/man/Message.Rd @@ -13,4 +13,3 @@ class arrow::Message TODO } -\keyword{datasets} diff --git a/r/man/MessageReader.Rd b/r/man/MessageReader.Rd index cabfa66..d198c18 100644 --- a/r/man/MessageReader.Rd +++ b/r/man/MessageReader.Rd @@ -13,4 +13,3 @@ class arrow::MessageReader TODO } -\keyword{datasets} diff --git a/r/man/OutputStream.Rd b/r/man/OutputStream.Rd index 95661d1..c998d5f 100644 --- a/r/man/OutputStream.Rd +++ b/r/man/OutputStream.Rd @@ -18,7 +18,7 @@ example. \section{Factory}{ -The \code{$create()} factory methods instantiate the \code{OutputStream} object and +The \verb{$create()} factory methods instantiate the \code{OutputStream} object and take the following arguments, depending on the subclass: \itemize{ \item \code{path} For \code{FileOutputStream}, a character file name @@ -34,14 +34,13 @@ made into a buffer via \code{buffer()}. \section{Methods}{ \itemize{ -\item \code{$tell()}: return the position in the stream -\item \code{$close()}: close the stream -\item \code{$write(x)}: send \code{x} to the stream -\item \code{$capacity()}: for \code{BufferOutputStream} -\item \code{$getvalue()}: for \code{BufferOutputStream} -\item \code{$GetExtentBytesWritten()}: for \code{MockOutputStream}, report how many bytes +\item \verb{$tell()}: return the position in the stream +\item \verb{$close()}: close the stream +\item \verb{$write(x)}: send \code{x} to the stream +\item \verb{$capacity()}: for \code{BufferOutputStream} +\item \verb{$getvalue()}: for \code{BufferOutputStream} +\item \verb{$GetExtentBytesWritten()}: for \code{MockOutputStream}, report how many bytes were sent. } } -\keyword{datasets} diff --git a/r/man/ParquetFileReader.Rd b/r/man/ParquetFileReader.Rd index 1ebc20c..55f007d 100644 --- a/r/man/ParquetFileReader.Rd +++ b/r/man/ParquetFileReader.Rd @@ -24,10 +24,10 @@ takes the following arguments: \section{Methods}{ \itemize{ -\item \code{$ReadTable(col_select)}: get an \code{arrow::Table} from the file, possibly +\item \verb{$ReadTable(col_select)}: get an \code{arrow::Table} from the file, possibly with columns filtered by a character vector of column names or a \code{tidyselect} specification. -\item \code{$GetSchema()}: get the \code{arrow::Schema} of the data in the file +\item \verb{$GetSchema()}: get the \code{arrow::Schema} of the data in the file } } @@ -40,4 +40,3 @@ tab <- pq$ReadTable(starts_with("c")) tab$schema } } -\keyword{datasets} diff --git a/r/man/ParquetReaderProperties.Rd b/r/man/ParquetReaderProperties.Rd index 90de601..5cf7770 100644 --- a/r/man/ParquetReaderProperties.Rd +++ b/r/man/ParquetReaderProperties.Rd @@ -21,10 +21,9 @@ and takes the following arguments: \section{Methods}{ \itemize{ -\item \code{$read_dictionary(column_index)} -\item \code{$set_read_dictionary(column_index, read_dict)} -\item \code{$use_threads(use_threads)} +\item \verb{$read_dictionary(column_index)} +\item \verb{$set_read_dictionary(column_index, read_dict)} +\item \verb{$use_threads(use_threads)} } } -\keyword{datasets} diff --git a/r/man/PartitionScheme.Rd b/r/man/PartitionScheme.Rd new file mode 100644 index 0000000..3eb65b4 --- /dev/null +++ b/r/man/PartitionScheme.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset.R +\name{PartitionScheme} +\alias{PartitionScheme} +\alias{SchemaPartitionScheme} +\alias{HivePartitionScheme} +\title{Define a partition scheme for a DataSource} +\description{ +Pass a \code{PartitionScheme} to a \link{DataSourceDiscovery}'s \verb{$SetPartitionScheme()} +method to indicate how the file's paths should be interpreted to define +partitioning. + +A \code{SchemaPartitionScheme} describes how to interpret raw path segments, in +order. For example, \code{schema(year = int16(), month = int8())} would define +partitions for file paths like "2019/01/file.parquet", +"2019/02/file.parquet", etc. + +A \code{HivePartitionScheme} is for Hive-style partitioning, which embeds field +names and values in path segments, such as +"/year=2019/month=2/data.parquet". Because fields are named in the path +segments, order does not matter. +} +\section{Factory}{ + +Both \code{SchemaPartitionScheme$create()} and \code{HivePartitionScheme$create()} +factory methods take a \link{Schema} as a single input argument. The helper +function \code{hive_partition(...)} is shorthand for +\code{HivePartitionScheme$create(schema(...))}. +} + diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch.Rd index d5ee46f..750fc0d 100644 --- a/r/man/RecordBatch.Rd +++ b/r/man/RecordBatch.Rd @@ -24,18 +24,18 @@ a sequence of \link[=Field]{fields}, each a contiguous Arrow \link{Array}. \section{S3 Methods and Usage}{ Record batches are data-frame-like, and many methods you expect to work on -a \code{data.frame} are implemented for \code{RecordBatch}. This includes \code{[}, \code{[[}, -\code{$}, \code{names}, \code{dim}, \code{nrow}, \code{ncol}, \code{head}, and \code{tail}. You can also pull +a \code{data.frame} are implemented for \code{RecordBatch}. This includes \verb{[}, \verb{[[}, +\verb{$}, \code{names}, \code{dim}, \code{nrow}, \code{ncol}, \code{head}, and \code{tail}. You can also pull the data from an Arrow record batch into R with \code{as.data.frame()}. See the examples. -A caveat about the \code{$} method: because \code{RecordBatch} is an \code{R6} object, -\code{$} is also used to access the object's methods (see below). Methods take +A caveat about the \verb{$} method: because \code{RecordBatch} is an \code{R6} object, +\verb{$} is also used to access the object's methods (see below). Methods take precedence over the table's columns. So, \code{batch$Slice} would return the "Slice" method function even if there were a column in the table called "Slice". -A caveat about the \code{[} method for row operations: only "slicing" is +A caveat about the \verb{[} method for row operations: only "slicing" is currently supported. That is, you can select a continuous range of rows from the table, but you can't filter with a \code{logical} vector or take an arbitrary selection of rows by integer indices. @@ -46,33 +46,33 @@ arbitrary selection of rows by integer indices. In addition to the more R-friendly S3 methods, a \code{RecordBatch} object has the following R6 methods that map onto the underlying C++ methods: \itemize{ -\item \code{$Equals(other)}: Returns \code{TRUE} if the \code{other} record batch is equal -\item \code{$column(i)}: Extract an \code{Array} by integer position from the batch -\item \code{$column_name(i)}: Get a column's name by integer position -\item \code{$names()}: Get all column names (called by \code{names(batch)}) -\item \code{$GetColumnByName(name)}: Extract an \code{Array} by string name -\item \code{$RemoveColumn(i)}: Drops a column from the batch by integer position -\item \code{$select(spec)}: Return a new record batch with a selection of columns. +\item \verb{$Equals(other)}: Returns \code{TRUE} if the \code{other} record batch is equal +\item \verb{$column(i)}: Extract an \code{Array} by integer position from the batch +\item \verb{$column_name(i)}: Get a column's name by integer position +\item \verb{$names()}: Get all column names (called by \code{names(batch)}) +\item \verb{$GetColumnByName(name)}: Extract an \code{Array} by string name +\item \verb{$RemoveColumn(i)}: Drops a column from the batch by integer position +\item \verb{$select(spec)}: Return a new record batch with a selection of columns. This supports the usual \code{character}, \code{numeric}, and \code{logical} selection methods as well as "tidy select" expressions. -\item \code{$Slice(offset, length = NULL)}: Create a zero-copy view starting at the +\item \verb{$Slice(offset, length = NULL)}: Create a zero-copy view starting at the indicated integer offset and going for the given length, or to the end of the table if \code{NULL}, the default. -\item \code{$Take(i)}: return an \code{RecordBatch} with rows at positions given by +\item \verb{$Take(i)}: return an \code{RecordBatch} with rows at positions given by integers (R vector or Array Array) \code{i}. -\item \code{$Filter(i)}: return an \code{RecordBatch} with rows at positions where logical +\item \verb{$Filter(i)}: return an \code{RecordBatch} with rows at positions where logical vector (or Arrow boolean Array) \code{i} is \code{TRUE}. -\item \code{$serialize()}: Returns a raw vector suitable for interprocess communication -\item \code{$cast(target_schema, safe = TRUE, options = cast_options(safe))}: Alter +\item \verb{$serialize()}: Returns a raw vector suitable for interprocess communication +\item \verb{$cast(target_schema, safe = TRUE, options = cast_options(safe))}: Alter the schema of the record batch. } There are also some active bindings \itemize{ -\item \code{$num_columns} -\item \code{$num_rows} -\item \code{$schema} -\item \code{$columns}: Returns a list of \code{Array}s +\item \verb{$num_columns} +\item \verb{$num_rows} +\item \verb{$schema} +\item \verb{$columns}: Returns a list of \code{Array}s } } @@ -87,4 +87,3 @@ batch[["cyl"]] as.data.frame(batch[4:8, c("gear", "hp", "wt")]) } } -\keyword{datasets} diff --git a/r/man/RecordBatchReader.Rd b/r/man/RecordBatchReader.Rd index 5ed6ba4..b6ae5f7 100644 --- a/r/man/RecordBatchReader.Rd +++ b/r/man/RecordBatchReader.Rd @@ -26,14 +26,13 @@ take a single argument, named according to the class: \section{Methods}{ \itemize{ -\item \code{$read_next_batch()}: Returns a \code{RecordBatch} -\item \code{$schema()}: Returns a \link{Schema} -\item \code{$batches()}: Returns a list of \code{RecordBatch}es -\item \code{$get_batch(i)}: For \code{RecordBatchFileReader}, return a particular batch +\item \verb{$read_next_batch()}: Returns a \code{RecordBatch} +\item \verb{$schema()}: Returns a \link{Schema} +\item \verb{$batches()}: Returns a list of \code{RecordBatch}es +\item \verb{$get_batch(i)}: For \code{RecordBatchFileReader}, return a particular batch by an integer index. -\item \code{$num_record_batches()}: For \code{RecordBatchFileReader}, see how many batches +\item \verb{$num_record_batches()}: For \code{RecordBatchFileReader}, see how many batches are in the file. } } -\keyword{datasets} diff --git a/r/man/RecordBatchWriter.Rd b/r/man/RecordBatchWriter.Rd index 46e0b87..6f21e0c 100644 --- a/r/man/RecordBatchWriter.Rd +++ b/r/man/RecordBatchWriter.Rd @@ -35,12 +35,11 @@ take a single argument, named according to the class: \section{Methods}{ \itemize{ -\item \code{$write(x)}: Write a \link{RecordBatch}, \link{Table}, or \code{data.frame}, dispatching +\item \verb{$write(x)}: Write a \link{RecordBatch}, \link{Table}, or \code{data.frame}, dispatching to the methods below appropriately -\item \code{$write_batch(batch)}: Write a \code{RecordBatch} to stream -\item \code{$write_table(table)}: Write a \code{Table} to stream -\item \code{$close()}: close stream +\item \verb{$write_batch(batch)}: Write a \code{RecordBatch} to stream +\item \verb{$write_table(table)}: Write a \code{Table} to stream +\item \verb{$close()}: close stream } } -\keyword{datasets} diff --git a/r/man/Scanner.Rd b/r/man/Scanner.Rd new file mode 100644 index 0000000..d231ac0 --- /dev/null +++ b/r/man/Scanner.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset.R +\name{Scanner} +\alias{Scanner} +\alias{ScannerBuilder} +\title{Scan the contents of a dataset} +\description{ +A \code{Scanner} iterates over a \link{Dataset}'s data fragments and returns data +according to given row filtering and column projection. Use a +\code{ScannerBuilder}, from a \code{Dataset}'s \verb{$NewScan()} method, to construct one. +} +\section{Methods}{ + +\code{ScannerBuilder} has the following methods: +\itemize{ +\item \verb{$Project(cols)}: Indicate that the scan should only return columns given +by \code{cols}, a character vector of column names +\item \verb{$Filter(expr)}: Filter rows by an \link{Expression}. +\item \verb{$UseThreads(threads)}: logical: should the scan use multithreading? +The method's default input is \code{TRUE}, but you must call the method to enable +multithreading because the scanner default is \code{FALSE}. +\item \verb{$schema}: Active binding, returns the \link{Schema} of the Dataset +\item \verb{$Finish()}: Returns a \code{Scanner} +} + +\code{Scanner} currently has a single method, \verb{$ToTable()}, which evaluates the +query and returns an Arrow \link{Table}. +} + diff --git a/r/man/Schema.Rd b/r/man/Schema.Rd index 2f960db..e0a17d9 100644 --- a/r/man/Schema.Rd +++ b/r/man/Schema.Rd @@ -29,10 +29,9 @@ s$field(i) \section{Methods}{ \itemize{ -\item \code{$ToString()}: convert to a string -\item \code{$num_fields()}: returns the number of fields -\item \code{$field(i)}: returns the field at index \code{i} (0-based) +\item \verb{$ToString()}: convert to a string +\item \verb{$num_fields()}: returns the number of fields +\item \verb{$field(i)}: returns the field at index \code{i} (0-based) } } -\keyword{datasets} diff --git a/r/man/Selector.Rd b/r/man/Selector.Rd index fe270ed..4fef4c5 100644 --- a/r/man/Selector.Rd +++ b/r/man/Selector.Rd @@ -1,19 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/filesystem.R -\docType{data} \name{Selector} \alias{Selector} \title{file selector} -\usage{ -Selector -} \description{ file selector } \section{Factory}{ -The \code{$create()} factory method instantiates a \code{Selector} given the 3 fields +The \verb{$create()} factory method instantiates a \code{Selector} given the 3 fields described below. } @@ -29,4 +25,3 @@ selection is returned } } -\keyword{datasets} diff --git a/r/man/Table.Rd b/r/man/Table.Rd index bac7a7b..d20c443 100644 --- a/r/man/Table.Rd +++ b/r/man/Table.Rd @@ -25,18 +25,18 @@ the data in \code{...} \section{S3 Methods and Usage}{ Tables are data-frame-like, and many methods you expect to work on -a \code{data.frame} are implemented for \code{Table}. This includes \code{[}, \code{[[}, -\code{$}, \code{names}, \code{dim}, \code{nrow}, \code{ncol}, \code{head}, and \code{tail}. You can also pull +a \code{data.frame} are implemented for \code{Table}. This includes \verb{[}, \verb{[[}, +\verb{$}, \code{names}, \code{dim}, \code{nrow}, \code{ncol}, \code{head}, and \code{tail}. You can also pull the data from an Arrow table into R with \code{as.data.frame()}. See the examples. -A caveat about the \code{$} method: because \code{Table} is an \code{R6} object, -\code{$} is also used to access the object's methods (see below). Methods take +A caveat about the \verb{$} method: because \code{Table} is an \code{R6} object, +\verb{$} is also used to access the object's methods (see below). Methods take precedence over the table's columns. So, \code{tab$Slice} would return the "Slice" method function even if there were a column in the table called "Slice". -A caveat about the \code{[} method for row operations: only "slicing" is +A caveat about the \verb{[} method for row operations: only "slicing" is currently supported. That is, you can select a continuous range of rows from the table, but you can't filter with a \code{logical} vector or take an arbitrary selection of rows by integer indices. @@ -47,33 +47,33 @@ arbitrary selection of rows by integer indices. In addition to the more R-friendly S3 methods, a \code{Table} object has the following R6 methods that map onto the underlying C++ methods: \itemize{ -\item \code{$column(i)}: Extract a \code{ChunkedArray} by integer position from the table -\item \code{$ColumnNames()}: Get all column names (called by \code{names(tab)}) -\item \code{$GetColumnByName(name)}: Extract a \code{ChunkedArray} by string name -\item \code{$field(i)}: Extract a \code{Field} from the table schema by integer position -\item \code{$select(spec)}: Return a new table with a selection of columns. +\item \verb{$column(i)}: Extract a \code{ChunkedArray} by integer position from the table +\item \verb{$ColumnNames()}: Get all column names (called by \code{names(tab)}) +\item \verb{$GetColumnByName(name)}: Extract a \code{ChunkedArray} by string name +\item \verb{$field(i)}: Extract a \code{Field} from the table schema by integer position +\item \verb{$select(spec)}: Return a new table with a selection of columns. This supports the usual \code{character}, \code{numeric}, and \code{logical} selection methods as well as "tidy select" expressions. -\item \code{$Slice(offset, length = NULL)}: Create a zero-copy view starting at the +\item \verb{$Slice(offset, length = NULL)}: Create a zero-copy view starting at the indicated integer offset and going for the given length, or to the end of the table if \code{NULL}, the default. -\item \code{$Take(i)}: return an \code{Table} with rows at positions given by +\item \verb{$Take(i)}: return an \code{Table} with rows at positions given by integers \code{i}. If \code{i} is an Arrow \code{Array} or \code{ChunkedArray}, it will be coerced to an R vector before taking. -\item \code{$Filter(i)}: return an \code{Table} with rows at positions where logical -vector or Arrow boolean-type \code{(Chunked)Array} \code{i} is \code{TRUE}. -\item \code{$serialize(output_stream, ...)}: Write the table to the given +\item \verb{$Filter(i)}: return an \code{Table} with rows at positions where logical +vector or Arrow boolean-type \verb{(Chunked)Array} \code{i} is \code{TRUE}. +\item \verb{$serialize(output_stream, ...)}: Write the table to the given \link{OutputStream} -\item \code{$cast(target_schema, safe = TRUE, options = cast_options(safe))}: Alter +\item \verb{$cast(target_schema, safe = TRUE, options = cast_options(safe))}: Alter the schema of the record batch. } There are also some active bindings \itemize{ -\item \code{$num_columns} -\item \code{$num_rows} -\item \code{$schema} -\item \code{$columns}: Returns a list of \code{ChunkedArray}s +\item \verb{$num_columns} +\item \verb{$num_rows} +\item \verb{$schema} +\item \verb{$columns}: Returns a list of \code{ChunkedArray}s } } @@ -88,4 +88,3 @@ tab[["cyl"]] as.data.frame(tab[4:8, c("gear", "hp", "wt")]) } } -\keyword{datasets} diff --git a/r/man/array.Rd b/r/man/array.Rd index 105404f..e9277ee 100644 --- a/r/man/array.Rd +++ b/r/man/array.Rd @@ -32,32 +32,31 @@ a == a \section{Methods}{ \itemize{ -\item \code{$IsNull(i)}: Return true if value at index is null. Does not boundscheck -\item \code{$IsValid(i)}: Return true if value at index is valid. Does not boundscheck -\item \code{$length()}: Size in the number of elements this array contains -\item \code{$offset()}: A relative position into another array's data, to enable zero-copy slicing -\item \code{$null_count()}: The number of null entries in the array -\item \code{$type()}: logical type of data -\item \code{$type_id()}: type id -\item \code{$Equals(other)} : is this array equal to \code{other} -\item \code{$ApproxEquals(other)} : -\item \code{$data()}: return the underlying \link{ArrayData} -\item \code{$as_vector()}: convert to an R vector -\item \code{$ToString()}: string representation of the array -\item \code{$Slice(offset, length = NULL)}: Construct a zero-copy slice of the array +\item \verb{$IsNull(i)}: Return true if value at index is null. Does not boundscheck +\item \verb{$IsValid(i)}: Return true if value at index is valid. Does not boundscheck +\item \verb{$length()}: Size in the number of elements this array contains +\item \verb{$offset()}: A relative position into another array's data, to enable zero-copy slicing +\item \verb{$null_count()}: The number of null entries in the array +\item \verb{$type()}: logical type of data +\item \verb{$type_id()}: type id +\item \verb{$Equals(other)} : is this array equal to \code{other} +\item \verb{$ApproxEquals(other)} : +\item \verb{$data()}: return the underlying \link{ArrayData} +\item \verb{$as_vector()}: convert to an R vector +\item \verb{$ToString()}: string representation of the array +\item \verb{$Slice(offset, length = NULL)}: Construct a zero-copy slice of the array with the indicated offset and length. If length is \code{NULL}, the slice goes until the end of the array. -\item \code{$Take(i)}: return an \code{Array} with values at positions given by integers +\item \verb{$Take(i)}: return an \code{Array} with values at positions given by integers (R vector or Array Array) \code{i}. -\item \code{$Filter(i)}: return an \code{Array} with values at positions where logical +\item \verb{$Filter(i)}: return an \code{Array} with values at positions where logical vector (or Arrow boolean Array) \code{i} is \code{TRUE}. -\item \code{$RangeEquals(other, start_idx, end_idx, other_start_idx)} : -\item \code{$cast(target_type, safe = TRUE, options = cast_options(safe))}: Alter the +\item \verb{$RangeEquals(other, start_idx, end_idx, other_start_idx)} : +\item \verb{$cast(target_type, safe = TRUE, options = cast_options(safe))}: Alter the data in the array to change its type. -\item \code{$View(type)}: Construct a zero-copy view of this array with the given type. -\item \code{$Validate()} : Perform any validation checks to determine obvious inconsistencies +\item \verb{$View(type)}: Construct a zero-copy view of this array with the given type. +\item \verb{$Validate()} : Perform any validation checks to determine obvious inconsistencies within the array's internal data. This can be an expensive check, potentially \code{O(length)} } } -\keyword{datasets} diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd index b8e8997..0f90d55 100644 --- a/r/man/arrow-package.Rd +++ b/r/man/arrow-package.Rd @@ -26,7 +26,7 @@ Useful links: Authors: \itemize{ - \item Romain François \email{rom...@rstudio.com} (0000-0002-2444-4226) + \item Romain François \email{rom...@rstudio.com} (\href{https://orcid.org/0000-0002-2444-4226}{ORCID}) \item Jeroen Ooms \email{jer...@berkeley.edu} \item Apache Arrow \email{d...@arrow.apache.org} [copyright holder] } diff --git a/r/man/buffer.Rd b/r/man/buffer.Rd index 5481ca5..4a479b7 100644 --- a/r/man/buffer.Rd +++ b/r/man/buffer.Rd @@ -26,11 +26,10 @@ contiguous memory with a particular size. \section{Methods}{ \itemize{ -\item \code{$is_mutable()} : -\item \code{$ZeroPadding()} : -\item \code{$size()} : -\item \code{$capacity()}: +\item \verb{$is_mutable()} : +\item \verb{$ZeroPadding()} : +\item \verb{$size()} : +\item \verb{$capacity()}: } } -\keyword{datasets} diff --git a/r/man/cast_options.Rd b/r/man/cast_options.Rd index 7d4ad53..19dfe65 100644 --- a/r/man/cast_options.Rd +++ b/r/man/cast_options.Rd @@ -4,8 +4,12 @@ \alias{cast_options} \title{Cast options} \usage{ -cast_options(safe = TRUE, allow_int_overflow = !safe, - allow_time_truncate = !safe, allow_float_truncate = !safe) +cast_options( + safe = TRUE, + allow_int_overflow = !safe, + allow_time_truncate = !safe, + allow_float_truncate = !safe +) } \arguments{ \item{safe}{enforce safe conversion} diff --git a/r/man/compression.Rd b/r/man/compression.Rd index f6a6920..7cdb320 100644 --- a/r/man/compression.Rd +++ b/r/man/compression.Rd @@ -29,4 +29,3 @@ factory methods instantiate the object and take the following arguments: Methods are inherited from \link{InputStream} and \link{OutputStream}, respectively } -\keyword{datasets} diff --git a/r/man/dictionary.Rd b/r/man/dictionary.Rd index 2716a17..d4b9349 100644 --- a/r/man/dictionary.Rd +++ b/r/man/dictionary.Rd @@ -4,8 +4,7 @@ \alias{dictionary} \title{Create a dictionary type} \usage{ -dictionary(index_type = int32(), value_type = utf8(), - ordered = FALSE) +dictionary(index_type = int32(), value_type = utf8(), ordered = FALSE) } \arguments{ \item{index_type}{A DataType for the indices (default \code{\link[=int32]{int32()}})} diff --git a/r/man/hive_partition.Rd b/r/man/hive_partition.Rd new file mode 100644 index 0000000..86627ac --- /dev/null +++ b/r/man/hive_partition.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset.R +\name{hive_partition} +\alias{hive_partition} +\title{Construct a Hive partition scheme} +\usage{ +hive_partition(...) +} +\arguments{ +\item{...}{named list of \link[=data-type]{data types}, passed to \code{\link[=schema]{schema()}}} +} +\value{ +A \code{HivePartitionScheme} +} +\description{ +Hive partitioning embeds field names and values in path segments, such as +"/year=2019/month=2/data.parquet". A \link[=PartitionScheme]{HivePartitionScheme} +is used to parse that in Dataset creation. +} +\details{ +Because fields are named in the path segments, order of fields passed to +\code{hive_partition()} does not matter. +} +\examples{ +\donttest{ +hive_partition(year = int16(), month = int8()) +} +} diff --git a/r/man/open_dataset.Rd b/r/man/open_dataset.Rd new file mode 100644 index 0000000..bd03d48 --- /dev/null +++ b/r/man/open_dataset.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset.R +\name{open_dataset} +\alias{open_dataset} +\title{Open a multi-file dataset} +\usage{ +open_dataset(path, schema = NULL, partition = NULL, ...) +} +\arguments{ +\item{path}{String path to a directory containing the data files} + +\item{schema}{\link{Schema} for the dataset. If \code{NULL} (the default), the schema +will be inferred from the files} + +\item{partition}{One of +\itemize{ +\item A \code{Schema}, in which case the file paths relative to \code{path} will be +parsed, and path segments will be matched with the schema fields. For +example, \code{schema(year = int16(), month = int8())} would create partitions +for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc. +\item A \code{HivePartitionScheme}, as returned by \code{\link[=hive_partition]{hive_partition()}} +\item \code{NULL}, the default, for no partitioning +}} + +\item{...}{additional arguments passed to \code{DataSourceDiscovery$create()}} +} +\value{ +A \link{Dataset} R6 object. Use \code{dplyr} methods on it to query the data, +or call \verb{$NewScan()} to construct a query directly. +} +\description{ +Open a multi-file dataset +} +\seealso{ +\link{PartitionScheme} for defining partitioning +} diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd index cfe1e48..030ab38 100644 --- a/r/man/read_delim_arrow.Rd +++ b/r/man/read_delim_arrow.Rd @@ -6,23 +6,57 @@ \alias{read_tsv_arrow} \title{Read a CSV or other delimited file with Arrow} \usage{ -read_delim_arrow(file, delim = ",", quote = "\\"", - escape_double = TRUE, escape_backslash = FALSE, col_names = TRUE, - col_select = NULL, na = c("", "NA"), quoted_na = TRUE, - skip_empty_rows = TRUE, skip = 0L, parse_options = NULL, - convert_options = NULL, read_options = NULL, as_data_frame = TRUE) - -read_csv_arrow(file, quote = "\\"", escape_double = TRUE, - escape_backslash = FALSE, col_names = TRUE, col_select = NULL, - na = c("", "NA"), quoted_na = TRUE, skip_empty_rows = TRUE, - skip = 0L, parse_options = NULL, convert_options = NULL, - read_options = NULL, as_data_frame = TRUE) - -read_tsv_arrow(file, quote = "\\"", escape_double = TRUE, - escape_backslash = FALSE, col_names = TRUE, col_select = NULL, - na = c("", "NA"), quoted_na = TRUE, skip_empty_rows = TRUE, - skip = 0L, parse_options = NULL, convert_options = NULL, - read_options = NULL, as_data_frame = TRUE) +read_delim_arrow( + file, + delim = ",", + quote = "\\"", + escape_double = TRUE, + escape_backslash = FALSE, + col_names = TRUE, + col_select = NULL, + na = c("", "NA"), + quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = NULL, + as_data_frame = TRUE +) + +read_csv_arrow( + file, + quote = "\\"", + escape_double = TRUE, + escape_backslash = FALSE, + col_names = TRUE, + col_select = NULL, + na = c("", "NA"), + quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = NULL, + as_data_frame = TRUE +) + +read_tsv_arrow( + file, + quote = "\\"", + escape_double = TRUE, + escape_backslash = FALSE, + col_names = TRUE, + col_select = NULL, + na = c("", "NA"), + quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = NULL, + as_data_frame = TRUE +) } \arguments{ \item{file}{A character file name, raw vector, or an Arrow input stream} @@ -32,13 +66,13 @@ read_tsv_arrow(file, quote = "\\"", escape_double = TRUE, \item{quote}{Single character used to quote strings.} \item{escape_double}{Does the file escape quotes by doubling them? -i.e. If this option is \code{TRUE}, the value \code{""""} represents -a single quote, \code{\"}.} +i.e. If this option is \code{TRUE}, the value \verb{""""} represents +a single quote, \verb{\\"}.} \item{escape_backslash}{Does the file use backslashes to escape special characters? This is more general than \code{escape_double} as backslashes can be used to escape the delimiter character, the quote character, or -to add special characters like \code{\\n}.} +to add special characters like \verb{\\\\n}.} \item{col_names}{If \code{TRUE}, the first row of the input will be used as the column names and will not be included in the data frame. If \code{FALSE}, column diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd index 757d82e..eef590e 100644 --- a/r/man/read_parquet.Rd +++ b/r/man/read_parquet.Rd @@ -4,8 +4,13 @@ \alias{read_parquet} \title{Read a Parquet file} \usage{ -read_parquet(file, col_select = NULL, as_data_frame = TRUE, - props = ParquetReaderProperties$create(), ...) +read_parquet( + file, + col_select = NULL, + as_data_frame = TRUE, + props = ParquetReaderProperties$create(), + ... +) } \arguments{ \item{file}{A character file name, raw vector, or an Arrow input stream} diff --git a/r/man/reexports.Rd b/r/man/reexports.Rd index 3b1e527..2ee3f0f 100644 --- a/r/man/reexports.Rd +++ b/r/man/reexports.Rd @@ -23,6 +23,6 @@ below to see their documentation. \describe{ \item{bit64}{\code{\link[bit64]{print.integer64}}, \code{\link[bit64]{str.integer64}}} - \item{tidyselect}{\code{\link[tidyselect]{contains}}, \code{\link[tidyselect]{ends_with}}, \code{\link[tidyselect]{everything}}, \code{\link[tidyselect]{matches}}, \code{\link[tidyselect]{num_range}}, \code{\link[tidyselect]{one_of}}, \code{\link[tidyselect]{starts_with}}, \code{\link[tidyselect]{last_col}}} + \item{tidyselect}{\code{\link[tidyselect]{contains}}, \code{\link[tidyselect]{ends_with}}, \code{\link[tidyselect]{everything}}, \code{\link[tidyselect]{last_col}}, \code{\link[tidyselect]{matches}}, \code{\link[tidyselect]{num_range}}, \code{\link[tidyselect]{one_of}}, \code{\link[tidyselect]{starts_with}}} }} diff --git a/r/man/write_arrow.Rd b/r/man/write_arrow.Rd index c4d6703..31d1131 100644 --- a/r/man/write_arrow.Rd +++ b/r/man/write_arrow.Rd @@ -11,7 +11,7 @@ write_arrow(x, sink, ...) \item{sink}{where to serialize to \itemize{ -\item A \link[=RecordBatchWriter]{arrow::RecordBatchWriter}: the \code{$write()} +\item A \link[=RecordBatchWriter]{arrow::RecordBatchWriter}: the \verb{$write()} of \code{x} is used. The stream is left open. This uses the streaming format or the binary file format depending on the type of the writer. \item A string file path: \code{x} is serialized with diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index b2471d7..d9b6fed 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -4,18 +4,26 @@ \alias{write_parquet} \title{Write Parquet file to disk} \usage{ -write_parquet(x, sink, chunk_size = NULL, version = NULL, - compression = NULL, compression_level = NULL, - use_dictionary = NULL, write_statistics = NULL, - data_page_size = NULL, properties = ParquetWriterProperties$create(x, - version = version, compression = compression, compression_level = - compression_level, use_dictionary = use_dictionary, write_statistics = - write_statistics, data_page_size = data_page_size), - use_deprecated_int96_timestamps = FALSE, coerce_timestamps = NULL, +write_parquet( + x, + sink, + chunk_size = NULL, + version = NULL, + compression = NULL, + compression_level = NULL, + use_dictionary = NULL, + write_statistics = NULL, + data_page_size = NULL, + properties = ParquetWriterProperties$create(x, version = version, compression = + compression, compression_level = compression_level, use_dictionary = use_dictionary, + write_statistics = write_statistics, data_page_size = data_page_size), + use_deprecated_int96_timestamps = FALSE, + coerce_timestamps = NULL, allow_truncated_timestamps = FALSE, arrow_properties = ParquetArrowWriterProperties$create(use_deprecated_int96_timestamps - = use_deprecated_int96_timestamps, coerce_timestamps = coerce_timestamps, - allow_truncated_timestamps = allow_truncated_timestamps)) + = use_deprecated_int96_timestamps, coerce_timestamps = coerce_timestamps, + allow_truncated_timestamps = allow_truncated_timestamps) +) } \arguments{ \item{x}{An \link[=Table]{arrow::Table}, or an object convertible to it.} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 3fd3d7a..58ed098 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1338,6 +1338,241 @@ RcppExport SEXP _arrow_csv___TableReader__Read(SEXP table_reader_sexp){ } #endif +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::DataSourceDiscovery> dataset___FSDSDiscovery__Make(const std::shared_ptr<arrow::fs::FileSystem>& fs, const std::shared_ptr<arrow::fs::Selector>& selector); +RcppExport SEXP _arrow_dataset___FSDSDiscovery__Make(SEXP fs_sexp, SEXP selector_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::fs::FileSystem>&>::type fs(fs_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::fs::Selector>&>::type selector(selector_sexp); + return Rcpp::wrap(dataset___FSDSDiscovery__Make(fs, selector)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___FSDSDiscovery__Make(SEXP fs_sexp, SEXP selector_sexp){ + Rf_error("Cannot call dataset___FSDSDiscovery__Make(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::DataSource> dataset___DSDiscovery__Finish(const std::shared_ptr<arrow::dataset::DataSourceDiscovery>& discovery); +RcppExport SEXP _arrow_dataset___DSDiscovery__Finish(SEXP discovery_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::DataSourceDiscovery>&>::type discovery(discovery_sexp); + return Rcpp::wrap(dataset___DSDiscovery__Finish(discovery)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___DSDiscovery__Finish(SEXP discovery_sexp){ + Rf_error("Cannot call dataset___DSDiscovery__Finish(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::Schema> dataset___DSDiscovery__Inspect(const std::shared_ptr<arrow::dataset::DataSourceDiscovery>& discovery); +RcppExport SEXP _arrow_dataset___DSDiscovery__Inspect(SEXP discovery_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::DataSourceDiscovery>&>::type discovery(discovery_sexp); + return Rcpp::wrap(dataset___DSDiscovery__Inspect(discovery)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___DSDiscovery__Inspect(SEXP discovery_sexp){ + Rf_error("Cannot call dataset___DSDiscovery__Inspect(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +void dataset___DSDiscovery__SetPartitionScheme(const std::shared_ptr<arrow::dataset::DataSourceDiscovery>& discovery, const std::shared_ptr<arrow::dataset::PartitionScheme>& part); +RcppExport SEXP _arrow_dataset___DSDiscovery__SetPartitionScheme(SEXP discovery_sexp, SEXP part_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::DataSourceDiscovery>&>::type discovery(discovery_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::PartitionScheme>&>::type part(part_sexp); + dataset___DSDiscovery__SetPartitionScheme(discovery, part); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___DSDiscovery__SetPartitionScheme(SEXP discovery_sexp, SEXP part_sexp){ + Rf_error("Cannot call dataset___DSDiscovery__SetPartitionScheme(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::SchemaPartitionScheme> dataset___SchemaPartitionScheme(const std::shared_ptr<arrow::Schema>& schm); +RcppExport SEXP _arrow_dataset___SchemaPartitionScheme(SEXP schm_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Schema>&>::type schm(schm_sexp); + return Rcpp::wrap(dataset___SchemaPartitionScheme(schm)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___SchemaPartitionScheme(SEXP schm_sexp){ + Rf_error("Cannot call dataset___SchemaPartitionScheme(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::HivePartitionScheme> dataset___HivePartitionScheme(const std::shared_ptr<arrow::Schema>& schm); +RcppExport SEXP _arrow_dataset___HivePartitionScheme(SEXP schm_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Schema>&>::type schm(schm_sexp); + return Rcpp::wrap(dataset___HivePartitionScheme(schm)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___HivePartitionScheme(SEXP schm_sexp){ + Rf_error("Cannot call dataset___HivePartitionScheme(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::Dataset> dataset___Dataset__create(const std::vector<std::shared_ptr<arrow::dataset::DataSource>>& sources, const std::shared_ptr<arrow::Schema>& schm); +RcppExport SEXP _arrow_dataset___Dataset__create(SEXP sources_sexp, SEXP schm_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::vector<std::shared_ptr<arrow::dataset::DataSource>>&>::type sources(sources_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Schema>&>::type schm(schm_sexp); + return Rcpp::wrap(dataset___Dataset__create(sources, schm)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___Dataset__create(SEXP sources_sexp, SEXP schm_sexp){ + Rf_error("Cannot call dataset___Dataset__create(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::Schema> dataset___Dataset__schema(const std::unique_ptr<arrow::dataset::Dataset>& ds); +RcppExport SEXP _arrow_dataset___Dataset__schema(SEXP ds_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::unique_ptr<arrow::dataset::Dataset>&>::type ds(ds_sexp); + return Rcpp::wrap(dataset___Dataset__schema(ds)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___Dataset__schema(SEXP ds_sexp){ + Rf_error("Cannot call dataset___Dataset__schema(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::unique_ptr<arrow::dataset::ScannerBuilder> dataset___Dataset__NewScan(const std::shared_ptr<arrow::dataset::Dataset>& ds); +RcppExport SEXP _arrow_dataset___Dataset__NewScan(SEXP ds_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Dataset>&>::type ds(ds_sexp); + return Rcpp::wrap(dataset___Dataset__NewScan(ds)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___Dataset__NewScan(SEXP ds_sexp){ + Rf_error("Cannot call dataset___Dataset__NewScan(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +void dataset___ScannerBuilder__Project(const std::unique_ptr<arrow::dataset::ScannerBuilder>& sb, const std::vector<std::string>& cols); +RcppExport SEXP _arrow_dataset___ScannerBuilder__Project(SEXP sb_sexp, SEXP cols_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::unique_ptr<arrow::dataset::ScannerBuilder>&>::type sb(sb_sexp); + Rcpp::traits::input_parameter<const std::vector<std::string>&>::type cols(cols_sexp); + dataset___ScannerBuilder__Project(sb, cols); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___ScannerBuilder__Project(SEXP sb_sexp, SEXP cols_sexp){ + Rf_error("Cannot call dataset___ScannerBuilder__Project(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +void dataset___ScannerBuilder__Filter(const std::unique_ptr<arrow::dataset::ScannerBuilder>& sb, const std::shared_ptr<arrow::dataset::Expression>& expr); +RcppExport SEXP _arrow_dataset___ScannerBuilder__Filter(SEXP sb_sexp, SEXP expr_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::unique_ptr<arrow::dataset::ScannerBuilder>&>::type sb(sb_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type expr(expr_sexp); + dataset___ScannerBuilder__Filter(sb, expr); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___ScannerBuilder__Filter(SEXP sb_sexp, SEXP expr_sexp){ + Rf_error("Cannot call dataset___ScannerBuilder__Filter(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +void dataset___ScannerBuilder__UseThreads(const std::unique_ptr<arrow::dataset::ScannerBuilder>& sb, bool threads); +RcppExport SEXP _arrow_dataset___ScannerBuilder__UseThreads(SEXP sb_sexp, SEXP threads_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::unique_ptr<arrow::dataset::ScannerBuilder>&>::type sb(sb_sexp); + Rcpp::traits::input_parameter<bool>::type threads(threads_sexp); + dataset___ScannerBuilder__UseThreads(sb, threads); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___ScannerBuilder__UseThreads(SEXP sb_sexp, SEXP threads_sexp){ + Rf_error("Cannot call dataset___ScannerBuilder__UseThreads(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::Schema> dataset___ScannerBuilder__schema(const std::unique_ptr<arrow::dataset::ScannerBuilder>& sb); +RcppExport SEXP _arrow_dataset___ScannerBuilder__schema(SEXP sb_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::unique_ptr<arrow::dataset::ScannerBuilder>&>::type sb(sb_sexp); + return Rcpp::wrap(dataset___ScannerBuilder__schema(sb)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___ScannerBuilder__schema(SEXP sb_sexp){ + Rf_error("Cannot call dataset___ScannerBuilder__schema(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::unique_ptr<arrow::dataset::Scanner> dataset___ScannerBuilder__Finish(const std::unique_ptr<arrow::dataset::ScannerBuilder>& sb); +RcppExport SEXP _arrow_dataset___ScannerBuilder__Finish(SEXP sb_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::unique_ptr<arrow::dataset::ScannerBuilder>&>::type sb(sb_sexp); + return Rcpp::wrap(dataset___ScannerBuilder__Finish(sb)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___ScannerBuilder__Finish(SEXP sb_sexp){ + Rf_error("Cannot call dataset___ScannerBuilder__Finish(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::Table> dataset___Scanner__ToTable(const std::unique_ptr<arrow::dataset::Scanner>& scn); +RcppExport SEXP _arrow_dataset___Scanner__ToTable(SEXP scn_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::unique_ptr<arrow::dataset::Scanner>&>::type scn(scn_sexp); + return Rcpp::wrap(dataset___Scanner__ToTable(scn)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___Scanner__ToTable(SEXP scn_sexp){ + Rf_error("Cannot call dataset___Scanner__ToTable(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // datatype.cpp #if defined(ARROW_R_WITH_ARROW) bool shared_ptr_is_null(SEXP xp); @@ -2064,6 +2299,194 @@ RcppExport SEXP _arrow_ListType__value_type(SEXP type_sexp){ } #endif +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::FieldExpression> dataset___expr__field_ref(std::string name); +RcppExport SEXP _arrow_dataset___expr__field_ref(SEXP name_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<std::string>::type name(name_sexp); + return Rcpp::wrap(dataset___expr__field_ref(name)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___expr__field_ref(SEXP name_sexp){ + Rf_error("Cannot call dataset___expr__field_ref(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::ComparisonExpression> dataset___expr__equal(const std::shared_ptr<arrow::dataset::Expression>& lhs, const std::shared_ptr<arrow::dataset::Expression>& rhs); +RcppExport SEXP _arrow_dataset___expr__equal(SEXP lhs_sexp, SEXP rhs_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type lhs(lhs_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type rhs(rhs_sexp); + return Rcpp::wrap(dataset___expr__equal(lhs, rhs)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___expr__equal(SEXP lhs_sexp, SEXP rhs_sexp){ + Rf_error("Cannot call dataset___expr__equal(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::ComparisonExpression> dataset___expr__not_equal(const std::shared_ptr<arrow::dataset::Expression>& lhs, const std::shared_ptr<arrow::dataset::Expression>& rhs); +RcppExport SEXP _arrow_dataset___expr__not_equal(SEXP lhs_sexp, SEXP rhs_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type lhs(lhs_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type rhs(rhs_sexp); + return Rcpp::wrap(dataset___expr__not_equal(lhs, rhs)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___expr__not_equal(SEXP lhs_sexp, SEXP rhs_sexp){ + Rf_error("Cannot call dataset___expr__not_equal(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::ComparisonExpression> dataset___expr__greater(const std::shared_ptr<arrow::dataset::Expression>& lhs, const std::shared_ptr<arrow::dataset::Expression>& rhs); +RcppExport SEXP _arrow_dataset___expr__greater(SEXP lhs_sexp, SEXP rhs_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type lhs(lhs_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type rhs(rhs_sexp); + return Rcpp::wrap(dataset___expr__greater(lhs, rhs)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___expr__greater(SEXP lhs_sexp, SEXP rhs_sexp){ + Rf_error("Cannot call dataset___expr__greater(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::ComparisonExpression> dataset___expr__greater_equal(const std::shared_ptr<arrow::dataset::Expression>& lhs, const std::shared_ptr<arrow::dataset::Expression>& rhs); +RcppExport SEXP _arrow_dataset___expr__greater_equal(SEXP lhs_sexp, SEXP rhs_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type lhs(lhs_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type rhs(rhs_sexp); + return Rcpp::wrap(dataset___expr__greater_equal(lhs, rhs)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___expr__greater_equal(SEXP lhs_sexp, SEXP rhs_sexp){ + Rf_error("Cannot call dataset___expr__greater_equal(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::ComparisonExpression> dataset___expr__less(const std::shared_ptr<arrow::dataset::Expression>& lhs, const std::shared_ptr<arrow::dataset::Expression>& rhs); +RcppExport SEXP _arrow_dataset___expr__less(SEXP lhs_sexp, SEXP rhs_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type lhs(lhs_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type rhs(rhs_sexp); + return Rcpp::wrap(dataset___expr__less(lhs, rhs)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___expr__less(SEXP lhs_sexp, SEXP rhs_sexp){ + Rf_error("Cannot call dataset___expr__less(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::ComparisonExpression> dataset___expr__less_equal(const std::shared_ptr<arrow::dataset::Expression>& lhs, const std::shared_ptr<arrow::dataset::Expression>& rhs); +RcppExport SEXP _arrow_dataset___expr__less_equal(SEXP lhs_sexp, SEXP rhs_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type lhs(lhs_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type rhs(rhs_sexp); + return Rcpp::wrap(dataset___expr__less_equal(lhs, rhs)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___expr__less_equal(SEXP lhs_sexp, SEXP rhs_sexp){ + Rf_error("Cannot call dataset___expr__less_equal(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::AndExpression> dataset___expr__and(const std::shared_ptr<arrow::dataset::Expression>& lhs, const std::shared_ptr<arrow::dataset::Expression>& rhs); +RcppExport SEXP _arrow_dataset___expr__and(SEXP lhs_sexp, SEXP rhs_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type lhs(lhs_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type rhs(rhs_sexp); + return Rcpp::wrap(dataset___expr__and(lhs, rhs)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___expr__and(SEXP lhs_sexp, SEXP rhs_sexp){ + Rf_error("Cannot call dataset___expr__and(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::OrExpression> dataset___expr__or(const std::shared_ptr<arrow::dataset::Expression>& lhs, const std::shared_ptr<arrow::dataset::Expression>& rhs); +RcppExport SEXP _arrow_dataset___expr__or(SEXP lhs_sexp, SEXP rhs_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type lhs(lhs_sexp); + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type rhs(rhs_sexp); + return Rcpp::wrap(dataset___expr__or(lhs, rhs)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___expr__or(SEXP lhs_sexp, SEXP rhs_sexp){ + Rf_error("Cannot call dataset___expr__or(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::NotExpression> dataset___expr__not(const std::shared_ptr<arrow::dataset::Expression>& lhs); +RcppExport SEXP _arrow_dataset___expr__not(SEXP lhs_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type lhs(lhs_sexp); + return Rcpp::wrap(dataset___expr__not(lhs)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___expr__not(SEXP lhs_sexp){ + Rf_error("Cannot call dataset___expr__not(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr<arrow::dataset::ScalarExpression> dataset___expr__scalar(SEXP x); +RcppExport SEXP _arrow_dataset___expr__scalar(SEXP x_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<SEXP>::type x(x_sexp); + return Rcpp::wrap(dataset___expr__scalar(x)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___expr__scalar(SEXP x_sexp){ + Rf_error("Cannot call dataset___expr__scalar(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +std::string dataset___expr__ToString(const std::shared_ptr<arrow::dataset::Expression>& x); +RcppExport SEXP _arrow_dataset___expr__ToString(SEXP x_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter<const std::shared_ptr<arrow::dataset::Expression>&>::type x(x_sexp); + return Rcpp::wrap(dataset___expr__ToString(x)); +END_RCPP +} +#else +RcppExport SEXP _arrow_dataset___expr__ToString(SEXP x_sexp){ + Rf_error("Cannot call dataset___expr__ToString(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // feather.cpp #if defined(ARROW_R_WITH_ARROW) void ipc___feather___TableWriter__SetDescription(const std::unique_ptr<arrow::ipc::feather::TableWriter>& writer, const std::string& description); @@ -5098,6 +5521,21 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_csv___ConvertOptions__initialize", (DL_FUNC) &_arrow_csv___ConvertOptions__initialize, 1}, { "_arrow_csv___TableReader__Make", (DL_FUNC) &_arrow_csv___TableReader__Make, 4}, { "_arrow_csv___TableReader__Read", (DL_FUNC) &_arrow_csv___TableReader__Read, 1}, + { "_arrow_dataset___FSDSDiscovery__Make", (DL_FUNC) &_arrow_dataset___FSDSDiscovery__Make, 2}, + { "_arrow_dataset___DSDiscovery__Finish", (DL_FUNC) &_arrow_dataset___DSDiscovery__Finish, 1}, + { "_arrow_dataset___DSDiscovery__Inspect", (DL_FUNC) &_arrow_dataset___DSDiscovery__Inspect, 1}, + { "_arrow_dataset___DSDiscovery__SetPartitionScheme", (DL_FUNC) &_arrow_dataset___DSDiscovery__SetPartitionScheme, 2}, + { "_arrow_dataset___SchemaPartitionScheme", (DL_FUNC) &_arrow_dataset___SchemaPartitionScheme, 1}, + { "_arrow_dataset___HivePartitionScheme", (DL_FUNC) &_arrow_dataset___HivePartitionScheme, 1}, + { "_arrow_dataset___Dataset__create", (DL_FUNC) &_arrow_dataset___Dataset__create, 2}, + { "_arrow_dataset___Dataset__schema", (DL_FUNC) &_arrow_dataset___Dataset__schema, 1}, + { "_arrow_dataset___Dataset__NewScan", (DL_FUNC) &_arrow_dataset___Dataset__NewScan, 1}, + { "_arrow_dataset___ScannerBuilder__Project", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Project, 2}, + { "_arrow_dataset___ScannerBuilder__Filter", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Filter, 2}, + { "_arrow_dataset___ScannerBuilder__UseThreads", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseThreads, 2}, + { "_arrow_dataset___ScannerBuilder__schema", (DL_FUNC) &_arrow_dataset___ScannerBuilder__schema, 1}, + { "_arrow_dataset___ScannerBuilder__Finish", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Finish, 1}, + { "_arrow_dataset___Scanner__ToTable", (DL_FUNC) &_arrow_dataset___Scanner__ToTable, 1}, { "_arrow_shared_ptr_is_null", (DL_FUNC) &_arrow_shared_ptr_is_null, 1}, { "_arrow_unique_ptr_is_null", (DL_FUNC) &_arrow_unique_ptr_is_null, 1}, { "_arrow_Int8__initialize", (DL_FUNC) &_arrow_Int8__initialize, 0}, @@ -5147,6 +5585,18 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2}, { "_arrow_ListType__value_field", (DL_FUNC) &_arrow_ListType__value_field, 1}, { "_arrow_ListType__value_type", (DL_FUNC) &_arrow_ListType__value_type, 1}, + { "_arrow_dataset___expr__field_ref", (DL_FUNC) &_arrow_dataset___expr__field_ref, 1}, + { "_arrow_dataset___expr__equal", (DL_FUNC) &_arrow_dataset___expr__equal, 2}, + { "_arrow_dataset___expr__not_equal", (DL_FUNC) &_arrow_dataset___expr__not_equal, 2}, + { "_arrow_dataset___expr__greater", (DL_FUNC) &_arrow_dataset___expr__greater, 2}, + { "_arrow_dataset___expr__greater_equal", (DL_FUNC) &_arrow_dataset___expr__greater_equal, 2}, + { "_arrow_dataset___expr__less", (DL_FUNC) &_arrow_dataset___expr__less, 2}, + { "_arrow_dataset___expr__less_equal", (DL_FUNC) &_arrow_dataset___expr__less_equal, 2}, + { "_arrow_dataset___expr__and", (DL_FUNC) &_arrow_dataset___expr__and, 2}, + { "_arrow_dataset___expr__or", (DL_FUNC) &_arrow_dataset___expr__or, 2}, + { "_arrow_dataset___expr__not", (DL_FUNC) &_arrow_dataset___expr__not, 1}, + { "_arrow_dataset___expr__scalar", (DL_FUNC) &_arrow_dataset___expr__scalar, 1}, + { "_arrow_dataset___expr__ToString", (DL_FUNC) &_arrow_dataset___expr__ToString, 1}, { "_arrow_ipc___feather___TableWriter__SetDescription", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetDescription, 2}, { "_arrow_ipc___feather___TableWriter__SetNumRows", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetNumRows, 2}, { "_arrow_ipc___feather___TableWriter__Append", (DL_FUNC) &_arrow_ipc___feather___TableWriter__Append, 3}, diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 69f5cb3..b9b742d 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -90,6 +90,24 @@ class ConstReferenceSmartPtrInputParameter { const T* ptr; }; +template <typename T> +class ConstReferenceVectorSmartPtrInputParameter { + public: + using const_reference = const std::vector<T>&; + + explicit ConstReferenceVectorSmartPtrInputParameter(SEXP self) : vec() { + R_xlen_t n = XLENGTH(self); + for (R_xlen_t i = 0; i < n; i++) { + vec.push_back(*internal::r6_to_smart_pointer<const T*>(VECTOR_ELT(self, i))); + } + } + + inline operator const_reference() { return vec; } + + private: + std::vector<T> vec; +}; + namespace traits { template <typename T> @@ -102,6 +120,12 @@ struct input_parameter<const std::unique_ptr<T>&> { typedef typename Rcpp::ConstReferenceSmartPtrInputParameter<std::unique_ptr<T>> type; }; +template <typename T> +struct input_parameter<const std::vector<std::shared_ptr<T>>&> { + typedef typename Rcpp::ConstReferenceVectorSmartPtrInputParameter<std::shared_ptr<T>> + type; +}; + struct wrap_type_shared_ptr_tag {}; struct wrap_type_unique_ptr_tag {}; @@ -184,6 +208,7 @@ inline std::shared_ptr<T> extract(SEXP x) { #include <arrow/api.h> #include <arrow/compute/api.h> #include <arrow/csv/reader.h> +#include <arrow/dataset/api.h> #include <arrow/filesystem/filesystem.h> #include <arrow/filesystem/localfs.h> #include <arrow/io/compressed.h> diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp new file mode 100644 index 0000000..14f0375 --- /dev/null +++ b/r/src/dataset.cpp @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "./arrow_types.h" + +#if defined(ARROW_R_WITH_ARROW) + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::DataSourceDiscovery> dataset___FSDSDiscovery__Make( + const std::shared_ptr<arrow::fs::FileSystem>& fs, + const std::shared_ptr<arrow::fs::Selector>& selector) { + std::shared_ptr<arrow::dataset::DataSourceDiscovery> discovery; + // TODO(npr): add format as an argument, don't hard-code Parquet + auto format = std::make_shared<arrow::dataset::ParquetFileFormat>(); + + STOP_IF_NOT_OK(arrow::dataset::FileSystemDataSourceDiscovery::Make(fs.get(), *selector, + format, &discovery)); + return discovery; +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::DataSource> dataset___DSDiscovery__Finish( + const std::shared_ptr<arrow::dataset::DataSourceDiscovery>& discovery) { + std::shared_ptr<arrow::dataset::DataSource> out; + STOP_IF_NOT_OK(discovery->Finish(&out)); + return out; +} + +// [[arrow::export]] +std::shared_ptr<arrow::Schema> dataset___DSDiscovery__Inspect( + const std::shared_ptr<arrow::dataset::DataSourceDiscovery>& discovery) { + std::shared_ptr<arrow::Schema> out; + STOP_IF_NOT_OK(discovery->Inspect(&out)); + return out; +} + +// [[arrow::export]] +void dataset___DSDiscovery__SetPartitionScheme( + const std::shared_ptr<arrow::dataset::DataSourceDiscovery>& discovery, + const std::shared_ptr<arrow::dataset::PartitionScheme>& part) { + STOP_IF_NOT_OK(discovery->SetPartitionScheme(part)); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::SchemaPartitionScheme> dataset___SchemaPartitionScheme( + const std::shared_ptr<arrow::Schema>& schm) { + return std::make_shared<arrow::dataset::SchemaPartitionScheme>(schm); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::HivePartitionScheme> dataset___HivePartitionScheme( + const std::shared_ptr<arrow::Schema>& schm) { + return std::make_shared<arrow::dataset::HivePartitionScheme>(schm); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::Dataset> dataset___Dataset__create( + const std::vector<std::shared_ptr<arrow::dataset::DataSource>>& sources, + const std::shared_ptr<arrow::Schema>& schm) { + return std::make_shared<arrow::dataset::Dataset>(sources, schm); +} + +// [[arrow::export]] +std::shared_ptr<arrow::Schema> dataset___Dataset__schema( + const std::unique_ptr<arrow::dataset::Dataset>& ds) { + return ds->schema(); +} + +// [[arrow::export]] +std::unique_ptr<arrow::dataset::ScannerBuilder> dataset___Dataset__NewScan( + const std::shared_ptr<arrow::dataset::Dataset>& ds) { + std::unique_ptr<arrow::dataset::ScannerBuilder> out; + STOP_IF_NOT_OK(ds->NewScan(&out)); + return out; +} + +// [[arrow::export]] +void dataset___ScannerBuilder__Project( + const std::unique_ptr<arrow::dataset::ScannerBuilder>& sb, + const std::vector<std::string>& cols) { + STOP_IF_NOT_OK(sb->Project(cols)); +} + +// [[arrow::export]] +void dataset___ScannerBuilder__Filter( + const std::unique_ptr<arrow::dataset::ScannerBuilder>& sb, + const std::shared_ptr<arrow::dataset::Expression>& expr) { + STOP_IF_NOT_OK(sb->Filter(expr)); +} + +// [[arrow::export]] +void dataset___ScannerBuilder__UseThreads( + const std::unique_ptr<arrow::dataset::ScannerBuilder>& sb, bool threads) { + STOP_IF_NOT_OK(sb->UseThreads(threads)); +} + +// [[arrow::export]] +std::shared_ptr<arrow::Schema> dataset___ScannerBuilder__schema( + const std::unique_ptr<arrow::dataset::ScannerBuilder>& sb) { + return sb->schema(); +} + +// [[arrow::export]] +std::unique_ptr<arrow::dataset::Scanner> dataset___ScannerBuilder__Finish( + const std::unique_ptr<arrow::dataset::ScannerBuilder>& sb) { + std::unique_ptr<arrow::dataset::Scanner> out; + STOP_IF_NOT_OK(sb->Finish(&out)); + return out; +} + +// [[arrow::export]] +std::shared_ptr<arrow::Table> dataset___Scanner__ToTable( + const std::unique_ptr<arrow::dataset::Scanner>& scn) { + std::shared_ptr<arrow::Table> out; + STOP_IF_NOT_OK(scn->ToTable(&out)); + return out; +} + +#endif diff --git a/r/src/expression.cpp b/r/src/expression.cpp new file mode 100644 index 0000000..773dc7a --- /dev/null +++ b/r/src/expression.cpp @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "./arrow_types.h" + +#if defined(ARROW_R_WITH_ARROW) + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::FieldExpression> dataset___expr__field_ref( + std::string name) { + return std::make_shared<arrow::dataset::FieldExpression>(std::move(name)); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::ComparisonExpression> dataset___expr__equal( + const std::shared_ptr<arrow::dataset::Expression>& lhs, + const std::shared_ptr<arrow::dataset::Expression>& rhs) { + return arrow::dataset::equal(lhs, rhs); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::ComparisonExpression> dataset___expr__not_equal( + const std::shared_ptr<arrow::dataset::Expression>& lhs, + const std::shared_ptr<arrow::dataset::Expression>& rhs) { + return arrow::dataset::not_equal(lhs, rhs); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::ComparisonExpression> dataset___expr__greater( + const std::shared_ptr<arrow::dataset::Expression>& lhs, + const std::shared_ptr<arrow::dataset::Expression>& rhs) { + return arrow::dataset::greater(lhs, rhs); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::ComparisonExpression> dataset___expr__greater_equal( + const std::shared_ptr<arrow::dataset::Expression>& lhs, + const std::shared_ptr<arrow::dataset::Expression>& rhs) { + return arrow::dataset::greater_equal(lhs, rhs); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::ComparisonExpression> dataset___expr__less( + const std::shared_ptr<arrow::dataset::Expression>& lhs, + const std::shared_ptr<arrow::dataset::Expression>& rhs) { + return arrow::dataset::less(lhs, rhs); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::ComparisonExpression> dataset___expr__less_equal( + const std::shared_ptr<arrow::dataset::Expression>& lhs, + const std::shared_ptr<arrow::dataset::Expression>& rhs) { + return arrow::dataset::less_equal(lhs, rhs); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::AndExpression> dataset___expr__and( + const std::shared_ptr<arrow::dataset::Expression>& lhs, + const std::shared_ptr<arrow::dataset::Expression>& rhs) { + return arrow::dataset::and_(lhs, rhs); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::OrExpression> dataset___expr__or( + const std::shared_ptr<arrow::dataset::Expression>& lhs, + const std::shared_ptr<arrow::dataset::Expression>& rhs) { + return arrow::dataset::or_(lhs, rhs); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::NotExpression> dataset___expr__not( + const std::shared_ptr<arrow::dataset::Expression>& lhs) { + return arrow::dataset::not_(lhs); +} + +// [[arrow::export]] +std::shared_ptr<arrow::dataset::ScalarExpression> dataset___expr__scalar(SEXP x) { + switch (TYPEOF(x)) { + case LGLSXP: + return arrow::dataset::scalar(Rf_asLogical(x)); + case REALSXP: + return arrow::dataset::scalar(Rf_asReal(x)); + case INTSXP: + return arrow::dataset::scalar(Rf_asInteger(x)); + default: + // TODO more types (character, factor, Date, POSIXt, etc.) + Rcpp::stop( + tfm::format("R object of type %s not supported", Rf_type2char(TYPEOF(x)))); + } + return nullptr; +} + +// [[arrow::export]] +std::string dataset___expr__ToString( + const std::shared_ptr<arrow::dataset::Expression>& x) { + return x->ToString(); +} + +#endif diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R new file mode 100644 index 0000000..8ff40af --- /dev/null +++ b/r/tests/testthat/test-dataset.R @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("Datasets") + +library(dplyr) + +dataset_dir <- tempfile() +dir.create(dataset_dir) + +hive_dir <- tempfile() +dir.create(hive_dir) + +df1 <- tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[1:10], + fct = factor(LETTERS[1:10]), + part = 1 +) +df2 <- tibble( + int = 101:110, + dbl = as.numeric(51:60), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[10:1], + fct = factor(LETTERS[10:1]), + part = 2 +) + +test_that("Setup (putting data in the dir)", { + dir.create(file.path(dataset_dir, 1)) + dir.create(file.path(dataset_dir, 2)) + write_parquet(df1, file.path(dataset_dir, 1, "file1.parquet")) + write_parquet(df2, file.path(dataset_dir, 2, "file2.parquet")) + expect_length(dir(dataset_dir, recursive = TRUE), 2) + + dir.create(file.path(hive_dir, "subdir", "group=1", "other=xxx"), recursive = TRUE) + dir.create(file.path(hive_dir, "subdir", "group=2", "other=yyy"), recursive = TRUE) + write_parquet(df1, file.path(hive_dir, "subdir", "group=1", "other=xxx", "file1.parquet")) + write_parquet(df2, file.path(hive_dir, "subdir", "group=2", "other=yyy", "file2.parquet")) + expect_length(dir(hive_dir, recursive = TRUE), 2) +}) + +test_that("Simple interface for datasets", { + ds <- open_dataset(dataset_dir, partition = schema(part = uint8())) + expect_is(ds, "Dataset") + expect_equivalent( + ds %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53) %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl")], + df2[1:2, c("chr", "dbl")] + ) + ) + + expect_equivalent( + ds %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6L & part == 1) %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) + +test_that("Hive partitioning", { + ds <- open_dataset(hive_dir, partition = hive_partition(other = utf8(), group = uint8())) + expect_is(ds, "Dataset") + expect_equivalent( + ds %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53) %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl")], + df2[1:2, c("chr", "dbl")] + ) + ) +}) + +test_that("Assembling a Dataset manually and getting a Table", { + fs <- LocalFileSystem$create() + selector <- Selector$create(dataset_dir, recursive = TRUE) + dsd <- FileSystemDataSourceDiscovery$create(fs, selector) + expect_is(dsd, "FileSystemDataSourceDiscovery") + schm <- dsd$Inspect() + expect_is(schm, "Schema") + expect_equal( + schm, + ParquetFileReader$create(file.path(dataset_dir, 1, "file1.parquet"))$GetSchema() + ) + dsd$SetPartitionScheme(SchemaPartitionScheme$create(schema(part = double()))) + datasource <- dsd$Finish() + expect_is(datasource, "DataSource") + + ds <- Dataset$create(list(datasource), schm) + expect_is(ds, "Dataset") + # TODO: this should fail when "part" is in the schema + expect_equal(names(ds), names(df1)) + + sb <- ds$NewScan() + expect_is(sb, "ScannerBuilder") + expect_equal(sb$schema, schm) + expect_equal(names(sb), names(df1)) + sb$Project(c("chr", "lgl")) + sb$Filter(FieldExpression$create("dbl") == 8) + scn <- sb$Finish() + expect_is(scn, "Scanner") + tab <- scn$ToTable() + expect_is(tab, "Table") + expect_equivalent( + as.data.frame(tab), + df1[8, c("chr", "lgl")] + ) +}) diff --git a/r/tests/testthat/test-expression.R b/r/tests/testthat/test-expression.R index 05f0ca1..e6886ba 100644 --- a/r/tests/testthat/test-expression.R +++ b/r/tests/testthat/test-expression.R @@ -18,21 +18,21 @@ context("Expressions") test_that("Can create an expression", { - expect_is(Array$create(1:5) + 4, "Expression") + expect_is(Array$create(1:5) + 4, "array_expression") }) test_that("Recursive expression generation", { a <- Array$create(1:5) - expect_is(a == 4 | a == 3, "Expression") + expect_is(a == 4 | a == 3, "array_expression") }) -test_that("as.vector(expression)", { +test_that("as.vector(array_expression)", { a <- Array$create(1:5) expect_equal(as.vector(a + 4), 5:9) expect_equal(as.vector(a == 4 | a == 3), c(FALSE, FALSE, TRUE, TRUE, FALSE)) }) -test_that("Expression print method", { +test_that("array_expression print method", { a <- Array$create(1:5) expect_output( print(a == 4 | a == 3), @@ -40,3 +40,22 @@ test_that("Expression print method", { fixed = TRUE ) }) + +test_that("C++ expressions", { + f <- FieldExpression$create("f") + g <- FieldExpression$create("g") + expect_is(f == g, "ComparisonExpression") + expect_is(f == 4, "ComparisonExpression") + expect_is(f <= 2L, "ComparisonExpression") + expect_is(f != FALSE, "ComparisonExpression") + expect_is(f > 4, "ComparisonExpression") + expect_is(f < 4 & f > 2, "AndExpression") + expect_is(f < 4 | f > 2, "OrExpression") + expect_is(!(f < 4), "NotExpression") + expect_output( + print(f > 4), + # We can do better than this right? + 'ComparisonExpression\nGREATER(field(f), scalar<double>(4.000000))', + fixed = TRUE + ) +}) diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R index f50adc2..9c9bd7a 100644 --- a/r/tests/testthat/test-schema.R +++ b/r/tests/testthat/test-schema.R @@ -22,6 +22,7 @@ test_that("Alternate type names are supported", { schema(b = double(), c = bool(), d = string(), e = float(), f = halffloat()), schema(b = float64(), c = boolean(), d = utf8(), e = float32(), f = float16()) ) + expect_equal(names(schema(b = double(), c = bool(), d = string())), c("b", "c", "d")) }) test_that("reading schema from Buffer", { diff --git a/r/tools/autobrew b/r/tools/autobrew index 673d512..9636c6f 100644 --- a/r/tools/autobrew +++ b/r/tools/autobrew @@ -46,7 +46,7 @@ fi # Hardcode this for my custom autobrew build rm -f $BREWDIR/lib/*.dylib -PKG_LIBS="-L$BREWDIR/lib -lparquet -larrow -lthrift -llz4 -lboost_system -lboost_filesystem -lboost_regex -ldouble-conversion -lsnappy" +PKG_LIBS="-L$BREWDIR/lib -lparquet -larrow_dataset -larrow -lthrift -llz4 -lboost_system -lboost_filesystem -lboost_regex -ldouble-conversion -lsnappy" # Prevent CRAN builder from linking against old libs in /usr/local/lib for FILE in $BREWDIR/Cellar/*/*/lib/*.a; do