This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5096803 ARROW-7962: [R][Dataset] Followup to "Consolidate Source and
Dataset classes"
5096803 is described below
commit 5096803803d6dbda41fdd600bd354672eb4be2f3
Author: Neal Richardson <[email protected]>
AuthorDate: Fri Feb 28 08:46:49 2020 -0500
ARROW-7962: [R][Dataset] Followup to "Consolidate Source and Dataset
classes"
This commit was pushed to #6470 along with my "approval" review, but a
force push overwrote it.
cc @bkietz
Closes #6502 from nealrichardson/dataset-source-followup and squashes the
following commits:
c1c5e7510 <Neal Richardson> Some R edits
Authored-by: Neal Richardson <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
---
r/NAMESPACE | 5 +-
r/R/dataset.R | 63 ++++++++--------------
r/_pkgdown.yml | 2 +-
r/man/Dataset.Rd | 41 ++++++++++----
r/man/FileFormat.Rd | 10 ++++
r/man/Scanner.Rd | 3 ++
...{open_dataset_factory.Rd => dataset_factory.Rd} | 27 +++++-----
r/man/open_dataset.Rd | 4 +-
r/tests/testthat/test-dataset.R | 20 +++----
9 files changed, 93 insertions(+), 82 deletions(-)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index b368798..b4bf77d 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -92,6 +92,7 @@ export(FileOutputStream)
export(FileSelector)
export(FileStats)
export(FileSystem)
+export(FileSystemDataset)
export(FileSystemDatasetFactory)
export(FileType)
export(FixedSizeBufferWriter)
@@ -133,8 +134,8 @@ export(StructArray)
export(SubTreeFileSystem)
export(Table)
export(TimeUnit)
-export(UnionDatasetFactory)
export(Type)
+export(UnionDataset)
export(arrow_available)
export(bool)
export(boolean)
@@ -143,6 +144,7 @@ export(cast_options)
export(chunked_array)
export(codec_is_available)
export(contains)
+export(dataset_factory)
export(date32)
export(date64)
export(decimal)
@@ -171,7 +173,6 @@ export(null)
export(num_range)
export(one_of)
export(open_dataset)
-export(open_dataset_factory)
export(read_arrow)
export(read_csv_arrow)
export(read_delim_arrow)
diff --git a/r/R/dataset.R b/r/R/dataset.R
index c9cee01..ce75743 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -24,7 +24,7 @@
#' `Dataset`, then use `dplyr` methods to query it.
#'
#' @param sources Either a string path to a directory containing data files,
-#' or a list of `DatasetFactory` objects as created by
[open_dataset_factory()].
+#' or a list of `DatasetFactory` objects as created by [dataset_factory()].
#' @param schema [Schema] for the dataset. If `NULL` (the default), the schema
#' will be inferred from the data sources.
#' @param partitioning When `sources` is a file path, one of
@@ -39,7 +39,7 @@
#' by [hive_partition()] which parses explicit or autodetected fields from
#' Hive-style path segments
#' * `NULL` for no partitioning
-#' @param ... additional arguments passed to `open_dataset_factory()` when
+#' @param ... additional arguments passed to `dataset_factory()` when
#' `sources` is a file path, otherwise ignored.
#' @return A [Dataset] R6 object. Use `dplyr` methods on it to query the data,
#' or call [`$NewScan()`][Scanner] to construct a query directly.
@@ -47,11 +47,7 @@
#' @seealso `vignette("dataset", package = "arrow")`
#' @include arrow-package.R
open_dataset <- function(sources, schema = NULL, partitioning =
hive_partition(), ...) {
- if (is.character(sources)) {
- factory <- open_dataset_factory(sources, partitioning = partitioning, ...)
- } else {
- factory <- open_dataset_factory(children = sources)
- }
+ factory <- DatasetFactory$create(sources, partitioning = partitioning, ...)
factory$Finish(schema)
}
@@ -79,7 +75,7 @@ open_dataset <- function(sources, schema = NULL, partitioning
= hive_partition()
#' discovering files in the local file system, the only currently supported
#' file system.
#'
-#' For the `DatasetFactory$create()` factory method, see
[open_dataset_factory()], an
+#' For the `DatasetFactory$create()` factory method, see [dataset_factory()],
an
#' alias for it. A `DatasetFactory` has:
#'
#' - `$Inspect()`: Returns a common [Schema] for all data discovered by the
factory.
@@ -92,12 +88,6 @@ open_dataset <- function(sources, schema = NULL,
partitioning = hive_partition()
#' * `format`: A string identifier of the format of the files in `path`.
#' Currently supported options are "parquet", "arrow", and "ipc" (an alias
for
#' the Arrow file format)
-#'
-#' `UnionDatasetFactory$create()` can be used to unify child `DatasetFactory`s
into
-#' a single `DatasetFactory`. Use it when (for example) your data is in
multiple
-#' file systems or formats.
-#' * `children`: child `DatasetFactory`s to be unified
-#'
#' @section Methods:
#'
#' A `Dataset` has the following methods:
@@ -197,22 +187,21 @@ DatasetFactory <- R6Class("DatasetFactory", inherit =
Object,
Inspect = function() shared_ptr(Schema,
dataset___DatasetFactory__Inspect(self))
)
)
-DatasetFactory$create <- function(path,
- children = NULL,
+DatasetFactory$create <- function(x,
filesystem = c("auto", "local"),
format = c("parquet", "arrow", "ipc"),
partitioning = NULL,
allow_non_existent = FALSE,
recursive = TRUE,
...) {
- if (!is.null(children)) {
- return(shared_ptr(DatasetFactory,
dataset___UnionDatasetFactory__Make(children)))
+ if (is.list(x) && all(map_lgl(x, ~inherits(., "DatasetFactory")))) {
+ return(shared_ptr(DatasetFactory, dataset___UnionDatasetFactory__Make(x)))
}
if (!inherits(filesystem, "FileSystem")) {
filesystem <- match.arg(filesystem)
if (filesystem == "auto") {
- # When there are other FileSystems supported, detect e.g. S3 from path
+ # When there are other FileSystems supported, detect e.g. S3 from x
filesystem <- "local"
}
filesystem <- list(
@@ -221,7 +210,7 @@ DatasetFactory$create <- function(path,
)[[filesystem]]$create(...)
}
selector <- FileSelector$create(
- path,
+ x,
allow_non_existent = allow_non_existent,
recursive = recursive
)
@@ -252,16 +241,16 @@ DatasetFactory$create <- function(path,
#'
#' If you would only have a single `DatasetFactory` (for example, you have a
#' single directory containing Parquet files), you can call `open_dataset()`
-#' directly. Use `open_dataset_factory()` when you
+#' directly. Use `dataset_factory()` when you
#' want to combine different directories, file systems, or file formats.
#'
-#' @param path A string file path containing data files
-#' @param children A list of `DatasetFactory` objects whose datasets should be
-#' unified. If this argument is specified it will be used to construct a
+#' @param x A string file x containing data files, or
+#' a list of `DatasetFactory` objects whose datasets should be
+#' grouped. If this argument is specified it will be used to construct a
#' `UnionDatasetFactory` and other arguments will be ignored.
#' @param filesystem A string identifier for the filesystem corresponding to
-#' `path`. Currently only "local" is supported.
-#' @param format A string identifier of the format of the files in `path`.
+#' `x`. Currently only "local" is supported.
+#' @param format A string identifier of the format of the files in `x`.
#' Currently supported options are "parquet", "arrow", and "ipc" (an alias for
#' the Arrow file format)
#' @param partitioning One of
@@ -276,26 +265,16 @@ DatasetFactory$create <- function(path,
#' by [hive_partition()] which parses explicit or autodetected fields from
#' Hive-style path segments
#' * `NULL` for no partitioning
-#' @param allow_non_existent logical: is `path` allowed to not exist? Default
+#' @param allow_non_existent logical: is `x` allowed to not exist? Default
#' `FALSE`. See [FileSelector].
#' @param recursive logical: should files be discovered in subdirectories of
-#' `path`? Default `TRUE`.
+#' `x`? Default `TRUE`.
#' @param ... Additional arguments passed to the [FileSystem] `$create()`
method
#' @return A `DatasetFactory` object. Pass this to [open_dataset()],
#' in a list potentially with other `DatasetFactory` objects, to create
#' a `Dataset`.
#' @export
-open_dataset_factory <- DatasetFactory$create
-
-#' @usage NULL
-#' @format NULL
-#' @rdname Dataset
-#' @export
-UnionDatasetFactory <- R6Class("UnionDatasetFactory", inherit = DatasetFactory)
-UnionDatasetFactory$create <- function(children) {
- assert_is_list_of(children, "DatasetFactory")
- shared_ptr(DatasetFactory, dataset___UnionDatasetFactory__Make(children))
-}
+dataset_factory <- DatasetFactory$create
#' @usage NULL
#' @format NULL
@@ -305,9 +284,9 @@ FileSystemDatasetFactory <-
R6Class("FileSystemDatasetFactory",
inherit = DatasetFactory
)
FileSystemDatasetFactory$create <- function(filesystem,
- selector,
- format,
- partitioning = NULL) {
+ selector,
+ format,
+ partitioning = NULL) {
assert_is(filesystem, "FileSystem")
assert_is(selector, "FileSelector")
assert_is(format, "FileFormat")
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index 25b76f0..165892a 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -61,7 +61,7 @@ reference:
- title: Multi-file datasets
contents:
- open_dataset
- - open_dataset_factory
+ - dataset_factory
- hive_partition
- Dataset
- Partitioning
diff --git a/r/man/Dataset.Rd b/r/man/Dataset.Rd
index 81bca3b..0d44c6d 100644
--- a/r/man/Dataset.Rd
+++ b/r/man/Dataset.Rd
@@ -2,8 +2,9 @@
% Please edit documentation in R/dataset.R
\name{Dataset}
\alias{Dataset}
+\alias{FileSystemDataset}
+\alias{UnionDataset}
\alias{DatasetFactory}
-\alias{UnionDatasetFactory}
\alias{FileSystemDatasetFactory}
\title{Multi-file datasets}
\value{
@@ -16,11 +17,6 @@ can accelerate queries that only touch some partitions
(files).
A \code{Dataset} contains one or more \code{Fragments}, such as files, of
potentially
differing type and partitioning.
-\code{DatasetFactory} is used to create a \code{Dataset}, inspect the
\link{Schema} of the
-fragments contained in it, and declare a partitioning.
-\code{FileSystemDatasetFactory} is a subclass of \code{DatasetFactory} for
-discovering files in the local file system, the only currently supported
-file system.
The \code{Dataset$create()} method instantiates a \code{Dataset} which wraps
child Datasets.
It takes the following arguments:
@@ -34,11 +30,29 @@ It takes the following arguments:
Start a new scan of the data
Return the Dataset's \code{Schema}
+
+Return the Dataset's type.
+
+Return the files contained in this \code{FileSystemDataset}
+
+Return the format of files in this \code{Dataset}
+
+Return the UnionDataset's child \code{Dataset}s
}
\section{Factory}{
-For the \code{DatasetFactory$create()} factory method, see
\code{\link[=open_dataset_factory]{open_dataset_factory()}}, an
-alias for it.
+\code{DatasetFactory} is used to create a \code{Dataset}, inspect the
\link{Schema} of the
+fragments contained in it, and declare a partitioning.
+\code{FileSystemDatasetFactory} is a subclass of \code{DatasetFactory} for
+discovering files in the local file system, the only currently supported
+file system.
+
+For the \code{DatasetFactory$create()} factory method, see
\code{\link[=dataset_factory]{dataset_factory()}}, an
+alias for it. A \code{DatasetFactory} has:
+\itemize{
+\item \verb{$Inspect()}: Returns a common \link{Schema} for all data
discovered by the factory.
+\item \verb{$Finish(schema)}: Returns a \code{Dataset}
+}
\code{FileSystemDatasetFactory$create()} is a lower-level factory method and
takes the following arguments:
@@ -60,10 +74,15 @@ A \code{Dataset} has the following methods:
\item \verb{$schema}: Active binding, returns the \link{Schema} of the Dataset
}
-A \code{DatasetFactory} has:
+\code{FileSystemDataset} has the following methods:
\itemize{
-\item \verb{$Inspect()}: Returns a common \link{Schema} for all data
discovered by the factory.
-\item \verb{$Finish(schema)}: Returns a \code{Dataset}
+\item \verb{$files}: Active binding, returns the files of the
\code{FileSystemDataset}
+\item \verb{$format}: Active binding, returns the \link{FileFormat} of the
\code{FileSystemDataset}
+}
+
+\code{UnionDataset} has the following methods:
+\itemize{
+\item \verb{$children}: Active binding, returns all child \code{Dataset}s.
}
}
diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd
index 80af81f..deb564e 100644
--- a/r/man/FileFormat.Rd
+++ b/r/man/FileFormat.Rd
@@ -9,6 +9,8 @@
A \code{FileFormat} holds information about how to read and parse the files
included in a \code{Dataset}. There are subclasses corresponding to the
supported
file formats (\code{ParquetFileFormat} and \code{IpcFileFormat}).
+
+Return the \code{FileFormat}'s type
}
\section{Factory}{
@@ -18,6 +20,14 @@ file formats (\code{ParquetFileFormat} and
\code{IpcFileFormat}).
Currently supported options are "parquet", "arrow", and "ipc" (an alias for
the Arrow file format)
\item \code{...}: Additional format-specific options
+format="parquet":
+\itemize{
+\item \code{use_buffered_stream}: Read files through buffered input streams
rather than
+loading entire row groups at once. This may be enabled
+to reduce memory overhead. Disabled by default.
+\item \code{buffer_size}: Size of buffered stream, if enabled. Default is 8KB.
+\item \code{dict_columns}: Names of columns which should be read as
dictionaries.
+}
}
It returns the appropriate subclass of \code{FileFormat} (e.g.
\code{ParquetFileFormat})
diff --git a/r/man/Scanner.Rd b/r/man/Scanner.Rd
index f85ca75..a665c0b 100644
--- a/r/man/Scanner.Rd
+++ b/r/man/Scanner.Rd
@@ -19,6 +19,9 @@ by \code{cols}, a character vector of column names
\item \verb{$UseThreads(threads)}: logical: should the scan use multithreading?
The method's default input is \code{TRUE}, but you must call the method to
enable
multithreading because the scanner default is \code{FALSE}.
+\item \verb{$BatchSize(batch_size)}: integer: Maximum row count of scanned
record
+batches, default is 32K. If scanned record batches are overflowing memory
+then this method can be called to reduce their size.
\item \verb{$schema}: Active binding, returns the \link{Schema} of the Dataset
\item \verb{$Finish()}: Returns a \code{Scanner}
}
diff --git a/r/man/open_dataset_factory.Rd b/r/man/dataset_factory.Rd
similarity index 80%
rename from r/man/open_dataset_factory.Rd
rename to r/man/dataset_factory.Rd
index afb1a20..877a952 100644
--- a/r/man/open_dataset_factory.Rd
+++ b/r/man/dataset_factory.Rd
@@ -1,12 +1,11 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dataset.R
-\name{open_dataset_factory}
-\alias{open_dataset_factory}
+\name{dataset_factory}
+\alias{dataset_factory}
\title{Create a DatasetFactory}
\usage{
-open_dataset_factory(
- path,
- children = NULL,
+dataset_factory(
+ x,
filesystem = c("auto", "local"),
format = c("parquet", "arrow", "ipc"),
partitioning = NULL,
@@ -16,15 +15,15 @@ open_dataset_factory(
)
}
\arguments{
-\item{path}{A string file path containing data files}
-
-\item{children}{A list of \code{DatasetFactory} objects whose datasets should
be
-unified. If this argument is specified other arguments will be ignored.}
+\item{x}{A string file x containing data files, or
+a list of \code{DatasetFactory} objects whose datasets should be
+grouped. If this argument is specified it will be used to construct a
+\code{UnionDatasetFactory} and other arguments will be ignored.}
\item{filesystem}{A string identifier for the filesystem corresponding to
-\code{path}. Currently only "local" is supported.}
+\code{x}. Currently only "local" is supported.}
-\item{format}{A string identifier of the format of the files in \code{path}.
+\item{format}{A string identifier of the format of the files in \code{x}.
Currently supported options are "parquet", "arrow", and "ipc" (an alias for
the Arrow file format)}
@@ -43,11 +42,11 @@ Hive-style path segments
\item \code{NULL} for no partitioning
}}
-\item{allow_non_existent}{logical: is \code{path} allowed to not exist? Default
+\item{allow_non_existent}{logical: is \code{x} allowed to not exist? Default
\code{FALSE}. See \link{FileSelector}.}
\item{recursive}{logical: should files be discovered in subdirectories of
-\code{path}? Default \code{TRUE}.}
+\code{x}? Default \code{TRUE}.}
\item{...}{Additional arguments passed to the \link{FileSystem}
\verb{$create()} method}
}
@@ -64,6 +63,6 @@ This function helps you construct a \code{DatasetFactory}
that you can pass to
\details{
If you would only have a single \code{DatasetFactory} (for example, you have a
single directory containing Parquet files), you can call \code{open_dataset()}
-directly. Use \code{open_dataset_factory()} when you
+directly. Use \code{dataset_factory()} when you
want to combine different directories, file systems, or file formats.
}
diff --git a/r/man/open_dataset.Rd b/r/man/open_dataset.Rd
index c3105e5..e668ff5 100644
--- a/r/man/open_dataset.Rd
+++ b/r/man/open_dataset.Rd
@@ -8,7 +8,7 @@ open_dataset(sources, schema = NULL, partitioning =
hive_partition(), ...)
}
\arguments{
\item{sources}{Either a string path to a directory containing data files,
-or a list of \code{DatasetFactory} objects as created by
\code{\link[=open_dataset_factory]{open_dataset_factory()}}.}
+or a list of \code{DatasetFactory} objects as created by
\code{\link[=dataset_factory]{dataset_factory()}}.}
\item{schema}{\link{Schema} for the dataset. If \code{NULL} (the default), the
schema
will be inferred from the data sources.}
@@ -28,7 +28,7 @@ Hive-style path segments
\item \code{NULL} for no partitioning
}}
-\item{...}{additional arguments passed to \code{open_dataset_factory()} when
+\item{...}{additional arguments passed to \code{dataset_factory()} when
\code{sources} is a file path, otherwise ignored.}
}
\value{
diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index 103a7a1..b1ec3a4 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -19,15 +19,15 @@ context("Datasets")
library(dplyr)
-tempdir <- function() {
+make_temp_dir <- function() {
path <- tempfile()
dir.create(path)
normalizePath(path, winslash = "/")
}
-dataset_dir <- tempdir()
-hive_dir <- tempdir()
-ipc_dir <- tempdir()
+dataset_dir <- make_temp_dir()
+hive_dir <- make_temp_dir()
+ipc_dir <- make_temp_dir()
first_date <- lubridate::ymd_hms("2015-04-29 03:12:39")
df1 <- tibble(
@@ -165,9 +165,9 @@ test_that("IPC/Arrow format data", {
test_that("Dataset with multiple file formats", {
skip("https://issues.apache.org/jira/browse/ARROW-7653")
- ds <- open_dataset(children=list(
- open_dataset_factory(dataset_dir, format = "parquet", partitioning =
"part"),
- open_dataset_factory(ipc_dir, format = "arrow", partitioning = "part")
+ ds <- open_dataset(list(
+ dataset_factory(dataset_dir, format = "parquet", partitioning = "part"),
+ dataset_factory(ipc_dir, format = "arrow", partitioning = "part")
))
expect_identical(names(ds), c(names(df1), "part"))
expect_equivalent(
@@ -411,12 +411,12 @@ test_that("Assembling a Dataset manually and getting a
Table", {
})
test_that("Assembling multiple DatasetFactories with DatasetFactory", {
- factory1 <- open_dataset_factory(file.path(dataset_dir, 1), format =
"parquet")
+ factory1 <- dataset_factory(file.path(dataset_dir, 1), format = "parquet")
expect_is(factory1, "FileSystemDatasetFactory")
- factory2 <- open_dataset_factory(file.path(dataset_dir, 2), format =
"parquet")
+ factory2 <- dataset_factory(file.path(dataset_dir, 2), format = "parquet")
expect_is(factory2, "FileSystemDatasetFactory")
- factory <- DatasetFactory$create(children=list(factory1, factory2))
+ factory <- DatasetFactory$create(list(factory1, factory2))
expect_is(factory, "DatasetFactory")
schm <- factory$Inspect()