[arrow] branch master updated: ARROW-7962: [R][Dataset] Followup to "Consolidate Source and Dataset classes"

npr Fri, 28 Feb 2020 05:47:40 -0800

This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 5096803  ARROW-7962: [R][Dataset] Followup to "Consolidate Source and 
Dataset classes"
5096803 is described below

commit 5096803803d6dbda41fdd600bd354672eb4be2f3
Author: Neal Richardson <[email protected]>
AuthorDate: Fri Feb 28 08:46:49 2020 -0500

    ARROW-7962: [R][Dataset] Followup to "Consolidate Source and Dataset 
classes"
    
    This commit was pushed to #6470 along with my "approval" review, but a 
force push overwrote it.
    
    cc @bkietz
    
    Closes #6502 from nealrichardson/dataset-source-followup and squashes the 
following commits:
    
    c1c5e7510 <Neal Richardson> Some R edits
    
    Authored-by: Neal Richardson <[email protected]>
    Signed-off-by: Neal Richardson <[email protected]>
---
 r/NAMESPACE                                        |  5 +-
 r/R/dataset.R                                      | 63 ++++++++--------------
 r/_pkgdown.yml                                     |  2 +-
 r/man/Dataset.Rd                                   | 41 ++++++++++----
 r/man/FileFormat.Rd                                | 10 ++++
 r/man/Scanner.Rd                                   |  3 ++
 ...{open_dataset_factory.Rd => dataset_factory.Rd} | 27 +++++-----
 r/man/open_dataset.Rd                              |  4 +-
 r/tests/testthat/test-dataset.R                    | 20 +++----
 9 files changed, 93 insertions(+), 82 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index b368798..b4bf77d 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -92,6 +92,7 @@ export(FileOutputStream)
 export(FileSelector)
 export(FileStats)
 export(FileSystem)
+export(FileSystemDataset)
 export(FileSystemDatasetFactory)
 export(FileType)
 export(FixedSizeBufferWriter)
@@ -133,8 +134,8 @@ export(StructArray)
 export(SubTreeFileSystem)
 export(Table)
 export(TimeUnit)
-export(UnionDatasetFactory)
 export(Type)
+export(UnionDataset)
 export(arrow_available)
 export(bool)
 export(boolean)
@@ -143,6 +144,7 @@ export(cast_options)
 export(chunked_array)
 export(codec_is_available)
 export(contains)
+export(dataset_factory)
 export(date32)
 export(date64)
 export(decimal)
@@ -171,7 +173,6 @@ export(null)
 export(num_range)
 export(one_of)
 export(open_dataset)
-export(open_dataset_factory)
 export(read_arrow)
 export(read_csv_arrow)
 export(read_delim_arrow)
diff --git a/r/R/dataset.R b/r/R/dataset.R
index c9cee01..ce75743 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -24,7 +24,7 @@
 #' `Dataset`, then use `dplyr` methods to query it.
 #'
 #' @param sources Either a string path to a directory containing data files,
-#' or a list of `DatasetFactory` objects as created by 
[open_dataset_factory()].
+#' or a list of `DatasetFactory` objects as created by [dataset_factory()].
 #' @param schema [Schema] for the dataset. If `NULL` (the default), the schema
 #' will be inferred from the data sources.
 #' @param partitioning When `sources` is a file path, one of
@@ -39,7 +39,7 @@
 #'    by [hive_partition()] which parses explicit or autodetected fields from
 #'    Hive-style path segments
 #'   * `NULL` for no partitioning
-#' @param ... additional arguments passed to `open_dataset_factory()` when
+#' @param ... additional arguments passed to `dataset_factory()` when
 #' `sources` is a file path, otherwise ignored.
 #' @return A [Dataset] R6 object. Use `dplyr` methods on it to query the data,
 #' or call [`$NewScan()`][Scanner] to construct a query directly.
@@ -47,11 +47,7 @@
 #' @seealso `vignette("dataset", package = "arrow")`
 #' @include arrow-package.R
 open_dataset <- function(sources, schema = NULL, partitioning = 
hive_partition(), ...) {
-  if (is.character(sources)) {
-    factory <- open_dataset_factory(sources, partitioning = partitioning, ...)
-  } else {
-    factory <- open_dataset_factory(children = sources)
-  }
+  factory <- DatasetFactory$create(sources, partitioning = partitioning, ...)
   factory$Finish(schema)
 }
 
@@ -79,7 +75,7 @@ open_dataset <- function(sources, schema = NULL, partitioning 
= hive_partition()
 #' discovering files in the local file system, the only currently supported
 #' file system.
 #'
-#' For the `DatasetFactory$create()` factory method, see 
[open_dataset_factory()], an
+#' For the `DatasetFactory$create()` factory method, see [dataset_factory()], 
an
 #' alias for it. A `DatasetFactory` has:
 #'
 #' - `$Inspect()`: Returns a common [Schema] for all data discovered by the 
factory.
@@ -92,12 +88,6 @@ open_dataset <- function(sources, schema = NULL, 
partitioning = hive_partition()
 #' * `format`: A string identifier of the format of the files in `path`.
 #'   Currently supported options are "parquet", "arrow", and "ipc" (an alias 
for
 #'   the Arrow file format)
-#'
-#' `UnionDatasetFactory$create()` can be used to unify child `DatasetFactory`s 
into
-#' a single `DatasetFactory`. Use it when (for example) your data is in 
multiple
-#' file systems or formats.
-#' * `children`: child `DatasetFactory`s to be unified
-#'
 #' @section Methods:
 #'
 #' A `Dataset` has the following methods:
@@ -197,22 +187,21 @@ DatasetFactory <- R6Class("DatasetFactory", inherit = 
Object,
     Inspect = function() shared_ptr(Schema, 
dataset___DatasetFactory__Inspect(self))
   )
 )
-DatasetFactory$create <- function(path,
-                                  children = NULL,
+DatasetFactory$create <- function(x,
                                   filesystem = c("auto", "local"),
                                   format = c("parquet", "arrow", "ipc"),
                                   partitioning = NULL,
                                   allow_non_existent = FALSE,
                                   recursive = TRUE,
                                   ...) {
-  if (!is.null(children)) {
-    return(shared_ptr(DatasetFactory, 
dataset___UnionDatasetFactory__Make(children)))
+  if (is.list(x) && all(map_lgl(x, ~inherits(., "DatasetFactory")))) {
+    return(shared_ptr(DatasetFactory, dataset___UnionDatasetFactory__Make(x)))
   }
 
   if (!inherits(filesystem, "FileSystem")) {
     filesystem <- match.arg(filesystem)
     if (filesystem == "auto") {
-      # When there are other FileSystems supported, detect e.g. S3 from path
+      # When there are other FileSystems supported, detect e.g. S3 from x
       filesystem <- "local"
     }
     filesystem <- list(
@@ -221,7 +210,7 @@ DatasetFactory$create <- function(path,
     )[[filesystem]]$create(...)
   }
   selector <- FileSelector$create(
-    path,
+    x,
     allow_non_existent = allow_non_existent,
     recursive = recursive
   )
@@ -252,16 +241,16 @@ DatasetFactory$create <- function(path,
 #'
 #' If you would only have a single `DatasetFactory` (for example, you have a
 #' single directory containing Parquet files), you can call `open_dataset()`
-#' directly. Use `open_dataset_factory()` when you
+#' directly. Use `dataset_factory()` when you
 #' want to combine different directories, file systems, or file formats.
 #'
-#' @param path A string file path containing data files
-#' @param children A list of `DatasetFactory` objects whose datasets should be
-#' unified. If this argument is specified it will be used to construct a
+#' @param x A string file x containing data files, or
+#' a list of `DatasetFactory` objects whose datasets should be
+#' grouped. If this argument is specified it will be used to construct a
 #' `UnionDatasetFactory` and other arguments will be ignored.
 #' @param filesystem A string identifier for the filesystem corresponding to
-#' `path`. Currently only "local" is supported.
-#' @param format A string identifier of the format of the files in `path`.
+#' `x`. Currently only "local" is supported.
+#' @param format A string identifier of the format of the files in `x`.
 #' Currently supported options are "parquet", "arrow", and "ipc" (an alias for
 #' the Arrow file format)
 #' @param partitioning One of
@@ -276,26 +265,16 @@ DatasetFactory$create <- function(path,
 #'    by [hive_partition()] which parses explicit or autodetected fields from
 #'    Hive-style path segments
 #'   * `NULL` for no partitioning
-#' @param allow_non_existent logical: is `path` allowed to not exist? Default
+#' @param allow_non_existent logical: is `x` allowed to not exist? Default
 #' `FALSE`. See [FileSelector].
 #' @param recursive logical: should files be discovered in subdirectories of
-#' `path`? Default `TRUE`.
+#' `x`? Default `TRUE`.
 #' @param ... Additional arguments passed to the [FileSystem] `$create()` 
method
 #' @return A `DatasetFactory` object. Pass this to [open_dataset()],
 #' in a list potentially with other `DatasetFactory` objects, to create
 #' a `Dataset`.
 #' @export
-open_dataset_factory <- DatasetFactory$create
-
-#' @usage NULL
-#' @format NULL
-#' @rdname Dataset
-#' @export
-UnionDatasetFactory <- R6Class("UnionDatasetFactory", inherit = DatasetFactory)
-UnionDatasetFactory$create <- function(children) {
-  assert_is_list_of(children, "DatasetFactory")
-  shared_ptr(DatasetFactory, dataset___UnionDatasetFactory__Make(children))
-}
+dataset_factory <- DatasetFactory$create
 
 #' @usage NULL
 #' @format NULL
@@ -305,9 +284,9 @@ FileSystemDatasetFactory <- 
R6Class("FileSystemDatasetFactory",
   inherit = DatasetFactory
 )
 FileSystemDatasetFactory$create <- function(filesystem,
-                                           selector,
-                                           format,
-                                           partitioning = NULL) {
+                                            selector,
+                                            format,
+                                            partitioning = NULL) {
   assert_is(filesystem, "FileSystem")
   assert_is(selector, "FileSelector")
   assert_is(format, "FileFormat")
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index 25b76f0..165892a 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -61,7 +61,7 @@ reference:
 - title: Multi-file datasets
   contents:
   - open_dataset
-  - open_dataset_factory
+  - dataset_factory
   - hive_partition
   - Dataset
   - Partitioning
diff --git a/r/man/Dataset.Rd b/r/man/Dataset.Rd
index 81bca3b..0d44c6d 100644
--- a/r/man/Dataset.Rd
+++ b/r/man/Dataset.Rd
@@ -2,8 +2,9 @@
 % Please edit documentation in R/dataset.R
 \name{Dataset}
 \alias{Dataset}
+\alias{FileSystemDataset}
+\alias{UnionDataset}
 \alias{DatasetFactory}
-\alias{UnionDatasetFactory}
 \alias{FileSystemDatasetFactory}
 \title{Multi-file datasets}
 \value{
@@ -16,11 +17,6 @@ can accelerate queries that only touch some partitions 
(files).
 
 A \code{Dataset} contains one or more \code{Fragments}, such as files, of 
potentially
 differing type and partitioning.
-\code{DatasetFactory} is used to create a \code{Dataset}, inspect the 
\link{Schema} of the
-fragments contained in it, and declare a partitioning.
-\code{FileSystemDatasetFactory} is a subclass of \code{DatasetFactory} for
-discovering files in the local file system, the only currently supported
-file system.
 
 The \code{Dataset$create()} method instantiates a \code{Dataset} which wraps 
child Datasets.
 It takes the following arguments:
@@ -34,11 +30,29 @@ It takes the following arguments:
 Start a new scan of the data
 
 Return the Dataset's \code{Schema}
+
+Return the Dataset's type.
+
+Return the files contained in this \code{FileSystemDataset}
+
+Return the format of files in this \code{Dataset}
+
+Return the UnionDataset's child \code{Dataset}s
 }
 \section{Factory}{
 
-For the \code{DatasetFactory$create()} factory method, see 
\code{\link[=open_dataset_factory]{open_dataset_factory()}}, an
-alias for it.
+\code{DatasetFactory} is used to create a \code{Dataset}, inspect the 
\link{Schema} of the
+fragments contained in it, and declare a partitioning.
+\code{FileSystemDatasetFactory} is a subclass of \code{DatasetFactory} for
+discovering files in the local file system, the only currently supported
+file system.
+
+For the \code{DatasetFactory$create()} factory method, see 
\code{\link[=dataset_factory]{dataset_factory()}}, an
+alias for it. A \code{DatasetFactory} has:
+\itemize{
+\item \verb{$Inspect()}: Returns a common \link{Schema} for all data 
discovered by the factory.
+\item \verb{$Finish(schema)}: Returns a \code{Dataset}
+}
 
 \code{FileSystemDatasetFactory$create()} is a lower-level factory method and
 takes the following arguments:
@@ -60,10 +74,15 @@ A \code{Dataset} has the following methods:
 \item \verb{$schema}: Active binding, returns the \link{Schema} of the Dataset
 }
 
-A \code{DatasetFactory} has:
+\code{FileSystemDataset} has the following methods:
 \itemize{
-\item \verb{$Inspect()}: Returns a common \link{Schema} for all data 
discovered by the factory.
-\item \verb{$Finish(schema)}: Returns a \code{Dataset}
+\item \verb{$files}: Active binding, returns the files of the 
\code{FileSystemDataset}
+\item \verb{$format}: Active binding, returns the \link{FileFormat} of the 
\code{FileSystemDataset}
+}
+
+\code{UnionDataset} has the following methods:
+\itemize{
+\item \verb{$children}: Active binding, returns all child \code{Dataset}s.
 }
 }
 
diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd
index 80af81f..deb564e 100644
--- a/r/man/FileFormat.Rd
+++ b/r/man/FileFormat.Rd
@@ -9,6 +9,8 @@
 A \code{FileFormat} holds information about how to read and parse the files
 included in a \code{Dataset}. There are subclasses corresponding to the 
supported
 file formats (\code{ParquetFileFormat} and \code{IpcFileFormat}).
+
+Return the \code{FileFormat}'s type
 }
 \section{Factory}{
 
@@ -18,6 +20,14 @@ file formats (\code{ParquetFileFormat} and 
\code{IpcFileFormat}).
 Currently supported options are "parquet", "arrow", and "ipc" (an alias for
 the Arrow file format)
 \item \code{...}: Additional format-specific options
+format="parquet":
+\itemize{
+\item \code{use_buffered_stream}: Read files through buffered input streams 
rather than
+loading entire row groups at once. This may be enabled
+to reduce memory overhead. Disabled by default.
+\item \code{buffer_size}: Size of buffered stream, if enabled. Default is 8KB.
+\item \code{dict_columns}: Names of columns which should be read as 
dictionaries.
+}
 }
 
 It returns the appropriate subclass of \code{FileFormat} (e.g. 
\code{ParquetFileFormat})
diff --git a/r/man/Scanner.Rd b/r/man/Scanner.Rd
index f85ca75..a665c0b 100644
--- a/r/man/Scanner.Rd
+++ b/r/man/Scanner.Rd
@@ -19,6 +19,9 @@ by \code{cols}, a character vector of column names
 \item \verb{$UseThreads(threads)}: logical: should the scan use multithreading?
 The method's default input is \code{TRUE}, but you must call the method to 
enable
 multithreading because the scanner default is \code{FALSE}.
+\item \verb{$BatchSize(batch_size)}: integer: Maximum row count of scanned 
record
+batches, default is 32K. If scanned record batches are overflowing memory
+then this method can be called to reduce their size.
 \item \verb{$schema}: Active binding, returns the \link{Schema} of the Dataset
 \item \verb{$Finish()}: Returns a \code{Scanner}
 }
diff --git a/r/man/open_dataset_factory.Rd b/r/man/dataset_factory.Rd
similarity index 80%
rename from r/man/open_dataset_factory.Rd
rename to r/man/dataset_factory.Rd
index afb1a20..877a952 100644
--- a/r/man/open_dataset_factory.Rd
+++ b/r/man/dataset_factory.Rd
@@ -1,12 +1,11 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/dataset.R
-\name{open_dataset_factory}
-\alias{open_dataset_factory}
+\name{dataset_factory}
+\alias{dataset_factory}
 \title{Create a DatasetFactory}
 \usage{
-open_dataset_factory(
-  path,
-  children = NULL,
+dataset_factory(
+  x,
   filesystem = c("auto", "local"),
   format = c("parquet", "arrow", "ipc"),
   partitioning = NULL,
@@ -16,15 +15,15 @@ open_dataset_factory(
 )
 }
 \arguments{
-\item{path}{A string file path containing data files}
-
-\item{children}{A list of \code{DatasetFactory} objects whose datasets should 
be
-unified. If this argument is specified other arguments will be ignored.}
+\item{x}{A string file x containing data files, or
+a list of \code{DatasetFactory} objects whose datasets should be
+grouped. If this argument is specified it will be used to construct a
+\code{UnionDatasetFactory} and other arguments will be ignored.}
 
 \item{filesystem}{A string identifier for the filesystem corresponding to
-\code{path}. Currently only "local" is supported.}
+\code{x}. Currently only "local" is supported.}
 
-\item{format}{A string identifier of the format of the files in \code{path}.
+\item{format}{A string identifier of the format of the files in \code{x}.
 Currently supported options are "parquet", "arrow", and "ipc" (an alias for
 the Arrow file format)}
 
@@ -43,11 +42,11 @@ Hive-style path segments
 \item \code{NULL} for no partitioning
 }}
 
-\item{allow_non_existent}{logical: is \code{path} allowed to not exist? Default
+\item{allow_non_existent}{logical: is \code{x} allowed to not exist? Default
 \code{FALSE}. See \link{FileSelector}.}
 
 \item{recursive}{logical: should files be discovered in subdirectories of
-\code{path}? Default \code{TRUE}.}
+\code{x}? Default \code{TRUE}.}
 
 \item{...}{Additional arguments passed to the \link{FileSystem} 
\verb{$create()} method}
 }
@@ -64,6 +63,6 @@ This function helps you construct a \code{DatasetFactory} 
that you can pass to
 \details{
 If you would only have a single \code{DatasetFactory} (for example, you have a
 single directory containing Parquet files), you can call \code{open_dataset()}
-directly. Use \code{open_dataset_factory()} when you
+directly. Use \code{dataset_factory()} when you
 want to combine different directories, file systems, or file formats.
 }
diff --git a/r/man/open_dataset.Rd b/r/man/open_dataset.Rd
index c3105e5..e668ff5 100644
--- a/r/man/open_dataset.Rd
+++ b/r/man/open_dataset.Rd
@@ -8,7 +8,7 @@ open_dataset(sources, schema = NULL, partitioning = 
hive_partition(), ...)
 }
 \arguments{
 \item{sources}{Either a string path to a directory containing data files,
-or a list of \code{DatasetFactory} objects as created by 
\code{\link[=open_dataset_factory]{open_dataset_factory()}}.}
+or a list of \code{DatasetFactory} objects as created by 
\code{\link[=dataset_factory]{dataset_factory()}}.}
 
 \item{schema}{\link{Schema} for the dataset. If \code{NULL} (the default), the 
schema
 will be inferred from the data sources.}
@@ -28,7 +28,7 @@ Hive-style path segments
 \item \code{NULL} for no partitioning
 }}
 
-\item{...}{additional arguments passed to \code{open_dataset_factory()} when
+\item{...}{additional arguments passed to \code{dataset_factory()} when
 \code{sources} is a file path, otherwise ignored.}
 }
 \value{
diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index 103a7a1..b1ec3a4 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -19,15 +19,15 @@ context("Datasets")
 
 library(dplyr)
 
-tempdir <- function() {
+make_temp_dir <- function() {
   path <- tempfile()
   dir.create(path)
   normalizePath(path, winslash = "/")
 }
 
-dataset_dir <- tempdir()
-hive_dir <- tempdir()
-ipc_dir <- tempdir()
+dataset_dir <- make_temp_dir()
+hive_dir <- make_temp_dir()
+ipc_dir <- make_temp_dir()
 
 first_date <- lubridate::ymd_hms("2015-04-29 03:12:39")
 df1 <- tibble(
@@ -165,9 +165,9 @@ test_that("IPC/Arrow format data", {
 
 test_that("Dataset with multiple file formats", {
   skip("https://issues.apache.org/jira/browse/ARROW-7653";)
-  ds <- open_dataset(children=list(
-    open_dataset_factory(dataset_dir, format = "parquet", partitioning = 
"part"),
-    open_dataset_factory(ipc_dir, format = "arrow", partitioning = "part")
+  ds <- open_dataset(list(
+    dataset_factory(dataset_dir, format = "parquet", partitioning = "part"),
+    dataset_factory(ipc_dir, format = "arrow", partitioning = "part")
   ))
   expect_identical(names(ds), c(names(df1), "part"))
   expect_equivalent(
@@ -411,12 +411,12 @@ test_that("Assembling a Dataset manually and getting a 
Table", {
 })
 
 test_that("Assembling multiple DatasetFactories with DatasetFactory", {
-  factory1 <- open_dataset_factory(file.path(dataset_dir, 1), format = 
"parquet")
+  factory1 <- dataset_factory(file.path(dataset_dir, 1), format = "parquet")
   expect_is(factory1, "FileSystemDatasetFactory")
-  factory2 <- open_dataset_factory(file.path(dataset_dir, 2), format = 
"parquet")
+  factory2 <- dataset_factory(file.path(dataset_dir, 2), format = "parquet")
   expect_is(factory2, "FileSystemDatasetFactory")
 
-  factory <- DatasetFactory$create(children=list(factory1, factory2))
+  factory <- DatasetFactory$create(list(factory1, factory2))
   expect_is(factory, "DatasetFactory")
 
   schm <- factory$Inspect()

[arrow] branch master updated: ARROW-7962: [R][Dataset] Followup to "Consolidate Source and Dataset classes"

Reply via email to