This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c6534a59a3 ARROW-16776: [R] dplyr::glimpse method for arrow table and
datasets (#13563)
c6534a59a3 is described below
commit c6534a59a38acd31856284bcdfa36ecea7d11479
Author: Neal Richardson <[email protected]>
AuthorDate: Tue Jul 12 15:48:16 2022 -0400
ARROW-16776: [R] dplyr::glimpse method for arrow table and datasets (#13563)
See reprex (sans terminal formatting) in
[r/tests/testthat/_snaps/dplyr-glimpse.md](https://github.com/apache/arrow/pull/13563/files#diff-e8d50da600908f077796a43b7600c17d34448671c7975bb8c4056a484ac2999e)
Not all queries can be glimpse()d: some would require evaluating the whole
query, which may be expensive (and can't be interrupted yet, see ARROW-11841).
Note that the existing `print()` methods aren't affected by this. There is
still the idea that the print methods for Table/RecordBatch should print some
data (ARROW-16777 and others), but that should probably be column-oriented
instead of row-oriented like glimpse().
Authored-by: Neal Richardson <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
---
r/DESCRIPTION | 3 +
r/NAMESPACE | 2 +
r/R/arrow-object.R | 6 +-
r/R/arrow-package.R | 3 +-
r/R/chunked-array.R | 3 +-
r/R/dplyr-count.R | 2 +-
r/R/dplyr-glimpse.R | 160 +++++++++++++++++++++++++++++++
r/R/dplyr.R | 47 ++++++++-
r/R/extension.R | 22 +----
r/R/filesystem.R | 1 -
r/R/query-engine.R | 4 +-
r/tests/testthat/_snaps/dplyr-glimpse.md | 152 +++++++++++++++++++++++++++++
r/tests/testthat/test-chunked-array.txt | 4 +
r/tests/testthat/test-data-type.R | 19 ++--
r/tests/testthat/test-dplyr-glimpse.R | 102 ++++++++++++++++++++
r/tests/testthat/test-dplyr-query.R | 140 +++++++++++++++++++++++++++
r/tests/testthat/test-extension.R | 2 +-
r/tests/testthat/test-schema.R | 11 +--
18 files changed, 637 insertions(+), 46 deletions(-)
diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 2cbbec054a..a7408d27d6 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -44,6 +44,7 @@ RoxygenNote: 7.2.0
Config/testthat/edition: 3
VignetteBuilder: knitr
Suggests:
+ cli,
DBI,
dbplyr,
decor,
@@ -53,6 +54,7 @@ Suggests:
hms,
knitr,
lubridate,
+ pillar,
pkgload,
reticulate,
rmarkdown,
@@ -103,6 +105,7 @@ Collate:
'dplyr-funcs-type.R'
'expression.R'
'dplyr-funcs.R'
+ 'dplyr-glimpse.R'
'dplyr-group-by.R'
'dplyr-join.R'
'dplyr-mutate.R'
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 023e9bb831..86eb958471 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -453,6 +453,8 @@ importFrom(tidyselect,starts_with)
importFrom(tidyselect,vars_pull)
importFrom(tidyselect,vars_rename)
importFrom(tidyselect,vars_select)
+importFrom(utils,capture.output)
+importFrom(utils,getFromNamespace)
importFrom(utils,head)
importFrom(utils,install.packages)
importFrom(utils,modifyList)
diff --git a/r/R/arrow-object.R b/r/R/arrow-object.R
index 0a82f85877..ac067d4aa5 100644
--- a/r/R/arrow-object.R
+++ b/r/R/arrow-object.R
@@ -31,14 +31,16 @@ ArrowObject <- R6Class("ArrowObject",
}
assign(".:xp:.", xp, envir = self)
},
- print = function(...) {
+ class_title = function() {
if (!is.null(self$.class_title)) {
# Allow subclasses to override just printing the class name first
class_title <- self$.class_title()
} else {
class_title <- class(self)[[1]]
}
- cat(class_title, "\n", sep = "")
+ },
+ print = function(...) {
+ cat(self$class_title(), "\n", sep = "")
if (!is.null(self$ToString)) {
cat(self$ToString(), "\n", sep = "")
}
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 05270ef6bb..a2c37d0ce3 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -41,7 +41,7 @@
"group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute",
"arrange", "rename", "pull", "relocate", "compute", "collapse",
"distinct", "left_join", "right_join", "inner_join", "full_join",
- "semi_join", "anti_join", "count", "tally", "rename_with", "union",
"union_all"
+ "semi_join", "anti_join", "count", "tally", "rename_with", "union",
"union_all", "glimpse"
)
)
for (cl in c("Dataset", "ArrowTabular", "RecordBatchReader",
"arrow_dplyr_query")) {
@@ -50,6 +50,7 @@
}
}
s3_register("dplyr::tbl_vars", "arrow_dplyr_query")
+ s3_register("pillar::type_sum", "DataType")
for (cl in c(
"Array", "RecordBatch", "ChunkedArray", "Table", "Schema",
diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R
index c16f562017..dd1beb2afd 100644
--- a/r/R/chunked-array.R
+++ b/r/R/chunked-array.R
@@ -120,7 +120,8 @@ ChunkedArray <- R6Class("ChunkedArray",
ChunkedArray__Validate(self)
},
ToString = function() {
- ChunkedArray__ToString(self)
+ typ <- paste0("<", self$type$ToString(), ">")
+ paste(typ, ChunkedArray__ToString(self), sep = "\n")
},
Equals = function(other, ...) {
inherits(other, "ChunkedArray") && ChunkedArray__Equals(self, other)
diff --git a/r/R/dplyr-count.R b/r/R/dplyr-count.R
index 747212bc7b..50badb8459 100644
--- a/r/R/dplyr-count.R
+++ b/r/R/dplyr-count.R
@@ -39,7 +39,7 @@ count.Dataset <- count.ArrowTabular <-
count.RecordBatchReader <- count.arrow_dp
#' @importFrom rlang sym :=
tally.arrow_dplyr_query <- function(x, wt = NULL, sort = FALSE, name = NULL) {
- check_name <- utils::getFromNamespace("check_name", "dplyr")
+ check_name <- getFromNamespace("check_name", "dplyr")
name <- check_name(name, dplyr::group_vars(x))
if (quo_is_null(enquo(wt))) {
diff --git a/r/R/dplyr-glimpse.R b/r/R/dplyr-glimpse.R
new file mode 100644
index 0000000000..8a70f4c5b7
--- /dev/null
+++ b/r/R/dplyr-glimpse.R
@@ -0,0 +1,160 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @importFrom utils getFromNamespace
+glimpse.ArrowTabular <- function(x,
+ width = getOption("pillar.width",
getOption("width")),
+ ...) {
+ # This function is inspired by pillar:::glimpse.tbl(), with some adaptations
+
+ # We use cli:: and pillar:: throughout this function. We don't need to check
+ # to see if they're installed because dplyr depends on pillar, which depends
+ # on cli, and we're only in this function though S3 dispatch on
dplyr::glimpse
+ if (!is.finite(width)) {
+ abort("`width` must be finite.")
+ }
+
+ # We need a couple of internal functions in pillar for formatting
+ pretty_int <- getFromNamespace("big_mark", "pillar")
+ make_shorter <- getFromNamespace("str_trunc", "pillar")
+ tickify <- getFromNamespace("tick_if_needed", "pillar")
+
+ # Even though this is the ArrowTabular method, we use it for
arrow_dplyr_query
+ # so make some accommodations. (Others are handled by S3 method dispatch.)
+ if (inherits(x, "arrow_dplyr_query")) {
+ class_title <- paste(source_data(x)$class_title(), "(query)")
+ } else {
+ class_title <- x$class_title()
+ }
+ cli::cat_line(class_title)
+
+ dims <- dim(x)
+ cli::cat_line(sprintf(
+ "%s rows x %s columns", pretty_int(dims[1]), pretty_int(dims[2])
+ ))
+
+ if (dims[2] == 0) {
+ return(invisible(x))
+ }
+
+ nrows <- as.integer(width / 3)
+ head_tab <- dplyr::compute(head(x, nrows))
+ # Take the schema from this Table because if x is arrow_dplyr_query, some
+ # output types could be a best guess (in implicit_schema()).
+ schema <- head_tab$schema
+
+ # Assemble the column names and types
+ # We use the Arrow type names here. See type_sum.DataType() below.
+ var_types <- map_chr(schema$fields, ~
format(pillar::new_pillar_type(.$type)))
+ # glimpse.tbl() left-aligns the var names (pads with whitespace to the right)
+ # and appends the types next to them. Because those type names are
+ # aggressively truncated to all be roughly the same length, this means the
+ # data glimpse that follows is also mostly aligned.
+ # However, Arrow type names are longer and variable length, and we're only
+ # truncating the nested type information inside of <...>. So, to keep the
+ # data glimpses aligned, we "justify" align the name and type: add the
padding
+ # whitespace between them so that the total width is equal.
+ var_headings <- paste("$", center_pad(tickify(names(x)), var_types))
+
+ # Assemble the data glimpse
+ df <- as.data.frame(head_tab)
+ formatted_data <- map_chr(df, function(.) {
+ tryCatch(
+ paste(pillar::format_glimpse(.), collapse = ", "),
+ # This could error e.g. if you have a VctrsExtensionType and the package
+ # that defines methods for the data is not loaded
+ error = function(e) conditionMessage(e)
+ )
+ })
+ # Here and elsewhere in the glimpse code, you have to use
pillar::get_extent()
+ # instead of nchar() because get_extent knows how to deal with ANSI escapes
+ # etc.--it counts how much space on the terminal will be taken when printed.
+ data_width <- width - pillar::get_extent(var_headings)
+ truncated_data <- make_shorter(formatted_data, data_width)
+
+ # Print the table body (var name, type, data glimpse)
+ cli::cat_line(var_headings, " ", truncated_data)
+
+ # TODO: use crayon to style these footers?
+ if (inherits(x, "arrow_dplyr_query")) {
+ cli::cat_line("Call `print()` for query details")
+ } else if (any(grepl("<...>", var_types, fixed = TRUE)) ||
schema$HasMetadata) {
+ cli::cat_line("Call `print()` for full schema details")
+ }
+ invisible(x)
+}
+
+# Dataset has an efficient head() method via Scanner so this is fine
+glimpse.Dataset <- glimpse.ArrowTabular
+
+glimpse.arrow_dplyr_query <- function(x,
+ width = getOption("pillar.width",
getOption("width")),
+ ...) {
+ if (any(map_lgl(all_sources(x), ~ inherits(., "RecordBatchReader")))) {
+ msg <- paste(
+ "Cannot glimpse() data from a RecordBatchReader because it can only be",
+ "read one time. Call `compute()` to evaluate the query first."
+ )
+ message(msg)
+ print(x)
+ } else if (query_on_dataset(x) && !query_can_stream(x)) {
+ msg <- paste(
+ "This query requires a full table scan, so glimpse() may be",
+ "expensive. Call `compute()` to evaluate the query first."
+ )
+ message(msg)
+ print(x)
+ } else {
+ # Go for it
+ glimpse.ArrowTabular(x, width = width, ...)
+ }
+}
+
+glimpse.RecordBatchReader <- function(x,
+ width = getOption("pillar.width",
getOption("width")),
+ ...) {
+ # TODO(ARROW-17038): to_arrow() on duckdb con should hold con not RBR so it
+ # can be run more than once (like duckdb does on the other side)
+ msg <- paste(
+ "Cannot glimpse() data from a RecordBatchReader because it can only be",
+ "read one time; call `as_arrow_table()` to consume it first."
+ )
+ message(msg)
+ print(x)
+}
+
+glimpse.ArrowDatum <- function(x, width, ...) {
+ cli::cat_line(gsub("[ \n]+", " ", x$ToString()))
+ invisible(x)
+}
+
+type_sum.DataType <- function(x) {
+ if (inherits(x, "VctrsExtensionType")) {
+ # ptype() holds a vctrs type object, which pillar knows how to format
+ paste0("ext<", pillar::type_sum(x$ptype()), ">")
+ } else {
+ # Trim long type names with <...>
+ sub("<.*>", "<...>", x$ToString())
+ }
+}
+
+center_pad <- function(left, right) {
+ left_sizes <- pillar::get_extent(left)
+ right_sizes <- pillar::get_extent(right)
+ total_width <- max(left_sizes + right_sizes) + 1L
+ paste0(left, strrep(" ", total_width - left_sizes - right_sizes), right)
+}
diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 8018cb5a60..b048d98018 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -264,7 +264,9 @@ abandon_ship <- function(call, .data, msg) {
eval.parent(call, 2)
}
-query_on_dataset <- function(x) inherits(source_data(x), c("Dataset",
"RecordBatchReader"))
+query_on_dataset <- function(x) {
+ any(map_lgl(all_sources(x), ~ inherits(., c("Dataset",
"RecordBatchReader"))))
+}
source_data <- function(x) {
if (!inherits(x, "arrow_dplyr_query")) {
@@ -276,13 +278,48 @@ source_data <- function(x) {
}
}
-is_collapsed <- function(x) inherits(x$.data, "arrow_dplyr_query")
+all_sources <- function(x) {
+ if (is.null(x)) {
+ x
+ } else if (!inherits(x, "arrow_dplyr_query")) {
+ list(x)
+ } else {
+ c(
+ all_sources(x$.data),
+ all_sources(x$join$right_data),
+ all_sources(x$union_all$right_data)
+ )
+ }
+}
-has_aggregation <- function(x) {
- # TODO: update with joins (check right side data too)
- !is.null(x$aggregations) || (is_collapsed(x) && has_aggregation(x$.data))
+query_can_stream <- function(x) {
+ # Queries that just select/filter/mutate can stream:
+ # you can take head() without evaluating over the whole dataset
+ if (inherits(x, "arrow_dplyr_query")) {
+ # Aggregations require all of the data
+ is.null(x$aggregations) &&
+ # Sorting does too
+ length(x$arrange_vars) == 0 &&
+ # Joins are ok as long as the right-side data is in memory
+ # (we have to hash the whole dataset to join it)
+ !query_on_dataset(x$join$right_data) &&
+ # But need to check that this non-dataset join can stream
+ query_can_stream(x$join$right_data) &&
+ # Also check that any unioned datasets also can stream
+ query_can_stream(x$union_all$right_data) &&
+ # Recursively check any queries that have been collapsed
+ query_can_stream(x$.data)
+ } else {
+ # Not a query, so it must be a Table/Dataset (or NULL)
+ # Note that if you have a RecordBatchReader, you *can* stream,
+ # but the reader is consumed. If that's a problem, you should check
+ # for RBRs outside of this function.
+ TRUE
+ }
}
+is_collapsed <- function(x) inherits(x$.data, "arrow_dplyr_query")
+
has_head_tail <- function(x) {
!is.null(x$head) || !is.null(x$tail) || (is_collapsed(x) &&
has_head_tail(x$.data))
}
diff --git a/r/R/extension.R b/r/R/extension.R
index e31f4934a7..be492c845f 100644
--- a/r/R/extension.R
+++ b/r/R/extension.R
@@ -193,7 +193,7 @@ ExtensionType <- R6Class("ExtensionType",
sprintf(
"<%s %s...>",
class(self)[1],
- paste(format(utils::head(metadata_raw, 20)), collapse = " ")
+ paste(format(head(metadata_raw, 20)), collapse = " ")
)
} else {
sprintf(
@@ -420,31 +420,19 @@ unregister_extension_type <- function(extension_name) {
arrow__UnregisterRExtensionType(extension_name)
}
+#' @importFrom utils capture.output
VctrsExtensionType <- R6Class("VctrsExtensionType",
inherit = ExtensionType,
public = list(
- ptype = function() {
- private$.ptype
- },
+ ptype = function() private$.ptype,
ToString = function() {
- tf <- tempfile()
- sink(tf)
- on.exit({
- sink(NULL)
- unlink(tf)
- })
- print(self$ptype())
- paste0(readLines(tf), collapse = "\n")
+ paste0(capture.output(print(self$ptype())), collapse = "\n")
},
deserialize_instance = function() {
private$.ptype <- unserialize(self$extension_metadata())
},
ExtensionEquals = function(other) {
- if (!inherits(other, "VctrsExtensionType")) {
- return(FALSE)
- }
-
- identical(self$ptype(), other$ptype())
+ inherits(other, "VctrsExtensionType") && identical(self$ptype(),
other$ptype())
},
as_vector = function(extension_array) {
if (inherits(extension_array, "ChunkedArray")) {
diff --git a/r/R/filesystem.R b/r/R/filesystem.R
index 75997431a4..3cebbc30c8 100644
--- a/r/R/filesystem.R
+++ b/r/R/filesystem.R
@@ -451,7 +451,6 @@ s3_bucket <- function(bucket, ...) {
#' @usage NULL
#' @format NULL
#' @rdname FileSystem
-#' @importFrom utils modifyList
#' @export
GcsFileSystem <- R6Class("GcsFileSystem",
inherit = FileSystem
diff --git a/r/R/query-engine.R b/r/R/query-engine.R
index 513b861d41..511bf3dbc2 100644
--- a/r/R/query-engine.R
+++ b/r/R/query-engine.R
@@ -226,9 +226,9 @@ ExecPlan <- R6Class("ExecPlan",
slice_size <- node$extras$head %||% node$extras$tail
if (!is.null(slice_size)) {
out <- head(out, slice_size)
+ # We already have everything we need for the head, so StopProducing
+ self$Stop()
}
- # Can we now tell `self$Stop()` to StopProducing? We already have
- # everything we need for the head (but it seems to segfault:
ARROW-14329)
} else if (!is.null(node$extras$tail)) {
# TODO(ARROW-16630): proper BottomK support
# Reverse the row order to get back what we expect
diff --git a/r/tests/testthat/_snaps/dplyr-glimpse.md
b/r/tests/testthat/_snaps/dplyr-glimpse.md
new file mode 100644
index 0000000000..6daca0850d
--- /dev/null
+++ b/r/tests/testthat/_snaps/dplyr-glimpse.md
@@ -0,0 +1,152 @@
+# glimpse() Table/ChunkedArray
+
+ Code
+ glimpse(tab)
+ Output
+ Table
+ 10 rows x 7 columns
+ $ int <int32> 1, 2, 3, NA, 5, 6, 7, 8, 9, 10
+ $ dbl <double> 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, NA, 10.1
+ $ dbl2 <double> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
+ $ lgl <bool> TRUE, NA, TRUE, FALSE, TRUE, NA, NA, FALSE,
FALSE, NA
+ $ false <bool> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, ~
+ $ chr <string> "a", "b", "c", "d", "e", NA, "g", "h", "i", "j"
+ $ fct <dictionary<...>> a, b, c, d, NA, NA, g, h, i, j
+ Call `print()` for full schema details
+
+---
+
+ Code
+ glimpse(tab$chr)
+ Output
+ <string> [ [ "a", "b", "c", "d", "e", null, "g", "h", "i", "j" ] ]
+
+# glimpse() RecordBatch/Array
+
+ Code
+ glimpse(batch)
+ Output
+ RecordBatch
+ 10 rows x 7 columns
+ $ int <int32> 1, 2, 3, NA, 5, 6, 7, 8, 9, 10
+ $ dbl <double> 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, NA, 10.1
+ $ dbl2 <double> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
+ $ lgl <bool> TRUE, NA, TRUE, FALSE, TRUE, NA, NA, FALSE,
FALSE, NA
+ $ false <bool> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, ~
+ $ chr <string> "a", "b", "c", "d", "e", NA, "g", "h", "i", "j"
+ $ fct <dictionary<...>> a, b, c, d, NA, NA, g, h, i, j
+ Call `print()` for full schema details
+
+---
+
+ Code
+ glimpse(batch$int)
+ Output
+ <int32> [ 1, 2, 3, null, 5, 6, 7, 8, 9, 10 ]
+
+# glimpse() with VctrsExtensionType
+
+ Code
+ glimpse(haven)
+ Output
+ Table
+ 2 rows x 3 columns
+ $ num <double> 5.1, 4.9
+ $ cat_int <ext<hvn_lbll>> 3, 1
+ $ cat_chr <ext<hvn_lbll>> Can't convert `x` <haven_labelled> to
<character>.
+ Call `print()` for full schema details
+
+---
+
+ Code
+ glimpse(haven[[3]])
+ Output
+ <<haven_labelled[0]>> [ [ "B", "B" ] ]
+
+# glimpse prints message about schema if there are complex types
+
+ Code
+ glimpse(dictionary_but_no_metadata)
+ Output
+ Table
+ 5 rows x 2 columns
+ $ a <int32> 1, 2, 3, 4, 5
+ $ b <dictionary<...>> 1, 2, 3, 4, 5
+ Call `print()` for full schema details
+
+---
+
+ Code
+ glimpse(Table$create(a = 1))
+ Output
+ Table
+ 1 rows x 1 columns
+ $ a <double> 1
+
+# glimpse() calls print() instead of showing data for RBR
+
+ Code
+ example_data %>% as_record_batch_reader() %>% glimpse()
+ Message <simpleMessage>
+ Cannot glimpse() data from a RecordBatchReader because it can only be
read one time; call `as_arrow_table()` to consume it first.
+ Output
+ RecordBatchReader
+ int: int32
+ dbl: double
+ dbl2: double
+ lgl: bool
+ false: bool
+ chr: string
+ fct: dictionary<values=string, indices=int8>
+
+---
+
+ Code
+ example_data %>% as_record_batch_reader() %>% select(int) %>% glimpse()
+ Message <simpleMessage>
+ Cannot glimpse() data from a RecordBatchReader because it can only be
read one time. Call `compute()` to evaluate the query first.
+ Output
+ RecordBatchReader (query)
+ int: int32
+
+ See $.data for the source Arrow object
+
+# glimpse() on Dataset
+
+ Code
+ glimpse(ds)
+ Output
+ FileSystemDataset with 2 Parquet files
+ 20 rows x 7 columns
+ $ int <int32> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 101, 102,
103, 104, ~
+ $ dbl <double> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 51, 52, 53,
54, 55, ~
+ $ lgl <bool> TRUE, FALSE, NA, TRUE, FALSE, TRUE, FALSE,
NA, TRUE~
+ $ chr <string> "a", "b", "c", "d", "e", "f", "g", "h",
"i", "j", "~
+ $ fct <dictionary<...>> A, B, C, D, E, F, G, H, I, J, J, I, H, G,
F, E, D, ~
+ $ ts <timestamp[us, tz=UTC]> 2015-04-30 03:12:39, 2015-05-01 03:12:39,
2015-05-0~
+ $ group <int32> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
2, 2, 2, ~
+ Call `print()` for full schema details
+
+# glimpse() on Dataset query only shows data for streaming eval
+
+ Code
+ ds %>% summarize(max(int)) %>% glimpse()
+ Message <simpleMessage>
+ This query requires a full table scan, so glimpse() may be expensive.
Call `compute()` to evaluate the query first.
+ Output
+ FileSystemDataset (query)
+ max(int): int32
+
+ See $.data for the source Arrow object
+
+# glimpse() on in-memory query shows data even if aggregating
+
+ Code
+ example_data %>% arrow_table() %>% summarize(sum(int, na.rm = TRUE)) %>%
+ glimpse()
+ Output
+ Table (query)
+ ?? rows x 1 columns
+ $ `sum(int, na.rm = TRUE)` <int64> 51
+ Call `print()` for query details
+
diff --git a/r/tests/testthat/test-chunked-array.txt
b/r/tests/testthat/test-chunked-array.txt
index c7101359d7..e2a691a99e 100644
--- a/r/tests/testthat/test-chunked-array.txt
+++ b/r/tests/testthat/test-chunked-array.txt
@@ -1,5 +1,6 @@
> chunked_array(c(1, 2, 3), c(4, 5, 6))
ChunkedArray
+<double>
[
[
1,
@@ -15,6 +16,7 @@ ChunkedArray
> chunked_array(1:30, c(4, 5, 6))
ChunkedArray
+<int32>
[
[
1,
@@ -48,6 +50,7 @@ ChunkedArray
> chunked_array(1:30)
ChunkedArray
+<int32>
[
[
1,
@@ -76,6 +79,7 @@ ChunkedArray
> chunked_array(factor(c("a", "b")), factor(c("c", "d")))
ChunkedArray
+<dictionary<values=string, indices=int8>>
[
-- dictionary:
diff --git a/r/tests/testthat/test-data-type.R
b/r/tests/testthat/test-data-type.R
index 88333fb314..16fcf8e0a3 100644
--- a/r/tests/testthat/test-data-type.R
+++ b/r/tests/testthat/test-data-type.R
@@ -593,15 +593,16 @@ test_that("DataType$code()", {
expect_code_roundtrip(dictionary(index_type = int8(), value_type =
large_utf8()))
expect_code_roundtrip(dictionary(index_type = int8(), ordered = TRUE))
- skip("until rlang 1.0")
- expect_snapshot({
- (expect_error(
- DayTimeInterval__initialize()$code()
- ))
- (expect_error(
- struct(a = DayTimeInterval__initialize())$code()
- ))
- })
+ skip_if(packageVersion("rlang") < 1)
+ # Are these unsupported for a reason?
+ expect_error(
+ eval(DayTimeInterval__initialize()$code()),
+ "Unsupported type"
+ )
+ expect_error(
+ eval(struct(a = DayTimeInterval__initialize())$code()),
+ "Unsupported type"
+ )
})
test_that("as_data_type() works for DataType", {
diff --git a/r/tests/testthat/test-dplyr-glimpse.R
b/r/tests/testthat/test-dplyr-glimpse.R
new file mode 100644
index 0000000000..d39fef9e82
--- /dev/null
+++ b/r/tests/testthat/test-dplyr-glimpse.R
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+library(dplyr, warn.conflicts = FALSE)
+
+test_that("glimpse() Table/ChunkedArray", {
+ tab <- Table$create(example_data)
+ expect_snapshot(glimpse(tab))
+ expect_snapshot(glimpse(tab$chr))
+})
+
+test_that("glimpse() RecordBatch/Array", {
+ batch <- RecordBatch$create(example_data)
+ expect_snapshot(glimpse(batch))
+ expect_snapshot(glimpse(batch$int))
+})
+
+test_that("glimpse() with VctrsExtensionType", {
+ haven <- Table$create(haven_data)
+ expect_snapshot(glimpse(haven))
+ expect_snapshot(glimpse(haven[[3]]))
+})
+
+test_that("glimpse prints message about schema if there are complex types", {
+ dictionary_but_no_metadata <- Table$create(a = 1:5, b = factor(1:5))
+ expect_snapshot(glimpse(dictionary_but_no_metadata))
+ # No message here
+ expect_snapshot(glimpse(Table$create(a = 1)))
+})
+
+test_that("glimpse() calls print() instead of showing data for RBR", {
+ expect_snapshot({
+ example_data %>%
+ as_record_batch_reader() %>%
+ glimpse()
+ })
+ expect_snapshot({
+ example_data %>%
+ as_record_batch_reader() %>%
+ select(int) %>%
+ glimpse()
+ })
+})
+
+skip_if_not_available("dataset")
+big_df <- rbind(
+ cbind(df1, group = 1),
+ cbind(df2, group = 2)
+)
+ds_dir <- make_temp_dir()
+write_dataset(big_df, ds_dir, partitioning = "group")
+
+ds <- open_dataset(ds_dir)
+
+test_that("glimpse() on Dataset", {
+ expect_snapshot(glimpse(ds))
+})
+
+test_that("glimpse() on Dataset query only shows data for streaming eval", {
+ # Because dataset scan row order is not deterministic, we can't snapshot
+ # the whole output. Instead check for an indication that glimpse method ran
+ # instead of the regular print() method that is the fallback
+ expect_output(
+ ds %>%
+ select(int, chr) %>%
+ filter(int > 2) %>%
+ mutate(twice = int * 2) %>%
+ glimpse(),
+ "Call `print()` for query details",
+ fixed = TRUE
+ )
+
+ # This doesn't show the data and falls back to print()
+ expect_snapshot({
+ ds %>%
+ summarize(max(int)) %>%
+ glimpse()
+ })
+})
+
+test_that("glimpse() on in-memory query shows data even if aggregating", {
+ expect_snapshot({
+ example_data %>%
+ arrow_table() %>%
+ summarize(sum(int, na.rm = TRUE)) %>%
+ glimpse()
+ })
+})
diff --git a/r/tests/testthat/test-dplyr-query.R
b/r/tests/testthat/test-dplyr-query.R
index d55ed07cfc..62633eb1d6 100644
--- a/r/tests/testthat/test-dplyr-query.R
+++ b/r/tests/testthat/test-dplyr-query.R
@@ -293,3 +293,143 @@ test_that("No duplicate field names are allowed in an
arrow_dplyr_query", {
)
)
})
+
+test_that("all_sources() finds all data sources in a query", {
+ tab <- Table$create(a = 1)
+ ds <- InMemoryDataset$create(tab)
+ expect_equal(all_sources(tab), list(tab))
+ expect_equal(
+ tab %>%
+ filter(a > 0) %>%
+ summarize(a = sum(a)) %>%
+ arrange(desc(a)) %>%
+ all_sources(),
+ list(tab)
+ )
+ expect_equal(
+ tab %>%
+ filter(a > 0) %>%
+ union_all(ds) %>%
+ all_sources(),
+ list(tab, ds)
+ )
+
+ expect_equal(
+ tab %>%
+ filter(a > 0) %>%
+ union_all(ds) %>%
+ left_join(tab) %>%
+ all_sources(),
+ list(tab, ds, tab)
+ )
+ expect_equal(
+ tab %>%
+ filter(a > 0) %>%
+ union_all(left_join(ds, tab)) %>%
+ left_join(tab) %>%
+ all_sources(),
+ list(tab, ds, tab, tab)
+ )
+})
+
+test_that("query_on_dataset() looks at all data sources in a query", {
+ tab <- Table$create(a = 1)
+ ds <- InMemoryDataset$create(tab)
+ expect_false(query_on_dataset(tab))
+ expect_true(query_on_dataset(ds))
+ expect_false(
+ tab %>%
+ filter(a > 0) %>%
+ summarize(a = sum(a)) %>%
+ arrange(desc(a)) %>%
+ query_on_dataset()
+ )
+ expect_true(
+ tab %>%
+ filter(a > 0) %>%
+ union_all(ds) %>%
+ query_on_dataset()
+ )
+
+ expect_true(
+ tab %>%
+ filter(a > 0) %>%
+ union_all(left_join(ds, tab)) %>%
+ left_join(tab) %>%
+ query_on_dataset()
+ )
+ expect_false(
+ tab %>%
+ filter(a > 0) %>%
+ union_all(left_join(tab, tab)) %>%
+ left_join(tab) %>%
+ query_on_dataset()
+ )
+})
+
+test_that("query_can_stream()", {
+ tab <- Table$create(a = 1)
+ ds <- InMemoryDataset$create(tab)
+ expect_true(query_can_stream(tab))
+ expect_true(query_can_stream(ds))
+ expect_true(query_can_stream(NULL))
+ expect_true(
+ ds %>%
+ filter(a > 0) %>%
+ query_can_stream()
+ )
+ expect_false(
+ tab %>%
+ filter(a > 0) %>%
+ arrange(desc(a)) %>%
+ query_can_stream()
+ )
+ expect_false(
+ tab %>%
+ filter(a > 0) %>%
+ summarize(a = sum(a)) %>%
+ query_can_stream()
+ )
+ expect_true(
+ tab %>%
+ filter(a > 0) %>%
+ union_all(ds) %>%
+ query_can_stream()
+ )
+ expect_false(
+ tab %>%
+ filter(a > 0) %>%
+ union_all(summarize(ds, a = sum(a))) %>%
+ query_can_stream()
+ )
+
+ expect_true(
+ tab %>%
+ filter(a > 0) %>%
+ union_all(left_join(ds, tab)) %>%
+ left_join(tab) %>%
+ query_can_stream()
+ )
+ expect_true(
+ tab %>%
+ filter(a > 0) %>%
+ union_all(left_join(tab, tab)) %>%
+ left_join(tab) %>%
+ query_can_stream()
+ )
+ expect_false(
+ tab %>%
+ filter(a > 0) %>%
+ union_all(left_join(tab, tab)) %>%
+ left_join(ds) %>%
+ query_can_stream()
+ )
+ expect_false(
+ tab %>%
+ filter(a > 0) %>%
+ arrange(a) %>%
+ union_all(left_join(tab, tab)) %>%
+ left_join(tab) %>%
+ query_can_stream()
+ )
+})
diff --git a/r/tests/testthat/test-extension.R
b/r/tests/testthat/test-extension.R
index a4a37c9127..638869dc8c 100644
--- a/r/tests/testthat/test-extension.R
+++ b/r/tests/testthat/test-extension.R
@@ -183,7 +183,7 @@ test_that("vctrs extension type works", {
expect_r6_class(type, "VctrsExtensionType")
expect_identical(type$ptype(), vctrs::vec_ptype(custom_vctr))
expect_true(type$Equals(type))
- expect_match(type$ToString(), "arrow_custom_test")
+ expect_identical(type$ToString(), "<arrow_custom_test[0]>")
array_in <- vctrs_extension_array(custom_vctr)
expect_true(array_in$type$Equals(type))
diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R
index c7046de3cb..3a35569f7f 100644
--- a/r/tests/testthat/test-schema.R
+++ b/r/tests/testthat/test-schema.R
@@ -43,12 +43,11 @@ test_that("Schema$code()", {
schema(a = int32(), b = struct(c = double(), d = utf8()), e =
list_of(binary()))
)
- skip("until rlang 1.0")
- expect_snapshot({
- (expect_error(
- schema(x = int32(), y = DayTimeInterval__initialize())$code()
- ))
- })
+ skip_if(packageVersion("rlang") < 1)
+ expect_error(
+ eval(schema(x = int32(), y = DayTimeInterval__initialize())$code()),
+ "Unsupported type"
+ )
})
test_that("Schema with non-nullable fields", {