[arrow] branch master updated: ARROW-16776: [R] dplyr::glimpse method for arrow table and datasets (#13563)

npr Tue, 12 Jul 2022 12:48:33 -0700

This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new c6534a59a3 ARROW-16776: [R] dplyr::glimpse method for arrow table and 
datasets (#13563)
c6534a59a3 is described below

commit c6534a59a38acd31856284bcdfa36ecea7d11479
Author: Neal Richardson <[email protected]>
AuthorDate: Tue Jul 12 15:48:16 2022 -0400

    ARROW-16776: [R] dplyr::glimpse method for arrow table and datasets (#13563)
    
    See reprex (sans terminal formatting) in 
[r/tests/testthat/_snaps/dplyr-glimpse.md](https://github.com/apache/arrow/pull/13563/files#diff-e8d50da600908f077796a43b7600c17d34448671c7975bb8c4056a484ac2999e)
    
    Not all queries can be glimpse()d: some would require evaluating the whole 
query, which may be expensive (and can't be interrupted yet, see ARROW-11841).
    
    Note that the existing `print()` methods aren't affected by this. There is 
still the idea that the print methods for Table/RecordBatch should print some 
data (ARROW-16777 and others), but that should probably be column-oriented 
instead of row-oriented like glimpse().
    
    Authored-by: Neal Richardson <[email protected]>
    Signed-off-by: Neal Richardson <[email protected]>
---
 r/DESCRIPTION                            |   3 +
 r/NAMESPACE                              |   2 +
 r/R/arrow-object.R                       |   6 +-
 r/R/arrow-package.R                      |   3 +-
 r/R/chunked-array.R                      |   3 +-
 r/R/dplyr-count.R                        |   2 +-
 r/R/dplyr-glimpse.R                      | 160 +++++++++++++++++++++++++++++++
 r/R/dplyr.R                              |  47 ++++++++-
 r/R/extension.R                          |  22 +----
 r/R/filesystem.R                         |   1 -
 r/R/query-engine.R                       |   4 +-
 r/tests/testthat/_snaps/dplyr-glimpse.md | 152 +++++++++++++++++++++++++++++
 r/tests/testthat/test-chunked-array.txt  |   4 +
 r/tests/testthat/test-data-type.R        |  19 ++--
 r/tests/testthat/test-dplyr-glimpse.R    | 102 ++++++++++++++++++++
 r/tests/testthat/test-dplyr-query.R      | 140 +++++++++++++++++++++++++++
 r/tests/testthat/test-extension.R        |   2 +-
 r/tests/testthat/test-schema.R           |  11 +--
 18 files changed, 637 insertions(+), 46 deletions(-)

diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 2cbbec054a..a7408d27d6 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -44,6 +44,7 @@ RoxygenNote: 7.2.0
 Config/testthat/edition: 3
 VignetteBuilder: knitr
 Suggests:
+    cli,
     DBI,
     dbplyr,
     decor,
@@ -53,6 +54,7 @@ Suggests:
     hms,
     knitr,
     lubridate,
+    pillar,
     pkgload,
     reticulate,
     rmarkdown,
@@ -103,6 +105,7 @@ Collate:
     'dplyr-funcs-type.R'
     'expression.R'
     'dplyr-funcs.R'
+    'dplyr-glimpse.R'
     'dplyr-group-by.R'
     'dplyr-join.R'
     'dplyr-mutate.R'
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 023e9bb831..86eb958471 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -453,6 +453,8 @@ importFrom(tidyselect,starts_with)
 importFrom(tidyselect,vars_pull)
 importFrom(tidyselect,vars_rename)
 importFrom(tidyselect,vars_select)
+importFrom(utils,capture.output)
+importFrom(utils,getFromNamespace)
 importFrom(utils,head)
 importFrom(utils,install.packages)
 importFrom(utils,modifyList)
diff --git a/r/R/arrow-object.R b/r/R/arrow-object.R
index 0a82f85877..ac067d4aa5 100644
--- a/r/R/arrow-object.R
+++ b/r/R/arrow-object.R
@@ -31,14 +31,16 @@ ArrowObject <- R6Class("ArrowObject",
       }
       assign(".:xp:.", xp, envir = self)
     },
-    print = function(...) {
+    class_title = function() {
       if (!is.null(self$.class_title)) {
         # Allow subclasses to override just printing the class name first
         class_title <- self$.class_title()
       } else {
         class_title <- class(self)[[1]]
       }
-      cat(class_title, "\n", sep = "")
+    },
+    print = function(...) {
+      cat(self$class_title(), "\n", sep = "")
       if (!is.null(self$ToString)) {
         cat(self$ToString(), "\n", sep = "")
       }
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 05270ef6bb..a2c37d0ce3 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -41,7 +41,7 @@
       "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute",
       "arrange", "rename", "pull", "relocate", "compute", "collapse",
       "distinct", "left_join", "right_join", "inner_join", "full_join",
-      "semi_join", "anti_join", "count", "tally", "rename_with", "union", 
"union_all"
+      "semi_join", "anti_join", "count", "tally", "rename_with", "union", 
"union_all", "glimpse"
     )
   )
   for (cl in c("Dataset", "ArrowTabular", "RecordBatchReader", 
"arrow_dplyr_query")) {
@@ -50,6 +50,7 @@
     }
   }
   s3_register("dplyr::tbl_vars", "arrow_dplyr_query")
+  s3_register("pillar::type_sum", "DataType")
 
   for (cl in c(
     "Array", "RecordBatch", "ChunkedArray", "Table", "Schema",
diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R
index c16f562017..dd1beb2afd 100644
--- a/r/R/chunked-array.R
+++ b/r/R/chunked-array.R
@@ -120,7 +120,8 @@ ChunkedArray <- R6Class("ChunkedArray",
       ChunkedArray__Validate(self)
     },
     ToString = function() {
-      ChunkedArray__ToString(self)
+      typ <- paste0("<", self$type$ToString(), ">")
+      paste(typ, ChunkedArray__ToString(self), sep = "\n")
     },
     Equals = function(other, ...) {
       inherits(other, "ChunkedArray") && ChunkedArray__Equals(self, other)
diff --git a/r/R/dplyr-count.R b/r/R/dplyr-count.R
index 747212bc7b..50badb8459 100644
--- a/r/R/dplyr-count.R
+++ b/r/R/dplyr-count.R
@@ -39,7 +39,7 @@ count.Dataset <- count.ArrowTabular <- 
count.RecordBatchReader <- count.arrow_dp
 
 #' @importFrom rlang sym :=
 tally.arrow_dplyr_query <- function(x, wt = NULL, sort = FALSE, name = NULL) {
-  check_name <- utils::getFromNamespace("check_name", "dplyr")
+  check_name <- getFromNamespace("check_name", "dplyr")
   name <- check_name(name, dplyr::group_vars(x))
 
   if (quo_is_null(enquo(wt))) {
diff --git a/r/R/dplyr-glimpse.R b/r/R/dplyr-glimpse.R
new file mode 100644
index 0000000000..8a70f4c5b7
--- /dev/null
+++ b/r/R/dplyr-glimpse.R
@@ -0,0 +1,160 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @importFrom utils getFromNamespace
+glimpse.ArrowTabular <- function(x,
+                                 width = getOption("pillar.width", 
getOption("width")),
+                                 ...) {
+  # This function is inspired by pillar:::glimpse.tbl(), with some adaptations
+
+  # We use cli:: and pillar:: throughout this function. We don't need to check
+  # to see if they're installed because dplyr depends on pillar, which depends
+  # on cli, and we're only in this function though S3 dispatch on 
dplyr::glimpse
+  if (!is.finite(width)) {
+    abort("`width` must be finite.")
+  }
+
+  # We need a couple of internal functions in pillar for formatting
+  pretty_int <- getFromNamespace("big_mark", "pillar")
+  make_shorter <- getFromNamespace("str_trunc", "pillar")
+  tickify <- getFromNamespace("tick_if_needed", "pillar")
+
+  # Even though this is the ArrowTabular method, we use it for 
arrow_dplyr_query
+  # so make some accommodations. (Others are handled by S3 method dispatch.)
+  if (inherits(x, "arrow_dplyr_query")) {
+    class_title <- paste(source_data(x)$class_title(), "(query)")
+  } else {
+    class_title <- x$class_title()
+  }
+  cli::cat_line(class_title)
+
+  dims <- dim(x)
+  cli::cat_line(sprintf(
+    "%s rows x %s columns", pretty_int(dims[1]), pretty_int(dims[2])
+  ))
+
+  if (dims[2] == 0) {
+    return(invisible(x))
+  }
+
+  nrows <- as.integer(width / 3)
+  head_tab <- dplyr::compute(head(x, nrows))
+  # Take the schema from this Table because if x is arrow_dplyr_query, some
+  # output types could be a best guess (in implicit_schema()).
+  schema <- head_tab$schema
+
+  # Assemble the column names and types
+  # We use the Arrow type names here. See type_sum.DataType() below.
+  var_types <- map_chr(schema$fields, ~ 
format(pillar::new_pillar_type(.$type)))
+  # glimpse.tbl() left-aligns the var names (pads with whitespace to the right)
+  # and appends the types next to them. Because those type names are
+  # aggressively truncated to all be roughly the same length, this means the
+  # data glimpse that follows is also mostly aligned.
+  # However, Arrow type names are longer and variable length, and we're only
+  # truncating the nested type information inside of <...>. So, to keep the
+  # data glimpses aligned, we "justify" align the name and type: add the 
padding
+  # whitespace between them so that the total width is equal.
+  var_headings <- paste("$", center_pad(tickify(names(x)), var_types))
+
+  # Assemble the data glimpse
+  df <- as.data.frame(head_tab)
+  formatted_data <- map_chr(df, function(.) {
+    tryCatch(
+      paste(pillar::format_glimpse(.), collapse = ", "),
+      # This could error e.g. if you have a VctrsExtensionType and the package
+      # that defines methods for the data is not loaded
+      error = function(e) conditionMessage(e)
+    )
+  })
+  # Here and elsewhere in the glimpse code, you have to use 
pillar::get_extent()
+  # instead of nchar() because get_extent knows how to deal with ANSI escapes
+  # etc.--it counts how much space on the terminal will be taken when printed.
+  data_width <- width - pillar::get_extent(var_headings)
+  truncated_data <- make_shorter(formatted_data, data_width)
+
+  # Print the table body (var name, type, data glimpse)
+  cli::cat_line(var_headings, " ", truncated_data)
+
+  # TODO: use crayon to style these footers?
+  if (inherits(x, "arrow_dplyr_query")) {
+    cli::cat_line("Call `print()` for query details")
+  } else if (any(grepl("<...>", var_types, fixed = TRUE)) || 
schema$HasMetadata) {
+    cli::cat_line("Call `print()` for full schema details")
+  }
+  invisible(x)
+}
+
+# Dataset has an efficient head() method via Scanner so this is fine
+glimpse.Dataset <- glimpse.ArrowTabular
+
+glimpse.arrow_dplyr_query <- function(x,
+                                      width = getOption("pillar.width", 
getOption("width")),
+                                      ...) {
+  if (any(map_lgl(all_sources(x), ~ inherits(., "RecordBatchReader")))) {
+    msg <- paste(
+      "Cannot glimpse() data from a RecordBatchReader because it can only be",
+      "read one time. Call `compute()` to evaluate the query first."
+    )
+    message(msg)
+    print(x)
+  } else if (query_on_dataset(x) && !query_can_stream(x)) {
+    msg <- paste(
+      "This query requires a full table scan, so glimpse() may be",
+      "expensive. Call `compute()` to evaluate the query first."
+    )
+    message(msg)
+    print(x)
+  } else {
+    # Go for it
+    glimpse.ArrowTabular(x, width = width, ...)
+  }
+}
+
+glimpse.RecordBatchReader <- function(x,
+                                      width = getOption("pillar.width", 
getOption("width")),
+                                      ...) {
+  # TODO(ARROW-17038): to_arrow() on duckdb con should hold con not RBR so it
+  # can be run more than once (like duckdb does on the other side)
+  msg <- paste(
+    "Cannot glimpse() data from a RecordBatchReader because it can only be",
+    "read one time; call `as_arrow_table()` to consume it first."
+  )
+  message(msg)
+  print(x)
+}
+
+glimpse.ArrowDatum <- function(x, width, ...) {
+  cli::cat_line(gsub("[ \n]+", " ", x$ToString()))
+  invisible(x)
+}
+
+type_sum.DataType <- function(x) {
+  if (inherits(x, "VctrsExtensionType")) {
+    # ptype() holds a vctrs type object, which pillar knows how to format
+    paste0("ext<", pillar::type_sum(x$ptype()), ">")
+  } else {
+    # Trim long type names with <...>
+    sub("<.*>", "<...>", x$ToString())
+  }
+}
+
+center_pad <- function(left, right) {
+  left_sizes <- pillar::get_extent(left)
+  right_sizes <- pillar::get_extent(right)
+  total_width <- max(left_sizes + right_sizes) + 1L
+  paste0(left, strrep(" ", total_width - left_sizes - right_sizes), right)
+}
diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 8018cb5a60..b048d98018 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -264,7 +264,9 @@ abandon_ship <- function(call, .data, msg) {
   eval.parent(call, 2)
 }
 
-query_on_dataset <- function(x) inherits(source_data(x), c("Dataset", 
"RecordBatchReader"))
+query_on_dataset <- function(x) {
+  any(map_lgl(all_sources(x), ~ inherits(., c("Dataset", 
"RecordBatchReader"))))
+}
 
 source_data <- function(x) {
   if (!inherits(x, "arrow_dplyr_query")) {
@@ -276,13 +278,48 @@ source_data <- function(x) {
   }
 }
 
-is_collapsed <- function(x) inherits(x$.data, "arrow_dplyr_query")
+all_sources <- function(x) {
+  if (is.null(x)) {
+    x
+  } else if (!inherits(x, "arrow_dplyr_query")) {
+    list(x)
+  } else {
+    c(
+      all_sources(x$.data),
+      all_sources(x$join$right_data),
+      all_sources(x$union_all$right_data)
+    )
+  }
+}
 
-has_aggregation <- function(x) {
-  # TODO: update with joins (check right side data too)
-  !is.null(x$aggregations) || (is_collapsed(x) && has_aggregation(x$.data))
+query_can_stream <- function(x) {
+  # Queries that just select/filter/mutate can stream:
+  # you can take head() without evaluating over the whole dataset
+  if (inherits(x, "arrow_dplyr_query")) {
+    # Aggregations require all of the data
+    is.null(x$aggregations) &&
+      # Sorting does too
+      length(x$arrange_vars) == 0 &&
+      # Joins are ok as long as the right-side data is in memory
+      # (we have to hash the whole dataset to join it)
+      !query_on_dataset(x$join$right_data) &&
+      # But need to check that this non-dataset join can stream
+      query_can_stream(x$join$right_data) &&
+      # Also check that any unioned datasets also can stream
+      query_can_stream(x$union_all$right_data) &&
+      # Recursively check any queries that have been collapsed
+      query_can_stream(x$.data)
+  } else {
+    # Not a query, so it must be a Table/Dataset (or NULL)
+    # Note that if you have a RecordBatchReader, you *can* stream,
+    # but the reader is consumed. If that's a problem, you should check
+    # for RBRs outside of this function.
+    TRUE
+  }
 }
 
+is_collapsed <- function(x) inherits(x$.data, "arrow_dplyr_query")
+
 has_head_tail <- function(x) {
   !is.null(x$head) || !is.null(x$tail) || (is_collapsed(x) && 
has_head_tail(x$.data))
 }
diff --git a/r/R/extension.R b/r/R/extension.R
index e31f4934a7..be492c845f 100644
--- a/r/R/extension.R
+++ b/r/R/extension.R
@@ -193,7 +193,7 @@ ExtensionType <- R6Class("ExtensionType",
           sprintf(
             "<%s %s...>",
             class(self)[1],
-            paste(format(utils::head(metadata_raw, 20)), collapse = " ")
+            paste(format(head(metadata_raw, 20)), collapse = " ")
           )
         } else {
           sprintf(
@@ -420,31 +420,19 @@ unregister_extension_type <- function(extension_name) {
   arrow__UnregisterRExtensionType(extension_name)
 }
 
+#' @importFrom utils capture.output
 VctrsExtensionType <- R6Class("VctrsExtensionType",
   inherit = ExtensionType,
   public = list(
-    ptype = function() {
-      private$.ptype
-    },
+    ptype = function() private$.ptype,
     ToString = function() {
-      tf <- tempfile()
-      sink(tf)
-      on.exit({
-        sink(NULL)
-        unlink(tf)
-      })
-      print(self$ptype())
-      paste0(readLines(tf), collapse = "\n")
+      paste0(capture.output(print(self$ptype())), collapse = "\n")
     },
     deserialize_instance = function() {
       private$.ptype <- unserialize(self$extension_metadata())
     },
     ExtensionEquals = function(other) {
-      if (!inherits(other, "VctrsExtensionType")) {
-        return(FALSE)
-      }
-
-      identical(self$ptype(), other$ptype())
+      inherits(other, "VctrsExtensionType") && identical(self$ptype(), 
other$ptype())
     },
     as_vector = function(extension_array) {
       if (inherits(extension_array, "ChunkedArray")) {
diff --git a/r/R/filesystem.R b/r/R/filesystem.R
index 75997431a4..3cebbc30c8 100644
--- a/r/R/filesystem.R
+++ b/r/R/filesystem.R
@@ -451,7 +451,6 @@ s3_bucket <- function(bucket, ...) {
 #' @usage NULL
 #' @format NULL
 #' @rdname FileSystem
-#' @importFrom utils modifyList
 #' @export
 GcsFileSystem <- R6Class("GcsFileSystem",
   inherit = FileSystem
diff --git a/r/R/query-engine.R b/r/R/query-engine.R
index 513b861d41..511bf3dbc2 100644
--- a/r/R/query-engine.R
+++ b/r/R/query-engine.R
@@ -226,9 +226,9 @@ ExecPlan <- R6Class("ExecPlan",
         slice_size <- node$extras$head %||% node$extras$tail
         if (!is.null(slice_size)) {
           out <- head(out, slice_size)
+          # We already have everything we need for the head, so StopProducing
+          self$Stop()
         }
-        # Can we now tell `self$Stop()` to StopProducing? We already have
-        # everything we need for the head (but it seems to segfault: 
ARROW-14329)
       } else if (!is.null(node$extras$tail)) {
         # TODO(ARROW-16630): proper BottomK support
         # Reverse the row order to get back what we expect
diff --git a/r/tests/testthat/_snaps/dplyr-glimpse.md 
b/r/tests/testthat/_snaps/dplyr-glimpse.md
new file mode 100644
index 0000000000..6daca0850d
--- /dev/null
+++ b/r/tests/testthat/_snaps/dplyr-glimpse.md
@@ -0,0 +1,152 @@
+# glimpse() Table/ChunkedArray
+
+    Code
+      glimpse(tab)
+    Output
+      Table
+      10 rows x 7 columns
+      $ int           <int32> 1, 2, 3, NA, 5, 6, 7, 8, 9, 10
+      $ dbl          <double> 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, NA, 10.1
+      $ dbl2         <double> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
+      $ lgl            <bool> TRUE, NA, TRUE, FALSE, TRUE, NA, NA, FALSE, 
FALSE, NA
+      $ false          <bool> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, ~
+      $ chr          <string> "a", "b", "c", "d", "e", NA, "g", "h", "i", "j"
+      $ fct <dictionary<...>> a, b, c, d, NA, NA, g, h, i, j
+      Call `print()` for full schema details
+
+---
+
+    Code
+      glimpse(tab$chr)
+    Output
+      <string> [ [ "a", "b", "c", "d", "e", null, "g", "h", "i", "j" ] ]
+
+# glimpse() RecordBatch/Array
+
+    Code
+      glimpse(batch)
+    Output
+      RecordBatch
+      10 rows x 7 columns
+      $ int           <int32> 1, 2, 3, NA, 5, 6, 7, 8, 9, 10
+      $ dbl          <double> 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, NA, 10.1
+      $ dbl2         <double> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
+      $ lgl            <bool> TRUE, NA, TRUE, FALSE, TRUE, NA, NA, FALSE, 
FALSE, NA
+      $ false          <bool> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, ~
+      $ chr          <string> "a", "b", "c", "d", "e", NA, "g", "h", "i", "j"
+      $ fct <dictionary<...>> a, b, c, d, NA, NA, g, h, i, j
+      Call `print()` for full schema details
+
+---
+
+    Code
+      glimpse(batch$int)
+    Output
+      <int32> [ 1, 2, 3, null, 5, 6, 7, 8, 9, 10 ]
+
+# glimpse() with VctrsExtensionType
+
+    Code
+      glimpse(haven)
+    Output
+      Table
+      2 rows x 3 columns
+      $ num            <double> 5.1, 4.9
+      $ cat_int <ext<hvn_lbll>> 3, 1
+      $ cat_chr <ext<hvn_lbll>> Can't convert `x` <haven_labelled> to 
<character>.
+      Call `print()` for full schema details
+
+---
+
+    Code
+      glimpse(haven[[3]])
+    Output
+      <<haven_labelled[0]>> [ [ "B", "B" ] ]
+
+# glimpse prints message about schema if there are complex types
+
+    Code
+      glimpse(dictionary_but_no_metadata)
+    Output
+      Table
+      5 rows x 2 columns
+      $ a           <int32> 1, 2, 3, 4, 5
+      $ b <dictionary<...>> 1, 2, 3, 4, 5
+      Call `print()` for full schema details
+
+---
+
+    Code
+      glimpse(Table$create(a = 1))
+    Output
+      Table
+      1 rows x 1 columns
+      $ a <double> 1
+
+# glimpse() calls print() instead of showing data for RBR
+
+    Code
+      example_data %>% as_record_batch_reader() %>% glimpse()
+    Message <simpleMessage>
+      Cannot glimpse() data from a RecordBatchReader because it can only be 
read one time; call `as_arrow_table()` to consume it first.
+    Output
+      RecordBatchReader
+      int: int32
+      dbl: double
+      dbl2: double
+      lgl: bool
+      false: bool
+      chr: string
+      fct: dictionary<values=string, indices=int8>
+
+---
+
+    Code
+      example_data %>% as_record_batch_reader() %>% select(int) %>% glimpse()
+    Message <simpleMessage>
+      Cannot glimpse() data from a RecordBatchReader because it can only be 
read one time. Call `compute()` to evaluate the query first.
+    Output
+      RecordBatchReader (query)
+      int: int32
+      
+      See $.data for the source Arrow object
+
+# glimpse() on Dataset
+
+    Code
+      glimpse(ds)
+    Output
+      FileSystemDataset with 2 Parquet files
+      20 rows x 7 columns
+      $ int                <int32> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 101, 102, 
103, 104, ~
+      $ dbl               <double> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 51, 52, 53, 
54, 55, ~
+      $ lgl                 <bool> TRUE, FALSE, NA, TRUE, FALSE, TRUE, FALSE, 
NA, TRUE~
+      $ chr               <string> "a", "b", "c", "d", "e", "f", "g", "h", 
"i", "j", "~
+      $ fct      <dictionary<...>> A, B, C, D, E, F, G, H, I, J, J, I, H, G, 
F, E, D, ~
+      $ ts <timestamp[us, tz=UTC]> 2015-04-30 03:12:39, 2015-05-01 03:12:39, 
2015-05-0~
+      $ group              <int32> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 
2, 2, 2, ~
+      Call `print()` for full schema details
+
+# glimpse() on Dataset query only shows data for streaming eval
+
+    Code
+      ds %>% summarize(max(int)) %>% glimpse()
+    Message <simpleMessage>
+      This query requires a full table scan, so glimpse() may be expensive. 
Call `compute()` to evaluate the query first.
+    Output
+      FileSystemDataset (query)
+      max(int): int32
+      
+      See $.data for the source Arrow object
+
+# glimpse() on in-memory query shows data even if aggregating
+
+    Code
+      example_data %>% arrow_table() %>% summarize(sum(int, na.rm = TRUE)) %>%
+        glimpse()
+    Output
+      Table (query)
+      ?? rows x 1 columns
+      $ `sum(int, na.rm = TRUE)` <int64> 51
+      Call `print()` for query details
+
diff --git a/r/tests/testthat/test-chunked-array.txt 
b/r/tests/testthat/test-chunked-array.txt
index c7101359d7..e2a691a99e 100644
--- a/r/tests/testthat/test-chunked-array.txt
+++ b/r/tests/testthat/test-chunked-array.txt
@@ -1,5 +1,6 @@
 > chunked_array(c(1, 2, 3), c(4, 5, 6))
 ChunkedArray
+<double>
 [
   [
     1,
@@ -15,6 +16,7 @@ ChunkedArray
 
 > chunked_array(1:30, c(4, 5, 6))
 ChunkedArray
+<int32>
 [
   [
     1,
@@ -48,6 +50,7 @@ ChunkedArray
 
 > chunked_array(1:30)
 ChunkedArray
+<int32>
 [
   [
     1,
@@ -76,6 +79,7 @@ ChunkedArray
 
 > chunked_array(factor(c("a", "b")), factor(c("c", "d")))
 ChunkedArray
+<dictionary<values=string, indices=int8>>
 [
 
   -- dictionary:
diff --git a/r/tests/testthat/test-data-type.R 
b/r/tests/testthat/test-data-type.R
index 88333fb314..16fcf8e0a3 100644
--- a/r/tests/testthat/test-data-type.R
+++ b/r/tests/testthat/test-data-type.R
@@ -593,15 +593,16 @@ test_that("DataType$code()", {
   expect_code_roundtrip(dictionary(index_type = int8(), value_type = 
large_utf8()))
   expect_code_roundtrip(dictionary(index_type = int8(), ordered = TRUE))
 
-  skip("until rlang 1.0")
-  expect_snapshot({
-    (expect_error(
-      DayTimeInterval__initialize()$code()
-    ))
-    (expect_error(
-      struct(a = DayTimeInterval__initialize())$code()
-    ))
-  })
+  skip_if(packageVersion("rlang") < 1)
+  # Are these unsupported for a reason?
+  expect_error(
+    eval(DayTimeInterval__initialize()$code()),
+    "Unsupported type"
+  )
+  expect_error(
+    eval(struct(a = DayTimeInterval__initialize())$code()),
+    "Unsupported type"
+  )
 })
 
 test_that("as_data_type() works for DataType", {
diff --git a/r/tests/testthat/test-dplyr-glimpse.R 
b/r/tests/testthat/test-dplyr-glimpse.R
new file mode 100644
index 0000000000..d39fef9e82
--- /dev/null
+++ b/r/tests/testthat/test-dplyr-glimpse.R
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+library(dplyr, warn.conflicts = FALSE)
+
+test_that("glimpse() Table/ChunkedArray", {
+  tab <- Table$create(example_data)
+  expect_snapshot(glimpse(tab))
+  expect_snapshot(glimpse(tab$chr))
+})
+
+test_that("glimpse() RecordBatch/Array", {
+  batch <- RecordBatch$create(example_data)
+  expect_snapshot(glimpse(batch))
+  expect_snapshot(glimpse(batch$int))
+})
+
+test_that("glimpse() with VctrsExtensionType", {
+  haven <- Table$create(haven_data)
+  expect_snapshot(glimpse(haven))
+  expect_snapshot(glimpse(haven[[3]]))
+})
+
+test_that("glimpse prints message about schema if there are complex types", {
+  dictionary_but_no_metadata <- Table$create(a = 1:5, b = factor(1:5))
+  expect_snapshot(glimpse(dictionary_but_no_metadata))
+  # No message here
+  expect_snapshot(glimpse(Table$create(a = 1)))
+})
+
+test_that("glimpse() calls print() instead of showing data for RBR", {
+  expect_snapshot({
+    example_data %>%
+      as_record_batch_reader() %>%
+      glimpse()
+  })
+  expect_snapshot({
+    example_data %>%
+      as_record_batch_reader() %>%
+      select(int) %>%
+      glimpse()
+  })
+})
+
+skip_if_not_available("dataset")
+big_df <- rbind(
+  cbind(df1, group = 1),
+  cbind(df2, group = 2)
+)
+ds_dir <- make_temp_dir()
+write_dataset(big_df, ds_dir, partitioning = "group")
+
+ds <- open_dataset(ds_dir)
+
+test_that("glimpse() on Dataset", {
+  expect_snapshot(glimpse(ds))
+})
+
+test_that("glimpse() on Dataset query only shows data for streaming eval", {
+  # Because dataset scan row order is not deterministic, we can't snapshot
+  # the whole output. Instead check for an indication that glimpse method ran
+  # instead of the regular print() method that is the fallback
+  expect_output(
+    ds %>%
+      select(int, chr) %>%
+      filter(int > 2) %>%
+      mutate(twice = int * 2) %>%
+      glimpse(),
+    "Call `print()` for query details",
+    fixed = TRUE
+  )
+
+  # This doesn't show the data and falls back to print()
+  expect_snapshot({
+    ds %>%
+      summarize(max(int)) %>%
+      glimpse()
+  })
+})
+
+test_that("glimpse() on in-memory query shows data even if aggregating", {
+  expect_snapshot({
+    example_data %>%
+      arrow_table() %>%
+      summarize(sum(int, na.rm = TRUE)) %>%
+      glimpse()
+  })
+})
diff --git a/r/tests/testthat/test-dplyr-query.R 
b/r/tests/testthat/test-dplyr-query.R
index d55ed07cfc..62633eb1d6 100644
--- a/r/tests/testthat/test-dplyr-query.R
+++ b/r/tests/testthat/test-dplyr-query.R
@@ -293,3 +293,143 @@ test_that("No duplicate field names are allowed in an 
arrow_dplyr_query", {
     )
   )
 })
+
+test_that("all_sources() finds all data sources in a query", {
+  tab <- Table$create(a = 1)
+  ds <- InMemoryDataset$create(tab)
+  expect_equal(all_sources(tab), list(tab))
+  expect_equal(
+    tab %>%
+      filter(a > 0) %>%
+      summarize(a = sum(a)) %>%
+      arrange(desc(a)) %>%
+      all_sources(),
+    list(tab)
+  )
+  expect_equal(
+    tab %>%
+      filter(a > 0) %>%
+      union_all(ds) %>%
+      all_sources(),
+    list(tab, ds)
+  )
+
+  expect_equal(
+    tab %>%
+      filter(a > 0) %>%
+      union_all(ds) %>%
+      left_join(tab) %>%
+      all_sources(),
+    list(tab, ds, tab)
+  )
+  expect_equal(
+    tab %>%
+      filter(a > 0) %>%
+      union_all(left_join(ds, tab)) %>%
+      left_join(tab) %>%
+      all_sources(),
+    list(tab, ds, tab, tab)
+  )
+})
+
+test_that("query_on_dataset() looks at all data sources in a query", {
+  tab <- Table$create(a = 1)
+  ds <- InMemoryDataset$create(tab)
+  expect_false(query_on_dataset(tab))
+  expect_true(query_on_dataset(ds))
+  expect_false(
+    tab %>%
+      filter(a > 0) %>%
+      summarize(a = sum(a)) %>%
+      arrange(desc(a)) %>%
+      query_on_dataset()
+  )
+  expect_true(
+    tab %>%
+      filter(a > 0) %>%
+      union_all(ds) %>%
+      query_on_dataset()
+  )
+
+  expect_true(
+    tab %>%
+      filter(a > 0) %>%
+      union_all(left_join(ds, tab)) %>%
+      left_join(tab) %>%
+      query_on_dataset()
+  )
+  expect_false(
+    tab %>%
+      filter(a > 0) %>%
+      union_all(left_join(tab, tab)) %>%
+      left_join(tab) %>%
+      query_on_dataset()
+  )
+})
+
+test_that("query_can_stream()", {
+  tab <- Table$create(a = 1)
+  ds <- InMemoryDataset$create(tab)
+  expect_true(query_can_stream(tab))
+  expect_true(query_can_stream(ds))
+  expect_true(query_can_stream(NULL))
+  expect_true(
+    ds %>%
+      filter(a > 0) %>%
+      query_can_stream()
+  )
+  expect_false(
+    tab %>%
+      filter(a > 0) %>%
+      arrange(desc(a)) %>%
+      query_can_stream()
+  )
+  expect_false(
+    tab %>%
+      filter(a > 0) %>%
+      summarize(a = sum(a)) %>%
+      query_can_stream()
+  )
+  expect_true(
+    tab %>%
+      filter(a > 0) %>%
+      union_all(ds) %>%
+      query_can_stream()
+  )
+  expect_false(
+    tab %>%
+      filter(a > 0) %>%
+      union_all(summarize(ds, a = sum(a))) %>%
+      query_can_stream()
+  )
+
+  expect_true(
+    tab %>%
+      filter(a > 0) %>%
+      union_all(left_join(ds, tab)) %>%
+      left_join(tab) %>%
+      query_can_stream()
+  )
+  expect_true(
+    tab %>%
+      filter(a > 0) %>%
+      union_all(left_join(tab, tab)) %>%
+      left_join(tab) %>%
+      query_can_stream()
+  )
+  expect_false(
+    tab %>%
+      filter(a > 0) %>%
+      union_all(left_join(tab, tab)) %>%
+      left_join(ds) %>%
+      query_can_stream()
+  )
+  expect_false(
+    tab %>%
+      filter(a > 0) %>%
+      arrange(a) %>%
+      union_all(left_join(tab, tab)) %>%
+      left_join(tab) %>%
+      query_can_stream()
+  )
+})
diff --git a/r/tests/testthat/test-extension.R 
b/r/tests/testthat/test-extension.R
index a4a37c9127..638869dc8c 100644
--- a/r/tests/testthat/test-extension.R
+++ b/r/tests/testthat/test-extension.R
@@ -183,7 +183,7 @@ test_that("vctrs extension type works", {
   expect_r6_class(type, "VctrsExtensionType")
   expect_identical(type$ptype(), vctrs::vec_ptype(custom_vctr))
   expect_true(type$Equals(type))
-  expect_match(type$ToString(), "arrow_custom_test")
+  expect_identical(type$ToString(), "<arrow_custom_test[0]>")
 
   array_in <- vctrs_extension_array(custom_vctr)
   expect_true(array_in$type$Equals(type))
diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R
index c7046de3cb..3a35569f7f 100644
--- a/r/tests/testthat/test-schema.R
+++ b/r/tests/testthat/test-schema.R
@@ -43,12 +43,11 @@ test_that("Schema$code()", {
     schema(a = int32(), b = struct(c = double(), d = utf8()), e = 
list_of(binary()))
   )
 
-  skip("until rlang 1.0")
-  expect_snapshot({
-    (expect_error(
-      schema(x = int32(), y = DayTimeInterval__initialize())$code()
-    ))
-  })
+  skip_if(packageVersion("rlang") < 1)
+  expect_error(
+    eval(schema(x = int32(), y = DayTimeInterval__initialize())$code()),
+    "Unsupported type"
+  )
 })
 
 test_that("Schema with non-nullable fields", {

[arrow] branch master updated: ARROW-16776: [R] dplyr::glimpse method for arrow table and datasets (#13563)

Reply via email to