dragosmg commented on code in PR #13541:
URL: https://github.com/apache/arrow/pull/13541#discussion_r927524623


##########
r/tests/testthat/test-dplyr-query.R:
##########
@@ -433,3 +433,343 @@ test_that("query_can_stream()", {
       query_can_stream()
   )
 })
+
+test_that("show_exec_plan(), show_query() and explain()", {
+  # minimal test - this fails if we don't coerce the input to 
`show_exec_plan()`
+  # to be an `arrow_dplyr_query`
+  expect_output(
+    mtcars %>%
+      arrow_table() %>%
+      show_exec_plan(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan
+      "ProjectNode.*",             # output columns
+      "TableSourceNode"            # entry point
+    )
+  )
+
+  # minimal test - show_query()
+  expect_output(
+    mtcars %>%
+      arrow_table() %>%
+      show_query(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan
+      "ProjectNode.*",             # output new columns
+      "TableSourceNode"            # entry point
+    )
+  )
+
+  # minimal test - explain()
+  expect_output(
+    mtcars %>%
+      arrow_table() %>%
+      explain(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan
+      "ProjectNode.*",             # output columns
+      "TableSourceNode"            # entry point
+    )
+  )
+
+  # arrow_table and mutate
+  expect_output(
+    tbl %>%
+      arrow_table() %>%
+      filter(dbl > 2, chr != "e") %>%
+      select(chr, int, lgl) %>%
+      mutate(int_plus_ten = int + 10) %>%
+      show_exec_plan(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",           # boiler plate for ExecPlan
+      "chr, int, lgl, \"int_plus_ten\".*",   # selected columns
+      "FilterNode.*",                        # filter node
+      "(dbl > 2).*",                         # filter expressions
+      "chr != \"e\".*",
+      "TableSourceNode"                      # entry point
+    )
+  )
+
+  # arrow_table and mutate - show_query()
+  expect_output(
+    tbl %>%
+      arrow_table() %>%
+      filter(dbl > 2, chr != "e") %>%
+      select(chr, int, lgl) %>%
+      mutate(int_plus_ten = int + 10) %>%
+      show_query(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",           # boiler plate for ExecPlan
+      "chr, int, lgl, \"int_plus_ten\".*",   # selected columns
+      "FilterNode.*",                        # filter node
+      "(dbl > 2).*",                         # filter expressions
+      "chr != \"e\".*",
+      "TableSourceNode"                      # entry point
+    )
+  )
+
+  # arrow_table and mutate - explain()
+  expect_output(
+    tbl %>%
+      arrow_table() %>%
+      filter(dbl > 2, chr != "e") %>%
+      select(chr, int, lgl) %>%
+      mutate(int_plus_ten = int + 10) %>%
+      explain(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",           # boiler plate for ExecPlan
+      "chr, int, lgl, \"int_plus_ten\".*",   # selected columns
+      "FilterNode.*",                        # filter node
+      "(dbl > 2).*",                         # filter expressions
+      "chr != \"e\".*",
+      "TableSourceNode"                      # entry point
+    )
+  )
+
+  # record_batch and mutate
+  expect_output(
+    tbl %>%
+      record_batch() %>%
+      filter(dbl > 2, chr != "e") %>%
+      select(chr, int, lgl) %>%
+      mutate(int_plus_ten = int + 10) %>%
+      show_exec_plan(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",           # boiler plate for ExecPlan
+      "chr, int, lgl, \"int_plus_ten\".*",   # selected columns
+      "(dbl > 2).*",                         # the filter expressions
+      "chr != \"e\".*",
+      "TableSourceNode"                      # the entry point"
+    )
+  )
+
+  # test with group_by and summarise
+  expect_output(
+    tbl %>%
+      arrow_table() %>%
+      group_by(lgl) %>%
+      summarise(avg = mean(dbl, na.rm = TRUE)) %>%
+      ungroup() %>%
+      show_exec_plan(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",            # boiler plate for ExecPlan
+      "ProjectNode.*",                        # output columns
+      "GroupByNode.*",                        # the group_by statement
+      "keys=.*lgl.*",                         # the key for the aggregations
+      "aggregates=.*hash_mean.*avg.*",        # the aggregations
+      "ProjectNode.*",                        # the input columns
+      "TableSourceNode"                       # the entry point
+    )
+  )
+
+  # test with group_by and summarise - show_query()
+  expect_output(
+    tbl %>%
+      arrow_table() %>%
+      group_by(lgl) %>%
+      summarise(avg = mean(dbl, na.rm = TRUE)) %>%
+      ungroup() %>%
+      show_query(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",            # boiler plate for ExecPlan
+      "ProjectNode.*",                        # output columns
+      "GroupByNode.*",                        # the group_by statement
+      "keys=.*lgl.*",                         # the key for the aggregations
+      "aggregates=.*hash_mean.*avg.*",        # the aggregations
+      "ProjectNode.*",                        # the input columns
+      "TableSourceNode"                       # the entry point
+    )
+  )
+
+  # test with group_by and summarise - explain()
+  expect_output(
+    tbl %>%
+      arrow_table() %>%
+      group_by(lgl) %>%
+      summarise(avg = mean(dbl, na.rm = TRUE)) %>%
+      ungroup() %>%
+      explain(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",            # boiler plate for ExecPlan
+      "ProjectNode.*",                        # output columns
+      "GroupByNode.*",                        # group_by statement
+      "keys=.*lgl.*",                         # key for the aggregations
+      "aggregates=.*hash_mean.*avg.*",        # aggregations
+      "ProjectNode.*",                        # input columns
+      "TableSourceNode"                       # entry point
+    )
+  )
+
+  # test with join
+  expect_output(
+    tbl %>%
+      arrow_table() %>%
+      left_join(
+        example_data %>%
+          arrow_table() %>%
+          mutate(doubled_dbl = dbl * 2) %>%
+          select(int, doubled_dbl),
+        by = "int"
+      ) %>%
+      select(int, verses, doubled_dbl) %>%
+      show_exec_plan(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",              # boiler plate for ExecPlan
+      "ProjectNode.*",                          # output columns
+      "HashJoinNode.*",                         # the join
+      "ProjectNode.*",                          # input columns for the second 
table
+      "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # mutate
+      "TableSourceNode.*",                      # second table
+      "ProjectNode.*",                          # input columns for the first 
table
+      "TableSourceNode"                         # first table
+    )
+  )
+
+  # test with join - show_query()
+  expect_output(
+    tbl %>%
+      arrow_table() %>%
+      left_join(
+        example_data %>%
+          arrow_table() %>%
+          mutate(doubled_dbl = dbl * 2) %>%
+          select(int, doubled_dbl),
+        by = "int"
+      ) %>%
+      select(int, verses, doubled_dbl) %>%
+      show_query(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",              # boiler plate for ExecPlan
+      "ProjectNode.*",                          # output columns
+      "HashJoinNode.*",                         # join
+      "ProjectNode.*",                          # input columns for the second 
table
+      "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # the mutate
+      "TableSourceNode.*",                      # second table
+      "ProjectNode.*",                          # input columns for the first 
table
+      "TableSourceNode"                         # first table
+    )
+  )
+
+  # test with join - explain()
+  expect_output(
+    tbl %>%
+      arrow_table() %>%
+      left_join(
+        example_data %>%
+          arrow_table() %>%
+          mutate(doubled_dbl = dbl * 2) %>%
+          select(int, doubled_dbl),
+        by = "int"
+      ) %>%
+      select(int, verses, doubled_dbl) %>%
+      explain(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",              # boiler plate for ExecPlan
+      "ProjectNode.*",                          # output columns
+      "HashJoinNode.*",                         # join
+      "ProjectNode.*",                          # input columns for the second 
table
+      "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # mutate
+      "TableSourceNode.*",                      # second table
+      "ProjectNode.*",                          # input columns for the first 
table
+      "TableSourceNode"                         # first table
+    )
+  )
+
+  expect_output(
+    mtcars %>%
+      arrow_table() %>%
+      filter(mpg > 20) %>%
+      arrange(desc(wt)) %>%
+      show_exec_plan(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",   # boiler plate for ExecPlan
+      "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node
+      "ProjectNode.*",               # output columns
+      "FilterNode.*",                # filter node
+      "TableSourceNode.*"            # entry point
+    )
+  )
+
+  expect_output(
+    mtcars %>%
+      arrow_table() %>%
+      filter(mpg > 20) %>%
+      arrange(desc(wt)) %>%
+      show_query(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",    # boiler plate for ExecPlan
+      "OrderBySinkNode.*wt.*DESC.*",  # arrange goes via the OrderBy sink node
+      "ProjectNode.*",                # output columns
+      "FilterNode.*",                 # filter node
+      "TableSourceNode.*"             # entry point
+    )
+  )
+
+  expect_output(
+    mtcars %>%
+      arrow_table() %>%
+      filter(mpg > 20) %>%
+      arrange(desc(wt)) %>%
+      explain(),
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*",   # boiler plate for ExecPlan
+      "OrderBySinkNode.*wt.*DESC.*", # arrange goes via the OrderBy sink node
+      "ProjectNode.*",               # output columns
+      "FilterNode.*",                # filter node
+      "TableSourceNode.*"            # entry point
+    )
+  )
+
+  expect_output(
+    mtcars %>%
+      arrow_table() %>%
+      filter(mpg > 20) %>%
+      arrange(desc(wt)) %>%
+      head(3) %>%
+      show_exec_plan(),
+    # for some reason the FilterNode disappears when head/tail are involved +
+    # we do not have additional information regarding the SinkNode +
+    # the entry point is now a SourceNode and not a TableSourceNode
+    regexp = paste0(
+      "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan
+      "SinkNode.*",                #
+      "ProjectNode.*",             # output columns
+      "SourceNode.*"               # entry point
+    )
+  )
+
+  expect_output(

Review Comment:
   I had removed duplicate tests in a previous commit, just adding the comment 
to confirm it's been done. We only test `show_exec_plan()` and I added a 
comment explaining why we don't test `show_query()` and `explain()`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to