nealrichardson commented on a change in pull request #9745: URL: https://github.com/apache/arrow/pull/9745#discussion_r599985192
########## File path: r/R/arrow-datum.R ########## @@ -138,3 +138,16 @@ as.integer.ArrowDatum <- function(x, ...) as.integer(as.vector(x), ...) #' @export as.character.ArrowDatum <- function(x, ...) as.character(as.vector(x), ...) + +#' @export +sort.ArrowDatum <- function(x, decreasing = FALSE, na.last = NA, ...) { + if (is.na(na.last)) { + x <- x$Filter(!is.na(x)) + x$Take(x$SortIndices(descending = decreasing)) + } else if (na.last) { + x$Take(x$SortIndices(descending = decreasing)) + } else { + x <- Table$create(x = x, isnax = as.integer(is.na(x))) Review comment: This deserves some comments explaining it, plus a reference to the C++ JIRA that will allow us to delete this hack. Also, I think it would be more readable with less `x`, i.e. you can call the Table something else, and the `x` column in the table doesn't need to be called `x` either. ########## File path: r/R/dplyr.R ########## @@ -364,6 +383,7 @@ set_filters <- function(.data, expressions) { collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { x <- ensure_group_vars(x) + x <- ensure_arrange_vars(x) # this sets x$temp_columns Review comment: Does `temp_columns` need to be added to `x`? It looks like it doesn't live outside of this function. You could spell this like ``` temp_columns <- find_extra_arrange_columns(x) ``` ########## File path: r/R/dplyr.R ########## @@ -631,17 +671,85 @@ abandon_ship <- function(call, .data, msg = NULL) { eval.parent(call, 2) } -arrange.arrow_dplyr_query <- function(.data, ...) { +arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) { + call <- match.call() + exprs <- quos(...) + if (.by_group) { + exprs <- c(quos(!!!dplyr::groups(.data)), exprs) + } + if (length(exprs) == 0) { + # Nothing to do + return(.data) + } .data <- arrow_dplyr_query(.data) if (query_on_dataset(.data)) { not_implemented_for_dataset("arrange()") } - # TODO(ARROW-11703) move this to Arrow - call <- match.call() - abandon_ship(call, .data) + is_dataset <- query_on_dataset(.data) + if (is_dataset) { + return(abandon_ship(call, .data)) + } + # find and remove any dplyr::desc() and tidy-eval + # the arrange expressions inside an Arrow data_mask + sorts <- vector("list", length(exprs)) + descs <- logical(0) + mask <- arrow_mask(.data) + for (i in seq_along(exprs)) { + x <- find_and_remove_desc(exprs[[i]]) + exprs[[i]] <- x[["quos"]] + sorts[[i]] <- arrow_eval(exprs[[i]], mask) + if (inherits(sorts[[i]], "try-error")) { + msg <- paste('Expression', as_label(exprs[[i]]), 'not supported in Arrow') + return(abandon_ship(call, .data, msg)) + } + names(sorts)[i] <- tryCatch( Review comment: Can you leave a comment explaining this tryCatch? ########## File path: r/tests/testthat/test-compute-sort.R ########## @@ -0,0 +1,157 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("compute: sorting") + +library(dplyr) + +tbl <- example_data_for_sorting + +test_that("Scalar sort", { + expect_identical( + as.vector(sort(Scalar$create(42L))), + 42L + ) +}) + +test_that("Array sort on integers", { + expect_equal( + Array$create(tbl$int)$SortIndices(), + Array$create(0L:9L, type = uint64()) + ) + expect_equal( + Array$create(rev(tbl$int))$SortIndices(descending = TRUE), + Array$create(c(1L:9L, 0L), type = uint64()) + ) + expect_equal( + as.vector(sort(Array$create(tbl$int))), + sort(tbl$int) + ) + expect_equal( + as.vector(sort(Array$create(tbl$int), na.last = NA)), + sort(tbl$int, na.last = NA) + ) + expect_equal( + as.vector(sort(Array$create(tbl$int), na.last = TRUE)), + sort(tbl$int, na.last = TRUE) + ) + expect_equal( + as.vector(sort(Array$create(tbl$int), na.last = FALSE)), + sort(tbl$int, na.last = FALSE) + ) + expect_equal( + as.vector(sort(Array$create(tbl$int), decreasing = TRUE)), + sort(tbl$int, decreasing = TRUE) + ) + expect_equal( + as.vector(sort(Array$create(tbl$int), decreasing = TRUE, na.last = TRUE)), + sort(tbl$int, decreasing = TRUE, na.last = TRUE) + ) + expect_equal( + as.vector(sort(Array$create(tbl$int), decreasing = TRUE, na.last = FALSE)), + sort(tbl$int, decreasing = TRUE, na.last = FALSE) + ) +}) + +test_that("ChunkedArray sort on integers", { + expect_equal( + ChunkedArray$create(tbl$int[1:5], tbl$int[6:10])$SortIndices(), + Array$create(0L:9L, type = uint64()) + ) + expect_equal( + ChunkedArray$create(rev(tbl$int)[1:5], rev(tbl$int)[6:10])$SortIndices(descending = TRUE), + Array$create(c(1L:9L, 0L), type = uint64()) + ) + expect_equal( + as.vector(sort(ChunkedArray$create(tbl$int[1:5], tbl$int[6:10]))), + sort(tbl$int) + ) + expect_equal( + as.vector(sort(ChunkedArray$create(tbl$int[1:5], tbl$int[6:10]), na.last = NA)), + sort(tbl$int, na.last = NA) + ) + expect_equal( + as.vector(sort(ChunkedArray$create(tbl$int[1:5], tbl$int[6:10]), na.last = TRUE)), + sort(tbl$int, na.last = TRUE) + ) + expect_equal( + as.vector(sort(ChunkedArray$create(tbl$int[1:5], tbl$int[6:10]), na.last = FALSE)), + sort(tbl$int, na.last = FALSE) + ) + expect_equal( + as.vector(sort(ChunkedArray$create(tbl$int[1:5], tbl$int[6:10]), decreasing = TRUE)), + sort(tbl$int, decreasing = TRUE) + ) + expect_equal( + as.vector(sort(ChunkedArray$create(tbl$int[1:5], tbl$int[6:10]), decreasing = TRUE, na.last = TRUE)), + sort(tbl$int, decreasing = TRUE, na.last = TRUE) + ) + expect_equal( + as.vector(sort(ChunkedArray$create(tbl$int[1:5], tbl$int[6:10]), decreasing = TRUE, na.last = FALSE)), + sort(tbl$int, decreasing = TRUE, na.last = FALSE) + ) +}) + +test_that("Array/ChunkedArray sort on strings", { + expect_equal( + as.vector(sort(Array$create(tbl$chr), decreasing = TRUE, na.last = FALSE)), + sort(tbl$chr, decreasing = TRUE, na.last = FALSE) + ) + expect_equal( + as.vector(sort(ChunkedArray$create(tbl$chr[1:5], tbl$chr[6:10]), decreasing = TRUE, na.last = FALSE)), + sort(tbl$chr, decreasing = TRUE, na.last = FALSE) + ) +}) + +test_that("Array/ChunkedArray sort on floats", { + skip("ARROW-12055") + expect_equal( + as.vector(sort(Array$create(tbl$dbl), decreasing = TRUE, na.last = FALSE)), + sort(tbl$dbl, decreasing = TRUE, na.last = FALSE) + ) + expect_equal( + as.vector(sort(ChunkedArray$create(tbl$dbl[1:5], tbl$dbl[6:10]), decreasing = TRUE, na.last = FALSE)), + sort(tbl$dbl, decreasing = TRUE, na.last = FALSE) + ) +}) + +test_that("Table/RecordBatch sort", { Review comment: These tests aren't super readable ########## File path: r/R/dplyr.R ########## @@ -390,6 +414,16 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { tab <- RecordBatch$create(!!!cols) } } + # Arrange rows + if (length(x$arrange_vars) > 0) { + x$arrange_vars <- get_field_names(x$arrange_vars) Review comment: I don't understand how this works. I don't see how `get_field_names` will return something useful if the arrange_vars are expressions. And then on the next line you take `names()` of that anyway, so you're ignoring the expressions. ########## File path: r/R/dplyr.R ########## @@ -364,6 +383,7 @@ set_filters <- function(.data, expressions) { collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { x <- ensure_group_vars(x) + x <- ensure_arrange_vars(x) # this sets x$temp_columns Review comment: I would also add a comment that explains how, unlike group_by_vars, which always are kept even if you don't `select` them, we have to do this for sorting because you could arrange then select without them but because of how the lazy query is built, we don't carry out the sort until later. ########## File path: r/tests/testthat/test-compute-sort.R ########## @@ -0,0 +1,157 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("compute: sorting") + +library(dplyr) + +tbl <- example_data_for_sorting + +test_that("Scalar sort", { + expect_identical( + as.vector(sort(Scalar$create(42L))), + 42L + ) +}) + +test_that("Array sort on integers", { + expect_equal( + Array$create(tbl$int)$SortIndices(), + Array$create(0L:9L, type = uint64()) + ) + expect_equal( + Array$create(rev(tbl$int))$SortIndices(descending = TRUE), + Array$create(c(1L:9L, 0L), type = uint64()) + ) + expect_equal( + as.vector(sort(Array$create(tbl$int))), + sort(tbl$int) + ) + expect_equal( + as.vector(sort(Array$create(tbl$int), na.last = NA)), Review comment: I get that you're testing the various arguments, which do result in different sort orders, but it's a little odd that your test data is already ordered. ########## File path: r/R/dplyr.R ########## @@ -631,17 +671,85 @@ abandon_ship <- function(call, .data, msg = NULL) { eval.parent(call, 2) } -arrange.arrow_dplyr_query <- function(.data, ...) { +arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) { + call <- match.call() + exprs <- quos(...) + if (.by_group) { + exprs <- c(quos(!!!dplyr::groups(.data)), exprs) + } + if (length(exprs) == 0) { + # Nothing to do + return(.data) + } .data <- arrow_dplyr_query(.data) if (query_on_dataset(.data)) { not_implemented_for_dataset("arrange()") } - # TODO(ARROW-11703) move this to Arrow - call <- match.call() - abandon_ship(call, .data) + is_dataset <- query_on_dataset(.data) + if (is_dataset) { Review comment: I believe this duplicates the block immediately above it. Also, unclear that you need to embargo datasets since you're only evaluating the sort on the Table after collecting(). ########## File path: r/R/dplyr.R ########## @@ -631,17 +671,85 @@ abandon_ship <- function(call, .data, msg = NULL) { eval.parent(call, 2) } -arrange.arrow_dplyr_query <- function(.data, ...) { +arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) { + call <- match.call() + exprs <- quos(...) + if (.by_group) { Review comment: This is subtle: so "by_group" sorting means adding group vars to the sort. Can you leave a comment? (I was confused about this at first.) ########## File path: r/tests/testthat/test-compute-sort.R ########## @@ -0,0 +1,157 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("compute: sorting") + +library(dplyr) + +tbl <- example_data_for_sorting + +test_that("Scalar sort", { + expect_identical( + as.vector(sort(Scalar$create(42L))), + 42L + ) +}) + +test_that("Array sort on integers", { + expect_equal( + Array$create(tbl$int)$SortIndices(), + Array$create(0L:9L, type = uint64()) + ) + expect_equal( + Array$create(rev(tbl$int))$SortIndices(descending = TRUE), + Array$create(c(1L:9L, 0L), type = uint64()) + ) + expect_equal( Review comment: I believe there is an `expect_vector()` (defined in `helper-expectations.R`) that can help with these -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org