This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 1114958706 GH-49186: [R] Support dplyr::filter_out() in Arrow dplyr
backend (#49256)
1114958706 is described below
commit 111495870686ef269254232b876de3aee2f919b6
Author: larry77 <[email protected]>
AuthorDate: Mon Feb 16 12:56:24 2026 +0100
GH-49186: [R] Support dplyr::filter_out() in Arrow dplyr backend (#49256)
### Rationale for this change
New function in dplyr not yet implemented in Arrow
### What changes are included in this PR?
This PR adds support for dplyr::filter_out() in the Arrow R dplyr backend.
The implementation reuses the existing filter() machinery and extends
set_filters() with an `exclude` flag. When exclude = TRUE, the predicate
is transformed to match dplyr semantics (drop rows where predicate is TRUE,
keep rows where predicate is FALSE or NA).
Multiple filter_out() predicates are combined before exclusion so that
filter_out(a, b) matches dplyr semantics (i.e. drop rows where a & b is
TRUE).
This works for arrow_table(), RecordBatchReader, and open_dataset(), and
preserves lazy evaluation for larger-than-memory datasets.
Tests are added to verify basic behavior, NA handling, and multiple
predicates.
Note: local test run hits a with_language() locale issue ('.cache' not
found),
which appears environment-specific and unrelated to these changes.
### Are these changes tested?
Yes
### Are there any user-facing changes?
Just the new function
* GitHub Issue: #49257
* GitHub Issue: #49186
Lead-authored-by: Lorenzo Isella <[email protected]>
Co-authored-by: Nic Crane <[email protected]>
Co-authored-by: Lorenzo ISELLA <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/R/arrow-package.R | 1 +
r/R/dplyr-filter.R | 121 +++++++++++++++++++++++++++++------
r/R/dplyr-funcs-doc.R | 3 +-
r/man/acero.Rd | 5 +-
r/tests/testthat/test-dplyr-filter.R | 48 ++++++++++++++
5 files changed, 154 insertions(+), 24 deletions(-)
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index a1167433c9..5a596dffe3 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -38,6 +38,7 @@
supported_dplyr_methods <- list(
select = NULL,
filter = NULL,
+ filter_out = NULL,
collect = NULL,
summarise = c(
"window functions not currently supported;",
diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R
index 18f5c929af..26fa1bf7d5 100644
--- a/r/R/dplyr-filter.R
+++ b/r/R/dplyr-filter.R
@@ -17,27 +17,61 @@
# The following S3 methods are registered on load if dplyr is present
-filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve =
FALSE) {
- try_arrow_dplyr({
- # TODO something with the .preserve argument
- out <- as_adq(.data)
+apply_filter_impl <- function(
+ .data,
+ ...,
+ .by = NULL,
+ .preserve = FALSE,
+ negate = FALSE
+) {
+ # TODO something with the .preserve argument
+ out <- as_adq(.data)
- by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data")
+ by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data")
- if (by$from_by) {
- out$group_by_vars <- by$names
- }
+ if (by$from_by) {
+ out$group_by_vars <- by$names
+ }
+
+ expanded_filters <- expand_across(out, quos(...))
+ if (length(expanded_filters) == 0) {
+ # Nothing to do
+ return(as_adq(.data))
+ }
+
+ # tidy-eval the filter expressions inside an Arrow data_mask
+ mask <- arrow_mask(out)
+
+ if (isTRUE(negate)) {
+ # filter_out(): combine all predicates with &, then negate
+ combined <- NULL
+
+ for (expr in expanded_filters) {
+ filt <- arrow_eval(expr, mask)
- expanded_filters <- expand_across(out, quos(...))
- if (length(expanded_filters) == 0) {
- # Nothing to do
- return(as_adq(.data))
+ if (length(mask$.aggregations)) {
+ # dplyr lets you filter on e.g. x < mean(x), but we haven't
implemented it.
+ # But we could, the same way it works in mutate() via join, if someone
asks.
+ # Until then, just error.
+ arrow_not_supported(
+ .actual_msg = "Expression not supported in filter_out() in Arrow",
+ call = expr
+ )
+ }
+
+ if (is_list_of(filt, "Expression")) {
+ filt <- Reduce("&", filt)
+ }
+
+ combined <- if (is.null(combined)) filt else (combined & filt)
}
- # tidy-eval the filter expressions inside an Arrow data_mask
- mask <- arrow_mask(out)
+ out <- set_filters(out, combined, negate = TRUE)
+ } else {
+ # filter(): apply each predicate sequentially
for (expr in expanded_filters) {
filt <- arrow_eval(expr, mask)
+
if (length(mask$.aggregations)) {
# dplyr lets you filter on e.g. x < mean(x), but we haven't
implemented it.
# But we could, the same way it works in mutate() via join, if someone
asks.
@@ -47,19 +81,55 @@ filter.arrow_dplyr_query <- function(.data, ..., .by =
NULL, .preserve = FALSE)
call = expr
)
}
- out <- set_filters(out, filt)
- }
- if (by$from_by) {
- out$group_by_vars <- character()
+ out <- set_filters(out, filt, negate = FALSE)
}
+ }
+
+ if (by$from_by) {
+ out$group_by_vars <- character()
+ }
- out
+ out
+}
+
+filter.arrow_dplyr_query <- function(
+ .data,
+ ...,
+ .by = NULL,
+ .preserve = FALSE
+) {
+ try_arrow_dplyr({
+ apply_filter_impl(
+ .data,
+ ...,
+ .by = {{ .by }},
+ .preserve = .preserve,
+ negate = FALSE
+ )
})
}
filter.Dataset <- filter.ArrowTabular <- filter.RecordBatchReader <-
filter.arrow_dplyr_query
-set_filters <- function(.data, expressions) {
+filter_out.arrow_dplyr_query <- function(
+ .data,
+ ...,
+ .by = NULL,
+ .preserve = FALSE
+) {
+ try_arrow_dplyr({
+ apply_filter_impl(
+ .data,
+ ...,
+ .by = {{ .by }},
+ .preserve = .preserve,
+ negate = TRUE
+ )
+ })
+}
+filter_out.Dataset <- filter_out.ArrowTabular <- filter_out.RecordBatchReader
<- filter_out.arrow_dplyr_query
+
+set_filters <- function(.data, expressions, negate = FALSE) {
if (length(expressions)) {
if (is_list_of(expressions, "Expression")) {
# expressions is a list of Expressions. AND them together and set them
on .data
@@ -67,7 +137,16 @@ set_filters <- function(.data, expressions) {
} else if (inherits(expressions, "Expression")) {
new_filter <- expressions
} else {
- stop("filter expressions must be either an expression or a list of
expressions", call. = FALSE)
+ stop(
+ "filter expressions must be either an expression or a list of
expressions",
+ call. = FALSE
+ )
+ }
+
+ if (isTRUE(negate)) {
+ # dplyr::filter_out() semantics: drop rows where predicate is TRUE;
+ # keep rows where predicate is FALSE or NA.
+ new_filter <- (!new_filter) | is.na(new_filter)
}
if (isTRUE(.data$filtered_rows)) {
diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R
index bbd1c91a02..9293d14c94 100644
--- a/r/R/dplyr-funcs-doc.R
+++ b/r/R/dplyr-funcs-doc.R
@@ -19,7 +19,7 @@
#' Functions available in Arrow dplyr queries
#'
-#' The `arrow` package contains methods for 37 `dplyr` table functions, many of
+#' The `arrow` package contains methods for 38 `dplyr` table functions, many of
#' which are "verbs" that do transformations to one or more tables.
#' The package also has mappings of 224 R functions to the corresponding
#' functions in the Arrow compute library. These allow you to write code inside
@@ -45,6 +45,7 @@
#' * [`distinct()`][dplyr::distinct()]: `.keep_all = TRUE` returns a
non-missing value if present, only returning missing values if all are missing.
#' * [`explain()`][dplyr::explain()]
#' * [`filter()`][dplyr::filter()]
+#' * [`filter_out()`][dplyr::filter_out()]
#' * [`full_join()`][dplyr::full_join()]: the `copy` argument is ignored
#' * [`glimpse()`][dplyr::glimpse()]
#' * [`group_by()`][dplyr::group_by()]
diff --git a/r/man/acero.Rd b/r/man/acero.Rd
index dcaca04d2f..ee156cc912 100644
--- a/r/man/acero.Rd
+++ b/r/man/acero.Rd
@@ -7,7 +7,7 @@
\alias{arrow-dplyr}
\title{Functions available in Arrow dplyr queries}
\description{
-The \code{arrow} package contains methods for 37 \code{dplyr} table functions,
many of
+The \code{arrow} package contains methods for 38 \code{dplyr} table functions,
many of
which are "verbs" that do transformations to one or more tables.
The package also has mappings of 224 R functions to the corresponding
functions in the Arrow compute library. These allow you to write code inside
@@ -32,6 +32,7 @@ Table into an R \code{tibble}.
\item \code{\link[dplyr:distinct]{distinct()}}: \code{.keep_all = TRUE}
returns a non-missing value if present, only returning missing values if all
are missing.
\item \code{\link[dplyr:explain]{explain()}}
\item \code{\link[dplyr:filter]{filter()}}
+\item \code{\link[dplyr:filter]{filter_out()}}
\item \code{\link[dplyr:mutate-joins]{full_join()}}: the \code{copy} argument
is ignored
\item \code{\link[dplyr:glimpse]{glimpse()}}
\item \code{\link[dplyr:group_by]{group_by()}}
@@ -198,7 +199,7 @@ Valid values are "s", "ms" (default), "us", "ns".
\itemize{
\item \code{\link[dplyr:across]{across()}}
\item \code{\link[dplyr:between]{between()}}
-\item \code{\link[dplyr:case_when]{case_when()}}: \code{.ptype} and
\code{.size} arguments not supported
+\item \code{\link[dplyr:case-and-replace-when]{case_when()}}: \code{.ptype}
and \code{.size} arguments not supported
\item \code{\link[dplyr:coalesce]{coalesce()}}
\item \code{\link[dplyr:desc]{desc()}}
\item \code{\link[dplyr:across]{if_all()}}
diff --git a/r/tests/testthat/test-dplyr-filter.R
b/r/tests/testthat/test-dplyr-filter.R
index d56e25fca3..3912e518ed 100644
--- a/r/tests/testthat/test-dplyr-filter.R
+++ b/r/tests/testthat/test-dplyr-filter.R
@@ -498,3 +498,51 @@ test_that("filter() with aggregation expressions errors", {
"not supported in filter"
)
})
+
+test_that("filter_out() basic", {
+ compare_dplyr_binding(
+ .input |>
+ filter_out(chr == "b") |>
+ select(chr, int, lgl) |>
+ collect(),
+ tbl
+ )
+})
+
+test_that("filter_out() keeps NA values in predicate result", {
+ compare_dplyr_binding(
+ .input |>
+ filter_out(lgl) |>
+ select(chr, int, lgl) |>
+ collect(),
+ tbl
+ )
+})
+
+test_that("filter_out() with multiple conditions", {
+ compare_dplyr_binding(
+ .input |>
+ filter_out(dbl > 2, chr %in% c("d", "f")) |>
+ collect(),
+ tbl
+ )
+})
+
+test_that("More complex select/filter_out", {
+ compare_dplyr_binding(
+ .input |>
+ filter_out(dbl > 2, chr == "d" | chr == "f") |>
+ select(chr, int, lgl) |>
+ filter(int < 5) |>
+ select(int, chr) |>
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input |>
+ filter_out(!is.na(int)) |>
+ collect(),
+ tbl
+ )
+})