jonkeane commented on code in PR #49536:
URL: https://github.com/apache/arrow/pull/49536#discussion_r3045612412
##########
r/R/dplyr-funcs-conditional.R:
##########
@@ -15,6 +15,169 @@
# specific language governing permissions and limitations
# under the License.
+#' Parse logical condition formulas
+#'
+#' Converts condition ~ value formulas into Arrow expressions. Unlike
+#' [parse_value_mapping()], the LHS must be a logical expression (not a value
+#' to match against).
+#'
+#' @param formulas A list of two-sided formulas where LHS is a logical
condition
+#' and RHS is the value to use when TRUE (e.g., `x > 5 ~ "high"`).
+#' @param mask The data mask for evaluating formula expressions.
+#'
+#' @return A list with `query` (list of logical expressions) and `value`
+#' (list of replacement expressions).
+#'
+#' @keywords internal
+#' @noRd
+parse_condition_formulas <- function(formulas, mask) {
+ fn <- call_name(rlang::caller_call())
+ # Compact NULL entries (allows conditional formulas like: if (cond) x ~ y)
+ formulas <- compact(formulas)
+ n <- length(formulas)
+ query <- vector("list", n)
+ value <- vector("list", n)
+ # Process each formula: condition ~ value
+ for (i in seq_len(n)) {
+ f <- formulas[[i]]
+ if (!is_formula(f, lhs = TRUE)) {
+ validation_error(paste0("Each argument to ", fn, "() must be a two-sided
formula"))
+ }
+ # f[[2]] is LHS (logical condition), f[[3]] is RHS (value when TRUE)
+ query[[i]] <- arrow_eval(f[[2]], mask)
+ value[[i]] <- arrow_eval(f[[3]], mask)
+ # Validate LHS is logical (unlike parse_value_mapping which does equality
matching)
+ if (!call_binding("is.logical", query[[i]])) {
+ validation_error(paste0("Left side of each formula in ", fn, "() must be
a logical expression"))
+ }
+ }
+ list(query = query, value = value)
+}
+
+#' Create case_when Expression from query/value lists
+#' @param query List of logical Arrow Expressions.
+#' @param value List of value Arrow Expressions.
+#' @return An Arrow Expression representing the case_when.
+#' @keywords internal
+#' @noRd
+build_case_when_expr <- function(query, value) {
+ Expression$create(
+ "case_when",
+ args = c(
+ Expression$create(
+ "make_struct",
+ args = query,
+ options = list(field_names = as.character(seq_along(query)))
+ ),
+ value
+ )
+ )
+}
+
+#' Build a match expression for x against a value (scalar, NA, or vector).
+#' @param x Arrow Expression for the column to match against.
+#' @param match_value Value to match - R scalar, vector, or NA. Expressions
+#' are compared with equality.
+#' @return Arrow Expression that is TRUE when x matches match_value.
+#' @keywords internal
+#' @noRd
+build_match_expr <- function(x, match_value) {
+ # Expressions or length-1 non-NA: use equality directly
+ if (inherits(match_value, "Expression") || length(match_value) == 1 &&
!is.na(match_value)) {
+ return(x == match_value)
+ }
+
+ # R scalar NA requires is.na() since x == NA returns NA in Arrow
+ if (length(match_value) == 1) {
+ return(call_binding("is.na", x))
+ }
+
+ # R vector: use %in%, handling NA separately if present
+ has_na <- any(is.na(match_value))
+ non_na_values <- match_value[!is.na(match_value)]
+
+ if (length(non_na_values) == 0) {
+ call_binding("is.na", x)
+ } else if (has_na) {
+ call_binding("%in%", x, non_na_values) | call_binding("is.na", x)
+ } else {
+ call_binding("%in%", x, match_value)
+ }
+}
+
+#' Build query/value lists from parallel from/to vectors.
+#' NA values in `from` use is.na() for matching.
+#' @param x Arrow Expression for the column to match against.
+#' @param from Vector of values to match.
+#' @param to Vector of replacement values (recycled to length of `from`).
+#' @return list(query, value) for use with build_case_when_expr().
+#' @keywords internal
+#' @noRd
+parse_from_to_mapping <- function(x, from, to) {
+ n <- length(from)
+ to <- vctrs::vec_recycle(to, n)
+ query <- map(from, ~ build_match_expr(x, .x))
+ value <- map(to, Expression$scalar)
+ list(query = query, value = value)
+}
Review Comment:
Do we use this elsewhere? I'm fine to have it as a separate function but
it's a little funny to have 5 lines as a separate function that's used once,
no? On the other hand, having a description of what it's doing is nice, so
maybe it's good to keep it as is?
##########
r/tests/testthat/test-dplyr-funcs-conditional.R:
##########
@@ -296,8 +296,8 @@ test_that("case_when()", {
)
expect_arrow_eval_error(
case_when(int > 5 ~ 1, .default = c(0, 1)),
- "`.default` must have size 1, not size 2",
- class = "validation_error"
+ "`case_when\\(\\)` with vectorized `.default` not supported in Arrow",
+ class = "arrow_not_supported"
Review Comment:
I'm not totally sure I think we should make this change. The original
message is clearer to me about what needs to happen. I also don't mind that
it's a `validation_error` since that is what it is.
##########
r/tests/testthat/test-dplyr-funcs-conditional.R:
##########
@@ -599,3 +599,280 @@ test_that("when_all()", {
class = "arrow_not_supported"
)
})
+
+test_that("replace_when()", {
+ # replaces matching values, keeps original otherwise
+ compare_dplyr_binding(
+ .input |>
+ mutate(result = replace_when(int, int > 5 ~ 100L)) |>
+ collect(),
+ tbl
+ )
+
+ # multiple conditions
+ compare_dplyr_binding(
+ .input |>
+ mutate(result = replace_when(int, int > 7 ~ 100L, int < 3 ~ 0L)) |>
+ collect(),
+ tbl
+ )
+
+ # overlapping conditions - first match wins
+ compare_dplyr_binding(
+ .input |>
+ mutate(result = replace_when(int, int > 3 ~ 100L, int > 5 ~ 200L)) |>
+ collect(),
+ tbl
+ )
+
+ # no formulas returns x unchanged
+ compare_dplyr_binding(
+ .input |>
+ mutate(result = replace_when(int)) |>
+ collect(),
+ tbl
+ )
+
+ # NULL formulas are compacted out (allows conditional formulas)
Review Comment:
I see what is being tested but I don't totally understand why these are
called "NULL formulas". Maybe something like
```suggestion
# Conditions on LHS of formulas are compacted out
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]