jonkeane commented on a change in pull request #10724:
URL: https://github.com/apache/arrow/pull/10724#discussion_r682676388
##########
File path: r/R/dplyr-functions.R
##########
@@ -634,20 +634,53 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE,
week_start = getOption
}
nse_funcs$log <- function(x, base = exp(1)) {
-
+
if (base == exp(1)) {
return(Expression$create("ln_checked", x))
}
-
+
if (base == 2) {
return(Expression$create("log2_checked", x))
}
-
+
if (base == 10) {
return(Expression$create("log10_checked", x))
- }
+ }
# ARROW-13345
stop("`base` values other than exp(1), 2 and 10 not supported in Arrow",
call. = FALSE)
}
nse_funcs$logb <- nse_funcs$log
+
+nse_funcs$if_else <- function(condition, true, false, missing = NULL){
+ # We ought to assert that the types of the true and false conditions will
result
+ # in the same types. We can't compare the objects themselves directly because
+ # they might be expressions (that will result in a type) or R objects that
will
+ # need to be compared to see if they are compatible with arrow types.
+ # ARROW-13186 might make this easier with a more robust way.
+ # TODO: do this ^^^
+
+ # if_else only supports boolean, numeric, or temporal types right now
+ # TODO: remove when ARROW-12955 merges
+ # If true/false are R types, we can use `is.*` directly
+ invalid_r_types <- is.character(true) || is.character(false) ||
is.list(true) ||
+ is.list(false) || is.factor(true) || is.factor(false)
+ # However, if they are expressions, we need to use the functions from
nse_funcs
+ invalid_expression_types_true <- inherits(true, "Expression") && (
+ nse_funcs$is.character(true) || nse_funcs$is.list(true) ||
nse_funcs$is.factor(true)
+ )
+ invalid_expression_types_false <- inherits(false, "Expression") && (
+ nse_funcs$is.character(false) || nse_funcs$is.list(false) ||
nse_funcs$is.factor(false)
+ )
+ if (invalid_r_types | invalid_expression_types_true |
invalid_expression_types_false) {
+ stop("`true` and `false` character values not yet supported in Arrow",
call. = FALSE)
+ }
Review comment:
I would say that the base R version here is wrong / never what someone
actually wants.
I don't think we want to get into the business of coalescing/merging the
dictionaries to be the same (there are many edge cases that can lead to very
funny outcomes). But emulating the dplyr behavior seems reasonable here (use
the levels of the first, merge the values together and any value that's not in
the levels of the first gets an NA + warning that the dictionaries didn't match)
##########
File path: r/R/dplyr-functions.R
##########
@@ -634,20 +634,53 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE,
week_start = getOption
}
nse_funcs$log <- function(x, base = exp(1)) {
-
+
if (base == exp(1)) {
return(Expression$create("ln_checked", x))
}
-
+
if (base == 2) {
return(Expression$create("log2_checked", x))
}
-
+
if (base == 10) {
return(Expression$create("log10_checked", x))
- }
+ }
# ARROW-13345
stop("`base` values other than exp(1), 2 and 10 not supported in Arrow",
call. = FALSE)
}
nse_funcs$logb <- nse_funcs$log
+
+nse_funcs$if_else <- function(condition, true, false, missing = NULL){
+ # We ought to assert that the types of the true and false conditions will
result
+ # in the same types. We can't compare the objects themselves directly because
+ # they might be expressions (that will result in a type) or R objects that
will
+ # need to be compared to see if they are compatible with arrow types.
+ # ARROW-13186 might make this easier with a more robust way.
+ # TODO: do this ^^^
+
+ # if_else only supports boolean, numeric, or temporal types right now
+ # TODO: remove when ARROW-12955 merges
+ # If true/false are R types, we can use `is.*` directly
+ invalid_r_types <- is.character(true) || is.character(false) ||
is.list(true) ||
+ is.list(false) || is.factor(true) || is.factor(false)
+ # However, if they are expressions, we need to use the functions from
nse_funcs
+ invalid_expression_types_true <- inherits(true, "Expression") && (
+ nse_funcs$is.character(true) || nse_funcs$is.list(true) ||
nse_funcs$is.factor(true)
+ )
+ invalid_expression_types_false <- inherits(false, "Expression") && (
+ nse_funcs$is.character(false) || nse_funcs$is.list(false) ||
nse_funcs$is.factor(false)
+ )
+ if (invalid_r_types | invalid_expression_types_true |
invalid_expression_types_false) {
+ stop("`true` and `false` character values not yet supported in Arrow",
call. = FALSE)
+ }
Review comment:
Aaah, ok. Would it be possible to do something like this then: Use the
levels of the first, merge the values together, and error on any value that's
not in the levels of the first? (where we could redirect the person to either
re-encode the dictionaries or `NULL` the offending values)
IME it's not uncommon to have a circumstance where you filter down to rows
that have values that overlap (even though the full dictionaries are different)
and forcing someone to re-encode there when no offending value would ever be
there could be a bit frustrating.
##########
File path: r/R/dplyr-functions.R
##########
@@ -634,20 +634,53 @@ nse_funcs$wday <- function(x, label = FALSE, abbr = TRUE,
week_start = getOption
}
nse_funcs$log <- function(x, base = exp(1)) {
-
+
if (base == exp(1)) {
return(Expression$create("ln_checked", x))
}
-
+
if (base == 2) {
return(Expression$create("log2_checked", x))
}
-
+
if (base == 10) {
return(Expression$create("log10_checked", x))
- }
+ }
# ARROW-13345
stop("`base` values other than exp(1), 2 and 10 not supported in Arrow",
call. = FALSE)
}
nse_funcs$logb <- nse_funcs$log
+
+nse_funcs$if_else <- function(condition, true, false, missing = NULL){
+ # We ought to assert that the types of the true and false conditions will
result
+ # in the same types. We can't compare the objects themselves directly because
+ # they might be expressions (that will result in a type) or R objects that
will
+ # need to be compared to see if they are compatible with arrow types.
+ # ARROW-13186 might make this easier with a more robust way.
+ # TODO: do this ^^^
+
+ # if_else only supports boolean, numeric, or temporal types right now
+ # TODO: remove when ARROW-12955 merges
+ # If true/false are R types, we can use `is.*` directly
+ invalid_r_types <- is.character(true) || is.character(false) ||
is.list(true) ||
+ is.list(false) || is.factor(true) || is.factor(false)
+ # However, if they are expressions, we need to use the functions from
nse_funcs
+ invalid_expression_types_true <- inherits(true, "Expression") && (
+ nse_funcs$is.character(true) || nse_funcs$is.list(true) ||
nse_funcs$is.factor(true)
+ )
+ invalid_expression_types_false <- inherits(false, "Expression") && (
+ nse_funcs$is.character(false) || nse_funcs$is.list(false) ||
nse_funcs$is.factor(false)
+ )
+ if (invalid_r_types | invalid_expression_types_true |
invalid_expression_types_false) {
+ stop("`true` and `false` character values not yet supported in Arrow",
call. = FALSE)
+ }
Review comment:
That sounds great. Cause the null-encode is basically the only other
safe option and is also pretty common to want
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]