This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new d052ccb0 feat(r/sedonadb): Improve DataFrame API for R bindings (#651)
d052ccb0 is described below
commit d052ccb0ab84ce08a36386e93ecd9bd4d708f909
Author: Dewey Dunnington <[email protected]>
AuthorDate: Mon Feb 23 14:16:59 2026 -0600
feat(r/sedonadb): Improve DataFrame API for R bindings (#651)
Co-authored-by: Copilot <[email protected]>
---
r/sedonadb/NAMESPACE | 10 ++
r/sedonadb/R/000-wrappers.R | 44 ++++++
r/sedonadb/R/dataframe.R | 183 ++++++++++++++++++++++++-
r/sedonadb/R/expression.R | 155 ++++++++++++++++++++-
r/sedonadb/bootstrap.R | 4 +
r/sedonadb/man/dot-fns.Rd | 18 +++
r/sedonadb/man/sd_arrange.Rd | 26 ++++
r/sedonadb/man/sd_expr_column.Rd | 24 +++-
r/sedonadb/man/sd_group_by.Rd | 32 +++++
r/sedonadb/man/sd_summarise.Rd | 28 ++++
r/sedonadb/src/init.c | 36 +++++
r/sedonadb/src/rust/api.h | 9 ++
r/sedonadb/src/rust/src/context.rs | 12 +-
r/sedonadb/src/rust/src/dataframe.rs | 28 ++++
r/sedonadb/src/rust/src/expression.rs | 23 +++-
r/sedonadb/tests/testthat/_snaps/dataframe.md | 28 ++++
r/sedonadb/tests/testthat/_snaps/expression.md | 74 ++++++++++
r/sedonadb/tests/testthat/test-context.R | 5 +
r/sedonadb/tests/testthat/test-dataframe.R | 73 +++++++++-
r/sedonadb/tests/testthat/test-expression.R | 36 +++++
20 files changed, 836 insertions(+), 12 deletions(-)
diff --git a/r/sedonadb/NAMESPACE b/r/sedonadb/NAMESPACE
index a7f631bf..c5c3be0f 100644
--- a/r/sedonadb/NAMESPACE
+++ b/r/sedonadb/NAMESPACE
@@ -2,6 +2,7 @@
S3method("$<-",savvy_sedonadb__sealed)
S3method("[[<-",savvy_sedonadb__sealed)
+S3method(.DollarNames,sedonadb_fns)
S3method(as.data.frame,sedonadb_dataframe)
S3method(as_nanoarrow_array_stream,sedonadb_dataframe)
S3method(as_sedonadb_dataframe,data.frame)
@@ -21,16 +22,19 @@ S3method(dim,sedonadb_dataframe)
S3method(dimnames,sedonadb_dataframe)
S3method(head,sedonadb_dataframe)
S3method(infer_nanoarrow_schema,sedonadb_dataframe)
+S3method(names,sedonadb_fns)
S3method(print,"sedonadb::InternalContext__bundle")
S3method(print,"sedonadb::InternalDataFrame__bundle")
S3method(print,"sedonadb::SedonaDBExprFactory__bundle")
S3method(print,"sedonadb::SedonaDBExpr__bundle")
S3method(print,SedonaDBExpr)
S3method(print,sedonadb_dataframe)
+export(.fns)
export(as_sd_expr)
export(as_sedonadb_dataframe)
export(as_sedonadb_literal)
export(is_sd_expr)
+export(sd_arrange)
export(sd_collect)
export(sd_compute)
export(sd_configure_proj)
@@ -38,6 +42,7 @@ export(sd_count)
export(sd_drop_view)
export(sd_expr_aggregate_function)
export(sd_expr_alias)
+export(sd_expr_any_function)
export(sd_expr_binary)
export(sd_expr_cast)
export(sd_expr_column)
@@ -46,17 +51,22 @@ export(sd_expr_literal)
export(sd_expr_negative)
export(sd_expr_scalar_function)
export(sd_filter)
+export(sd_group_by)
export(sd_preview)
export(sd_read_parquet)
export(sd_register_udf)
export(sd_select)
export(sd_sql)
+export(sd_summarise)
+export(sd_summarize)
export(sd_to_view)
export(sd_transmute)
+export(sd_ungroup)
export(sd_view)
export(sd_write_parquet)
export(sedonadb_adbc)
importFrom(nanoarrow,as_nanoarrow_array_stream)
importFrom(nanoarrow,infer_nanoarrow_schema)
+importFrom(utils,.DollarNames)
importFrom(utils,head)
useDynLib(sedonadb, .registration = TRUE)
diff --git a/r/sedonadb/R/000-wrappers.R b/r/sedonadb/R/000-wrappers.R
index 40bb9263..dbe4a405 100644
--- a/r/sedonadb/R/000-wrappers.R
+++ b/r/sedonadb/R/000-wrappers.R
@@ -107,6 +107,12 @@ NULL
}
}
+`InternalContext_list_functions` <- function(self) {
+ function() {
+ .Call(savvy_InternalContext_list_functions__impl, `self`)
+ }
+}
+
`InternalContext_read_parquet` <- function(self) {
function(`paths`) {
.savvy_wrap_InternalDataFrame(.Call(
@@ -157,6 +163,7 @@ NULL
ptr
)
e$`deregister_table` <- `InternalContext_deregister_table`(ptr)
+ e$`list_functions` <- `InternalContext_list_functions`(ptr)
e$`read_parquet` <- `InternalContext_read_parquet`(ptr)
e$`register_scalar_udf` <- `InternalContext_register_scalar_udf`(ptr)
e$`scalar_udf_xptr` <- `InternalContext_scalar_udf_xptr`(ptr)
@@ -189,6 +196,28 @@ class(`InternalContext`) <- c(
### wrapper functions for InternalDataFrame
+`InternalDataFrame_aggregate` <- function(self) {
+ function(`group_by_exprs_sexp`, `exprs_sexp`) {
+ .savvy_wrap_InternalDataFrame(.Call(
+ savvy_InternalDataFrame_aggregate__impl,
+ `self`,
+ `group_by_exprs_sexp`,
+ `exprs_sexp`
+ ))
+ }
+}
+
+`InternalDataFrame_arrange` <- function(self) {
+ function(`exprs_sexp`, `is_descending_sexp`) {
+ .savvy_wrap_InternalDataFrame(.Call(
+ savvy_InternalDataFrame_arrange__impl,
+ `self`,
+ `exprs_sexp`,
+ `is_descending_sexp`
+ ))
+ }
+}
+
`InternalDataFrame_collect` <- function(self) {
function(`out`) {
.Call(savvy_InternalDataFrame_collect__impl, `self`, `out`)
@@ -333,6 +362,8 @@ class(`InternalContext`) <- c(
`.savvy_wrap_InternalDataFrame` <- function(ptr) {
e <- new.env(parent = emptyenv())
e$.ptr <- ptr
+ e$`aggregate` <- `InternalDataFrame_aggregate`(ptr)
+ e$`arrange` <- `InternalDataFrame_arrange`(ptr)
e$`collect` <- `InternalDataFrame_collect`(ptr)
e$`compute` <- `InternalDataFrame_compute`(ptr)
e$`count` <- `InternalDataFrame_count`(ptr)
@@ -445,6 +476,18 @@ class(`SedonaDBExpr`) <-
c("sedonadb::SedonaDBExpr__bundle", "savvy_sedonadb__se
}
}
+`SedonaDBExprFactory_any_function` <- function(self) {
+ function(`name`, `args`, `na_rm` = NULL) {
+ .savvy_wrap_SedonaDBExpr(.Call(
+ savvy_SedonaDBExprFactory_any_function__impl,
+ `self`,
+ `name`,
+ `args`,
+ `na_rm`
+ ))
+ }
+}
+
`SedonaDBExprFactory_binary` <- function(self) {
function(`op`, `lhs`, `rhs`) {
`lhs` <- .savvy_extract_ptr(`lhs`, "sedonadb::SedonaDBExpr")
@@ -485,6 +528,7 @@ class(`SedonaDBExpr`) <-
c("sedonadb::SedonaDBExpr__bundle", "savvy_sedonadb__se
e <- new.env(parent = emptyenv())
e$.ptr <- ptr
e$`aggregate_function` <- `SedonaDBExprFactory_aggregate_function`(ptr)
+ e$`any_function` <- `SedonaDBExprFactory_any_function`(ptr)
e$`binary` <- `SedonaDBExprFactory_binary`(ptr)
e$`column` <- `SedonaDBExprFactory_column`(ptr)
e$`scalar_function` <- `SedonaDBExprFactory_scalar_function`(ptr)
diff --git a/r/sedonadb/R/dataframe.R b/r/sedonadb/R/dataframe.R
index 58a47bf9..4bb4456d 100644
--- a/r/sedonadb/R/dataframe.R
+++ b/r/sedonadb/R/dataframe.R
@@ -188,6 +188,27 @@ sd_preview <- function(.data, n = NULL, ascii = NULL,
width = NULL) {
ascii = ascii
)
+ schema <- nanoarrow::infer_nanoarrow_schema(.data)
+ if (is.null(.data$group_by)) {
+ grouped_label <- ""
+ grouped_vars <- ""
+ } else {
+ grouped_label <- "grouped "
+ grouped_vars <- sprintf(
+ " | [%s]",
+ paste0("`", names(.data$group_by), "`", collapse = ", ")
+ )
+ }
+
+ cat(
+ sprintf(
+ "<%ssedonab_dataframe: NA x %d%s>\n",
+ grouped_label,
+ length(schema$children),
+ grouped_vars
+ )
+ )
+
cat(content)
cat(paste0("Preview of up to ", n, " row(s)\n"))
@@ -207,6 +228,10 @@ sd_preview <- function(.data, n = NULL, ascii = NULL,
width = NULL) {
#'
sd_select <- function(.data, ...) {
.data <- as_sedonadb_dataframe(.data)
+ if (!is.null(.data$group_by)) {
+ stop("sd_select() does not support grouped input")
+ }
+
schema <- nanoarrow::infer_nanoarrow_schema(.data)
ptype <- nanoarrow::infer_nanoarrow_ptype(schema)
loc <- tidyselect::eval_select(rlang::expr(c(...)), data = ptype)
@@ -231,10 +256,14 @@ sd_select <- function(.data, ...) {
#'
sd_transmute <- function(.data, ...) {
.data <- as_sedonadb_dataframe(.data)
+ if (!is.null(.data$group_by)) {
+ stop("sd_transmute() does not support grouped input")
+ }
+
expr_quos <- rlang::enquos(...)
env <- parent.frame()
- expr_ctx <- sd_expr_ctx(infer_nanoarrow_schema(.data), env)
+ expr_ctx <- sd_expr_ctx(infer_nanoarrow_schema(.data), env, ctx = .data$ctx)
r_exprs <- expr_quos |> rlang::quos_auto_name() |>
lapply(rlang::quo_get_expr)
sd_exprs <- lapply(r_exprs, sd_eval_expr, expr_ctx = expr_ctx, env = env)
@@ -266,12 +295,16 @@ sd_transmute <- function(.data, ...) {
#'
sd_filter <- function(.data, ...) {
.data <- as_sedonadb_dataframe(.data)
+ if (!is.null(.data$group_by)) {
+ stop("sd_filter() does not support grouped input")
+ }
+
rlang::check_dots_unnamed()
expr_quos <- rlang::enquos(...)
env <- parent.frame()
- expr_ctx <- sd_expr_ctx(infer_nanoarrow_schema(.data), env)
+ expr_ctx <- sd_expr_ctx(infer_nanoarrow_schema(.data), env, ctx = .data$ctx)
r_exprs <- expr_quos |> lapply(rlang::quo_get_expr)
sd_exprs <- lapply(r_exprs, sd_eval_expr, expr_ctx = expr_ctx, env = env)
@@ -279,6 +312,138 @@ sd_filter <- function(.data, ...) {
new_sedonadb_dataframe(.data$ctx, df)
}
+#' Order rows of a SedonaDB data frame using column values
+#'
+#' @inheritParams sd_count
+#' @param ... Unnamed expressions for arrange expressions. These are evaluated
+#' in the same way as [dplyr::arrange()] except does not support extra
+#' dplyr features such as `across()`, `.by_group`, or `.locale`.
+#'
+#' @returns An object of class sedonadb_dataframe
+#' @export
+#'
+#' @examples
+#' data.frame(x = c(10:1, NA)) |> sd_arrange(x)
+#' data.frame(x = c(1:10, NA)) |> sd_arrange(desc(x))
+#'
+sd_arrange <- function(.data, ...) {
+ .data <- as_sedonadb_dataframe(.data)
+ if (!is.null(.data$group_by)) {
+ stop("sd_arrange() does not support grouped input")
+ }
+
+ rlang::check_dots_unnamed()
+
+ expr_quos <- rlang::enquos(...)
+ env <- parent.frame()
+
+ expr_ctx <- sd_expr_ctx(infer_nanoarrow_schema(.data), env, ctx = .data$ctx)
+ r_exprs <- expr_quos |> lapply(rlang::quo_get_expr)
+
+ # Specifically for sd_arrange(), we need to unwrap desc() calls
+ unwrapped <- unwrap_desc(r_exprs)
+
+ sd_exprs <- lapply(
+ unwrapped$inner_exprs,
+ sd_eval_expr,
+ expr_ctx = expr_ctx,
+ env = env
+ )
+
+ df <- .data$df$arrange(sd_exprs, unwrapped$is_descending)
+ new_sedonadb_dataframe(.data$ctx, df)
+}
+
+#' Group SedonaDB DataFrames by one or more expressions
+#'
+#' Note that unlike [dplyr::group_by()], these groups are dropped after
+#' any transformations.
+#'
+#' @inheritParams sd_count
+#' @param ... Named expressions whose unique combination will be used as
+#' groups to potentially compute a future aggregate expression. These are
+#' evaluated in the same way as [dplyr::group_by()] except `.add` nor
+#' `.drop` are supported.
+#'
+#' @returns An object of class sedonadb_dataframe
+#' @export
+#'
+#' @examples
+#' data.frame(letter = c(rep("a", 3), rep("b", 4), rep("c", 3)), x = 1:10) |>
+#' sd_group_by(letter) |>
+#' sd_summarise(x = sum(x))
+#'
+sd_group_by <- function(.data, ...) {
+ .data <- as_sedonadb_dataframe(.data)
+ expr_quos <- rlang::enquos(...)
+ env <- parent.frame()
+
+ expr_ctx <- sd_expr_ctx(infer_nanoarrow_schema(.data), env, ctx = .data$ctx)
+ r_exprs <- expr_quos |> rlang::quos_auto_name() |>
lapply(rlang::quo_get_expr)
+ sd_exprs <- lapply(r_exprs, sd_eval_expr, expr_ctx = expr_ctx, env = env)
+
+ # Ensure inputs are given aliases to account for the expected column name
+ exprs_names <- names(r_exprs)
+ for (i in seq_along(sd_exprs)) {
+ name <- exprs_names[i]
+ if (!is.na(name) && name != "") {
+ sd_exprs[[i]] <- sd_expr_alias(sd_exprs[[i]], name, expr_ctx$factory)
+ }
+ }
+
+ new_sedonadb_dataframe(.data$ctx, .data$df, group_by = sd_exprs)
+}
+
+#' @rdname sd_group_by
+#' @export
+sd_ungroup <- function(.data) {
+ .data <- as_sedonadb_dataframe(.data)
+ .data$group_by <- NULL
+ .data
+}
+
+#' Aggregate SedonaDB DataFrames to a single row per group
+#'
+#' @inheritParams sd_count
+#' @param ... Aggregate expressions. These are evaluated in the same way as
+#' [dplyr::summarise()] except the outer expression must be an aggregate
+#' expression (e.g., `sum(x) + 1` is not currently possible).
+#'
+#' @returns An object of class sedonadb_dataframe
+#' @export
+#'
+#' @examples
+#' data.frame(x = c(10:1, NA)) |> sd_summarise(x = sum(x, na.rm = TRUE))
+#'
+sd_summarise <- function(.data, ...) {
+ .data <- as_sedonadb_dataframe(.data)
+
+ expr_quos <- rlang::enquos(...)
+ env <- parent.frame()
+
+ expr_ctx <- sd_expr_ctx(infer_nanoarrow_schema(.data), env, ctx = .data$ctx)
+ r_exprs <- expr_quos |> rlang::quos_auto_name() |>
lapply(rlang::quo_get_expr)
+ sd_exprs <- lapply(r_exprs, sd_eval_expr, expr_ctx = expr_ctx, env = env)
+
+ # Ensure inputs are given aliases to account for the expected column name
+ exprs_names <- names(r_exprs)
+ for (i in seq_along(sd_exprs)) {
+ name <- exprs_names[i]
+ if (!is.na(name) && name != "") {
+ sd_exprs[[i]] <- sd_expr_alias(sd_exprs[[i]], name, expr_ctx$factory)
+ }
+ }
+
+ df <- .data$df$aggregate(as.list(.data$group_by), sd_exprs)
+ new_sedonadb_dataframe(.data$ctx, df)
+}
+
+#' @rdname sd_summarise
+#' @export
+sd_summarize <- function(.data, ...) {
+ sd_summarise(.data, ...)
+}
+
#' Write DataFrame to (Geo)Parquet files
#'
#' Write this DataFrame to one or more (Geo)Parquet files. For input that
contains
@@ -333,6 +498,9 @@ sd_write_parquet <- function(
overwrite_bbox_columns = FALSE
) {
.data <- as_sedonadb_dataframe(.data)
+ if (!is.null(.data$group_by)) {
+ stop("sd_write_parquet() does not support grouped input")
+ }
# Determine single_file_output default based on path and partition_by
if (is.null(single_file_output)) {
@@ -360,8 +528,15 @@ sd_write_parquet <- function(
invisible(.data)
}
-new_sedonadb_dataframe <- function(ctx, internal_df) {
- structure(list(ctx = ctx, df = internal_df), class = "sedonadb_dataframe")
+new_sedonadb_dataframe <- function(ctx, internal_df, ..., group_by = NULL) {
+ if (length(group_by) == 0) {
+ group_by <- NULL
+ }
+
+ structure(
+ list(ctx = ctx, df = internal_df, group_by = group_by),
+ class = "sedonadb_dataframe"
+ )
}
#' @importFrom utils head
diff --git a/r/sedonadb/R/expression.R b/r/sedonadb/R/expression.R
index 0d97c898..223106c0 100644
--- a/r/sedonadb/R/expression.R
+++ b/r/sedonadb/R/expression.R
@@ -30,6 +30,14 @@
#' @param lhs,rhs Arguments to a binary expression
#' @param factory A [sd_expr_factory()]. This factory wraps a SedonaDB context
#' and is used to resolve scalar functions and/or retrieve options.
+#' @param ctx A SedonaDB context or NULL to use the default context.
+#' @param args A list of SedonaDBExpr or object coercible to one with
+#' [as_sd_expr()].
+#' @param na.rm For aggregate expressions, should nulls be ignored? The R
+#' idiom is to respect null; however, the SQL idiom is to drop them. The
+#' default value follows the R idiom (`na.rm = FALSE`).
+#' @param distinct For aggregate expressions, use only distinct values.
+#' @param ... Reserved for future use
#'
#' @returns An object of class SedonaDBExpr
#' @export
@@ -63,6 +71,19 @@ sd_expr_negative <- function(expr, factory =
sd_expr_factory()) {
as_sd_expr(expr, factory = factory)$negate()
}
+#' @rdname sd_expr_column
+#' @export
+sd_expr_any_function <- function(
+ function_name,
+ args,
+ ...,
+ na.rm = NULL, # nolint: object_name_linter
+ factory = sd_expr_factory()
+) {
+ args_as_expr <- lapply(args, as_sd_expr, factory = factory)
+ factory$any_function(function_name, args_as_expr, na_rm = na.rm)
+}
+
#' @rdname sd_expr_column
#' @export
sd_expr_scalar_function <- function(function_name, args, factory =
sd_expr_factory()) {
@@ -122,8 +143,12 @@ is_sd_expr <- function(x) {
#' @rdname sd_expr_column
#' @export
-sd_expr_factory <- function() {
- SedonaDBExprFactory$new(ctx())
+sd_expr_factory <- function(ctx = NULL) {
+ if (is.null(ctx)) {
+ ctx <- ctx()
+ }
+
+ SedonaDBExprFactory$new(ctx)
}
#' @export
@@ -134,6 +159,30 @@ print.SedonaDBExpr <- function(x, ...) {
invisible(x)
}
+#' SedonaDB Functions
+#'
+#' This object is an escape hatch for calling SedonaDB/DataFusion functions
+#' directly for translations that are not yet registered or are otherwise
+#' misbehaving.
+#'
+#' @export .fns
+.fns <- structure(list(), class = "sedonadb_fns")
+
+# For IDE autocomplete
+#' @export
+names.sedonadb_fns <- function(x) {
+ ctx <- ctx()
+ ctx$list_functions()
+}
+
+# nolint start: object_name_linter
+#' @importFrom utils .DollarNames
+#' @export
+.DollarNames.sedonadb_fns <- function(x, pattern = "") {
+ grep(pattern, names(x), value = TRUE)
+}
+# nolint end
+
#' Evaluate an R expression into a SedonaDB expression
#'
#' @param expr An R expression (e.g., the result of `quote()`).
@@ -161,6 +210,13 @@ sd_eval_expr <- function(expr, expr_ctx = sd_expr_ctx(env
= env), env = parent.f
sd_eval_expr_inner <- function(expr, expr_ctx) {
if (rlang::is_call(expr)) {
+ # Special syntax for the escape hatch of "just call a DataFusion function"
is
+ # the expression .fns$datafusion_fn_name(arg1, arg2)
+ if (rlang::is_call(expr[[1]], "$") && rlang::is_symbol(expr[[1]][[2]],
".fns")) {
+ fn_key <- as.character(expr[[1]][[3]])
+ return(sd_eval_datafusion_fn(fn_key, expr, expr_ctx))
+ }
+
# Extract `pkg::fun` or `fun` if this is a usual call (e.g., not
# something fancy like `fun()()`)
call_name <- rlang::call_name(expr)
@@ -179,6 +235,28 @@ sd_eval_expr_inner <- function(expr, expr_ctx) {
}
}
+sd_eval_datafusion_fn <- function(fn_key, expr, expr_ctx) {
+ # Evaluate arguments
+ evaluated_args <- lapply(expr[-1], sd_eval_expr_inner, expr_ctx = expr_ctx)
+
+ na_rm <- evaluated_args$na.rm
+ evaluated_args$na.rm <- NULL
+
+ if (any(rlang::have_name(evaluated_args))) {
+ stop(
+ sprintf(
+ "Expected unnamed arguments to SedonaDB SQL function but got %s",
+ paste(
+ names(evaluated_args)[rlang::have_name(evaluated_args)],
+ collapse = ", "
+ )
+ )
+ )
+ }
+
+ sd_expr_any_function(fn_key, evaluated_args, na.rm = na_rm, factory =
expr_ctx$factory)
+}
+
sd_eval_translation <- function(fn_key, expr, expr_ctx) {
# Replace the function with the translation in such a way that
# any error resulting from the call doesn't have an absolute garbage error
@@ -200,6 +278,26 @@ sd_eval_default <- function(expr, expr_ctx) {
rlang::eval_tidy(expr, data = expr_ctx$data, env = expr_ctx$env)
}
+# Needed for sd_arrange(), as wrapping expression in desc() is how a descending
+# sort order is specified. Unwraps desc(inner_expr) to separate the
expressions.
+unwrap_desc <- function(exprs) {
+ inner_exprs <- vector("list", length(exprs))
+ is_descending <- vector("logical", length(exprs))
+ for (i in seq_along(exprs)) {
+ expr <- exprs[[i]]
+
+ if (rlang::is_call(expr, "desc") || rlang::is_call(expr, "desc", ns =
"dplyr")) {
+ inner_exprs[[i]] <- expr[[2]]
+ is_descending[[i]] <- TRUE
+ } else {
+ inner_exprs[[i]] <- expr
+ is_descending[[i]] <- FALSE
+ }
+ }
+
+ list(inner_exprs = inner_exprs, is_descending = is_descending)
+}
+
#' Expression evaluation context
#'
#' A context to use for evaluating a set of related R expressions into
@@ -210,10 +308,12 @@ sd_eval_default <- function(expr, expr_ctx) {
#' [nanoarrow::as_nanoarrow_schema()]. This is used to create the data mask
#' for expressions.
#' @param env The expression environment. This is needed to evaluate
expressions.
+#' @param ctx A SedonaDB context whose function registry should be used to
resolve
+#' functions.
#'
#' @return An object of class sedonadb_expr_ctx
#' @noRd
-sd_expr_ctx <- function(schema = NULL, env = parent.frame()) {
+sd_expr_ctx <- function(schema = NULL, env = parent.frame(), ctx = NULL) {
if (is.null(schema)) {
schema <- nanoarrow::na_struct()
}
@@ -225,7 +325,7 @@ sd_expr_ctx <- function(schema = NULL, env =
parent.frame()) {
structure(
list(
- factory = sd_expr_factory(),
+ factory = sd_expr_factory(ctx = ctx),
schema = schema,
data = rlang::as_data_mask(data),
env = env,
@@ -256,6 +356,43 @@ sd_register_translation <- function(qualified_name, fn) {
invisible(fn)
}
+#' Register a translation that always forwards its arguments to DataFusion
+#'
+#' @param fn_name The name of the function
+#' @returns fn, invisibly
+#' @noRd
+sd_register_datafusion_fn <- function(fn_name) {
+ force(fn_name)
+
+ fn <- function(.ctx, ...) {
+ evaluated_args <- list(...)
+ na_rm <- evaluated_args$na.rm
+ evaluated_args$na.rm <- NULL
+
+ if (any(rlang::have_name(evaluated_args))) {
+ stop(
+ sprintf(
+ "Expected unnamed arguments to SedonaDB SQL function but got %s",
+ paste(
+ names(evaluated_args)[rlang::have_name(evaluated_args)],
+ collapse = ", "
+ )
+ )
+ )
+ }
+
+ sd_expr_any_function(
+ fn_name,
+ evaluated_args,
+ na.rm = na_rm,
+ factory = .ctx$factory
+ )
+ }
+
+ default_fns[[fn_name]] <- fn
+ invisible(fn)
+}
+
default_fns <- new.env(parent = emptyenv())
# Register translations lazily because SQL users don't need them and because
@@ -265,6 +402,12 @@ ensure_translations_registered <- function() {
return()
}
+ # Register default translations for our st_, sd_, and rs_ functions
+ fn_names <- utils::.DollarNames(.fns, "^(st|rs|sd)_")
+ for (fn_name in fn_names) {
+ sd_register_datafusion_fn(fn_name)
+ }
+
sd_register_translation("base::abs", function(.ctx, x) {
sd_expr_scalar_function("abs", list(x), factory = .ctx$factory)
})
@@ -303,4 +446,8 @@ ensure_translations_registered <- function() {
})
)
}
+
+ sd_register_translation("dplyr::n", function(.ctx) {
+ sd_expr_aggregate_function("count", list(1L), na.rm = FALSE, factory =
.ctx$factory)
+ })
}
diff --git a/r/sedonadb/bootstrap.R b/r/sedonadb/bootstrap.R
index 80e36ea2..d1974f24 100644
--- a/r/sedonadb/bootstrap.R
+++ b/r/sedonadb/bootstrap.R
@@ -40,6 +40,9 @@ file.copy(
recursive = TRUE
)
+# Remove unused libgpuspatial crate
+unlink("src/c/sedona-libgpuspatial", recursive = TRUE)
+
# Tweak workspace Cargo.toml
top_cargo_toml <- "src/Cargo.toml"
lines <- readLines(top_cargo_toml)
@@ -48,4 +51,5 @@ lines <- gsub("r/sedonadb/src/rust", "rust", lines, fixed =
TRUE)
# remove unnecessary workspace members
lines <- gsub('"python/sedonadb",', "", lines, fixed = TRUE)
lines <- gsub('"sedona-cli",', "", lines, fixed = TRUE)
+lines <- gsub('"c/sedona-libgpuspatial",', "", lines, fixed = TRUE)
writeLines(lines, top_cargo_toml)
diff --git a/r/sedonadb/man/dot-fns.Rd b/r/sedonadb/man/dot-fns.Rd
new file mode 100644
index 00000000..4912b449
--- /dev/null
+++ b/r/sedonadb/man/dot-fns.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/expression.R
+\docType{data}
+\name{.fns}
+\alias{.fns}
+\title{SedonaDB Functions}
+\format{
+An object of class \code{sedonadb_fns} of length 0.
+}
+\usage{
+.fns
+}
+\description{
+This object is an escape hatch for calling SedonaDB/DataFusion functions
+directly for translations that are not yet registered or are otherwise
+misbehaving.
+}
+\keyword{datasets}
diff --git a/r/sedonadb/man/sd_arrange.Rd b/r/sedonadb/man/sd_arrange.Rd
new file mode 100644
index 00000000..cb43e964
--- /dev/null
+++ b/r/sedonadb/man/sd_arrange.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataframe.R
+\name{sd_arrange}
+\alias{sd_arrange}
+\title{Order rows of a SedonaDB data frame using column values}
+\usage{
+sd_arrange(.data, ...)
+}
+\arguments{
+\item{.data}{A sedonadb_dataframe or an object that can be coerced to one.}
+
+\item{...}{Unnamed expressions for arrange expressions. These are evaluated
+in the same way as \code{\link[dplyr:arrange]{dplyr::arrange()}} except does
not support extra
+dplyr features such as \code{across()}, \code{.by_group}, or \code{.locale}.}
+}
+\value{
+An object of class sedonadb_dataframe
+}
+\description{
+Order rows of a SedonaDB data frame using column values
+}
+\examples{
+data.frame(x = c(10:1, NA)) |> sd_arrange(x)
+data.frame(x = c(1:10, NA)) |> sd_arrange(desc(x))
+
+}
diff --git a/r/sedonadb/man/sd_expr_column.Rd b/r/sedonadb/man/sd_expr_column.Rd
index 50c41a89..f3a6a521 100644
--- a/r/sedonadb/man/sd_expr_column.Rd
+++ b/r/sedonadb/man/sd_expr_column.Rd
@@ -5,6 +5,7 @@
\alias{sd_expr_literal}
\alias{sd_expr_binary}
\alias{sd_expr_negative}
+\alias{sd_expr_any_function}
\alias{sd_expr_scalar_function}
\alias{sd_expr_aggregate_function}
\alias{sd_expr_cast}
@@ -22,6 +23,14 @@ sd_expr_binary(op, lhs, rhs, factory = sd_expr_factory())
sd_expr_negative(expr, factory = sd_expr_factory())
+sd_expr_any_function(
+ function_name,
+ args,
+ ...,
+ na.rm = NULL,
+ factory = sd_expr_factory()
+)
+
sd_expr_scalar_function(function_name, args, factory = sd_expr_factory())
sd_expr_aggregate_function(
@@ -41,7 +50,7 @@ as_sd_expr(x, factory = sd_expr_factory())
is_sd_expr(x)
-sd_expr_factory()
+sd_expr_factory(ctx = NULL)
}
\arguments{
\item{column_name}{A column name}
@@ -64,7 +73,20 @@ R function names (e.g., \code{>}, \code{<}, \code{+},
\code{-}).}
\item{function_name}{The name of the function to call. This name is resolved
from the context associated with \code{factory}.}
+\item{args}{A list of SedonaDBExpr or object coercible to one with
+\code{\link[=as_sd_expr]{as_sd_expr()}}.}
+
+\item{...}{Reserved for future use}
+
+\item{na.rm}{For aggregate expressions, should nulls be ignored? The R
+idiom is to respect null; however, the SQL idiom is to drop them. The
+default value follows the R idiom (\code{na.rm = FALSE}).}
+
+\item{distinct}{For aggregate expressions, use only distinct values.}
+
\item{alias}{An alias to apply to \code{expr}.}
+
+\item{ctx}{A SedonaDB context or NULL to use the default context.}
}
\value{
An object of class SedonaDBExpr
diff --git a/r/sedonadb/man/sd_group_by.Rd b/r/sedonadb/man/sd_group_by.Rd
new file mode 100644
index 00000000..95ac2200
--- /dev/null
+++ b/r/sedonadb/man/sd_group_by.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataframe.R
+\name{sd_group_by}
+\alias{sd_group_by}
+\alias{sd_ungroup}
+\title{Group SedonaDB DataFrames by one or more expressions}
+\usage{
+sd_group_by(.data, ...)
+
+sd_ungroup(.data)
+}
+\arguments{
+\item{.data}{A sedonadb_dataframe or an object that can be coerced to one.}
+
+\item{...}{Named expressions whose unique combination will be used as
+groups to potentially compute a future aggregate expression. These are
+evaluated in the same way as \code{\link[dplyr:group_by]{dplyr::group_by()}}
except \code{.add} nor
+\code{.drop} are supported.}
+}
+\value{
+An object of class sedonadb_dataframe
+}
+\description{
+Note that unlike \code{\link[dplyr:group_by]{dplyr::group_by()}}, these groups
are dropped after
+any transformations.
+}
+\examples{
+data.frame(letter = c(rep("a", 3), rep("b", 4), rep("c", 3)), x = 1:10) |>
+ sd_group_by(letter) |>
+ sd_summarise(x = sum(x))
+
+}
diff --git a/r/sedonadb/man/sd_summarise.Rd b/r/sedonadb/man/sd_summarise.Rd
new file mode 100644
index 00000000..9534aba5
--- /dev/null
+++ b/r/sedonadb/man/sd_summarise.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataframe.R
+\name{sd_summarise}
+\alias{sd_summarise}
+\alias{sd_summarize}
+\title{Aggregate SedonaDB DataFrames to a single row per group}
+\usage{
+sd_summarise(.data, ...)
+
+sd_summarize(.data, ...)
+}
+\arguments{
+\item{.data}{A sedonadb_dataframe or an object that can be coerced to one.}
+
+\item{...}{Aggregate expressions. These are evaluated in the same way as
+\code{\link[dplyr:summarise]{dplyr::summarise()}} except the outer expression
must be an aggregate
+expression (e.g., \code{sum(x) + 1} is not currently possible).}
+}
+\value{
+An object of class sedonadb_dataframe
+}
+\description{
+Aggregate SedonaDB DataFrames to a single row per group
+}
+\examples{
+data.frame(x = c(10:1, NA)) |> sd_summarise(x = sum(x, na.rm = TRUE))
+
+}
diff --git a/r/sedonadb/src/init.c b/r/sedonadb/src/init.c
index 48cc3290..95c44800 100644
--- a/r/sedonadb/src/init.c
+++ b/r/sedonadb/src/init.c
@@ -101,6 +101,11 @@ SEXP savvy_InternalContext_deregister_table__impl(SEXP
self__,
return handle_result(res);
}
+SEXP savvy_InternalContext_list_functions__impl(SEXP self__) {
+ SEXP res = savvy_InternalContext_list_functions__ffi(self__);
+ return handle_result(res);
+}
+
SEXP savvy_InternalContext_new__impl(void) {
SEXP res = savvy_InternalContext_new__ffi();
return handle_result(res);
@@ -134,6 +139,21 @@ SEXP savvy_InternalContext_view__impl(SEXP self__, SEXP
c_arg__table_ref) {
return handle_result(res);
}
+SEXP savvy_InternalDataFrame_aggregate__impl(SEXP self__,
+ SEXP c_arg__group_by_exprs_sexp,
+ SEXP c_arg__exprs_sexp) {
+ SEXP res = savvy_InternalDataFrame_aggregate__ffi(
+ self__, c_arg__group_by_exprs_sexp, c_arg__exprs_sexp);
+ return handle_result(res);
+}
+
+SEXP savvy_InternalDataFrame_arrange__impl(SEXP self__, SEXP c_arg__exprs_sexp,
+ SEXP c_arg__is_descending_sexp) {
+ SEXP res = savvy_InternalDataFrame_arrange__ffi(self__, c_arg__exprs_sexp,
+ c_arg__is_descending_sexp);
+ return handle_result(res);
+}
+
SEXP savvy_InternalDataFrame_collect__impl(SEXP self__, SEXP c_arg__out) {
SEXP res = savvy_InternalDataFrame_collect__ffi(self__, c_arg__out);
return handle_result(res);
@@ -257,6 +277,14 @@ SEXP
savvy_SedonaDBExprFactory_aggregate_function__impl(SEXP self__,
return handle_result(res);
}
+SEXP savvy_SedonaDBExprFactory_any_function__impl(SEXP self__, SEXP
c_arg__name,
+ SEXP c_arg__args,
+ SEXP c_arg__na_rm) {
+ SEXP res = savvy_SedonaDBExprFactory_any_function__ffi(
+ self__, c_arg__name, c_arg__args, c_arg__na_rm);
+ return handle_result(res);
+}
+
SEXP savvy_SedonaDBExprFactory_binary__impl(SEXP self__, SEXP c_arg__op,
SEXP c_arg__lhs, SEXP c_arg__rhs) {
SEXP res = savvy_SedonaDBExprFactory_binary__ffi(self__, c_arg__op,
@@ -304,6 +332,8 @@ static const R_CallMethodDef CallEntries[] = {
(DL_FUNC)&savvy_InternalContext_data_frame_from_table_provider__impl, 2},
{"savvy_InternalContext_deregister_table__impl",
(DL_FUNC)&savvy_InternalContext_deregister_table__impl, 2},
+ {"savvy_InternalContext_list_functions__impl",
+ (DL_FUNC)&savvy_InternalContext_list_functions__impl, 1},
{"savvy_InternalContext_new__impl",
(DL_FUNC)&savvy_InternalContext_new__impl, 0},
{"savvy_InternalContext_read_parquet__impl",
@@ -316,6 +346,10 @@ static const R_CallMethodDef CallEntries[] = {
(DL_FUNC)&savvy_InternalContext_sql__impl, 2},
{"savvy_InternalContext_view__impl",
(DL_FUNC)&savvy_InternalContext_view__impl, 2},
+ {"savvy_InternalDataFrame_aggregate__impl",
+ (DL_FUNC)&savvy_InternalDataFrame_aggregate__impl, 3},
+ {"savvy_InternalDataFrame_arrange__impl",
+ (DL_FUNC)&savvy_InternalDataFrame_arrange__impl, 3},
{"savvy_InternalDataFrame_collect__impl",
(DL_FUNC)&savvy_InternalDataFrame_collect__impl, 2},
{"savvy_InternalDataFrame_compute__impl",
@@ -356,6 +390,8 @@ static const R_CallMethodDef CallEntries[] = {
(DL_FUNC)&savvy_SedonaDBExpr_negate__impl, 1},
{"savvy_SedonaDBExprFactory_aggregate_function__impl",
(DL_FUNC)&savvy_SedonaDBExprFactory_aggregate_function__impl, 5},
+ {"savvy_SedonaDBExprFactory_any_function__impl",
+ (DL_FUNC)&savvy_SedonaDBExprFactory_any_function__impl, 4},
{"savvy_SedonaDBExprFactory_binary__impl",
(DL_FUNC)&savvy_SedonaDBExprFactory_binary__impl, 4},
{"savvy_SedonaDBExprFactory_column__impl",
diff --git a/r/sedonadb/src/rust/api.h b/r/sedonadb/src/rust/api.h
index b43df3f5..f61a5df1 100644
--- a/r/sedonadb/src/rust/api.h
+++ b/r/sedonadb/src/rust/api.h
@@ -30,6 +30,7 @@ SEXP
savvy_InternalContext_data_frame_from_table_provider__ffi(
SEXP self__, SEXP c_arg__provider_xptr);
SEXP savvy_InternalContext_deregister_table__ffi(SEXP self__,
SEXP c_arg__table_ref);
+SEXP savvy_InternalContext_list_functions__ffi(SEXP self__);
SEXP savvy_InternalContext_new__ffi(void);
SEXP savvy_InternalContext_read_parquet__ffi(SEXP self__, SEXP c_arg__paths);
SEXP savvy_InternalContext_register_scalar_udf__ffi(
@@ -39,6 +40,11 @@ SEXP savvy_InternalContext_sql__ffi(SEXP self__, SEXP
c_arg__query);
SEXP savvy_InternalContext_view__ffi(SEXP self__, SEXP c_arg__table_ref);
// methods and associated functions for InternalDataFrame
+SEXP savvy_InternalDataFrame_aggregate__ffi(SEXP self__,
+ SEXP c_arg__group_by_exprs_sexp,
+ SEXP c_arg__exprs_sexp);
+SEXP savvy_InternalDataFrame_arrange__ffi(SEXP self__, SEXP c_arg__exprs_sexp,
+ SEXP c_arg__is_descending_sexp);
SEXP savvy_InternalDataFrame_collect__ffi(SEXP self__, SEXP c_arg__out);
SEXP savvy_InternalDataFrame_compute__ffi(SEXP self__, SEXP c_arg__ctx);
SEXP savvy_InternalDataFrame_count__ffi(SEXP self__);
@@ -76,6 +82,9 @@ SEXP savvy_SedonaDBExprFactory_aggregate_function__ffi(SEXP
self__,
SEXP c_arg__args,
SEXP c_arg__na_rm,
SEXP c_arg__distinct);
+SEXP savvy_SedonaDBExprFactory_any_function__ffi(SEXP self__, SEXP c_arg__name,
+ SEXP c_arg__args,
+ SEXP c_arg__na_rm);
SEXP savvy_SedonaDBExprFactory_binary__ffi(SEXP self__, SEXP c_arg__op,
SEXP c_arg__lhs, SEXP c_arg__rhs);
SEXP savvy_SedonaDBExprFactory_column__ffi(SEXP self__, SEXP c_arg__name,
diff --git a/r/sedonadb/src/rust/src/context.rs
b/r/sedonadb/src/rust/src/context.rs
index a8c2c0e6..ca6ffc45 100644
--- a/r/sedonadb/src/rust/src/context.rs
+++ b/r/sedonadb/src/rust/src/context.rs
@@ -20,7 +20,7 @@ use arrow_array::RecordBatchReader;
use arrow_schema::ArrowError;
use datafusion::catalog::{MemTable, TableProvider};
use datafusion_ffi::udf::FFI_ScalarUDF;
-use savvy::{savvy, savvy_err, IntoExtPtrSexp, Result};
+use savvy::{savvy, savvy_err, IntoExtPtrSexp, OwnedStringSexp, Result};
use sedona::{context::SedonaContext,
record_batch_reader_provider::RecordBatchReaderProvider};
use sedona_geoparquet::provider::GeoParquetReadOptions;
@@ -123,6 +123,16 @@ impl InternalContext {
Ok(())
}
+ pub fn list_functions(&self) -> savvy::Result<savvy::Sexp> {
+ let mut fn_names = Vec::new();
+ let state = self.inner.ctx.state();
+ fn_names.extend(state.scalar_functions().keys());
+ fn_names.extend(state.aggregate_functions().keys());
+
+ let fn_names_sexp = OwnedStringSexp::try_from(fn_names.as_slice())?;
+ fn_names_sexp.into()
+ }
+
pub fn scalar_udf_xptr(&self, name: &str) -> savvy::Result<savvy::Sexp> {
if let Some(udf) = self.inner.ctx.state().scalar_functions().get(name)
{
let ffi_scalar_udf: FFI_ScalarUDF = udf.clone().into();
diff --git a/r/sedonadb/src/rust/src/dataframe.rs
b/r/sedonadb/src/rust/src/dataframe.rs
index 2e24fdfc..d5bdb460 100644
--- a/r/sedonadb/src/rust/src/dataframe.rs
+++ b/r/sedonadb/src/rust/src/dataframe.rs
@@ -332,4 +332,32 @@ impl InternalDataFrame {
Ok(new_data_frame(inner, self.runtime.clone()))
}
+
+ fn arrange(
+ &self,
+ exprs_sexp: savvy::Sexp,
+ is_descending_sexp: savvy::Sexp,
+ ) -> savvy::Result<InternalDataFrame> {
+ let exprs = SedonaDBExprFactory::exprs(exprs_sexp)?;
+ let is_descending_lglsxp =
savvy::LogicalSexp::try_from(is_descending_sexp)?;
+
+ let sort_exprs = zip(exprs, is_descending_lglsxp.iter())
+ .map(|(expr, is_descending)| SortExpr::new(expr, !is_descending,
false))
+ .collect::<Vec<_>>();
+
+ let inner = self.inner.clone().sort(sort_exprs)?;
+ Ok(new_data_frame(inner, self.runtime.clone()))
+ }
+
+ fn aggregate(
+ &self,
+ group_by_exprs_sexp: savvy::Sexp,
+ exprs_sexp: savvy::Sexp,
+ ) -> savvy::Result<InternalDataFrame> {
+ let exprs = SedonaDBExprFactory::exprs(exprs_sexp)?;
+ let group_by_exprs = SedonaDBExprFactory::exprs(group_by_exprs_sexp)?;
+
+ let inner = self.inner.clone().aggregate(group_by_exprs, exprs)?;
+ Ok(new_data_frame(inner, self.runtime.clone()))
+ }
}
diff --git a/r/sedonadb/src/rust/src/expression.rs
b/r/sedonadb/src/rust/src/expression.rs
index e0753fd2..be51a085 100644
--- a/r/sedonadb/src/rust/src/expression.rs
+++ b/r/sedonadb/src/rust/src/expression.rs
@@ -133,6 +133,27 @@ impl SedonaDBExprFactory {
Ok(SedonaDBExpr { inner })
}
+ // Wrapper for the case where a function might be aggregate or scalar
+ fn any_function(
+ &self,
+ name: &str,
+ args: savvy::Sexp,
+ na_rm: Option<bool>,
+ ) -> savvy::Result<SedonaDBExpr> {
+ if self
+ .ctx
+ .ctx
+ .state()
+ .aggregate_functions()
+ .contains_key(name)
+ || na_rm.is_some()
+ {
+ self.aggregate_function(name, args, na_rm, None)
+ } else {
+ self.scalar_function(name, args)
+ }
+ }
+
fn scalar_function(&self, name: &str, args: savvy::Sexp) ->
savvy::Result<SedonaDBExpr> {
if let Some(udf) = self.ctx.ctx.state().scalar_functions().get(name) {
let args = Self::exprs(args)?;
@@ -152,7 +173,7 @@ impl SedonaDBExprFactory {
) -> savvy::Result<SedonaDBExpr> {
if let Some(udf) =
self.ctx.ctx.state().aggregate_functions().get(name) {
let args = Self::exprs(args)?;
- let null_treatment = if na_rm.unwrap_or(true) {
+ let null_treatment = if na_rm.unwrap_or(false) {
NullTreatment::IgnoreNulls
} else {
NullTreatment::RespectNulls
diff --git a/r/sedonadb/tests/testthat/_snaps/dataframe.md
b/r/sedonadb/tests/testthat/_snaps/dataframe.md
new file mode 100644
index 00000000..2164d4c6
--- /dev/null
+++ b/r/sedonadb/tests/testthat/_snaps/dataframe.md
@@ -0,0 +1,28 @@
+# dataframe can be printed
+
+ Code
+ print(df)
+ Output
+ <sedonab_dataframe: NA x 1>
+ +------------+
+ | pt |
+ | geometry |
+ +------------+
+ | POINT(0 1) |
+ +------------+
+ Preview of up to 6 row(s)
+
+---
+
+ Code
+ print(grouped)
+ Output
+ <grouped sedonab_dataframe: NA x 1 | [`x`]>
+ +------------+
+ | pt |
+ | geometry |
+ +------------+
+ | POINT(0 1) |
+ +------------+
+ Preview of up to 6 row(s)
+
diff --git a/r/sedonadb/tests/testthat/_snaps/expression.md
b/r/sedonadb/tests/testthat/_snaps/expression.md
index 5d7452a5..a2c02a0d 100644
--- a/r/sedonadb/tests/testthat/_snaps/expression.md
+++ b/r/sedonadb/tests/testthat/_snaps/expression.md
@@ -110,6 +110,80 @@
<SedonaDBExpr>
abs((- Int32(1)))
+# function calls explicitly referencing DataFusion functions work
+
+ Code
+ sd_eval_expr(quote(.fns$abs(-1L)))
+ Output
+ <SedonaDBExpr>
+ abs((- Int32(1)))
+
+---
+
+ Code
+ sd_eval_expr(quote(.fns$sum(-1L)))
+ Output
+ <SedonaDBExpr>
+ sum((- Int32(1))) RESPECT NULLS
+
+---
+
+ Code
+ sd_eval_expr(quote(.fns$sum(-1L, na.rm = TRUE)))
+ Output
+ <SedonaDBExpr>
+ sum((- Int32(1))) IGNORE NULLS
+
+---
+
+ Code
+ sd_eval_expr(quote(.fns$sum(-1L, na.rm = FALSE)))
+ Output
+ <SedonaDBExpr>
+ sum((- Int32(1))) RESPECT NULLS
+
+---
+
+ Error evaluating translated expression `.fns$absolutely_not(-1L)`
+ Caused by error:
+ ! Scalar UDF 'absolutely_not' not found
+
+---
+
+ Error evaluating translated expression `.fns$absolutely_not(x = -1L)`
+ Caused by error in `sd_eval_datafusion_fn()`:
+ ! Expected unnamed arguments to SedonaDB SQL function but got x
+
+# function calls referencing SedonaDB SQL functions work
+
+ Code
+ sd_eval_expr(quote(st_point(0, 1)))
+ Output
+ <SedonaDBExpr>
+ st_point(Float64(0), Float64(1))
+
+---
+
+ Code
+ sd_eval_expr(quote(st_envelope_agg(st_point(0, 1))))
+ Output
+ <SedonaDBExpr>
+ st_envelope_agg(st_point(Float64(0), Float64(1))) RESPECT NULLS
+
+---
+
+ Code
+ sd_eval_expr(quote(st_envelope_agg(st_point(0, 1), na.rm = TRUE)))
+ Output
+ <SedonaDBExpr>
+ st_envelope_agg(st_point(Float64(0), Float64(1))) IGNORE NULLS
+
+---
+
+ Error evaluating translated expression `st_point(1, y = 2)`
+ Caused by error:
+ ! Expected unnamed arguments to SedonaDB SQL function but got y
+
# function calls without a translation are evaluated in R
Code
diff --git a/r/sedonadb/tests/testthat/test-context.R
b/r/sedonadb/tests/testthat/test-context.R
index f84055c3..0f2647dc 100644
--- a/r/sedonadb/tests/testthat/test-context.R
+++ b/r/sedonadb/tests/testthat/test-context.R
@@ -68,3 +68,8 @@ test_that("configure_proj() errors for invalid inputs", {
"Invalid search path"
)
})
+
+test_that(".fns can have its contents listed", {
+ expect_contains(names(.fns), "st_intersects")
+ expect_contains(.DollarNames(.fns, "st_int"), "st_intersects")
+})
diff --git a/r/sedonadb/tests/testthat/test-dataframe.R
b/r/sedonadb/tests/testthat/test-dataframe.R
index f150a8ea..6697254c 100644
--- a/r/sedonadb/tests/testthat/test-dataframe.R
+++ b/r/sedonadb/tests/testthat/test-dataframe.R
@@ -127,7 +127,10 @@ test_that("dataframe can be converted to an array stream",
{
test_that("dataframe can be printed", {
df <- sd_sql("SELECT ST_Point(0, 1) as pt")
- expect_output(expect_identical(print(df), df), "POINT")
+ expect_snapshot(print(df))
+
+ grouped <- df |> sd_group_by(x = .fns$st_x(pt))
+ expect_snapshot(print(grouped))
})
test_that("dataframe print uses ASCII when requested", {
@@ -354,3 +357,71 @@ test_that("sd_filter() works with dplyr-like filter
syntax", {
data.frame(x = integer())
)
})
+
+test_that("sd_arrange() works with dplyr-like arrange syntax", {
+ df_in <- data.frame(x = 1:10, y = letters[10:1])
+
+ # Zero conditions
+ expect_identical(
+ df_in |> sd_filter() |> sd_collect(),
+ df_in
+ )
+
+ # One condition
+ expect_identical(
+ df_in |> sd_arrange(x) |> sd_collect(),
+ df_in
+ )
+
+ # Check descending with desc()
+ expect_identical(
+ df_in |> sd_arrange(desc(x)) |> sd_collect(),
+ data.frame(x = 10:1, y = letters[1:10])
+ )
+
+ # Check descending with dplyr::desc()
+ expect_identical(
+ df_in |> sd_arrange(dplyr::desc(x)) |> sd_collect(),
+ data.frame(x = 10:1, y = letters[1:10])
+ )
+})
+
+test_that("sd_group_by() and sd_ungroup() modify .data$group_by", {
+ df_in <- data.frame(letter = letters[1:10], x = 1:10)
+
+ # Check that we can group
+ grouped <- df_in |> sd_group_by(letter, x)
+ expect_identical(names(grouped$group_by), c("letter", "x"))
+
+ # Check that we can ungroup
+ expect_identical(names(sd_ungroup(grouped)$group_by), NULL)
+
+ # Check that we can ungroup using sd_group_by()
+ expect_identical(names(sd_group_by(grouped)$group_by), NULL)
+})
+
+test_that("sd_summarise() + sd_group_by() works as expected", {
+ df_in <- data.frame(letter = c(rep("a", 3), rep("b", 4), rep("c", 3)), x =
1:10)
+
+ expect_identical(
+ df_in |>
+ sd_group_by(letter) |>
+ sd_summarise(x = sum(x), n = n()) |>
+ sd_arrange(letter) |>
+ sd_collect(),
+ data.frame(
+ letter = c("a", "b", "c"),
+ x = c(1 + 2 + 3, 4 + 5 + 6 + 7, 8 + 9 + 10),
+ n = c(3, 4, 3)
+ )
+ )
+})
+
+test_that("sd_summarise() works with dplyr-like summarise syntax", {
+ df_in <- data.frame(x = as.double(1:10))
+
+ expect_identical(
+ df_in |> sd_summarise(x = sum(x)) |> sd_collect(),
+ data.frame(x = sum(as.double(1:10)))
+ )
+})
diff --git a/r/sedonadb/tests/testthat/test-expression.R
b/r/sedonadb/tests/testthat/test-expression.R
index f0f3d5af..2c282529 100644
--- a/r/sedonadb/tests/testthat/test-expression.R
+++ b/r/sedonadb/tests/testthat/test-expression.R
@@ -58,6 +58,32 @@ test_that("function calls with a translation become function
calls", {
expect_snapshot(sd_eval_expr(quote(base::abs(-1L))))
})
+test_that("function calls explicitly referencing DataFusion functions work", {
+ # Scalar function
+ expect_snapshot(sd_eval_expr(quote(.fns$abs(-1L))))
+
+ # Aggregate function
+ expect_snapshot(sd_eval_expr(quote(.fns$sum(-1L))))
+ expect_snapshot(sd_eval_expr(quote(.fns$sum(-1L, na.rm = TRUE))))
+ expect_snapshot(sd_eval_expr(quote(.fns$sum(-1L, na.rm = FALSE))))
+
+ # Check for a reasonable error if this is not a valid name or we have
+ # named arguments
+ expect_snapshot_error(sd_eval_expr(quote(.fns$absolutely_not(-1L))))
+ expect_snapshot_error(sd_eval_expr(quote(.fns$absolutely_not(x = -1L))))
+})
+
+test_that("function calls referencing SedonaDB SQL functions work", {
+ expect_snapshot(sd_eval_expr(quote(st_point(0, 1))))
+ expect_snapshot(sd_eval_expr(quote(st_envelope_agg(st_point(0, 1)))))
+ expect_snapshot(
+ sd_eval_expr(quote(st_envelope_agg(st_point(0, 1), na.rm = TRUE)))
+ )
+
+ # Check for reasonable errors (named arguments are not allowed)
+ expect_snapshot_error(sd_eval_expr(quote(st_point(1, y = 2))))
+})
+
test_that("function calls without a translation are evaluated in R", {
function_without_a_translation <- function(x) x + 1L
expect_snapshot(sd_eval_expr(quote(function_without_a_translation(1L))))
@@ -78,3 +104,13 @@ test_that("errors that occur during evaluation have
reasonable context", {
function_without_a_translation <- function(x) x + 1L
expect_snapshot(sd_eval_expr(quote(stop("this will error"))), error = TRUE)
})
+
+test_that("unwrap_desc() can unwrap calls to desc()", {
+ expect_identical(
+ unwrap_desc(list(quote(desc(a)), quote(dplyr::desc(b)), quote(c))),
+ list(
+ inner_exprs = list(quote(a), quote(b), quote(c)),
+ is_descending = c(TRUE, TRUE, FALSE)
+ )
+ )
+})